smhavens commited on
Commit
69ae466
·
1 Parent(s): 5ee2a12

mass reformatting for tokenization of dataset

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -28,7 +28,7 @@ def mean_pooling(model_output, attention_mask):
28
 
29
 
30
  def tokenize_function(examples):
31
- return tokenizer(examples["text"], padding="max_length", truncation=True)
32
 
33
 
34
  def compute_metrics(eval_pred):
@@ -51,13 +51,17 @@ def training():
51
  # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
52
  # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
53
 
 
 
 
 
 
 
54
  train_examples = []
55
  train_data = dataset['train']
56
  # For agility we only 1/2 of our available data
57
  n_examples = dataset['train'].num_rows // 2
58
 
59
-
60
-
61
  for i in range(n_examples):
62
  example = train_data[i]
63
  # print(example)
 
28
 
29
 
30
  def tokenize_function(examples):
31
+ return tokenizer(examples["text"])
32
 
33
 
34
  def compute_metrics(eval_pred):
 
51
  # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
52
  # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
53
 
54
+ dataset = dataset["train"].map(tokenize_function, batched=True)
55
+ dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
56
+ dataset.format['type']
57
+
58
+ print(dataset)
59
+
60
  train_examples = []
61
  train_data = dataset['train']
62
  # For agility we only 1/2 of our available data
63
  n_examples = dataset['train'].num_rows // 2
64
 
 
 
65
  for i in range(n_examples):
66
  example = train_data[i]
67
  # print(example)