Spaces:
Sleeping
Sleeping
smhavens
commited on
Commit
·
69ae466
1
Parent(s):
5ee2a12
mass reformatting for tokenization of dataset
Browse files
app.py
CHANGED
@@ -28,7 +28,7 @@ def mean_pooling(model_output, attention_mask):
|
|
28 |
|
29 |
|
30 |
def tokenize_function(examples):
|
31 |
-
return tokenizer(examples["text"]
|
32 |
|
33 |
|
34 |
def compute_metrics(eval_pred):
|
@@ -51,13 +51,17 @@ def training():
|
|
51 |
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
52 |
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
train_examples = []
|
55 |
train_data = dataset['train']
|
56 |
# For agility we only 1/2 of our available data
|
57 |
n_examples = dataset['train'].num_rows // 2
|
58 |
|
59 |
-
|
60 |
-
|
61 |
for i in range(n_examples):
|
62 |
example = train_data[i]
|
63 |
# print(example)
|
|
|
28 |
|
29 |
|
30 |
def tokenize_function(examples):
|
31 |
+
return tokenizer(examples["text"])
|
32 |
|
33 |
|
34 |
def compute_metrics(eval_pred):
|
|
|
51 |
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
|
52 |
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
|
53 |
|
54 |
+
dataset = dataset["train"].map(tokenize_function, batched=True)
|
55 |
+
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
|
56 |
+
dataset.format['type']
|
57 |
+
|
58 |
+
print(dataset)
|
59 |
+
|
60 |
train_examples = []
|
61 |
train_data = dataset['train']
|
62 |
# For agility we only 1/2 of our available data
|
63 |
n_examples = dataset['train'].num_rows // 2
|
64 |
|
|
|
|
|
65 |
for i in range(n_examples):
|
66 |
example = train_data[i]
|
67 |
# print(example)
|