Spaces:
Sleeping
Sleeping
Update train_model.py
Browse files- train_model.py +17 -4
train_model.py
CHANGED
@@ -68,16 +68,29 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
|
|
68 |
|
69 |
# Log some examples to check dataset structure
|
70 |
logging.info(f"Example data from the dataset: {dataset[:5]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def tokenize_function(examples):
|
73 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
# Tokenize with truncation and padding
|
75 |
tokens = tokenizer(
|
76 |
examples['text'],
|
77 |
truncation=True,
|
78 |
max_length=sequence_length,
|
79 |
-
padding=
|
80 |
-
return_tensors=None # Let the collator handle tensor
|
81 |
)
|
82 |
# Log the tokens for debugging
|
83 |
logging.info(f"Tokenized example: {tokens}")
|
@@ -87,7 +100,7 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
|
|
87 |
logging.error(f"Problematic example: {examples}")
|
88 |
raise e
|
89 |
|
90 |
-
# Tokenize the dataset
|
91 |
tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
|
92 |
logging.info("Dataset tokenization complete.")
|
93 |
return tokenized_datasets
|
@@ -215,7 +228,7 @@ def main():
|
|
215 |
if args.task == "generation":
|
216 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
217 |
elif args.task == "classification":
|
218 |
-
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Handle padding dynamically during batching
|
219 |
else:
|
220 |
logging.error("Unsupported task type for data collator.")
|
221 |
raise ValueError("Unsupported task type for data collator.")
|
|
|
68 |
|
69 |
# Log some examples to check dataset structure
|
70 |
logging.info(f"Example data from the dataset: {dataset[:5]}")
|
71 |
+
|
72 |
+
def clean_text(text):
|
73 |
+
# Ensure each text is a string
|
74 |
+
if isinstance(text, list):
|
75 |
+
return " ".join([str(t) for t in text])
|
76 |
+
return str(text)
|
77 |
|
78 |
def tokenize_function(examples):
|
79 |
try:
|
80 |
+
# Clean text to ensure correct format
|
81 |
+
examples['text'] = [clean_text(text) for text in examples['text']]
|
82 |
+
|
83 |
+
# Log the type and structure of text to debug
|
84 |
+
logging.info(f"Type of examples['text']: {type(examples['text'])}")
|
85 |
+
logging.info(f"First example type: {type(examples['text'][0])}")
|
86 |
+
|
87 |
# Tokenize with truncation and padding
|
88 |
tokens = tokenizer(
|
89 |
examples['text'],
|
90 |
truncation=True,
|
91 |
max_length=sequence_length,
|
92 |
+
padding=False, # Defer padding to data collator
|
93 |
+
return_tensors=None # Let the data collator handle tensor creation
|
94 |
)
|
95 |
# Log the tokens for debugging
|
96 |
logging.info(f"Tokenized example: {tokens}")
|
|
|
100 |
logging.error(f"Problematic example: {examples}")
|
101 |
raise e
|
102 |
|
103 |
+
# Tokenize the dataset using the modified tokenize_function
|
104 |
tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
|
105 |
logging.info("Dataset tokenization complete.")
|
106 |
return tokenized_datasets
|
|
|
228 |
if args.task == "generation":
|
229 |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
230 |
elif args.task == "classification":
|
231 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest') # Handle padding dynamically during batching
|
232 |
else:
|
233 |
logging.error("Unsupported task type for data collator.")
|
234 |
raise ValueError("Unsupported task type for data collator.")
|