Vishwas1 commited on
Commit
8955717
·
verified ·
1 Parent(s): 4a9e5f8

Update train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +17 -4
train_model.py CHANGED
@@ -68,16 +68,29 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
68
 
69
  # Log some examples to check dataset structure
70
  logging.info(f"Example data from the dataset: {dataset[:5]}")
 
 
 
 
 
 
71
 
72
  def tokenize_function(examples):
73
  try:
 
 
 
 
 
 
 
74
  # Tokenize with truncation and padding
75
  tokens = tokenizer(
76
  examples['text'],
77
  truncation=True,
78
  max_length=sequence_length,
79
- padding='max_length', # Force padding to max length for debugging
80
- return_tensors=None # Let the collator handle tensor conversion
81
  )
82
  # Log the tokens for debugging
83
  logging.info(f"Tokenized example: {tokens}")
@@ -87,7 +100,7 @@ def load_and_prepare_dataset(task, dataset_name, tokenizer, sequence_length):
87
  logging.error(f"Problematic example: {examples}")
88
  raise e
89
 
90
- # Tokenize the dataset
91
  tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
92
  logging.info("Dataset tokenization complete.")
93
  return tokenized_datasets
@@ -215,7 +228,7 @@ def main():
215
  if args.task == "generation":
216
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
217
  elif args.task == "classification":
218
- data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Handle padding dynamically during batching
219
  else:
220
  logging.error("Unsupported task type for data collator.")
221
  raise ValueError("Unsupported task type for data collator.")
 
68
 
69
  # Log some examples to check dataset structure
70
  logging.info(f"Example data from the dataset: {dataset[:5]}")
71
+
72
+ def clean_text(text):
73
+ # Ensure each text is a string
74
+ if isinstance(text, list):
75
+ return " ".join([str(t) for t in text])
76
+ return str(text)
77
 
78
  def tokenize_function(examples):
79
  try:
80
+ # Clean text to ensure correct format
81
+ examples['text'] = [clean_text(text) for text in examples['text']]
82
+
83
+ # Log the type and structure of text to debug
84
+ logging.info(f"Type of examples['text']: {type(examples['text'])}")
85
+ logging.info(f"First example type: {type(examples['text'][0])}")
86
+
87
  # Tokenize with truncation and padding
88
  tokens = tokenizer(
89
  examples['text'],
90
  truncation=True,
91
  max_length=sequence_length,
92
+ padding=False, # Defer padding to data collator
93
+ return_tensors=None # Let the data collator handle tensor creation
94
  )
95
  # Log the tokens for debugging
96
  logging.info(f"Tokenized example: {tokens}")
 
100
  logging.error(f"Problematic example: {examples}")
101
  raise e
102
 
103
+ # Tokenize the dataset using the modified tokenize_function
104
  tokenized_datasets = dataset.shuffle(seed=42).select(range(500)).map(tokenize_function, batched=True)
105
  logging.info("Dataset tokenization complete.")
106
  return tokenized_datasets
 
228
  if args.task == "generation":
229
  data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
230
  elif args.task == "classification":
231
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest') # Handle padding dynamically during batching
232
  else:
233
  logging.error("Unsupported task type for data collator.")
234
  raise ValueError("Unsupported task type for data collator.")