Spaces:

art-manuh
/

swahili_llm

Runtime error

App Files Files Community

art-manuh commited on Aug 9, 2024

Commit

25ebe24

verified ·

1 Parent(s): a6835cd

Changed model

Browse files

Files changed (1) hide show

app.py +12 -11

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ dataset = load_dataset("mwitiderrick/swahili")
 print(f"Dataset columns: {dataset['train'].column_names}")
 # Initialize the tokenizer and model
-model_name = "gpt2"  # Use GPT-2 for text generation
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
 model = GPT2LMHeadModel.from_pretrained(model_name)
@@ -24,21 +24,21 @@ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
 # Preprocess the dataset
 def preprocess_function(examples):
-    # Tokenize and format the dataset
     encodings = tokenizer(
-        examples['text'],  # Use 'text' column from your dataset
         truncation=True,
-        padding='max_length',  # Ensure consistent length
         max_length=512
     )
-    encodings['labels'] = encodings['input_ids']  # Use input_ids directly as labels
     return encodings
 # Tokenize the dataset
 try:
     tokenized_datasets = dataset.map(
         preprocess_function,
-        batched=True
     )
 except Exception as e:
     print(f"Error during tokenization: {e}")
@@ -46,13 +46,14 @@ except Exception as e:
 # Define training arguments
 training_args = TrainingArguments(
     output_dir='./results',
-    per_device_train_batch_size=4,
     num_train_epochs=1,
     logging_dir='./logs',
-    logging_steps=500,  # Log every 500 steps
-    evaluation_strategy="steps",  # Use evaluation strategy
-    save_steps=10_000,  # Save checkpoint every 10,000 steps
-    save_total_limit=2,  # Keep only the last 2 checkpoints
 )
 # Define Trainer

 print(f"Dataset columns: {dataset['train'].column_names}")
 # Initialize the tokenizer and model
+model_name = "gpt2-small"  # Use a smaller variant of GPT-2 for efficiency
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
 model = GPT2LMHeadModel.from_pretrained(model_name)
 # Preprocess the dataset
 def preprocess_function(examples):
     encodings = tokenizer(
+        examples['text'],
         truncation=True,
+        padding='max_length',
         max_length=512
     )
+    encodings['labels'] = encodings['input_ids']
     return encodings
 # Tokenize the dataset
 try:
     tokenized_datasets = dataset.map(
         preprocess_function,
+        batched=True,
+        batch_size=1000  # Adjust batch size for efficiency
     )
 except Exception as e:
     print(f"Error during tokenization: {e}")
 # Define training arguments
 training_args = TrainingArguments(
     output_dir='./results',
+    per_device_train_batch_size=2,  # Lowered batch size to prevent OOM errors
     num_train_epochs=1,
     logging_dir='./logs',
+    logging_steps=500,
+    evaluation_strategy="steps",
+    save_steps=5000,  # Save checkpoints more frequently
+    save_total_limit=2,
+    gradient_accumulation_steps=8,  # Accumulate gradients to simulate larger batch size
 )
 # Define Trainer