Spaces:
Runtime error
Runtime error
Changed model
Browse files
app.py
CHANGED
|
@@ -14,7 +14,7 @@ dataset = load_dataset("mwitiderrick/swahili")
|
|
| 14 |
print(f"Dataset columns: {dataset['train'].column_names}")
|
| 15 |
|
| 16 |
# Initialize the tokenizer and model
|
| 17 |
-
model_name = "gpt2" # Use GPT-2 for
|
| 18 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 19 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 20 |
|
|
@@ -24,21 +24,21 @@ tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
|
|
| 24 |
|
| 25 |
# Preprocess the dataset
|
| 26 |
def preprocess_function(examples):
|
| 27 |
-
# Tokenize and format the dataset
|
| 28 |
encodings = tokenizer(
|
| 29 |
-
examples['text'],
|
| 30 |
truncation=True,
|
| 31 |
-
padding='max_length',
|
| 32 |
max_length=512
|
| 33 |
)
|
| 34 |
-
encodings['labels'] = encodings['input_ids']
|
| 35 |
return encodings
|
| 36 |
|
| 37 |
# Tokenize the dataset
|
| 38 |
try:
|
| 39 |
tokenized_datasets = dataset.map(
|
| 40 |
preprocess_function,
|
| 41 |
-
batched=True
|
|
|
|
| 42 |
)
|
| 43 |
except Exception as e:
|
| 44 |
print(f"Error during tokenization: {e}")
|
|
@@ -46,13 +46,14 @@ except Exception as e:
|
|
| 46 |
# Define training arguments
|
| 47 |
training_args = TrainingArguments(
|
| 48 |
output_dir='./results',
|
| 49 |
-
per_device_train_batch_size=
|
| 50 |
num_train_epochs=1,
|
| 51 |
logging_dir='./logs',
|
| 52 |
-
logging_steps=500,
|
| 53 |
-
evaluation_strategy="steps",
|
| 54 |
-
save_steps=
|
| 55 |
-
save_total_limit=2,
|
|
|
|
| 56 |
)
|
| 57 |
|
| 58 |
# Define Trainer
|
|
|
|
| 14 |
print(f"Dataset columns: {dataset['train'].column_names}")
|
| 15 |
|
| 16 |
# Initialize the tokenizer and model
|
| 17 |
+
model_name = "gpt2-small" # Use a smaller variant of GPT-2 for efficiency
|
| 18 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 19 |
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 20 |
|
|
|
|
| 24 |
|
| 25 |
# Preprocess the dataset
|
| 26 |
def preprocess_function(examples):
|
|
|
|
| 27 |
encodings = tokenizer(
|
| 28 |
+
examples['text'],
|
| 29 |
truncation=True,
|
| 30 |
+
padding='max_length',
|
| 31 |
max_length=512
|
| 32 |
)
|
| 33 |
+
encodings['labels'] = encodings['input_ids']
|
| 34 |
return encodings
|
| 35 |
|
| 36 |
# Tokenize the dataset
|
| 37 |
try:
|
| 38 |
tokenized_datasets = dataset.map(
|
| 39 |
preprocess_function,
|
| 40 |
+
batched=True,
|
| 41 |
+
batch_size=1000 # Adjust batch size for efficiency
|
| 42 |
)
|
| 43 |
except Exception as e:
|
| 44 |
print(f"Error during tokenization: {e}")
|
|
|
|
| 46 |
# Define training arguments
|
| 47 |
training_args = TrainingArguments(
|
| 48 |
output_dir='./results',
|
| 49 |
+
per_device_train_batch_size=2, # Lowered batch size to prevent OOM errors
|
| 50 |
num_train_epochs=1,
|
| 51 |
logging_dir='./logs',
|
| 52 |
+
logging_steps=500,
|
| 53 |
+
evaluation_strategy="steps",
|
| 54 |
+
save_steps=5000, # Save checkpoints more frequently
|
| 55 |
+
save_total_limit=2,
|
| 56 |
+
gradient_accumulation_steps=8, # Accumulate gradients to simulate larger batch size
|
| 57 |
)
|
| 58 |
|
| 59 |
# Define Trainer
|