|
--- |
|
datasets: |
|
- asr-malayalam/indicvoices-v1a |
|
- Tensoic/GPTeacher-Malayalam |
|
language: |
|
- ml |
|
- en |
|
metrics: |
|
- accuracy |
|
base_model: |
|
- deepseek-ai/DeepSeek-R1 |
|
pipeline_tag: translation |
|
--- |
|
import os |
|
import argparse |
|
import pandas as pd |
|
from datasets import Dataset |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForSeq2SeqLM, |
|
Seq2SeqTrainingArguments, |
|
Seq2SeqTrainer, |
|
DataCollatorForSeq2Seq |
|
) |
|
from utils import compute_metrics |
|
|
|
def load_dataset(file_path): |
|
"""Load and prepare the dataset.""" |
|
df = pd.read_csv(file_path) |
|
dataset = Dataset.from_pandas(df) |
|
# Split dataset into train and validation |
|
split_dataset = dataset.train_test_split(test_size=0.1) |
|
return split_dataset |
|
|
|
def preprocess_function(examples, tokenizer, max_length=128): |
|
"""Tokenize the texts.""" |
|
inputs = [ex for ex in examples["english_text"]] |
|
targets = [ex for ex in examples["malayalam_text"]] |
|
|
|
model_inputs = tokenizer( |
|
inputs, |
|
max_length=max_length, |
|
truncation=True, |
|
padding="max_length", |
|
) |
|
|
|
with tokenizer.as_target_tokenizer(): |
|
labels = tokenizer( |
|
targets, |
|
max_length=max_length, |
|
truncation=True, |
|
padding="max_length", |
|
) |
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
def main(args): |
|
# Load tokenizer and model |
|
model_name = "google/mt5-small" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
# Load and preprocess dataset |
|
dataset = load_dataset("dataset/malayalam_dataset.csv") |
|
|
|
# Tokenize datasets |
|
tokenized_datasets = dataset.map( |
|
lambda x: preprocess_function(x, tokenizer), |
|
batched=True, |
|
remove_columns=dataset["train"].column_names |
|
) |
|
|
|
# Define training arguments |
|
training_args = Seq2SeqTrainingArguments( |
|
output_dir="./model", |
|
evaluation_strategy="epoch", |
|
learning_rate=args.learning_rate, |
|
per_device_train_batch_size=args.batch_size, |
|
per_device_eval_batch_size=args.batch_size, |
|
num_train_epochs=args.epochs, |
|
weight_decay=0.01, |
|
save_total_limit=2, |
|
predict_with_generate=True, |
|
logging_dir="./logs", |
|
logging_steps=100, |
|
push_to_hub=True, |
|
) |
|
|
|
# Create data collator |
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
|
|
# Initialize trainer |
|
trainer = Seq2SeqTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets["train"], |
|
eval_dataset=tokenized_datasets["test"], |
|
data_collator=data_collator, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics |
|
) |
|
|
|
# Train the model |
|
trainer.train() |
|
|
|
# Save the model |
|
trainer.save_model("./model") |
|
tokenizer.save_pretrained("./model") |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--epochs", type=int, default=3) |
|
parser.add_argument("--batch_size", type=int, default=8) |
|
parser.add_argument("--learning_rate", type=float, default=2e-5) |
|
args = parser.parse_args() |
|
main(args) |