Translation
Malayalam
English
model / README.md
Haryni's picture
Update README.md
8db5b66 verified
---
datasets:
- asr-malayalam/indicvoices-v1a
- Tensoic/GPTeacher-Malayalam
language:
- ml
- en
metrics:
- accuracy
base_model:
- deepseek-ai/DeepSeek-R1
pipeline_tag: translation
---
import os
import argparse
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
DataCollatorForSeq2Seq
)
from utils import compute_metrics
def load_dataset(file_path):
"""Load and prepare the dataset."""
df = pd.read_csv(file_path)
dataset = Dataset.from_pandas(df)
# Split dataset into train and validation
split_dataset = dataset.train_test_split(test_size=0.1)
return split_dataset
def preprocess_function(examples, tokenizer, max_length=128):
"""Tokenize the texts."""
inputs = [ex for ex in examples["english_text"]]
targets = [ex for ex in examples["malayalam_text"]]
model_inputs = tokenizer(
inputs,
max_length=max_length,
truncation=True,
padding="max_length",
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
targets,
max_length=max_length,
truncation=True,
padding="max_length",
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
def main(args):
# Load tokenizer and model
model_name = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Load and preprocess dataset
dataset = load_dataset("dataset/malayalam_dataset.csv")
# Tokenize datasets
tokenized_datasets = dataset.map(
lambda x: preprocess_function(x, tokenizer),
batched=True,
remove_columns=dataset["train"].column_names
)
# Define training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./model",
evaluation_strategy="epoch",
learning_rate=args.learning_rate,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
num_train_epochs=args.epochs,
weight_decay=0.01,
save_total_limit=2,
predict_with_generate=True,
logging_dir="./logs",
logging_steps=100,
push_to_hub=True,
)
# Create data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# Initialize trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
# Save the model
trainer.save_model("./model")
tokenizer.save_pretrained("./model")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--learning_rate", type=float, default=2e-5)
args = parser.parse_args()
main(args)