File size: 3,774 Bytes
3b692d3
 
 
 
 
 
 
 
d08801e
3b692d3
d08801e
3b692d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import pandas as pd
from torch.utils.data import Dataset
from transformers import (
    MBartTokenizer,
    MBartForConditionalGeneration,
    Trainer,
    TrainingArguments,
    
)
from huggingface_hub import HfFolder
# Save the Hugging Face token (if not already saved)
token = os.getenv("HF_TOKEN")
if token:
    HfFolder.save_token(token)
    print("Token saved successfully!")
else:
    print("HF_TOKEN environment variable not set. Ensure your token is saved for authentication.")

# Step 1: Define Dataset Class
class HindiDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_length=512):
        """
        Dataset class for Hindi translation tasks.

        Args:
            data_path (str): Path to the dataset file (e.g., TSV with source-target pairs).
            tokenizer (MBartTokenizer): Tokenizer for mBART.
            max_length (int): Maximum sequence length for tokenization.
        """
        self.data = pd.read_csv(data_path, sep="\t")
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source = self.data.iloc[idx]["source"]
        target = self.data.iloc[idx]["target"]

        source_encodings = self.tokenizer(
            source, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt"
        )
        target_encodings = self.tokenizer(
            target, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt"
        )

        return {
            "input_ids": source_encodings["input_ids"].squeeze(),
            "attention_mask": source_encodings["attention_mask"].squeeze(),
            "labels": target_encodings["input_ids"].squeeze(),
        }

# Step 2: Load Tokenizer and Dataset
data_path = "hindi_dataset.tsv"  # Path to your dataset file
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50")
train_dataset = HindiDataset(data_path, tokenizer)

# Step 3: Load Pre-trained mBART Model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./mbart-hindi",           # Output directory for model checkpoints
    per_device_train_batch_size=4,       # Training batch size per GPU
    per_device_eval_batch_size=4,        # Evaluation batch size per GPU
    evaluation_strategy="steps",         # Evaluate every 'save_steps'
    save_steps=500,                      # Save model every 500 steps
    save_total_limit=2,                  # Keep only 2 checkpoints
    logging_dir="./logs",                # Directory for training logs
    num_train_epochs=3,                  # Number of training epochs
    learning_rate=5e-5,                  # Learning rate
    weight_decay=0.01,                   # Weight decay for optimizer
    report_to="none"                     # Disable third-party logging
)

# Step 5: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

# Step 6: Train the Model
print("Starting training...")
trainer.train()

# Step 7: Save the Fine-Tuned Model
output_dir = "./mbart-hindi-model"
print(f"Saving fine-tuned model to {output_dir}...")
trainer.save_model(output_dir)

# Step 8: Test the Fine-Tuned Model
print("Testing the fine-tuned model...")
model = MBartForConditionalGeneration.from_pretrained(output_dir)
tokenizer = MBartTokenizer.from_pretrained(output_dir)

test_text = "Translate this to Hindi."
inputs = tokenizer(test_text, return_tensors="pt")
outputs = model.generate(**inputs)
print("Generated Translation:", tokenizer.decode(outputs[0], skip_special_tokens=True))