Spaces:

nullHawk
/

english-hindi_translator

Sleeping

App Files Files Community

nullHawk commited on Jun 10

Commit

9a41f63

verified ·

1 Parent(s): 23beb2b

init

Browse files

Files changed (27) hide show

.gitattributes +1 -0
__pycache__/inference.cpython-310.pyc +0 -0
app.py +55 -0
bin/eng_vocab.pkl +3 -0
bin/hin_vocab.pkl +3 -0
bin/seq2seq.pth +3 -0
data/hindi_english_parallel.csv +3 -0
inference.py +76 -0
models/__pycache__/attention.cpython-310.pyc +0 -0
models/__pycache__/decoder.cpython-310.pyc +0 -0
models/__pycache__/encoder.cpython-310.pyc +0 -0
models/__pycache__/seq2seq.cpython-310.pyc +0 -0
models/attention.py +25 -0
models/decoder.py +59 -0
models/encoder.py +26 -0
models/seq2seq.py +34 -0
requirment.txt +4 -0
train.py +77 -0
utils/__pycache__/config.cpython-310.pyc +0 -0
utils/__pycache__/config.cpython-312.pyc +0 -0
utils/__pycache__/data_loader.cpython-310.pyc +0 -0
utils/__pycache__/data_loader.cpython-312.pyc +0 -0
utils/__pycache__/preprocessing.cpython-310.pyc +0 -0
utils/__pycache__/preprocessing.cpython-312.pyc +0 -0
utils/config.py +31 -0
utils/data_loader.py +95 -0
utils/preprocessing.py +48 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (2.41 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from utils.config import config
+from inference import main, translate_sentence
+from models.encoder import Encoder
+from models.decoder import Decoder
+from models.seq2seq import Seq2Seq
+import gradio as gr
+import pickle
+import torch
+def translate(inp):
+    global model, eng_vocab, hin_vocab
+    text = translate_sentence(inp, model, eng_vocab, hin_vocab, config.device)
+    return text
+def main():
+    global model, eng_vocab, hin_vocab
+    # Load vocabularies
+    with open('bin/eng_vocab.pkl', 'rb') as f:
+        eng_vocab = pickle.load(f)
+    with open('bin/hin_vocab.pkl', 'rb') as f:
+        hin_vocab = pickle.load(f)
+    # Load model
+    enc = Encoder(
+        len(eng_vocab),
+        config.embedding_dim,
+        config.hidden_size,
+        config.num_layers,
+        config.dropout
+    ).to(config.device)
+    dec = Decoder(
+        len(hin_vocab),
+        config.embedding_dim,
+        config.hidden_size,
+        config.num_layers,
+        config.dropout
+    ).to(config.device)
+    model = Seq2Seq(enc, dec, config.device).to(config.device)
+    model.load_state_dict(torch.load("bin/seq2seq.pth", map_location=config.device))
+    app = gr.Interface(
+        fn=translate,
+        inputs='textbox',
+        outputs='textbox'
+    )
+    app.launch()
+if __name__ == "__main__":
+    main()

bin/eng_vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e79a4e87ee83d027731c589031c086e07e5b1acca16be6c4739487ed36910a71
+size 546070

bin/hin_vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7df77cef431e4c96d47b7a1754bb60dd02950852aeb522331003f8185e5f078
+size 1777961

bin/seq2seq.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f82aedf07d90622f29769e19b7e62f14e1ecc98e66f75f2240c8bb26bebc5a49
+size 421474417

data/hindi_english_parallel.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea0c215aab91c26d35c22a2ad878c8ae14332ec480de007c7b9b961ef19d1eb9
+size 400990503

inference.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+from utils.config import config
+from utils.preprocessing import clean_text, clean_hindi
+from utils.data_loader import TranslationDataset
+from models.encoder import Encoder
+from models.decoder import Decoder
+from models.seq2seq import Seq2Seq
+import pickle
+def translate_sentence(sentence, model, eng_vocab, hin_vocab, device):
+    model.eval()
+    sentence = clean_text(sentence)
+    # Convert to tensor
+    tokens = [eng_vocab.get(word, eng_vocab['<unk>']) for word in sentence.split()]
+    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
+    with torch.no_grad():
+        encoder_outputs, hidden = model.encoder(src_tensor)
+    trg_indexes = [hin_vocab['<start>']]
+    for _ in range(config.max_length):
+        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
+        with torch.no_grad():
+            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
+        pred_token = output.argmax(1).item()
+        trg_indexes.append(pred_token)
+        if pred_token == hin_vocab['<end>']:
+            break
+    trg_tokens = [list(hin_vocab.keys())[list(hin_vocab.values()).index(i)]
+                 for i in trg_indexes]
+    return ' '.join(trg_tokens[1:-1])  # Remove <start> and <end>
+def main():
+    # Load vocabularies
+    with open('bin/eng_vocab.pkl', 'rb') as f:
+        eng_vocab = pickle.load(f)
+    with open('bin/hin_vocab.pkl', 'rb') as f:
+        hin_vocab = pickle.load(f)
+    # Load model
+    enc = Encoder(
+        len(eng_vocab),
+        config.embedding_dim,
+        config.hidden_size,
+        config.num_layers,
+        config.dropout
+    ).to(config.device)
+    dec = Decoder(
+        len(hin_vocab),
+        config.embedding_dim,
+        config.hidden_size,
+        config.num_layers,
+        config.dropout
+    ).to(config.device)
+    model = Seq2Seq(enc, dec, config.device).to(config.device)
+    model.load_state_dict(torch.load("bin/seq2seq.pth", map_location=config.device))
+    # Interactive translation
+    while True:
+        sentence = input("Enter English sentence (type 'exit' to quit): ")
+        if sentence.lower() == 'exit':
+            break
+        translation = translate_sentence(sentence, model, eng_vocab, hin_vocab, config.device)
+        print(f"Hindi Translation: {translation}\n")
+if __name__ == "__main__":
+    main()

models/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (1.03 kB). View file

models/__pycache__/decoder.cpython-310.pyc ADDED Viewed

Binary file (1.43 kB). View file

models/__pycache__/encoder.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

models/__pycache__/seq2seq.cpython-310.pyc ADDED Viewed

Binary file (1.26 kB). View file

models/attention.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Attention(nn.Module):
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
+        self.v = nn.Linear(hidden_dim, 1, bias=False)
+    def forward(self, hidden, encoder_outputs):
+        # hidden: [1, batch_size, hidden_dim]
+        # encoder_outputs: [src_len, batch_size, hidden_dim]
+        src_len = encoder_outputs.shape[0]
+        hidden = hidden.repeat(src_len, 1, 1)
+        # hidden: [src_len, batch_size, hidden_dim]
+        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
+        # energy: [src_len, batch_size, hidden_dim]
+        attention = self.v(energy).squeeze(2)
+        # attention: [src_len, batch_size]
+        return F.softmax(attention, dim=0)

models/decoder.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch.nn as nn
+import torch
+from models.attention import Attention
+from utils.config import config
+class Decoder(nn.Module):
+    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
+        super().__init__()
+        self.output_dim = output_dim
+        self.attention = Attention(hidden_dim)
+        self.embedding = nn.Embedding(output_dim, embedding_dim)
+        self.rnn = nn.GRU(
+            embedding_dim + hidden_dim,
+            hidden_dim,
+            num_layers=n_layers,
+            dropout=dropout if n_layers > 1 else 0
+        )
+        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, input, hidden, encoder_outputs):
+        # input: [batch_size]
+        # hidden: [n_layers, batch_size, hidden_dim]
+        # encoder_outputs: [src_len, batch_size, hidden_dim]
+        input = input.unsqueeze(0)
+        # input: [1, batch_size]
+        embedded = self.dropout(self.embedding(input))
+        # embedded: [1, batch_size, embedding_dim]
+        a = self.attention(hidden[-1], encoder_outputs)
+        # a: [src_len, batch_size]
+        a = a.permute(1, 0).unsqueeze(1)
+        # a: [batch_size, 1, src_len]
+        encoder_outputs = encoder_outputs.permute(1, 0, 2)
+        # encoder_outputs: [batch_size, src_len, hidden_dim]
+        weighted = torch.bmm(a, encoder_outputs)
+        weighted = weighted.permute(1, 0, 2)
+        # weighted: [1, batch_size, hidden_dim]
+        rnn_input = torch.cat((embedded, weighted), dim=2)
+        # rnn_input: [1, batch_size, embedding_dim + hidden_dim]
+        output, hidden = self.rnn(rnn_input, hidden)
+        # output: [1, batch_size, hidden_dim]
+        # hidden: [n_layers, batch_size, hidden_dim]
+        embedded = embedded.squeeze(0)
+        output = output.squeeze(0)
+        weighted = weighted.squeeze(0)
+        prediction = self.fc_out(torch.cat((output, weighted), dim=1))
+        # prediction: [batch_size, output_dim]
+        return prediction, hidden

models/encoder.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+from utils.config import config
+class Encoder(nn.Module):
+    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
+        super().__init__()
+        self.embedding = nn.Embedding(input_dim, embedding_dim)
+        self.rnn = nn.GRU(
+            embedding_dim,
+            hidden_dim,
+            num_layers=n_layers,
+            dropout=dropout if n_layers > 1 else 0,
+            bidirectional=False
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, src):
+        # src: [batch_size, src_len]
+        embedded = self.dropout(self.embedding(src))
+        # embedded: [batch_size, src_len, embedding_dim]
+        outputs, hidden = self.rnn(embedded.permute(1, 0, 2))
+        # outputs: [src_len, batch_size, hidden_dim]
+        # hidden: [n_layers * num_directions, batch_size, hidden_dim]
+        return outputs, hidden

models/seq2seq.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.nn as nn
+from models.encoder import Encoder
+from models.decoder import Decoder
+class Seq2Seq(nn.Module):
+    def __init__(self, encoder, decoder, device):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.device = device
+    def forward(self, src, trg, teacher_forcing_ratio=0.5):
+        # src: [batch_size, src_len]
+        # trg: [batch_size, trg_len]
+        batch_size = trg.shape[0]
+        trg_len = trg.shape[1]
+        trg_vocab_size = self.decoder.output_dim
+        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
+        encoder_outputs, hidden = self.encoder(src)
+        input = trg[:, 0]  # First token is <start>
+        for t in range(1, trg_len):
+            output, hidden = self.decoder(input, hidden, encoder_outputs)
+            outputs[t] = output
+            teacher_force = torch.rand(1) < teacher_forcing_ratio
+            top1 = output.argmax(1)
+            input = trg[:, t] if teacher_force else top1
+        return outputs.permute(1, 0, 2)

requirment.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+pandas
+numpy
+tqdm

train.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from tqdm import tqdm
+from utils.config import config
+from utils.data_loader import get_data_loaders
+from models.encoder import Encoder
+from models.decoder import Decoder
+from models.seq2seq import Seq2Seq
+def init_weights(m):
+    for name, param in m.named_parameters():
+        if 'weight' in name:
+            nn.init.normal_(param.data, mean=0, std=0.01)
+        else:
+            nn.init.constant_(param.data, 0)
+def train():
+    train_loader, val_loader, eng_vocab, hin_vocab = get_data_loaders()
+    print(f"Final English vocab size: {len(eng_vocab)}")
+    print(f"Final Hindi vocab size: {len(hin_vocab)}")
+    # Model initialization
+    enc = Encoder(
+        len(eng_vocab),
+        config.embedding_dim,
+        config.hidden_size,
+        config.num_layers,
+        config.dropout
+    ).to(config.device)
+    dec = Decoder(
+        len(hin_vocab),
+        config.embedding_dim,
+        config.hidden_size,
+        config.num_layers,
+        config.dropout
+    ).to(config.device)
+    model = Seq2Seq(enc, dec, config.device).to(config.device)
+    model.apply(init_weights)
+    # Optimizer and loss
+    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
+    criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
+    # Training loop
+    for epoch in range(config.epochs):
+        model.train()
+        epoch_loss = 0
+        for src, trg in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
+            src, trg = src.to(config.device), trg.to(config.device)
+            optimizer.zero_grad()
+            output = model(src, trg, config.teacher_forcing_ratio)
+            output_dim = output.shape[-1]
+            output = output[:, 1:].reshape(-1, output_dim)
+            trg = trg[:, 1:].reshape(-1)
+            loss = criterion(output, trg)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
+            optimizer.step()
+            epoch_loss += loss.item()
+        avg_loss = epoch_loss / len(train_loader)
+        print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")
+        # Save model
+        torch.save(model.state_dict(), f"seq2seq_epoch_{epoch+1}.pth")
+if __name__ == "__main__":
+    train()

utils/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (794 Bytes). View file

utils/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (996 Bytes). View file

utils/__pycache__/data_loader.cpython-310.pyc ADDED Viewed

Binary file (3.77 kB). View file

utils/__pycache__/data_loader.cpython-312.pyc ADDED Viewed

Binary file (3.84 kB). View file

utils/__pycache__/preprocessing.cpython-310.pyc ADDED Viewed

Binary file (1.6 kB). View file

utils/__pycache__/preprocessing.cpython-312.pyc ADDED Viewed

Binary file (2.36 kB). View file

utils/config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+class Config:
+    # Data
+    data_path = "data/hindi_english_parallel.csv"
+    train_ratio = 0.8
+    # Preprocessing
+    max_length = 20
+    min_word_count = 3
+    # Model
+    embedding_dim = 256
+    hidden_size = 512
+    num_layers = 2
+    dropout = 0.5
+    # Training
+    batch_size = 64
+    learning_rate = 0.001
+    epochs = 20
+    teacher_forcing_ratio = 0.5
+    max_vocab_english = 5000
+    max_vocab_hindi = 10000
+    max_length = 20  # Maximum sentence length
+    # Device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+config = Config()

utils/data_loader.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import pickle
+import torch
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+from utils.preprocessing import prepare_data, build_vocab
+from utils.config import config
+class TranslationDataset(Dataset):
+    def __init__(self, english_sentences, hindi_sentences, eng_vocab, hin_vocab):
+        self.english_sentences = english_sentences
+        self.hindi_sentences = hindi_sentences
+        self.eng_vocab = eng_vocab
+        self.hin_vocab = hin_vocab
+        self.eng_vocab_size = len(eng_vocab)
+        self.hin_vocab_size = len(hin_vocab)
+    def __len__(self):
+        return len(self.english_sentences)
+    def __getitem__(self, idx):
+        eng_sentence = self.english_sentences[idx]
+        hin_sentence = self.hindi_sentences[idx]
+        eng_ids = [self.eng_vocab.get(word, self.eng_vocab['<unk>'])
+                  for word in eng_sentence.split()]
+        hin_ids = [self.hin_vocab.get(word, self.hin_vocab['<unk>'])
+                  for word in hin_sentence.split()]
+        # Clamp indices to vocabulary size
+        eng_ids = [min(idx, self.eng_vocab_size - 1) for idx in eng_ids]
+        hin_ids = [min(idx, self.hin_vocab_size - 1) for idx in hin_ids]
+        return {
+            'english': torch.tensor(eng_ids, dtype=torch.long),
+            'hindi': torch.tensor(hin_ids, dtype=torch.long)
+        }
+def collate_fn(batch):
+    eng_batch = [item['english'] for item in batch]
+    hin_batch = [item['hindi'] for item in batch]
+    eng_padded = torch.nn.utils.rnn.pad_sequence(
+        eng_batch, padding_value=0, batch_first=True)
+    hin_padded = torch.nn.utils.rnn.pad_sequence(
+        hin_batch, padding_value=0, batch_first=True)
+    return eng_padded, hin_padded
+def get_data_loaders():
+    df = prepare_data()
+    df = df.sample(frac=0.1, random_state=42)
+    df['eng_len'] = df['english'].apply(lambda x: len(x.split()))
+    df['hin_len'] = df['hindi'].apply(lambda x: len(x.split()))
+    df = df[(df['eng_len'] <= config.max_length) &
+            (df['hin_len'] <= config.max_length)]
+    eng_sentences = df['english'].tolist()
+    hin_sentences = df['hindi'].tolist()
+    # Split data
+    split_idx = int(len(eng_sentences) * config.train_ratio)
+    train_eng = eng_sentences[:split_idx]
+    train_hin = hin_sentences[:split_idx]
+    val_eng = eng_sentences[split_idx:]
+    val_hin = hin_sentences[split_idx:]
+    # Build vocabularies
+    eng_vocab = build_vocab(train_eng)
+    hin_vocab = build_vocab(train_hin, is_hindi=True)
+    # Create datasets
+    train_dataset = TranslationDataset(train_eng, train_hin, eng_vocab, hin_vocab)
+    val_dataset = TranslationDataset(val_eng, val_hin, eng_vocab, hin_vocab)
+    # Create data loaders
+    train_loader = DataLoader(
+        train_dataset, batch_size=config.batch_size,
+        shuffle=True, collate_fn=collate_fn
+    )
+    val_loader = DataLoader(
+        val_dataset, batch_size=config.batch_size,
+        shuffle=False, collate_fn=collate_fn
+    )
+    # Save vocabularies for inference
+    with open('eng_vocab.pkl', 'wb') as f:
+        pickle.dump(eng_vocab, f)
+    with open('hin_vocab.pkl', 'wb') as f:
+        pickle.dump(hin_vocab, f)
+    print(f"English vocabulary size: {len(eng_vocab)}")
+    print(f"Hindi vocabulary size: {len(hin_vocab)}")
+    print(f"Max English index: {max(eng_vocab.values())}")
+    print(f"Max Hindi index: {max(hin_vocab.values())}")
+    return train_loader, val_loader, eng_vocab, hin_vocab

utils/preprocessing.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pandas as pd
+import re
+from utils.config import config
+from collections import Counter
+def clean_text(text):
+    text = text.lower().strip()
+    text = re.sub(r"([.!?])", r" \1", text)
+    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)  # For English
+    return text
+def clean_hindi(text):
+    text = text.strip()
+    text = re.sub(r"([।.!?])", r" \1", text)
+    return text
+def prepare_data():
+    df = pd.read_csv(config.data_path)
+    df = df[['english', 'hindi']].dropna()
+    # Clean text
+    df['english'] = df['english'].apply(clean_text)
+    df['hindi'] = df['hindi'].apply(clean_hindi)
+    # Add start/end tokens to Hindi
+    df['hindi'] = df['hindi'].apply(lambda x: '<start> ' + x + ' <end>')
+    return df[['english', 'hindi']]
+def build_vocab(sentences, is_hindi=False):
+    word_counts = Counter()
+    for sentence in sentences:
+        # Skip empty sentences
+        if not sentence or pd.isna(sentence):
+            continue
+        words = sentence.split()
+        word_counts.update(words)
+    # Include all words regardless of frequency
+    vocab = {word: idx+4 for idx, word in enumerate(word_counts)}
+    # Add special tokens
+    vocab['<pad>'] = 0
+    vocab['<start>'] = 1
+    vocab['<end>'] = 2
+    vocab['<unk>'] = 3
+    return vocab