nullHawk commited on
Commit
9a41f63
·
verified ·
1 Parent(s): 23beb2b
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.csv filter=lfs diff=lfs merge=lfs -text
__pycache__/inference.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.config import config
2
+ from inference import main, translate_sentence
3
+ from models.encoder import Encoder
4
+ from models.decoder import Decoder
5
+ from models.seq2seq import Seq2Seq
6
+
7
+ import gradio as gr
8
+ import pickle
9
+ import torch
10
+
11
+ def translate(inp):
12
+ global model, eng_vocab, hin_vocab
13
+ text = translate_sentence(inp, model, eng_vocab, hin_vocab, config.device)
14
+ return text
15
+
16
+ def main():
17
+ global model, eng_vocab, hin_vocab
18
+ # Load vocabularies
19
+ with open('bin/eng_vocab.pkl', 'rb') as f:
20
+ eng_vocab = pickle.load(f)
21
+ with open('bin/hin_vocab.pkl', 'rb') as f:
22
+ hin_vocab = pickle.load(f)
23
+
24
+ # Load model
25
+ enc = Encoder(
26
+ len(eng_vocab),
27
+ config.embedding_dim,
28
+ config.hidden_size,
29
+ config.num_layers,
30
+ config.dropout
31
+ ).to(config.device)
32
+
33
+ dec = Decoder(
34
+ len(hin_vocab),
35
+ config.embedding_dim,
36
+ config.hidden_size,
37
+ config.num_layers,
38
+ config.dropout
39
+ ).to(config.device)
40
+
41
+ model = Seq2Seq(enc, dec, config.device).to(config.device)
42
+ model.load_state_dict(torch.load("bin/seq2seq.pth", map_location=config.device))
43
+
44
+
45
+ app = gr.Interface(
46
+ fn=translate,
47
+ inputs='textbox',
48
+ outputs='textbox'
49
+ )
50
+
51
+ app.launch()
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()
bin/eng_vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e79a4e87ee83d027731c589031c086e07e5b1acca16be6c4739487ed36910a71
3
+ size 546070
bin/hin_vocab.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7df77cef431e4c96d47b7a1754bb60dd02950852aeb522331003f8185e5f078
3
+ size 1777961
bin/seq2seq.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f82aedf07d90622f29769e19b7e62f14e1ecc98e66f75f2240c8bb26bebc5a49
3
+ size 421474417
data/hindi_english_parallel.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea0c215aab91c26d35c22a2ad878c8ae14332ec480de007c7b9b961ef19d1eb9
3
+ size 400990503
inference.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from utils.config import config
3
+ from utils.preprocessing import clean_text, clean_hindi
4
+ from utils.data_loader import TranslationDataset
5
+ from models.encoder import Encoder
6
+ from models.decoder import Decoder
7
+ from models.seq2seq import Seq2Seq
8
+ import pickle
9
+
10
+ def translate_sentence(sentence, model, eng_vocab, hin_vocab, device):
11
+ model.eval()
12
+ sentence = clean_text(sentence)
13
+
14
+ # Convert to tensor
15
+ tokens = [eng_vocab.get(word, eng_vocab['<unk>']) for word in sentence.split()]
16
+ src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
17
+
18
+ with torch.no_grad():
19
+ encoder_outputs, hidden = model.encoder(src_tensor)
20
+
21
+ trg_indexes = [hin_vocab['<start>']]
22
+
23
+ for _ in range(config.max_length):
24
+ trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
25
+
26
+ with torch.no_grad():
27
+ output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
28
+
29
+ pred_token = output.argmax(1).item()
30
+ trg_indexes.append(pred_token)
31
+
32
+ if pred_token == hin_vocab['<end>']:
33
+ break
34
+
35
+ trg_tokens = [list(hin_vocab.keys())[list(hin_vocab.values()).index(i)]
36
+ for i in trg_indexes]
37
+
38
+ return ' '.join(trg_tokens[1:-1]) # Remove <start> and <end>
39
+
40
+ def main():
41
+ # Load vocabularies
42
+ with open('bin/eng_vocab.pkl', 'rb') as f:
43
+ eng_vocab = pickle.load(f)
44
+ with open('bin/hin_vocab.pkl', 'rb') as f:
45
+ hin_vocab = pickle.load(f)
46
+
47
+ # Load model
48
+ enc = Encoder(
49
+ len(eng_vocab),
50
+ config.embedding_dim,
51
+ config.hidden_size,
52
+ config.num_layers,
53
+ config.dropout
54
+ ).to(config.device)
55
+
56
+ dec = Decoder(
57
+ len(hin_vocab),
58
+ config.embedding_dim,
59
+ config.hidden_size,
60
+ config.num_layers,
61
+ config.dropout
62
+ ).to(config.device)
63
+
64
+ model = Seq2Seq(enc, dec, config.device).to(config.device)
65
+ model.load_state_dict(torch.load("bin/seq2seq.pth", map_location=config.device))
66
+
67
+ # Interactive translation
68
+ while True:
69
+ sentence = input("Enter English sentence (type 'exit' to quit): ")
70
+ if sentence.lower() == 'exit':
71
+ break
72
+ translation = translate_sentence(sentence, model, eng_vocab, hin_vocab, config.device)
73
+ print(f"Hindi Translation: {translation}\n")
74
+
75
+ if __name__ == "__main__":
76
+ main()
models/__pycache__/attention.cpython-310.pyc ADDED
Binary file (1.03 kB). View file
 
models/__pycache__/decoder.cpython-310.pyc ADDED
Binary file (1.43 kB). View file
 
models/__pycache__/encoder.cpython-310.pyc ADDED
Binary file (1.02 kB). View file
 
models/__pycache__/seq2seq.cpython-310.pyc ADDED
Binary file (1.26 kB). View file
 
models/attention.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class Attention(nn.Module):
6
+ def __init__(self, hidden_dim):
7
+ super().__init__()
8
+ self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
9
+ self.v = nn.Linear(hidden_dim, 1, bias=False)
10
+
11
+ def forward(self, hidden, encoder_outputs):
12
+ # hidden: [1, batch_size, hidden_dim]
13
+ # encoder_outputs: [src_len, batch_size, hidden_dim]
14
+
15
+ src_len = encoder_outputs.shape[0]
16
+ hidden = hidden.repeat(src_len, 1, 1)
17
+ # hidden: [src_len, batch_size, hidden_dim]
18
+
19
+ energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
20
+ # energy: [src_len, batch_size, hidden_dim]
21
+
22
+ attention = self.v(energy).squeeze(2)
23
+ # attention: [src_len, batch_size]
24
+
25
+ return F.softmax(attention, dim=0)
models/decoder.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ from models.attention import Attention
4
+ from utils.config import config
5
+
6
+ class Decoder(nn.Module):
7
+ def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
8
+ super().__init__()
9
+ self.output_dim = output_dim
10
+ self.attention = Attention(hidden_dim)
11
+ self.embedding = nn.Embedding(output_dim, embedding_dim)
12
+ self.rnn = nn.GRU(
13
+ embedding_dim + hidden_dim,
14
+ hidden_dim,
15
+ num_layers=n_layers,
16
+ dropout=dropout if n_layers > 1 else 0
17
+ )
18
+ self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
19
+ self.dropout = nn.Dropout(dropout)
20
+
21
+ def forward(self, input, hidden, encoder_outputs):
22
+ # input: [batch_size]
23
+ # hidden: [n_layers, batch_size, hidden_dim]
24
+ # encoder_outputs: [src_len, batch_size, hidden_dim]
25
+
26
+ input = input.unsqueeze(0)
27
+ # input: [1, batch_size]
28
+
29
+ embedded = self.dropout(self.embedding(input))
30
+ # embedded: [1, batch_size, embedding_dim]
31
+
32
+ a = self.attention(hidden[-1], encoder_outputs)
33
+ # a: [src_len, batch_size]
34
+
35
+ a = a.permute(1, 0).unsqueeze(1)
36
+ # a: [batch_size, 1, src_len]
37
+
38
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
39
+ # encoder_outputs: [batch_size, src_len, hidden_dim]
40
+
41
+ weighted = torch.bmm(a, encoder_outputs)
42
+ weighted = weighted.permute(1, 0, 2)
43
+ # weighted: [1, batch_size, hidden_dim]
44
+
45
+ rnn_input = torch.cat((embedded, weighted), dim=2)
46
+ # rnn_input: [1, batch_size, embedding_dim + hidden_dim]
47
+
48
+ output, hidden = self.rnn(rnn_input, hidden)
49
+ # output: [1, batch_size, hidden_dim]
50
+ # hidden: [n_layers, batch_size, hidden_dim]
51
+
52
+ embedded = embedded.squeeze(0)
53
+ output = output.squeeze(0)
54
+ weighted = weighted.squeeze(0)
55
+
56
+ prediction = self.fc_out(torch.cat((output, weighted), dim=1))
57
+ # prediction: [batch_size, output_dim]
58
+
59
+ return prediction, hidden
models/encoder.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from utils.config import config
3
+
4
+ class Encoder(nn.Module):
5
+ def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
6
+ super().__init__()
7
+ self.embedding = nn.Embedding(input_dim, embedding_dim)
8
+ self.rnn = nn.GRU(
9
+ embedding_dim,
10
+ hidden_dim,
11
+ num_layers=n_layers,
12
+ dropout=dropout if n_layers > 1 else 0,
13
+ bidirectional=False
14
+ )
15
+ self.dropout = nn.Dropout(dropout)
16
+
17
+ def forward(self, src):
18
+ # src: [batch_size, src_len]
19
+ embedded = self.dropout(self.embedding(src))
20
+ # embedded: [batch_size, src_len, embedding_dim]
21
+
22
+ outputs, hidden = self.rnn(embedded.permute(1, 0, 2))
23
+ # outputs: [src_len, batch_size, hidden_dim]
24
+ # hidden: [n_layers * num_directions, batch_size, hidden_dim]
25
+
26
+ return outputs, hidden
models/seq2seq.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from models.encoder import Encoder
4
+ from models.decoder import Decoder
5
+
6
+ class Seq2Seq(nn.Module):
7
+ def __init__(self, encoder, decoder, device):
8
+ super().__init__()
9
+ self.encoder = encoder
10
+ self.decoder = decoder
11
+ self.device = device
12
+
13
+ def forward(self, src, trg, teacher_forcing_ratio=0.5):
14
+ # src: [batch_size, src_len]
15
+ # trg: [batch_size, trg_len]
16
+
17
+ batch_size = trg.shape[0]
18
+ trg_len = trg.shape[1]
19
+ trg_vocab_size = self.decoder.output_dim
20
+
21
+ outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
22
+
23
+ encoder_outputs, hidden = self.encoder(src)
24
+
25
+ input = trg[:, 0] # First token is <start>
26
+
27
+ for t in range(1, trg_len):
28
+ output, hidden = self.decoder(input, hidden, encoder_outputs)
29
+ outputs[t] = output
30
+ teacher_force = torch.rand(1) < teacher_forcing_ratio
31
+ top1 = output.argmax(1)
32
+ input = trg[:, t] if teacher_force else top1
33
+
34
+ return outputs.permute(1, 0, 2)
requirment.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ pandas
3
+ numpy
4
+ tqdm
train.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.optim as optim
4
+ from tqdm import tqdm
5
+ from utils.config import config
6
+ from utils.data_loader import get_data_loaders
7
+ from models.encoder import Encoder
8
+ from models.decoder import Decoder
9
+ from models.seq2seq import Seq2Seq
10
+
11
+ def init_weights(m):
12
+ for name, param in m.named_parameters():
13
+ if 'weight' in name:
14
+ nn.init.normal_(param.data, mean=0, std=0.01)
15
+ else:
16
+ nn.init.constant_(param.data, 0)
17
+
18
+ def train():
19
+ train_loader, val_loader, eng_vocab, hin_vocab = get_data_loaders()
20
+
21
+ print(f"Final English vocab size: {len(eng_vocab)}")
22
+ print(f"Final Hindi vocab size: {len(hin_vocab)}")
23
+
24
+ # Model initialization
25
+ enc = Encoder(
26
+ len(eng_vocab),
27
+ config.embedding_dim,
28
+ config.hidden_size,
29
+ config.num_layers,
30
+ config.dropout
31
+ ).to(config.device)
32
+
33
+ dec = Decoder(
34
+ len(hin_vocab),
35
+ config.embedding_dim,
36
+ config.hidden_size,
37
+ config.num_layers,
38
+ config.dropout
39
+ ).to(config.device)
40
+
41
+ model = Seq2Seq(enc, dec, config.device).to(config.device)
42
+ model.apply(init_weights)
43
+
44
+ # Optimizer and loss
45
+ optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
46
+ criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding
47
+
48
+ # Training loop
49
+ for epoch in range(config.epochs):
50
+ model.train()
51
+ epoch_loss = 0
52
+
53
+ for src, trg in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
54
+ src, trg = src.to(config.device), trg.to(config.device)
55
+
56
+ optimizer.zero_grad()
57
+ output = model(src, trg, config.teacher_forcing_ratio)
58
+
59
+ output_dim = output.shape[-1]
60
+ output = output[:, 1:].reshape(-1, output_dim)
61
+ trg = trg[:, 1:].reshape(-1)
62
+
63
+ loss = criterion(output, trg)
64
+ loss.backward()
65
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
66
+ optimizer.step()
67
+
68
+ epoch_loss += loss.item()
69
+
70
+ avg_loss = epoch_loss / len(train_loader)
71
+ print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")
72
+
73
+ # Save model
74
+ torch.save(model.state_dict(), f"seq2seq_epoch_{epoch+1}.pth")
75
+
76
+ if __name__ == "__main__":
77
+ train()
utils/__pycache__/config.cpython-310.pyc ADDED
Binary file (794 Bytes). View file
 
utils/__pycache__/config.cpython-312.pyc ADDED
Binary file (996 Bytes). View file
 
utils/__pycache__/data_loader.cpython-310.pyc ADDED
Binary file (3.77 kB). View file
 
utils/__pycache__/data_loader.cpython-312.pyc ADDED
Binary file (3.84 kB). View file
 
utils/__pycache__/preprocessing.cpython-310.pyc ADDED
Binary file (1.6 kB). View file
 
utils/__pycache__/preprocessing.cpython-312.pyc ADDED
Binary file (2.36 kB). View file
 
utils/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ class Config:
4
+ # Data
5
+ data_path = "data/hindi_english_parallel.csv"
6
+ train_ratio = 0.8
7
+
8
+ # Preprocessing
9
+ max_length = 20
10
+ min_word_count = 3
11
+
12
+ # Model
13
+ embedding_dim = 256
14
+ hidden_size = 512
15
+ num_layers = 2
16
+ dropout = 0.5
17
+
18
+ # Training
19
+ batch_size = 64
20
+ learning_rate = 0.001
21
+ epochs = 20
22
+ teacher_forcing_ratio = 0.5
23
+
24
+ max_vocab_english = 5000
25
+ max_vocab_hindi = 10000
26
+ max_length = 20 # Maximum sentence length
27
+
28
+ # Device
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+
31
+ config = Config()
utils/data_loader.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import torch
3
+ from torch.utils.data import Dataset, DataLoader
4
+ import numpy as np
5
+ from utils.preprocessing import prepare_data, build_vocab
6
+ from utils.config import config
7
+
8
+ class TranslationDataset(Dataset):
9
+ def __init__(self, english_sentences, hindi_sentences, eng_vocab, hin_vocab):
10
+ self.english_sentences = english_sentences
11
+ self.hindi_sentences = hindi_sentences
12
+ self.eng_vocab = eng_vocab
13
+ self.hin_vocab = hin_vocab
14
+ self.eng_vocab_size = len(eng_vocab)
15
+ self.hin_vocab_size = len(hin_vocab)
16
+
17
+ def __len__(self):
18
+ return len(self.english_sentences)
19
+
20
+ def __getitem__(self, idx):
21
+ eng_sentence = self.english_sentences[idx]
22
+ hin_sentence = self.hindi_sentences[idx]
23
+
24
+ eng_ids = [self.eng_vocab.get(word, self.eng_vocab['<unk>'])
25
+ for word in eng_sentence.split()]
26
+ hin_ids = [self.hin_vocab.get(word, self.hin_vocab['<unk>'])
27
+ for word in hin_sentence.split()]
28
+
29
+ # Clamp indices to vocabulary size
30
+ eng_ids = [min(idx, self.eng_vocab_size - 1) for idx in eng_ids]
31
+ hin_ids = [min(idx, self.hin_vocab_size - 1) for idx in hin_ids]
32
+
33
+ return {
34
+ 'english': torch.tensor(eng_ids, dtype=torch.long),
35
+ 'hindi': torch.tensor(hin_ids, dtype=torch.long)
36
+ }
37
+
38
+ def collate_fn(batch):
39
+ eng_batch = [item['english'] for item in batch]
40
+ hin_batch = [item['hindi'] for item in batch]
41
+
42
+ eng_padded = torch.nn.utils.rnn.pad_sequence(
43
+ eng_batch, padding_value=0, batch_first=True)
44
+ hin_padded = torch.nn.utils.rnn.pad_sequence(
45
+ hin_batch, padding_value=0, batch_first=True)
46
+
47
+ return eng_padded, hin_padded
48
+
49
+ def get_data_loaders():
50
+ df = prepare_data()
51
+ df = df.sample(frac=0.1, random_state=42)
52
+ df['eng_len'] = df['english'].apply(lambda x: len(x.split()))
53
+ df['hin_len'] = df['hindi'].apply(lambda x: len(x.split()))
54
+ df = df[(df['eng_len'] <= config.max_length) &
55
+ (df['hin_len'] <= config.max_length)]
56
+
57
+ eng_sentences = df['english'].tolist()
58
+ hin_sentences = df['hindi'].tolist()
59
+
60
+ # Split data
61
+ split_idx = int(len(eng_sentences) * config.train_ratio)
62
+ train_eng = eng_sentences[:split_idx]
63
+ train_hin = hin_sentences[:split_idx]
64
+ val_eng = eng_sentences[split_idx:]
65
+ val_hin = hin_sentences[split_idx:]
66
+
67
+ # Build vocabularies
68
+ eng_vocab = build_vocab(train_eng)
69
+ hin_vocab = build_vocab(train_hin, is_hindi=True)
70
+
71
+ # Create datasets
72
+ train_dataset = TranslationDataset(train_eng, train_hin, eng_vocab, hin_vocab)
73
+ val_dataset = TranslationDataset(val_eng, val_hin, eng_vocab, hin_vocab)
74
+
75
+ # Create data loaders
76
+ train_loader = DataLoader(
77
+ train_dataset, batch_size=config.batch_size,
78
+ shuffle=True, collate_fn=collate_fn
79
+ )
80
+ val_loader = DataLoader(
81
+ val_dataset, batch_size=config.batch_size,
82
+ shuffle=False, collate_fn=collate_fn
83
+ )
84
+
85
+ # Save vocabularies for inference
86
+ with open('eng_vocab.pkl', 'wb') as f:
87
+ pickle.dump(eng_vocab, f)
88
+ with open('hin_vocab.pkl', 'wb') as f:
89
+ pickle.dump(hin_vocab, f)
90
+ print(f"English vocabulary size: {len(eng_vocab)}")
91
+ print(f"Hindi vocabulary size: {len(hin_vocab)}")
92
+ print(f"Max English index: {max(eng_vocab.values())}")
93
+ print(f"Max Hindi index: {max(hin_vocab.values())}")
94
+
95
+ return train_loader, val_loader, eng_vocab, hin_vocab
utils/preprocessing.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from utils.config import config
4
+ from collections import Counter
5
+
6
+ def clean_text(text):
7
+ text = text.lower().strip()
8
+ text = re.sub(r"([.!?])", r" \1", text)
9
+ text = re.sub(r"[^a-zA-Z.!?]+", r" ", text) # For English
10
+ return text
11
+
12
+ def clean_hindi(text):
13
+ text = text.strip()
14
+ text = re.sub(r"([।.!?])", r" \1", text)
15
+ return text
16
+
17
+ def prepare_data():
18
+ df = pd.read_csv(config.data_path)
19
+ df = df[['english', 'hindi']].dropna()
20
+
21
+ # Clean text
22
+ df['english'] = df['english'].apply(clean_text)
23
+ df['hindi'] = df['hindi'].apply(clean_hindi)
24
+
25
+ # Add start/end tokens to Hindi
26
+ df['hindi'] = df['hindi'].apply(lambda x: '<start> ' + x + ' <end>')
27
+
28
+ return df[['english', 'hindi']]
29
+
30
+ def build_vocab(sentences, is_hindi=False):
31
+ word_counts = Counter()
32
+ for sentence in sentences:
33
+ # Skip empty sentences
34
+ if not sentence or pd.isna(sentence):
35
+ continue
36
+ words = sentence.split()
37
+ word_counts.update(words)
38
+
39
+ # Include all words regardless of frequency
40
+ vocab = {word: idx+4 for idx, word in enumerate(word_counts)}
41
+
42
+ # Add special tokens
43
+ vocab['<pad>'] = 0
44
+ vocab['<start>'] = 1
45
+ vocab['<end>'] = 2
46
+ vocab['<unk>'] = 3
47
+
48
+ return vocab