Spaces:
Sleeping
Sleeping
init
Browse files- .gitattributes +1 -0
- __pycache__/inference.cpython-310.pyc +0 -0
- app.py +55 -0
- bin/eng_vocab.pkl +3 -0
- bin/hin_vocab.pkl +3 -0
- bin/seq2seq.pth +3 -0
- data/hindi_english_parallel.csv +3 -0
- inference.py +76 -0
- models/__pycache__/attention.cpython-310.pyc +0 -0
- models/__pycache__/decoder.cpython-310.pyc +0 -0
- models/__pycache__/encoder.cpython-310.pyc +0 -0
- models/__pycache__/seq2seq.cpython-310.pyc +0 -0
- models/attention.py +25 -0
- models/decoder.py +59 -0
- models/encoder.py +26 -0
- models/seq2seq.py +34 -0
- requirment.txt +4 -0
- train.py +77 -0
- utils/__pycache__/config.cpython-310.pyc +0 -0
- utils/__pycache__/config.cpython-312.pyc +0 -0
- utils/__pycache__/data_loader.cpython-310.pyc +0 -0
- utils/__pycache__/data_loader.cpython-312.pyc +0 -0
- utils/__pycache__/preprocessing.cpython-310.pyc +0 -0
- utils/__pycache__/preprocessing.cpython-312.pyc +0 -0
- utils/config.py +31 -0
- utils/data_loader.py +95 -0
- utils/preprocessing.py +48 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
__pycache__/inference.cpython-310.pyc
ADDED
Binary file (2.41 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.config import config
|
2 |
+
from inference import main, translate_sentence
|
3 |
+
from models.encoder import Encoder
|
4 |
+
from models.decoder import Decoder
|
5 |
+
from models.seq2seq import Seq2Seq
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import pickle
|
9 |
+
import torch
|
10 |
+
|
11 |
+
def translate(inp):
|
12 |
+
global model, eng_vocab, hin_vocab
|
13 |
+
text = translate_sentence(inp, model, eng_vocab, hin_vocab, config.device)
|
14 |
+
return text
|
15 |
+
|
16 |
+
def main():
|
17 |
+
global model, eng_vocab, hin_vocab
|
18 |
+
# Load vocabularies
|
19 |
+
with open('bin/eng_vocab.pkl', 'rb') as f:
|
20 |
+
eng_vocab = pickle.load(f)
|
21 |
+
with open('bin/hin_vocab.pkl', 'rb') as f:
|
22 |
+
hin_vocab = pickle.load(f)
|
23 |
+
|
24 |
+
# Load model
|
25 |
+
enc = Encoder(
|
26 |
+
len(eng_vocab),
|
27 |
+
config.embedding_dim,
|
28 |
+
config.hidden_size,
|
29 |
+
config.num_layers,
|
30 |
+
config.dropout
|
31 |
+
).to(config.device)
|
32 |
+
|
33 |
+
dec = Decoder(
|
34 |
+
len(hin_vocab),
|
35 |
+
config.embedding_dim,
|
36 |
+
config.hidden_size,
|
37 |
+
config.num_layers,
|
38 |
+
config.dropout
|
39 |
+
).to(config.device)
|
40 |
+
|
41 |
+
model = Seq2Seq(enc, dec, config.device).to(config.device)
|
42 |
+
model.load_state_dict(torch.load("bin/seq2seq.pth", map_location=config.device))
|
43 |
+
|
44 |
+
|
45 |
+
app = gr.Interface(
|
46 |
+
fn=translate,
|
47 |
+
inputs='textbox',
|
48 |
+
outputs='textbox'
|
49 |
+
)
|
50 |
+
|
51 |
+
app.launch()
|
52 |
+
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
main()
|
bin/eng_vocab.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e79a4e87ee83d027731c589031c086e07e5b1acca16be6c4739487ed36910a71
|
3 |
+
size 546070
|
bin/hin_vocab.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7df77cef431e4c96d47b7a1754bb60dd02950852aeb522331003f8185e5f078
|
3 |
+
size 1777961
|
bin/seq2seq.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f82aedf07d90622f29769e19b7e62f14e1ecc98e66f75f2240c8bb26bebc5a49
|
3 |
+
size 421474417
|
data/hindi_english_parallel.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea0c215aab91c26d35c22a2ad878c8ae14332ec480de007c7b9b961ef19d1eb9
|
3 |
+
size 400990503
|
inference.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from utils.config import config
|
3 |
+
from utils.preprocessing import clean_text, clean_hindi
|
4 |
+
from utils.data_loader import TranslationDataset
|
5 |
+
from models.encoder import Encoder
|
6 |
+
from models.decoder import Decoder
|
7 |
+
from models.seq2seq import Seq2Seq
|
8 |
+
import pickle
|
9 |
+
|
10 |
+
def translate_sentence(sentence, model, eng_vocab, hin_vocab, device):
|
11 |
+
model.eval()
|
12 |
+
sentence = clean_text(sentence)
|
13 |
+
|
14 |
+
# Convert to tensor
|
15 |
+
tokens = [eng_vocab.get(word, eng_vocab['<unk>']) for word in sentence.split()]
|
16 |
+
src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
|
17 |
+
|
18 |
+
with torch.no_grad():
|
19 |
+
encoder_outputs, hidden = model.encoder(src_tensor)
|
20 |
+
|
21 |
+
trg_indexes = [hin_vocab['<start>']]
|
22 |
+
|
23 |
+
for _ in range(config.max_length):
|
24 |
+
trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
|
25 |
+
|
26 |
+
with torch.no_grad():
|
27 |
+
output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
|
28 |
+
|
29 |
+
pred_token = output.argmax(1).item()
|
30 |
+
trg_indexes.append(pred_token)
|
31 |
+
|
32 |
+
if pred_token == hin_vocab['<end>']:
|
33 |
+
break
|
34 |
+
|
35 |
+
trg_tokens = [list(hin_vocab.keys())[list(hin_vocab.values()).index(i)]
|
36 |
+
for i in trg_indexes]
|
37 |
+
|
38 |
+
return ' '.join(trg_tokens[1:-1]) # Remove <start> and <end>
|
39 |
+
|
40 |
+
def main():
|
41 |
+
# Load vocabularies
|
42 |
+
with open('bin/eng_vocab.pkl', 'rb') as f:
|
43 |
+
eng_vocab = pickle.load(f)
|
44 |
+
with open('bin/hin_vocab.pkl', 'rb') as f:
|
45 |
+
hin_vocab = pickle.load(f)
|
46 |
+
|
47 |
+
# Load model
|
48 |
+
enc = Encoder(
|
49 |
+
len(eng_vocab),
|
50 |
+
config.embedding_dim,
|
51 |
+
config.hidden_size,
|
52 |
+
config.num_layers,
|
53 |
+
config.dropout
|
54 |
+
).to(config.device)
|
55 |
+
|
56 |
+
dec = Decoder(
|
57 |
+
len(hin_vocab),
|
58 |
+
config.embedding_dim,
|
59 |
+
config.hidden_size,
|
60 |
+
config.num_layers,
|
61 |
+
config.dropout
|
62 |
+
).to(config.device)
|
63 |
+
|
64 |
+
model = Seq2Seq(enc, dec, config.device).to(config.device)
|
65 |
+
model.load_state_dict(torch.load("bin/seq2seq.pth", map_location=config.device))
|
66 |
+
|
67 |
+
# Interactive translation
|
68 |
+
while True:
|
69 |
+
sentence = input("Enter English sentence (type 'exit' to quit): ")
|
70 |
+
if sentence.lower() == 'exit':
|
71 |
+
break
|
72 |
+
translation = translate_sentence(sentence, model, eng_vocab, hin_vocab, config.device)
|
73 |
+
print(f"Hindi Translation: {translation}\n")
|
74 |
+
|
75 |
+
if __name__ == "__main__":
|
76 |
+
main()
|
models/__pycache__/attention.cpython-310.pyc
ADDED
Binary file (1.03 kB). View file
|
|
models/__pycache__/decoder.cpython-310.pyc
ADDED
Binary file (1.43 kB). View file
|
|
models/__pycache__/encoder.cpython-310.pyc
ADDED
Binary file (1.02 kB). View file
|
|
models/__pycache__/seq2seq.cpython-310.pyc
ADDED
Binary file (1.26 kB). View file
|
|
models/attention.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class Attention(nn.Module):
|
6 |
+
def __init__(self, hidden_dim):
|
7 |
+
super().__init__()
|
8 |
+
self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
|
9 |
+
self.v = nn.Linear(hidden_dim, 1, bias=False)
|
10 |
+
|
11 |
+
def forward(self, hidden, encoder_outputs):
|
12 |
+
# hidden: [1, batch_size, hidden_dim]
|
13 |
+
# encoder_outputs: [src_len, batch_size, hidden_dim]
|
14 |
+
|
15 |
+
src_len = encoder_outputs.shape[0]
|
16 |
+
hidden = hidden.repeat(src_len, 1, 1)
|
17 |
+
# hidden: [src_len, batch_size, hidden_dim]
|
18 |
+
|
19 |
+
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
|
20 |
+
# energy: [src_len, batch_size, hidden_dim]
|
21 |
+
|
22 |
+
attention = self.v(energy).squeeze(2)
|
23 |
+
# attention: [src_len, batch_size]
|
24 |
+
|
25 |
+
return F.softmax(attention, dim=0)
|
models/decoder.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from models.attention import Attention
|
4 |
+
from utils.config import config
|
5 |
+
|
6 |
+
class Decoder(nn.Module):
|
7 |
+
def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
|
8 |
+
super().__init__()
|
9 |
+
self.output_dim = output_dim
|
10 |
+
self.attention = Attention(hidden_dim)
|
11 |
+
self.embedding = nn.Embedding(output_dim, embedding_dim)
|
12 |
+
self.rnn = nn.GRU(
|
13 |
+
embedding_dim + hidden_dim,
|
14 |
+
hidden_dim,
|
15 |
+
num_layers=n_layers,
|
16 |
+
dropout=dropout if n_layers > 1 else 0
|
17 |
+
)
|
18 |
+
self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
|
19 |
+
self.dropout = nn.Dropout(dropout)
|
20 |
+
|
21 |
+
def forward(self, input, hidden, encoder_outputs):
|
22 |
+
# input: [batch_size]
|
23 |
+
# hidden: [n_layers, batch_size, hidden_dim]
|
24 |
+
# encoder_outputs: [src_len, batch_size, hidden_dim]
|
25 |
+
|
26 |
+
input = input.unsqueeze(0)
|
27 |
+
# input: [1, batch_size]
|
28 |
+
|
29 |
+
embedded = self.dropout(self.embedding(input))
|
30 |
+
# embedded: [1, batch_size, embedding_dim]
|
31 |
+
|
32 |
+
a = self.attention(hidden[-1], encoder_outputs)
|
33 |
+
# a: [src_len, batch_size]
|
34 |
+
|
35 |
+
a = a.permute(1, 0).unsqueeze(1)
|
36 |
+
# a: [batch_size, 1, src_len]
|
37 |
+
|
38 |
+
encoder_outputs = encoder_outputs.permute(1, 0, 2)
|
39 |
+
# encoder_outputs: [batch_size, src_len, hidden_dim]
|
40 |
+
|
41 |
+
weighted = torch.bmm(a, encoder_outputs)
|
42 |
+
weighted = weighted.permute(1, 0, 2)
|
43 |
+
# weighted: [1, batch_size, hidden_dim]
|
44 |
+
|
45 |
+
rnn_input = torch.cat((embedded, weighted), dim=2)
|
46 |
+
# rnn_input: [1, batch_size, embedding_dim + hidden_dim]
|
47 |
+
|
48 |
+
output, hidden = self.rnn(rnn_input, hidden)
|
49 |
+
# output: [1, batch_size, hidden_dim]
|
50 |
+
# hidden: [n_layers, batch_size, hidden_dim]
|
51 |
+
|
52 |
+
embedded = embedded.squeeze(0)
|
53 |
+
output = output.squeeze(0)
|
54 |
+
weighted = weighted.squeeze(0)
|
55 |
+
|
56 |
+
prediction = self.fc_out(torch.cat((output, weighted), dim=1))
|
57 |
+
# prediction: [batch_size, output_dim]
|
58 |
+
|
59 |
+
return prediction, hidden
|
models/encoder.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
from utils.config import config
|
3 |
+
|
4 |
+
class Encoder(nn.Module):
|
5 |
+
def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
|
6 |
+
super().__init__()
|
7 |
+
self.embedding = nn.Embedding(input_dim, embedding_dim)
|
8 |
+
self.rnn = nn.GRU(
|
9 |
+
embedding_dim,
|
10 |
+
hidden_dim,
|
11 |
+
num_layers=n_layers,
|
12 |
+
dropout=dropout if n_layers > 1 else 0,
|
13 |
+
bidirectional=False
|
14 |
+
)
|
15 |
+
self.dropout = nn.Dropout(dropout)
|
16 |
+
|
17 |
+
def forward(self, src):
|
18 |
+
# src: [batch_size, src_len]
|
19 |
+
embedded = self.dropout(self.embedding(src))
|
20 |
+
# embedded: [batch_size, src_len, embedding_dim]
|
21 |
+
|
22 |
+
outputs, hidden = self.rnn(embedded.permute(1, 0, 2))
|
23 |
+
# outputs: [src_len, batch_size, hidden_dim]
|
24 |
+
# hidden: [n_layers * num_directions, batch_size, hidden_dim]
|
25 |
+
|
26 |
+
return outputs, hidden
|
models/seq2seq.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from models.encoder import Encoder
|
4 |
+
from models.decoder import Decoder
|
5 |
+
|
6 |
+
class Seq2Seq(nn.Module):
|
7 |
+
def __init__(self, encoder, decoder, device):
|
8 |
+
super().__init__()
|
9 |
+
self.encoder = encoder
|
10 |
+
self.decoder = decoder
|
11 |
+
self.device = device
|
12 |
+
|
13 |
+
def forward(self, src, trg, teacher_forcing_ratio=0.5):
|
14 |
+
# src: [batch_size, src_len]
|
15 |
+
# trg: [batch_size, trg_len]
|
16 |
+
|
17 |
+
batch_size = trg.shape[0]
|
18 |
+
trg_len = trg.shape[1]
|
19 |
+
trg_vocab_size = self.decoder.output_dim
|
20 |
+
|
21 |
+
outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
|
22 |
+
|
23 |
+
encoder_outputs, hidden = self.encoder(src)
|
24 |
+
|
25 |
+
input = trg[:, 0] # First token is <start>
|
26 |
+
|
27 |
+
for t in range(1, trg_len):
|
28 |
+
output, hidden = self.decoder(input, hidden, encoder_outputs)
|
29 |
+
outputs[t] = output
|
30 |
+
teacher_force = torch.rand(1) < teacher_forcing_ratio
|
31 |
+
top1 = output.argmax(1)
|
32 |
+
input = trg[:, t] if teacher_force else top1
|
33 |
+
|
34 |
+
return outputs.permute(1, 0, 2)
|
requirment.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
tqdm
|
train.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.optim as optim
|
4 |
+
from tqdm import tqdm
|
5 |
+
from utils.config import config
|
6 |
+
from utils.data_loader import get_data_loaders
|
7 |
+
from models.encoder import Encoder
|
8 |
+
from models.decoder import Decoder
|
9 |
+
from models.seq2seq import Seq2Seq
|
10 |
+
|
11 |
+
def init_weights(m):
|
12 |
+
for name, param in m.named_parameters():
|
13 |
+
if 'weight' in name:
|
14 |
+
nn.init.normal_(param.data, mean=0, std=0.01)
|
15 |
+
else:
|
16 |
+
nn.init.constant_(param.data, 0)
|
17 |
+
|
18 |
+
def train():
|
19 |
+
train_loader, val_loader, eng_vocab, hin_vocab = get_data_loaders()
|
20 |
+
|
21 |
+
print(f"Final English vocab size: {len(eng_vocab)}")
|
22 |
+
print(f"Final Hindi vocab size: {len(hin_vocab)}")
|
23 |
+
|
24 |
+
# Model initialization
|
25 |
+
enc = Encoder(
|
26 |
+
len(eng_vocab),
|
27 |
+
config.embedding_dim,
|
28 |
+
config.hidden_size,
|
29 |
+
config.num_layers,
|
30 |
+
config.dropout
|
31 |
+
).to(config.device)
|
32 |
+
|
33 |
+
dec = Decoder(
|
34 |
+
len(hin_vocab),
|
35 |
+
config.embedding_dim,
|
36 |
+
config.hidden_size,
|
37 |
+
config.num_layers,
|
38 |
+
config.dropout
|
39 |
+
).to(config.device)
|
40 |
+
|
41 |
+
model = Seq2Seq(enc, dec, config.device).to(config.device)
|
42 |
+
model.apply(init_weights)
|
43 |
+
|
44 |
+
# Optimizer and loss
|
45 |
+
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
|
46 |
+
criterion = nn.CrossEntropyLoss(ignore_index=0) # Ignore padding
|
47 |
+
|
48 |
+
# Training loop
|
49 |
+
for epoch in range(config.epochs):
|
50 |
+
model.train()
|
51 |
+
epoch_loss = 0
|
52 |
+
|
53 |
+
for src, trg in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
|
54 |
+
src, trg = src.to(config.device), trg.to(config.device)
|
55 |
+
|
56 |
+
optimizer.zero_grad()
|
57 |
+
output = model(src, trg, config.teacher_forcing_ratio)
|
58 |
+
|
59 |
+
output_dim = output.shape[-1]
|
60 |
+
output = output[:, 1:].reshape(-1, output_dim)
|
61 |
+
trg = trg[:, 1:].reshape(-1)
|
62 |
+
|
63 |
+
loss = criterion(output, trg)
|
64 |
+
loss.backward()
|
65 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
|
66 |
+
optimizer.step()
|
67 |
+
|
68 |
+
epoch_loss += loss.item()
|
69 |
+
|
70 |
+
avg_loss = epoch_loss / len(train_loader)
|
71 |
+
print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")
|
72 |
+
|
73 |
+
# Save model
|
74 |
+
torch.save(model.state_dict(), f"seq2seq_epoch_{epoch+1}.pth")
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
train()
|
utils/__pycache__/config.cpython-310.pyc
ADDED
Binary file (794 Bytes). View file
|
|
utils/__pycache__/config.cpython-312.pyc
ADDED
Binary file (996 Bytes). View file
|
|
utils/__pycache__/data_loader.cpython-310.pyc
ADDED
Binary file (3.77 kB). View file
|
|
utils/__pycache__/data_loader.cpython-312.pyc
ADDED
Binary file (3.84 kB). View file
|
|
utils/__pycache__/preprocessing.cpython-310.pyc
ADDED
Binary file (1.6 kB). View file
|
|
utils/__pycache__/preprocessing.cpython-312.pyc
ADDED
Binary file (2.36 kB). View file
|
|
utils/config.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
class Config:
|
4 |
+
# Data
|
5 |
+
data_path = "data/hindi_english_parallel.csv"
|
6 |
+
train_ratio = 0.8
|
7 |
+
|
8 |
+
# Preprocessing
|
9 |
+
max_length = 20
|
10 |
+
min_word_count = 3
|
11 |
+
|
12 |
+
# Model
|
13 |
+
embedding_dim = 256
|
14 |
+
hidden_size = 512
|
15 |
+
num_layers = 2
|
16 |
+
dropout = 0.5
|
17 |
+
|
18 |
+
# Training
|
19 |
+
batch_size = 64
|
20 |
+
learning_rate = 0.001
|
21 |
+
epochs = 20
|
22 |
+
teacher_forcing_ratio = 0.5
|
23 |
+
|
24 |
+
max_vocab_english = 5000
|
25 |
+
max_vocab_hindi = 10000
|
26 |
+
max_length = 20 # Maximum sentence length
|
27 |
+
|
28 |
+
# Device
|
29 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
+
|
31 |
+
config = Config()
|
utils/data_loader.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import torch
|
3 |
+
from torch.utils.data import Dataset, DataLoader
|
4 |
+
import numpy as np
|
5 |
+
from utils.preprocessing import prepare_data, build_vocab
|
6 |
+
from utils.config import config
|
7 |
+
|
8 |
+
class TranslationDataset(Dataset):
|
9 |
+
def __init__(self, english_sentences, hindi_sentences, eng_vocab, hin_vocab):
|
10 |
+
self.english_sentences = english_sentences
|
11 |
+
self.hindi_sentences = hindi_sentences
|
12 |
+
self.eng_vocab = eng_vocab
|
13 |
+
self.hin_vocab = hin_vocab
|
14 |
+
self.eng_vocab_size = len(eng_vocab)
|
15 |
+
self.hin_vocab_size = len(hin_vocab)
|
16 |
+
|
17 |
+
def __len__(self):
|
18 |
+
return len(self.english_sentences)
|
19 |
+
|
20 |
+
def __getitem__(self, idx):
|
21 |
+
eng_sentence = self.english_sentences[idx]
|
22 |
+
hin_sentence = self.hindi_sentences[idx]
|
23 |
+
|
24 |
+
eng_ids = [self.eng_vocab.get(word, self.eng_vocab['<unk>'])
|
25 |
+
for word in eng_sentence.split()]
|
26 |
+
hin_ids = [self.hin_vocab.get(word, self.hin_vocab['<unk>'])
|
27 |
+
for word in hin_sentence.split()]
|
28 |
+
|
29 |
+
# Clamp indices to vocabulary size
|
30 |
+
eng_ids = [min(idx, self.eng_vocab_size - 1) for idx in eng_ids]
|
31 |
+
hin_ids = [min(idx, self.hin_vocab_size - 1) for idx in hin_ids]
|
32 |
+
|
33 |
+
return {
|
34 |
+
'english': torch.tensor(eng_ids, dtype=torch.long),
|
35 |
+
'hindi': torch.tensor(hin_ids, dtype=torch.long)
|
36 |
+
}
|
37 |
+
|
38 |
+
def collate_fn(batch):
|
39 |
+
eng_batch = [item['english'] for item in batch]
|
40 |
+
hin_batch = [item['hindi'] for item in batch]
|
41 |
+
|
42 |
+
eng_padded = torch.nn.utils.rnn.pad_sequence(
|
43 |
+
eng_batch, padding_value=0, batch_first=True)
|
44 |
+
hin_padded = torch.nn.utils.rnn.pad_sequence(
|
45 |
+
hin_batch, padding_value=0, batch_first=True)
|
46 |
+
|
47 |
+
return eng_padded, hin_padded
|
48 |
+
|
49 |
+
def get_data_loaders():
|
50 |
+
df = prepare_data()
|
51 |
+
df = df.sample(frac=0.1, random_state=42)
|
52 |
+
df['eng_len'] = df['english'].apply(lambda x: len(x.split()))
|
53 |
+
df['hin_len'] = df['hindi'].apply(lambda x: len(x.split()))
|
54 |
+
df = df[(df['eng_len'] <= config.max_length) &
|
55 |
+
(df['hin_len'] <= config.max_length)]
|
56 |
+
|
57 |
+
eng_sentences = df['english'].tolist()
|
58 |
+
hin_sentences = df['hindi'].tolist()
|
59 |
+
|
60 |
+
# Split data
|
61 |
+
split_idx = int(len(eng_sentences) * config.train_ratio)
|
62 |
+
train_eng = eng_sentences[:split_idx]
|
63 |
+
train_hin = hin_sentences[:split_idx]
|
64 |
+
val_eng = eng_sentences[split_idx:]
|
65 |
+
val_hin = hin_sentences[split_idx:]
|
66 |
+
|
67 |
+
# Build vocabularies
|
68 |
+
eng_vocab = build_vocab(train_eng)
|
69 |
+
hin_vocab = build_vocab(train_hin, is_hindi=True)
|
70 |
+
|
71 |
+
# Create datasets
|
72 |
+
train_dataset = TranslationDataset(train_eng, train_hin, eng_vocab, hin_vocab)
|
73 |
+
val_dataset = TranslationDataset(val_eng, val_hin, eng_vocab, hin_vocab)
|
74 |
+
|
75 |
+
# Create data loaders
|
76 |
+
train_loader = DataLoader(
|
77 |
+
train_dataset, batch_size=config.batch_size,
|
78 |
+
shuffle=True, collate_fn=collate_fn
|
79 |
+
)
|
80 |
+
val_loader = DataLoader(
|
81 |
+
val_dataset, batch_size=config.batch_size,
|
82 |
+
shuffle=False, collate_fn=collate_fn
|
83 |
+
)
|
84 |
+
|
85 |
+
# Save vocabularies for inference
|
86 |
+
with open('eng_vocab.pkl', 'wb') as f:
|
87 |
+
pickle.dump(eng_vocab, f)
|
88 |
+
with open('hin_vocab.pkl', 'wb') as f:
|
89 |
+
pickle.dump(hin_vocab, f)
|
90 |
+
print(f"English vocabulary size: {len(eng_vocab)}")
|
91 |
+
print(f"Hindi vocabulary size: {len(hin_vocab)}")
|
92 |
+
print(f"Max English index: {max(eng_vocab.values())}")
|
93 |
+
print(f"Max Hindi index: {max(hin_vocab.values())}")
|
94 |
+
|
95 |
+
return train_loader, val_loader, eng_vocab, hin_vocab
|
utils/preprocessing.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
from utils.config import config
|
4 |
+
from collections import Counter
|
5 |
+
|
6 |
+
def clean_text(text):
|
7 |
+
text = text.lower().strip()
|
8 |
+
text = re.sub(r"([.!?])", r" \1", text)
|
9 |
+
text = re.sub(r"[^a-zA-Z.!?]+", r" ", text) # For English
|
10 |
+
return text
|
11 |
+
|
12 |
+
def clean_hindi(text):
|
13 |
+
text = text.strip()
|
14 |
+
text = re.sub(r"([।.!?])", r" \1", text)
|
15 |
+
return text
|
16 |
+
|
17 |
+
def prepare_data():
|
18 |
+
df = pd.read_csv(config.data_path)
|
19 |
+
df = df[['english', 'hindi']].dropna()
|
20 |
+
|
21 |
+
# Clean text
|
22 |
+
df['english'] = df['english'].apply(clean_text)
|
23 |
+
df['hindi'] = df['hindi'].apply(clean_hindi)
|
24 |
+
|
25 |
+
# Add start/end tokens to Hindi
|
26 |
+
df['hindi'] = df['hindi'].apply(lambda x: '<start> ' + x + ' <end>')
|
27 |
+
|
28 |
+
return df[['english', 'hindi']]
|
29 |
+
|
30 |
+
def build_vocab(sentences, is_hindi=False):
|
31 |
+
word_counts = Counter()
|
32 |
+
for sentence in sentences:
|
33 |
+
# Skip empty sentences
|
34 |
+
if not sentence or pd.isna(sentence):
|
35 |
+
continue
|
36 |
+
words = sentence.split()
|
37 |
+
word_counts.update(words)
|
38 |
+
|
39 |
+
# Include all words regardless of frequency
|
40 |
+
vocab = {word: idx+4 for idx, word in enumerate(word_counts)}
|
41 |
+
|
42 |
+
# Add special tokens
|
43 |
+
vocab['<pad>'] = 0
|
44 |
+
vocab['<start>'] = 1
|
45 |
+
vocab['<end>'] = 2
|
46 |
+
vocab['<unk>'] = 3
|
47 |
+
|
48 |
+
return vocab
|