File size: 1,297 Bytes
9a41f63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import re
from utils.config import config
from collections import Counter

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"([.!?])", r" \1", text)
    text = re.sub(r"[^a-zA-Z.!?]+", r" ", text)  # For English
    return text

def clean_hindi(text):
    text = text.strip()
    text = re.sub(r"([।.!?])", r" \1", text)
    return text

def prepare_data():
    df = pd.read_csv(config.data_path)
    df = df[['english', 'hindi']].dropna()
    
    # Clean text
    df['english'] = df['english'].apply(clean_text)
    df['hindi'] = df['hindi'].apply(clean_hindi)
    
    # Add start/end tokens to Hindi
    df['hindi'] = df['hindi'].apply(lambda x: '<start> ' + x + ' <end>')
    
    return df[['english', 'hindi']]

def build_vocab(sentences, is_hindi=False):
    word_counts = Counter()
    for sentence in sentences:
        # Skip empty sentences
        if not sentence or pd.isna(sentence):
            continue
        words = sentence.split()
        word_counts.update(words)
    
    # Include all words regardless of frequency
    vocab = {word: idx+4 for idx, word in enumerate(word_counts)}
    
    # Add special tokens
    vocab['<pad>'] = 0
    vocab['<start>'] = 1
    vocab['<end>'] = 2
    vocab['<unk>'] = 3
    
    return vocab