import pandas as pd import re from utils.config import config from collections import Counter def clean_text(text): text = text.lower().strip() text = re.sub(r"([.!?])", r" \1", text) text = re.sub(r"[^a-zA-Z.!?]+", r" ", text) # For English return text def clean_hindi(text): text = text.strip() text = re.sub(r"([ред.!?])", r" \1", text) return text def prepare_data(): df = pd.read_csv(config.data_path) df = df[['english', 'hindi']].dropna() # Clean text df['english'] = df['english'].apply(clean_text) df['hindi'] = df['hindi'].apply(clean_hindi) # Add start/end tokens to Hindi df['hindi'] = df['hindi'].apply(lambda x: ' ' + x + ' ') return df[['english', 'hindi']] def build_vocab(sentences, is_hindi=False): word_counts = Counter() for sentence in sentences: # Skip empty sentences if not sentence or pd.isna(sentence): continue words = sentence.split() word_counts.update(words) # Include all words regardless of frequency vocab = {word: idx+4 for idx, word in enumerate(word_counts)} # Add special tokens vocab[''] = 0 vocab[''] = 1 vocab[''] = 2 vocab[''] = 3 return vocab