Spaces:
Sleeping
Sleeping
import pandas as pd | |
import re | |
from utils.config import config | |
from collections import Counter | |
def clean_text(text): | |
text = text.lower().strip() | |
text = re.sub(r"([.!?])", r" \1", text) | |
text = re.sub(r"[^a-zA-Z.!?]+", r" ", text) # For English | |
return text | |
def clean_hindi(text): | |
text = text.strip() | |
text = re.sub(r"([।.!?])", r" \1", text) | |
return text | |
def prepare_data(): | |
df = pd.read_csv(config.data_path) | |
df = df[['english', 'hindi']].dropna() | |
# Clean text | |
df['english'] = df['english'].apply(clean_text) | |
df['hindi'] = df['hindi'].apply(clean_hindi) | |
# Add start/end tokens to Hindi | |
df['hindi'] = df['hindi'].apply(lambda x: '<start> ' + x + ' <end>') | |
return df[['english', 'hindi']] | |
def build_vocab(sentences, is_hindi=False): | |
word_counts = Counter() | |
for sentence in sentences: | |
# Skip empty sentences | |
if not sentence or pd.isna(sentence): | |
continue | |
words = sentence.split() | |
word_counts.update(words) | |
# Include all words regardless of frequency | |
vocab = {word: idx+4 for idx, word in enumerate(word_counts)} | |
# Add special tokens | |
vocab['<pad>'] = 0 | |
vocab['<start>'] = 1 | |
vocab['<end>'] = 2 | |
vocab['<unk>'] = 3 | |
return vocab |