Spaces:
Sleeping
Sleeping
File size: 1,297 Bytes
9a41f63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import pandas as pd
import re
from utils.config import config
from collections import Counter
def clean_text(text):
text = text.lower().strip()
text = re.sub(r"([.!?])", r" \1", text)
text = re.sub(r"[^a-zA-Z.!?]+", r" ", text) # For English
return text
def clean_hindi(text):
text = text.strip()
text = re.sub(r"([।.!?])", r" \1", text)
return text
def prepare_data():
df = pd.read_csv(config.data_path)
df = df[['english', 'hindi']].dropna()
# Clean text
df['english'] = df['english'].apply(clean_text)
df['hindi'] = df['hindi'].apply(clean_hindi)
# Add start/end tokens to Hindi
df['hindi'] = df['hindi'].apply(lambda x: '<start> ' + x + ' <end>')
return df[['english', 'hindi']]
def build_vocab(sentences, is_hindi=False):
word_counts = Counter()
for sentence in sentences:
# Skip empty sentences
if not sentence or pd.isna(sentence):
continue
words = sentence.split()
word_counts.update(words)
# Include all words regardless of frequency
vocab = {word: idx+4 for idx, word in enumerate(word_counts)}
# Add special tokens
vocab['<pad>'] = 0
vocab['<start>'] = 1
vocab['<end>'] = 2
vocab['<unk>'] = 3
return vocab |