Spaces:
Build error
Build error
import re | |
class TextProcessor: | |
"""Handles text processing operations for translation.""" | |
MAX_LENGTH = 512 | |
PERSIAN_NUMBERS = { | |
'0': '۰', '1': '۱', '2': '۲', '3': '۳', '4': '۴', | |
'5': '۵', '6': '۶', '7': '۷', '8': '۸', '9': '۹' | |
} | |
def preprocess_text(text: str) -> str: | |
""" | |
Clean and prepare text for translation. | |
Args: | |
text: Input text to process | |
Returns: | |
Processed text ready for translation | |
""" | |
if not text: | |
return "" | |
# Normalize whitespace and remove special characters | |
text = ' '.join(text.split()) | |
text = re.sub(r'[^\w\s.,!?-]', '', text) | |
return text[:TextProcessor.MAX_LENGTH] | |
def postprocess_translation(text: str) -> str: | |
""" | |
Clean up translated text and normalize numbers. | |
Args: | |
text: Translated text to process | |
Returns: | |
Cleaned and normalized text | |
""" | |
if not text: | |
return "" | |
# Clean up model artifacts | |
text = text.replace("<pad>", "").replace("</s>", "").replace("<s>", "") | |
text = re.sub(r'\s+([.,!?])', r'\1', text) | |
text = ' '.join(text.split()) | |
# Convert to Persian numbers | |
for en, fa in TextProcessor.PERSIAN_NUMBERS.items(): | |
text = text.replace(en, fa) | |
return text.strip() | |
def detect_language(text: str) -> str: | |
""" | |
Detect if text is primarily English or Farsi. | |
Args: | |
text: Input text to analyze | |
Returns: | |
'Farsi' or 'English' based on character frequency | |
""" | |
farsi_chars = len(re.findall(r'[\u0600-\u06FF]', text)) | |
english_chars = len(re.findall(r'[a-zA-Z]', text)) | |
return "Farsi" if farsi_chars > english_chars else "English" | |
def validate_input(text: str) -> tuple[bool, str]: | |
""" | |
Validate input text length and content. | |
Args: | |
text: Input text to validate | |
Returns: | |
Tuple of (is_valid, error_message) | |
""" | |
if not text or len(text.strip()) < 1: | |
return False, "Please enter text to translate" | |
if len(text) > TextProcessor.MAX_LENGTH: | |
return False, f"Input text is too long (maximum {TextProcessor.MAX_LENGTH} characters)" | |
return True, "" | |
# Expose static methods for backward compatibility | |
preprocess_text = TextProcessor.preprocess_text | |
postprocess_translation = TextProcessor.postprocess_translation | |
detect_language = TextProcessor.detect_language | |
validate_input = TextProcessor.validate_input | |