LinguaCanvas / utils.py
S-Dreamer's picture
Upload 12 files
b98a046 verified
raw
history blame
2.81 kB
import re
class TextProcessor:
"""Handles text processing operations for translation."""
MAX_LENGTH = 512
PERSIAN_NUMBERS = {
'0': '۰', '1': '۱', '2': '۲', '3': '۳', '4': '۴',
'5': '۵', '6': '۶', '7': '۷', '8': '۸', '9': '۹'
}
@staticmethod
def preprocess_text(text: str) -> str:
"""
Clean and prepare text for translation.
Args:
text: Input text to process
Returns:
Processed text ready for translation
"""
if not text:
return ""
# Normalize whitespace and remove special characters
text = ' '.join(text.split())
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text[:TextProcessor.MAX_LENGTH]
@staticmethod
def postprocess_translation(text: str) -> str:
"""
Clean up translated text and normalize numbers.
Args:
text: Translated text to process
Returns:
Cleaned and normalized text
"""
if not text:
return ""
# Clean up model artifacts
text = text.replace("<pad>", "").replace("</s>", "").replace("<s>", "")
text = re.sub(r'\s+([.,!?])', r'\1', text)
text = ' '.join(text.split())
# Convert to Persian numbers
for en, fa in TextProcessor.PERSIAN_NUMBERS.items():
text = text.replace(en, fa)
return text.strip()
@staticmethod
def detect_language(text: str) -> str:
"""
Detect if text is primarily English or Farsi.
Args:
text: Input text to analyze
Returns:
'Farsi' or 'English' based on character frequency
"""
farsi_chars = len(re.findall(r'[\u0600-\u06FF]', text))
english_chars = len(re.findall(r'[a-zA-Z]', text))
return "Farsi" if farsi_chars > english_chars else "English"
@staticmethod
def validate_input(text: str) -> tuple[bool, str]:
"""
Validate input text length and content.
Args:
text: Input text to validate
Returns:
Tuple of (is_valid, error_message)
"""
if not text or len(text.strip()) < 1:
return False, "Please enter text to translate"
if len(text) > TextProcessor.MAX_LENGTH:
return False, f"Input text is too long (maximum {TextProcessor.MAX_LENGTH} characters)"
return True, ""
# Expose static methods for backward compatibility
preprocess_text = TextProcessor.preprocess_text
postprocess_translation = TextProcessor.postprocess_translation
detect_language = TextProcessor.detect_language
validate_input = TextProcessor.validate_input