File size: 2,814 Bytes
b98a046
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

import re

class TextProcessor:
    """Handles text processing operations for translation."""
    
    MAX_LENGTH = 512
    PERSIAN_NUMBERS = {
        '0': '۰', '1': '۱', '2': '۲', '3': '۳', '4': '۴',
        '5': '۵', '6': '۶', '7': '۷', '8': '۸', '9': '۹'
    }

    @staticmethod
    def preprocess_text(text: str) -> str:
        """
        Clean and prepare text for translation.
        
        Args:
            text: Input text to process
            
        Returns:
            Processed text ready for translation
        """
        if not text:
            return ""

        # Normalize whitespace and remove special characters
        text = ' '.join(text.split())
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        
        return text[:TextProcessor.MAX_LENGTH]

    @staticmethod
    def postprocess_translation(text: str) -> str:
        """
        Clean up translated text and normalize numbers.
        
        Args:
            text: Translated text to process
            
        Returns:
            Cleaned and normalized text
        """
        if not text:
            return ""

        # Clean up model artifacts
        text = text.replace("<pad>", "").replace("</s>", "").replace("<s>", "")
        text = re.sub(r'\s+([.,!?])', r'\1', text)
        text = ' '.join(text.split())

        # Convert to Persian numbers
        for en, fa in TextProcessor.PERSIAN_NUMBERS.items():
            text = text.replace(en, fa)

        return text.strip()

    @staticmethod
    def detect_language(text: str) -> str:
        """
        Detect if text is primarily English or Farsi.
        
        Args:
            text: Input text to analyze
            
        Returns:
            'Farsi' or 'English' based on character frequency
        """
        farsi_chars = len(re.findall(r'[\u0600-\u06FF]', text))
        english_chars = len(re.findall(r'[a-zA-Z]', text))
        return "Farsi" if farsi_chars > english_chars else "English"

    @staticmethod
    def validate_input(text: str) -> tuple[bool, str]:
        """
        Validate input text length and content.
        
        Args:
            text: Input text to validate
            
        Returns:
            Tuple of (is_valid, error_message)
        """
        if not text or len(text.strip()) < 1:
            return False, "Please enter text to translate"
        if len(text) > TextProcessor.MAX_LENGTH:
            return False, f"Input text is too long (maximum {TextProcessor.MAX_LENGTH} characters)"
        return True, ""

# Expose static methods for backward compatibility
preprocess_text = TextProcessor.preprocess_text
postprocess_translation = TextProcessor.postprocess_translation
detect_language = TextProcessor.detect_language
validate_input = TextProcessor.validate_input