File size: 6,031 Bytes
76f42d9
66882a0
76f42d9
 
93294e9
66882a0
2d857e8
76f42d9
 
 
 
93294e9
 
 
 
 
 
76f42d9
 
2d857e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93294e9
2d857e8
 
93294e9
2d857e8
 
93294e9
2d857e8
 
93294e9
2d857e8
 
 
 
 
 
 
 
 
93294e9
2d857e8
 
 
 
 
 
93294e9
2d857e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac96a59
2d857e8
 
93294e9
ac96a59
76f42d9
2d857e8
93294e9
2d857e8
 
 
 
 
 
 
76f42d9
2d857e8
93294e9
2d857e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93294e9
76f42d9
 
2d857e8
93294e9
2d857e8
 
93294e9
2d857e8
 
93294e9
2d857e8
 
93294e9
2d857e8
 
 
 
 
93294e9
2d857e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93294e9
 
76f42d9
2d857e8
 
93294e9
2d857e8
 
 
 
 
 
 
 
 
93294e9
 
 
 
 
 
 
 
76f42d9
ac96a59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import streamlit as st
import PyPDF2
import io
import os
import re

# Existing mapping dictionaries remain the same
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
    "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
    "^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
    "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
    ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
    ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
    "å": "द्व", "÷": "/"
}

# Common Preeti patterns that need to be preserved as units
preeti_compounds = {
    'qm': 's|',
    'f]': 'ो',
    'km': 'फ',
    '0f': 'ण',
    'If': 'क्ष',
    'if': 'ष',
    'cf': 'आ',
    '6«': 'ट्र',
    'g]': 'ने',
    '8f': 'डा',
    '«': '्र',
    'j|m': 'क्र',
    ';+': 'सं'
}

def is_nepali_unicode(char):
    """Check if character is already in Nepali Unicode range"""
    return '\u0900' <= char <= '\u097F'

def get_preeti_segment(text, start_idx):
    """
    Extract a complete Preeti segment starting from given index.
    Returns the segment and the ending index.
    """
    if start_idx >= len(text):
        return "", start_idx
    
    current_idx = start_idx
    segment = ""
    
    while current_idx < len(text):
        # Check for compound characters first
        matched = False
        for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
            if text[current_idx:].startswith(compound):
                segment += compound
                current_idx += len(compound)
                matched = True
                break
        
        if not matched:
            char = text[current_idx]
            if char.isspace() or is_nepali_unicode(char):
                break
            segment += char
            current_idx += 1
            
    return segment, current_idx

def normalize_preeti(preetitxt):
    """Normalize Preeti text with improved compound handling"""
    # First handle the compound characters
    for old, new in preeti_compounds.items():
        preetitxt = preetitxt.replace(old, new)
    
    # Handle remaining special cases
    normalized = ''
    idx = 0
    while idx < len(preetitxt):
        if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
            normalized += preetitxt[idx + 1] + 'ि'
            idx += 2
        else:
            normalized += preetitxt[idx]
            idx += 1
            
    return normalized

def convert_segment(segment):
    """Convert a single Preeti segment to Unicode"""
    if not segment.strip():
        return segment
    
    # If already in Nepali Unicode, return as is
    if all(is_nepali_unicode(char) for char in segment if char.strip()):
        return segment
    
    converted = ''
    normalized = normalize_preeti(segment)
    
    for char in normalized:
        if is_nepali_unicode(char):
            converted += char
        elif char.isascii():
            try:
                if 'a' <= char <= 'z':
                    converted += unicodeatoz[ord(char) - ord('a')]
                elif 'A' <= char <= 'Z':
                    converted += unicodeAtoZ[ord(char) - ord('A')]
                elif '0' <= char <= '9':
                    converted += unicode0to9[ord(char) - ord('0')]
                else:
                    converted += symbolsDict.get(char, char)
            except (IndexError, KeyError):
                converted += char
        else:
            converted += char
            
    return converted

def smart_convert_mixed(text):
    """
    Convert text while handling mixed Preeti, Unicode and English.
    Processes text character by character to maintain proper segmentation.
    """
    result = ""
    idx = 0
    
    while idx < len(text):
        char = text[idx]
        
        # Skip spaces and preserve them
        if char.isspace():
            result += char
            idx += 1
            continue
            
        # If character is already in Nepali Unicode, preserve it
        if is_nepali_unicode(char):
            result += char
            idx += 1
            continue
            
        # If it's a potential Preeti character, get the complete segment
        if char.isascii():
            preeti_segment, new_idx = get_preeti_segment(text, idx)
            if preeti_segment:
                result += convert_segment(preeti_segment)
                idx = new_idx
                continue
                
        # Default case: preserve the character
        result += char
        idx += 1
        
    return result

def main():
    st.title("Advanced Mixed Text Converter")
    st.write("Converts Preeti text while preserving existing Nepali Unicode and English")

    # Input area
    input_text = st.text_area("Enter text to convert", height=200)
    
    if st.button("Convert"):
        if input_text:
            converted_text = smart_convert_mixed(input_text)
            
            st.subheader("Converted Text")
            st.text_area("", value=converted_text, height=200)
            
            st.download_button(
                label="Download Converted Text",
                data=converted_text.encode("utf-8"),
                file_name="converted_text.txt",
                mime="text/plain"
            )

if __name__ == "__main__":
    main()