File size: 7,205 Bytes
76f42d9
66882a0
76f42d9
 
93294e9
66882a0
93294e9
76f42d9
 
 
 
93294e9
 
 
 
 
 
76f42d9
 
93294e9
 
 
 
 
 
 
 
 
 
 
 
76f42d9
93294e9
ac96a59
 
93294e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac96a59
 
 
 
93294e9
ac96a59
93294e9
ac96a59
 
 
 
93294e9
 
ac96a59
 
 
 
 
 
93294e9
ac96a59
 
 
 
 
 
93294e9
ac96a59
76f42d9
93294e9
 
76f42d9
 
93294e9
 
76f42d9
93294e9
 
 
 
 
 
76f42d9
93294e9
 
76f42d9
93294e9
76f42d9
 
93294e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76f42d9
93294e9
76f42d9
93294e9
 
 
 
 
 
 
 
66882a0
 
76f42d9
93294e9
 
76f42d9
 
 
 
93294e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76f42d9
 
ac96a59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import streamlit as st
import PyPDF2
import io
import os
import re

# Existing mapping dictionaries
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
    "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
    "^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
    "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
    ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
    ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
    "å": "द्व", "÷": "/"
}

def is_preeti_text(text):
    """
    Check if text segment is likely to be Preeti-encoded Nepali.
    Returns True if the text contains common Preeti patterns.
    """
    preeti_patterns = [
        r'cf', r'qm', r'If', r'0f', r'km', r'f]',  # Common Preeti combinations
        r'[a-zA-Z]{2,}[\\|\[\]{}]',  # Preeti vowel signs and consonants
    ]
    
    return any(re.search(pattern, text) for pattern in preeti_patterns)

def normalizePreeti(preetitxt):
    """Normalized Preeti text with improved handling"""
    normalized = ''
    previoussymbol = ''
    
    # Common Preeti substitutions
    replacements = {
        'qm': 's|',
        'f]': 'ो',
        'km': 'फ',
        '0f': 'ण',
        'If': 'क्ष',
        'if': 'ष',
        'cf': 'आ'
    }
    
    for old, new in replacements.items():
        preetitxt = preetitxt.replace(old, new)
    
    index = -1
    while index + 1 < len(preetitxt):
        index += 1
        character = preetitxt[index]
        
        try:
            if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{':
                if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
                    normalized += '{' + character + preetitxt[index + 1]
                    index += 2
                    continue
                    
            if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{':
                if character != 'f':
                    normalized += '{' + character
                    index += 1
                    continue
        except IndexError:
            pass
            
        if character == 'l':
            previoussymbol = 'l'
            continue
        else:
            normalized += character + previoussymbol
            previoussymbol = ''
            
    return normalized

def convert_preeti_segment(preeti):
    """Convert a single Preeti segment to Unicode"""
    converted = ''
    normalizedpreeti = normalizePreeti(preeti)
    
    for character in normalizedpreeti:
        try:
            if ord('a') <= ord(character) <= ord('z'):
                converted += unicodeatoz[ord(character) - ord('a')]
            elif ord('A') <= ord(character) <= ord('Z'):
                converted += unicodeAtoZ[ord(character) - ord('A')]
            elif ord('0') <= ord(character) <= ord('9'):
                converted += unicode0to9[ord(character) - ord('0')]
            else:
                converted += symbolsDict.get(character, character)
        except (KeyError, IndexError):
            converted += character
            
    return converted

def smart_convert(text):
    """
    Convert text while preserving English segments.
    Uses pattern matching to identify and preserve English text.
    """
    # Patterns to identify different text segments
    patterns = [
        # Email addresses
        r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
        # URLs
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        # Date patterns
        r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b',
        # Common English words (3 or more characters)
        r'\b[A-Za-z]{3,}\b',
        # Numbers with units
        r'\b\d+\s*[A-Za-z]+\b',
    ]
    
    # Combine patterns
    combined_pattern = '|'.join(patterns)
    
    # Split text into segments while preserving delimiters
    segments = []
    last_end = 0
    
    for match in re.finditer(combined_pattern, text):
        start, end = match.span()
        
        # Add text before match
        if start > last_end:
            segment = text[last_end:start]
            if segment.strip():
                segments.append((segment, is_preeti_text(segment)))
        
        # Add matched text (preserve it)
        segments.append((match.group(), False))
        last_end = end
    
    # Add remaining text
    if last_end < len(text):
        segment = text[last_end:]
        if segment.strip():
            segments.append((segment, is_preeti_text(segment)))
    
    # Convert segments
    result = ''
    for segment, is_preeti in segments:
        if is_preeti:
            result += convert_preeti_segment(segment)
        else:
            result += segment
            
    return result

def extract_text_from_pdf(pdf_file):
    """Extract text from PDF with improved encoding handling"""
    text = ''
    try:
        with open(pdf_file, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() or ''
    except Exception as e:
        st.error(f"Error reading PDF: {str(e)}")
        return ''
    return text

def main():
    st.title("Smart Preeti to Unicode Converter")
    st.write("This converter preserves English text while converting Preeti to Unicode")

    uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])

    if uploaded_file is not None:
        try:
            if uploaded_file.name.lower().endswith('.pdf'):
                pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() or ''
            else:  # .txt file
                text = uploaded_file.getvalue().decode("utf-8")

            converted_text = smart_convert(text)

            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Original Text")
                st.text_area("", value=text, height=300)
                
            with col2:
                st.subheader("Converted Text")
                st.text_area("", value=converted_text, height=300)

            st.download_button(
                label="Download Converted Text",
                data=converted_text.encode("utf-8"),
                file_name="converted_text.txt",
                mime="text/plain"
            )

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()