rockerritesh commited on
Commit
2d857e8
·
verified ·
1 Parent(s): 93294e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -154
app.py CHANGED
@@ -4,7 +4,7 @@ import io
4
  import os
5
  import re
6
 
7
- # Existing mapping dictionaries
8
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
9
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
10
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
@@ -17,182 +17,159 @@ symbolsDict = {
17
  "å": "द्व", "÷": "/"
18
  }
19
 
20
- def is_preeti_text(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  """
22
- Check if text segment is likely to be Preeti-encoded Nepali.
23
- Returns True if the text contains common Preeti patterns.
24
  """
25
- preeti_patterns = [
26
- r'cf', r'qm', r'If', r'0f', r'km', r'f]', # Common Preeti combinations
27
- r'[a-zA-Z]{2,}[\\|\[\]{}]', # Preeti vowel signs and consonants
28
- ]
29
 
30
- return any(re.search(pattern, text) for pattern in preeti_patterns)
31
-
32
- def normalizePreeti(preetitxt):
33
- """Normalized Preeti text with improved handling"""
34
- normalized = ''
35
- previoussymbol = ''
36
 
37
- # Common Preeti substitutions
38
- replacements = {
39
- 'qm': 's|',
40
- 'f]': 'ो',
41
- 'km': 'फ',
42
- '0f': 'ण',
43
- 'If': 'क्ष',
44
- 'if': 'ष',
45
- 'cf': 'आ'
46
- }
47
-
48
- for old, new in replacements.items():
49
- preetitxt = preetitxt.replace(old, new)
50
-
51
- index = -1
52
- while index + 1 < len(preetitxt):
53
- index += 1
54
- character = preetitxt[index]
55
 
56
- try:
57
- if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{':
58
- if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
59
- normalized += '{' + character + preetitxt[index + 1]
60
- index += 2
61
- continue
62
-
63
- if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{':
64
- if character != 'f':
65
- normalized += '{' + character
66
- index += 1
67
- continue
68
- except IndexError:
69
- pass
70
 
71
- if character == 'l':
72
- previoussymbol = 'l'
73
- continue
 
 
 
 
 
 
 
 
 
 
 
 
74
  else:
75
- normalized += character + previoussymbol
76
- previoussymbol = ''
77
 
78
  return normalized
79
 
80
- def convert_preeti_segment(preeti):
81
  """Convert a single Preeti segment to Unicode"""
 
 
 
 
 
 
 
82
  converted = ''
83
- normalizedpreeti = normalizePreeti(preeti)
84
 
85
- for character in normalizedpreeti:
86
- try:
87
- if ord('a') <= ord(character) <= ord('z'):
88
- converted += unicodeatoz[ord(character) - ord('a')]
89
- elif ord('A') <= ord(character) <= ord('Z'):
90
- converted += unicodeAtoZ[ord(character) - ord('A')]
91
- elif ord('0') <= ord(character) <= ord('9'):
92
- converted += unicode0to9[ord(character) - ord('0')]
93
- else:
94
- converted += symbolsDict.get(character, character)
95
- except (KeyError, IndexError):
96
- converted += character
 
 
 
 
 
97
 
98
  return converted
99
 
100
- def smart_convert(text):
101
  """
102
- Convert text while preserving English segments.
103
- Uses pattern matching to identify and preserve English text.
104
  """
105
- # Patterns to identify different text segments
106
- patterns = [
107
- # Email addresses
108
- r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
109
- # URLs
110
- r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
111
- # Date patterns
112
- r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b',
113
- # Common English words (3 or more characters)
114
- r'\b[A-Za-z]{3,}\b',
115
- # Numbers with units
116
- r'\b\d+\s*[A-Za-z]+\b',
117
- ]
118
-
119
- # Combine patterns
120
- combined_pattern = '|'.join(patterns)
121
-
122
- # Split text into segments while preserving delimiters
123
- segments = []
124
- last_end = 0
125
 
126
- for match in re.finditer(combined_pattern, text):
127
- start, end = match.span()
128
 
129
- # Add text before match
130
- if start > last_end:
131
- segment = text[last_end:start]
132
- if segment.strip():
133
- segments.append((segment, is_preeti_text(segment)))
134
-
135
- # Add matched text (preserve it)
136
- segments.append((match.group(), False))
137
- last_end = end
138
-
139
- # Add remaining text
140
- if last_end < len(text):
141
- segment = text[last_end:]
142
- if segment.strip():
143
- segments.append((segment, is_preeti_text(segment)))
144
-
145
- # Convert segments
146
- result = ''
147
- for segment, is_preeti in segments:
148
- if is_preeti:
149
- result += convert_preeti_segment(segment)
150
- else:
151
- result += segment
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  return result
154
 
155
- def extract_text_from_pdf(pdf_file):
156
- """Extract text from PDF with improved encoding handling"""
157
- text = ''
158
- try:
159
- with open(pdf_file, 'rb') as file:
160
- reader = PyPDF2.PdfReader(file)
161
- for page in reader.pages:
162
- text += page.extract_text() or ''
163
- except Exception as e:
164
- st.error(f"Error reading PDF: {str(e)}")
165
- return ''
166
- return text
167
-
168
  def main():
169
- st.title("Smart Preeti to Unicode Converter")
170
- st.write("This converter preserves English text while converting Preeti to Unicode")
171
-
172
- uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
173
-
174
- if uploaded_file is not None:
175
- try:
176
- if uploaded_file.name.lower().endswith('.pdf'):
177
- pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
178
- text = ""
179
- for page in pdf_reader.pages:
180
- text += page.extract_text() or ''
181
- else: # .txt file
182
- text = uploaded_file.getvalue().decode("utf-8")
183
 
184
- converted_text = smart_convert(text)
185
-
186
- col1, col2 = st.columns(2)
 
 
 
 
 
 
187
 
188
- with col1:
189
- st.subheader("Original Text")
190
- st.text_area("", value=text, height=300)
191
-
192
- with col2:
193
- st.subheader("Converted Text")
194
- st.text_area("", value=converted_text, height=300)
195
-
196
  st.download_button(
197
  label="Download Converted Text",
198
  data=converted_text.encode("utf-8"),
@@ -200,8 +177,5 @@ def main():
200
  mime="text/plain"
201
  )
202
 
203
- except Exception as e:
204
- st.error(f"An error occurred: {str(e)}")
205
-
206
  if __name__ == "__main__":
207
  main()
 
4
  import os
5
  import re
6
 
7
+ # Existing mapping dictionaries remain the same
8
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
9
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
10
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
 
17
  "å": "द्व", "÷": "/"
18
  }
19
 
20
+ # Common Preeti patterns that need to be preserved as units
21
+ preeti_compounds = {
22
+ 'qm': 's|',
23
+ 'f]': 'ो',
24
+ 'km': 'फ',
25
+ '0f': 'ण',
26
+ 'If': 'क्ष',
27
+ 'if': 'ष',
28
+ 'cf': 'आ',
29
+ '6«': 'ट्र',
30
+ 'g]': 'ने',
31
+ '8f': 'डा',
32
+ '«': '्र',
33
+ 'j|m': 'क्र',
34
+ ';+': 'सं'
35
+ }
36
+
37
+ def is_nepali_unicode(char):
38
+ """Check if character is already in Nepali Unicode range"""
39
+ return '\u0900' <= char <= '\u097F'
40
+
41
+ def get_preeti_segment(text, start_idx):
42
  """
43
+ Extract a complete Preeti segment starting from given index.
44
+ Returns the segment and the ending index.
45
  """
46
+ if start_idx >= len(text):
47
+ return "", start_idx
 
 
48
 
49
+ current_idx = start_idx
50
+ segment = ""
 
 
 
 
51
 
52
+ while current_idx < len(text):
53
+ # Check for compound characters first
54
+ matched = False
55
+ for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
56
+ if text[current_idx:].startswith(compound):
57
+ segment += compound
58
+ current_idx += len(compound)
59
+ matched = True
60
+ break
 
 
 
 
 
 
 
 
 
61
 
62
+ if not matched:
63
+ char = text[current_idx]
64
+ if char.isspace() or is_nepali_unicode(char):
65
+ break
66
+ segment += char
67
+ current_idx += 1
 
 
 
 
 
 
 
 
68
 
69
+ return segment, current_idx
70
+
71
+ def normalize_preeti(preetitxt):
72
+ """Normalize Preeti text with improved compound handling"""
73
+ # First handle the compound characters
74
+ for old, new in preeti_compounds.items():
75
+ preetitxt = preetitxt.replace(old, new)
76
+
77
+ # Handle remaining special cases
78
+ normalized = ''
79
+ idx = 0
80
+ while idx < len(preetitxt):
81
+ if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
82
+ normalized += preetitxt[idx + 1] + 'ि'
83
+ idx += 2
84
  else:
85
+ normalized += preetitxt[idx]
86
+ idx += 1
87
 
88
  return normalized
89
 
90
+ def convert_segment(segment):
91
  """Convert a single Preeti segment to Unicode"""
92
+ if not segment.strip():
93
+ return segment
94
+
95
+ # If already in Nepali Unicode, return as is
96
+ if all(is_nepali_unicode(char) for char in segment if char.strip()):
97
+ return segment
98
+
99
  converted = ''
100
+ normalized = normalize_preeti(segment)
101
 
102
+ for char in normalized:
103
+ if is_nepali_unicode(char):
104
+ converted += char
105
+ elif char.isascii():
106
+ try:
107
+ if 'a' <= char <= 'z':
108
+ converted += unicodeatoz[ord(char) - ord('a')]
109
+ elif 'A' <= char <= 'Z':
110
+ converted += unicodeAtoZ[ord(char) - ord('A')]
111
+ elif '0' <= char <= '9':
112
+ converted += unicode0to9[ord(char) - ord('0')]
113
+ else:
114
+ converted += symbolsDict.get(char, char)
115
+ except (IndexError, KeyError):
116
+ converted += char
117
+ else:
118
+ converted += char
119
 
120
  return converted
121
 
122
+ def smart_convert_mixed(text):
123
  """
124
+ Convert text while handling mixed Preeti, Unicode and English.
125
+ Processes text character by character to maintain proper segmentation.
126
  """
127
+ result = ""
128
+ idx = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ while idx < len(text):
131
+ char = text[idx]
132
 
133
+ # Skip spaces and preserve them
134
+ if char.isspace():
135
+ result += char
136
+ idx += 1
137
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ # If character is already in Nepali Unicode, preserve it
140
+ if is_nepali_unicode(char):
141
+ result += char
142
+ idx += 1
143
+ continue
144
+
145
+ # If it's a potential Preeti character, get the complete segment
146
+ if char.isascii():
147
+ preeti_segment, new_idx = get_preeti_segment(text, idx)
148
+ if preeti_segment:
149
+ result += convert_segment(preeti_segment)
150
+ idx = new_idx
151
+ continue
152
+
153
+ # Default case: preserve the character
154
+ result += char
155
+ idx += 1
156
+
157
  return result
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  def main():
160
+ st.title("Advanced Mixed Text Converter")
161
+ st.write("Converts Preeti text while preserving existing Nepali Unicode and English")
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ # Input area
164
+ input_text = st.text_area("Enter text to convert", height=200)
165
+
166
+ if st.button("Convert"):
167
+ if input_text:
168
+ converted_text = smart_convert_mixed(input_text)
169
+
170
+ st.subheader("Converted Text")
171
+ st.text_area("", value=converted_text, height=200)
172
 
 
 
 
 
 
 
 
 
173
  st.download_button(
174
  label="Download Converted Text",
175
  data=converted_text.encode("utf-8"),
 
177
  mime="text/plain"
178
  )
179
 
 
 
 
180
  if __name__ == "__main__":
181
  main()