rockerritesh commited on
Commit
d0a3b36
·
verified ·
1 Parent(s): 2d857e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -158
app.py CHANGED
@@ -2,180 +2,151 @@ import streamlit as st
2
  import PyPDF2
3
  import io
4
  import os
5
- import re
6
 
7
- # Existing mapping dictionaries remain the same
8
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
9
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
10
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
11
  symbolsDict = {
12
- "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
13
- "^": "", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
14
- "+": "", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
15
- ";": "", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
16
- ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
17
- "å": "द्व", "÷": "/"
18
- }
19
-
20
- # Common Preeti patterns that need to be preserved as units
21
- preeti_compounds = {
22
- 'qm': 's|',
23
- 'f]': 'ो',
24
- 'km': 'फ',
25
- '0f': 'ण',
26
- 'If': 'क्ष',
27
- 'if': 'ष',
28
- 'cf': 'आ',
29
- '6«': 'ट्र',
30
- 'g]': 'ने',
31
- '8f': 'डा',
32
- '«': '्र',
33
- 'j|m': 'क्र',
34
- ';+': 'सं'
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
 
37
- def is_nepali_unicode(char):
38
- """Check if character is already in Nepali Unicode range"""
39
- return '\u0900' <= char <= '\u097F'
40
-
41
- def get_preeti_segment(text, start_idx):
42
- """
43
- Extract a complete Preeti segment starting from given index.
44
- Returns the segment and the ending index.
45
- """
46
- if start_idx >= len(text):
47
- return "", start_idx
48
-
49
- current_idx = start_idx
50
- segment = ""
51
-
52
- while current_idx < len(text):
53
- # Check for compound characters first
54
- matched = False
55
- for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
56
- if text[current_idx:].startswith(compound):
57
- segment += compound
58
- current_idx += len(compound)
59
- matched = True
60
- break
61
-
62
- if not matched:
63
- char = text[current_idx]
64
- if char.isspace() or is_nepali_unicode(char):
65
- break
66
- segment += char
67
- current_idx += 1
68
-
69
- return segment, current_idx
70
-
71
- def normalize_preeti(preetitxt):
72
- """Normalize Preeti text with improved compound handling"""
73
- # First handle the compound characters
74
- for old, new in preeti_compounds.items():
75
- preetitxt = preetitxt.replace(old, new)
76
-
77
- # Handle remaining special cases
78
  normalized = ''
79
- idx = 0
80
- while idx < len(preetitxt):
81
- if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
82
- normalized += preetitxt[idx + 1] + 'ि'
83
- idx += 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  else:
85
- normalized += preetitxt[idx]
86
- idx += 1
87
-
88
  return normalized
89
 
90
- def convert_segment(segment):
91
- """Convert a single Preeti segment to Unicode"""
92
- if not segment.strip():
93
- return segment
94
-
95
- # If already in Nepali Unicode, return as is
96
- if all(is_nepali_unicode(char) for char in segment if char.strip()):
97
- return segment
98
-
99
  converted = ''
100
- normalized = normalize_preeti(segment)
101
-
102
- for char in normalized:
103
- if is_nepali_unicode(char):
104
- converted += char
105
- elif char.isascii():
106
- try:
107
- if 'a' <= char <= 'z':
108
- converted += unicodeatoz[ord(char) - ord('a')]
109
- elif 'A' <= char <= 'Z':
110
- converted += unicodeAtoZ[ord(char) - ord('A')]
111
- elif '0' <= char <= '9':
112
- converted += unicode0to9[ord(char) - ord('0')]
113
- else:
114
- converted += symbolsDict.get(char, char)
115
- except (IndexError, KeyError):
116
- converted += char
117
- else:
118
- converted += char
119
-
120
  return converted
121
 
122
- def smart_convert_mixed(text):
123
- """
124
- Convert text while handling mixed Preeti, Unicode and English.
125
- Processes text character by character to maintain proper segmentation.
126
- """
127
- result = ""
128
- idx = 0
129
-
130
- while idx < len(text):
131
- char = text[idx]
132
-
133
- # Skip spaces and preserve them
134
- if char.isspace():
135
- result += char
136
- idx += 1
137
- continue
138
-
139
- # If character is already in Nepali Unicode, preserve it
140
- if is_nepali_unicode(char):
141
- result += char
142
- idx += 1
143
- continue
144
-
145
- # If it's a potential Preeti character, get the complete segment
146
- if char.isascii():
147
- preeti_segment, new_idx = get_preeti_segment(text, idx)
148
- if preeti_segment:
149
- result += convert_segment(preeti_segment)
150
- idx = new_idx
151
- continue
152
-
153
- # Default case: preserve the character
154
- result += char
155
- idx += 1
156
-
157
- return result
158
 
159
  def main():
160
- st.title("Advanced Mixed Text Converter")
161
- st.write("Converts Preeti text while preserving existing Nepali Unicode and English")
162
-
163
- # Input area
164
- input_text = st.text_area("Enter text to convert", height=200)
165
-
166
- if st.button("Convert"):
167
- if input_text:
168
- converted_text = smart_convert_mixed(input_text)
169
-
170
- st.subheader("Converted Text")
171
- st.text_area("", value=converted_text, height=200)
172
-
173
- st.download_button(
174
- label="Download Converted Text",
175
- data=converted_text.encode("utf-8"),
176
- file_name="converted_text.txt",
177
- mime="text/plain"
178
- )
 
 
 
 
 
 
 
 
 
 
 
179
 
180
  if __name__ == "__main__":
181
  main()
 
2
  import PyPDF2
3
  import io
4
  import os
 
5
 
6
+
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
10
  symbolsDict = {
11
+ "~": "ञ्",
12
+ "`": "",
13
+ "!": "",
14
+ "@": "",
15
+ "#": "",
16
+ "$": "",
17
+ "%": "५",
18
+ "^": "६",
19
+ "&": "७",
20
+ "*": "८",
21
+ "(": "९",
22
+ ")": "०",
23
+ "-": "(",
24
+ "_": ")",
25
+ "+": "ं",
26
+ "[": "ृ",
27
+ "{": "र्",
28
+ "]": "े",
29
+ "}": "ै",
30
+ "\\": "्",
31
+ "|": "्र",
32
+ ";": "स",
33
+ ":": "स्",
34
+ "'": "ु",
35
+ "\"": "ू",
36
+ ",": ",",
37
+ "<": "?",
38
+ ".": "।",
39
+ ">": "श्र",
40
+ "/": "र",
41
+ "?": "रु",
42
+ "=": ".",
43
+ "ˆ": "फ्",
44
+ "Î": "ङ्ख",
45
+ "å": "द्व",
46
+ "÷": "/"
47
  }
48
 
49
+ def normalizePreeti(preetitxt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  normalized = ''
51
+ previoussymbol = ''
52
+ preetitxt = preetitxt.replace('qm', 's|')
53
+ preetitxt = preetitxt.replace('f]', '')
54
+ preetitxt = preetitxt.replace('km', '')
55
+ preetitxt = preetitxt.replace('0f', 'ण')
56
+ preetitxt = preetitxt.replace('If', 'क्ष')
57
+ preetitxt = preetitxt.replace('if', 'ष')
58
+ preetitxt = preetitxt.replace('cf', 'आ')
59
+ index = -1
60
+ while index + 1 < len(preetitxt):
61
+ index += 1
62
+ character = preetitxt[index]
63
+ try:
64
+ if preetitxt[index + 2] == '{':
65
+ if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
66
+ normalized += '{' + character + preetitxt[index + 1]
67
+ index += 2
68
+ continue
69
+ if preetitxt[index + 1] == '{':
70
+ if character != 'f':
71
+ normalized += '{' + character
72
+ index += 1
73
+ continue
74
+ except IndexError:
75
+ pass
76
+ if character == 'l':
77
+ previoussymbol = 'l'
78
+ continue
79
  else:
80
+ normalized += character + previoussymbol
81
+ previoussymbol = ''
 
82
  return normalized
83
 
84
+ def convert(preeti):
 
 
 
 
 
 
 
 
85
  converted = ''
86
+ normalizedpreeti = normalizePreeti(preeti)
87
+ for index, character in enumerate(normalizedpreeti):
88
+ try:
89
+ if ord(character) >= 97 and ord(character) <= 122:
90
+ converted += unicodeatoz[ord(character) - 97]
91
+ elif ord(character) >= 65 and ord(character) <= 90:
92
+ converted += unicodeAtoZ[ord(character) - 65]
93
+ elif ord(character) >= 48 and ord(character) <= 57:
94
+ converted += unicode0to9[ord(character) - 48]
95
+ else:
96
+ converted += symbolsDict[character]
97
+ except KeyError:
98
+ converted += character
99
+
 
 
 
 
 
 
100
  return converted
101
 
102
+ def extract_text_from_pdf(pdf_file):
103
+ text = ''
104
+ with open(pdf_file, 'rb') as file:
105
+ reader = PyPDF2.PdfReader(file)
106
+ for page in reader.pages:
107
+ text += page.extract_text()
108
+ return text
109
+
110
+ def process_file(inputfile):
111
+ ext = os.path.splitext(inputfile)[1].lower()
112
+ if ext == '.pdf':
113
+ preeti = extract_text_from_pdf(inputfile)
114
+ else:
115
+ with open(inputfile, "r") as fp:
116
+ preeti = fp.read()
117
+ return convert(preeti)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  def main():
120
+ st.title("PDF/TXT to Unicode Converter")
121
+
122
+ uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
123
+
124
+ if uploaded_file is not None:
125
+ file_extension = os.path.splitext(uploaded_file.name)[1].lower()
126
+
127
+ if file_extension == ".pdf":
128
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
129
+ text = ""
130
+ for page in pdf_reader.pages:
131
+ text += page.extract_text()
132
+ else: # .txt file
133
+ text = uploaded_file.getvalue().decode("utf-8")
134
+
135
+ converted_text = convert(text)
136
+
137
+ st.subheader("Original Text")
138
+ st.text_area("", value=text, height=200)
139
+
140
+ st.subheader("Converted Text")
141
+ st.text_area("", value=converted_text, height=200)
142
+
143
+ # Create a download button for the converted text
144
+ st.download_button(
145
+ label="Download Converted Text",
146
+ data=converted_text.encode("utf-8"),
147
+ file_name="converted_text.txt",
148
+ mime="text/plain"
149
+ )
150
 
151
  if __name__ == "__main__":
152
  main()