rockerritesh commited on
Commit
49bb6dc
·
verified ·
1 Parent(s): 5bddd29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -68
app.py CHANGED
@@ -7,54 +7,16 @@ unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्",
7
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
8
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
9
  symbolsDict = {
10
- "~": "ञ्",
11
- "`": "",
12
- "!": "",
13
- "@": "",
14
- "#": "३",
15
- "$": "४",
16
- "%": "५",
17
- "^": "६",
18
- "&": "७",
19
- "*": "८",
20
- "(": "९",
21
- ")": "०",
22
- "-": "(",
23
- "_": ")",
24
- "+": "ं",
25
- "[": "ृ",
26
- "{": "र्",
27
- "]": "े",
28
- "}": "ै",
29
- "\\": "्",
30
- "|": "्र",
31
- ";": "स",
32
- ":": "स्",
33
- "'": "ु",
34
- "\"": "ू",
35
- ",": ",",
36
- "<": "?",
37
- ".": "।",
38
- ">": "श्र",
39
- "/": "र",
40
- "?": "रु",
41
- "=": ".",
42
- "ˆ": "फ्",
43
- "Î": "ङ्ख",
44
- "å": "द्व",
45
- "÷": "/"
46
  }
47
 
48
  def normalizePreeti(preetitxt):
49
  normalized = ''
50
  previoussymbol = ''
51
- preetitxt = preetitxt.replace('qm', 's|')
52
- preetitxt = preetitxt.replace('f]', 'ो')
53
- preetitxt = preetitxt.replace('km', 'फ')
54
- preetitxt = preetitxt.replace('0f', 'ण')
55
- preetitxt = preetitxt.replace('If', 'क्ष')
56
- preetitxt = preetitxt.replace('if', 'ष')
57
- preetitxt = preetitxt.replace('cf', 'आ')
58
  index = -1
59
  while index + 1 < len(preetitxt):
60
  index += 1
@@ -83,19 +45,18 @@ def normalizePreeti(preetitxt):
83
  def convert(preeti):
84
  converted = ''
85
  normalizedpreeti = normalizePreeti(preeti)
86
- for index, character in enumerate(normalizedpreeti):
87
  try:
88
- if ord(character) >= 97 and ord(character) <= 122:
89
  converted += unicodeatoz[ord(character) - 97]
90
- elif ord(character) >= 65 and ord(character) <= 90:
91
  converted += unicodeAtoZ[ord(character) - 65]
92
- elif ord(character) >= 48 and ord(character) <= 57:
93
  converted += unicode0to9[ord(character) - 48]
94
  else:
95
  converted += symbolsDict[character]
96
  except KeyError:
97
  converted += character
98
-
99
  return converted
100
 
101
  def extract_text_from_pdf(pdf_file):
@@ -105,16 +66,12 @@ def extract_text_from_pdf(pdf_file):
105
  extracted_text = page.extract_text()
106
  if extracted_text:
107
  text += extracted_text
108
-
109
  return handle_vertical_text(text)
110
 
111
  def handle_vertical_text(text):
112
- # If the text is vertical, it's likely arranged with one character per line.
113
- # We'll attempt to reformat the text by concatenating characters that are stacked vertically.
114
  lines = text.split('\n')
115
  vertical_lines = []
116
  horizontal_line = ''
117
-
118
  for line in lines:
119
  if len(line) == 1: # Possible vertical arrangement (single character per line)
120
  horizontal_line += line
@@ -123,23 +80,12 @@ def handle_vertical_text(text):
123
  vertical_lines.append(horizontal_line)
124
  horizontal_line = ''
125
  vertical_lines.append(line) # Add the full line if it's not vertical.
126
-
127
  if horizontal_line:
128
  vertical_lines.append(horizontal_line)
129
-
130
  return ' '.join(vertical_lines)
131
 
132
- def process_file(inputfile):
133
- ext = os.path.splitext(inputfile)[1].lower()
134
- if ext == '.pdf':
135
- preeti = extract_text_from_pdf(inputfile)
136
- else:
137
- with open(inputfile, "r") as fp:
138
- preeti = fp.read()
139
- return convert(preeti)
140
-
141
  def main():
142
- st.title("PDF/TXT to Unicode Converter(Nepali RAG)")
143
 
144
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
145
 
@@ -148,10 +94,8 @@ def main():
148
 
149
  if file_extension == ".pdf":
150
  text = extract_text_from_pdf(io.BytesIO(uploaded_file.read()))
151
-
152
  else: # .txt file
153
- with open(inputfile, "r") as fp:
154
- text = fp.read()
155
 
156
  converted_text = convert(text)
157
 
@@ -173,4 +117,4 @@ def main():
173
  st.markdown("Made with ❤️ by Sumit Yadav(https://sumityadav.com.np)")
174
 
175
  if __name__ == "__main__":
176
- main()
 
7
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
8
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
9
  symbolsDict = {
10
+ "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८",
11
+ "(": "", ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्",
12
+ "|": "्र", ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र",
13
+ "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  }
15
 
16
  def normalizePreeti(preetitxt):
17
  normalized = ''
18
  previoussymbol = ''
19
+ preetitxt = preetitxt.replace('qm', 's|').replace('f]', 'ो').replace('km', 'फ').replace('0f', 'ण').replace('If', 'क्ष').replace('if', 'ष').replace('cf', 'आ')
 
 
 
 
 
 
20
  index = -1
21
  while index + 1 < len(preetitxt):
22
  index += 1
 
45
  def convert(preeti):
46
  converted = ''
47
  normalizedpreeti = normalizePreeti(preeti)
48
+ for character in normalizedpreeti:
49
  try:
50
+ if 97 <= ord(character) <= 122:
51
  converted += unicodeatoz[ord(character) - 97]
52
+ elif 65 <= ord(character) <= 90:
53
  converted += unicodeAtoZ[ord(character) - 65]
54
+ elif 48 <= ord(character) <= 57:
55
  converted += unicode0to9[ord(character) - 48]
56
  else:
57
  converted += symbolsDict[character]
58
  except KeyError:
59
  converted += character
 
60
  return converted
61
 
62
  def extract_text_from_pdf(pdf_file):
 
66
  extracted_text = page.extract_text()
67
  if extracted_text:
68
  text += extracted_text
 
69
  return handle_vertical_text(text)
70
 
71
  def handle_vertical_text(text):
 
 
72
  lines = text.split('\n')
73
  vertical_lines = []
74
  horizontal_line = ''
 
75
  for line in lines:
76
  if len(line) == 1: # Possible vertical arrangement (single character per line)
77
  horizontal_line += line
 
80
  vertical_lines.append(horizontal_line)
81
  horizontal_line = ''
82
  vertical_lines.append(line) # Add the full line if it's not vertical.
 
83
  if horizontal_line:
84
  vertical_lines.append(horizontal_line)
 
85
  return ' '.join(vertical_lines)
86
 
 
 
 
 
 
 
 
 
 
87
  def main():
88
+ st.title("PDF/TXT to Unicode Converter (Nepali RAG)")
89
 
90
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
91
 
 
94
 
95
  if file_extension == ".pdf":
96
  text = extract_text_from_pdf(io.BytesIO(uploaded_file.read()))
 
97
  else: # .txt file
98
+ text = uploaded_file.getvalue().decode("utf-8")
 
99
 
100
  converted_text = convert(text)
101
 
 
117
  st.markdown("Made with ❤️ by Sumit Yadav(https://sumityadav.com.np)")
118
 
119
  if __name__ == "__main__":
120
+ main()