rockerritesh commited on
Commit
5f4cce3
·
verified ·
1 Parent(s): 66882a0

add new feature

Browse files
Files changed (1) hide show
  1. app.py +22 -25
app.py CHANGED
@@ -3,7 +3,7 @@ import PyPDF2
3
  import io
4
  import os
5
 
6
-
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
@@ -49,28 +49,22 @@ symbolsDict = {
49
  def normalizePreeti(preetitxt):
50
  normalized = ''
51
  previoussymbol = ''
52
- preetitxt = preetitxt.replace('qm', 's|')
53
- preetitxt = preetitxt.replace('f]', 'ो')
54
- preetitxt = preetitxt.replace('km', 'फ')
55
- preetitxt = preetitxt.replace('0f', 'ण')
56
- preetitxt = preetitxt.replace('If', 'क्ष')
57
- preetitxt = preetitxt.replace('if', 'ष')
58
- preetitxt = preetitxt.replace('cf', 'आ')
59
  index = -1
60
  while index + 1 < len(preetitxt):
61
  index += 1
62
  character = preetitxt[index]
63
  try:
64
- if preetitxt[index + 2] == '{':
65
- if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
66
- normalized += '{' + character + preetitxt[index + 1]
67
- index += 2
68
- continue
69
- if preetitxt[index + 1] == '{':
70
- if character != 'f':
71
- normalized += '{' + character
72
- index += 1
73
- continue
74
  except IndexError:
75
  pass
76
  if character == 'l':
@@ -101,10 +95,13 @@ def convert(preeti):
101
 
102
  def extract_text_from_pdf(pdf_file):
103
  text = ''
104
- with open(pdf_file, 'rb') as file:
105
- reader = PyPDF2.PdfReader(file)
106
- for page in reader.pages:
107
- text += page.extract_text()
 
 
 
108
  return text
109
 
110
  def process_file(inputfile):
@@ -112,12 +109,12 @@ def process_file(inputfile):
112
  if ext == '.pdf':
113
  preeti = extract_text_from_pdf(inputfile)
114
  else:
115
- with open(inputfile, "r") as fp:
116
  preeti = fp.read()
117
  return convert(preeti)
118
 
119
  def main():
120
- st.title("PDF/TXT to Unicode Converter")
121
 
122
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
123
 
@@ -149,4 +146,4 @@ def main():
149
  )
150
 
151
  if __name__ == "__main__":
152
- main()
 
3
  import io
4
  import os
5
 
6
+ # Updated Unicode mappings
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
9
  unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
 
49
  def normalizePreeti(preetitxt):
50
  normalized = ''
51
  previoussymbol = ''
52
+ # Additional normalization for complex combinations
53
+ preetitxt = preetitxt.replace('qm', 's|').replace('f]', 'ो').replace('km', 'फ').replace('0f', 'ण').replace('If', 'क्ष').replace('if', 'ष').replace('cf', 'आ')
54
+
 
 
 
 
55
  index = -1
56
  while index + 1 < len(preetitxt):
57
  index += 1
58
  character = preetitxt[index]
59
  try:
60
+ if preetitxt[index + 2] == '{' and (preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो'):
61
+ normalized += '{' + character + preetitxt[index + 1]
62
+ index += 2
63
+ continue
64
+ if preetitxt[index + 1] == '{' and character != 'f':
65
+ normalized += '{' + character
66
+ index += 1
67
+ continue
 
 
68
  except IndexError:
69
  pass
70
  if character == 'l':
 
95
 
96
  def extract_text_from_pdf(pdf_file):
97
  text = ''
98
+ try:
99
+ with open(pdf_file, 'rb') as file:
100
+ reader = PyPDF2.PdfReader(file)
101
+ for page in reader.pages:
102
+ text += page.extract_text()
103
+ except Exception as e:
104
+ text = f"Error extracting text: {e}"
105
  return text
106
 
107
  def process_file(inputfile):
 
109
  if ext == '.pdf':
110
  preeti = extract_text_from_pdf(inputfile)
111
  else:
112
+ with open(inputfile, "r", encoding="utf-8") as fp:
113
  preeti = fp.read()
114
  return convert(preeti)
115
 
116
  def main():
117
+ st.title("Preeti to Unicode Converter")
118
 
119
  uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
120
 
 
146
  )
147
 
148
  if __name__ == "__main__":
149
+ main()