Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,17 +2,8 @@ import streamlit as st
|
|
2 |
import PyPDF2
|
3 |
import io
|
4 |
import os
|
5 |
-
import re
|
6 |
-
import nltk
|
7 |
-
from nltk.corpus import words
|
8 |
|
9 |
-
# Download the words corpus if not already downloaded
|
10 |
-
nltk.download('words')
|
11 |
|
12 |
-
# Create a set of English words for quick lookup
|
13 |
-
english_words_set = set(words.words())
|
14 |
-
|
15 |
-
# Your existing mappings
|
16 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
17 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
18 |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
|
@@ -90,43 +81,22 @@ def normalizePreeti(preetitxt):
|
|
90 |
previoussymbol = ''
|
91 |
return normalized
|
92 |
|
93 |
-
def is_english_word(word):
|
94 |
-
# Remove punctuation and convert to lowercase
|
95 |
-
word_clean = re.sub(r'\W+', '', word).lower()
|
96 |
-
return word_clean in english_words_set
|
97 |
-
|
98 |
def convert(preeti):
|
99 |
converted = ''
|
100 |
normalizedpreeti = normalizePreeti(preeti)
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
# English word, skip conversion
|
110 |
-
converted += token
|
111 |
else:
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
if ord(character) >= 97 and ord(character) <= 122:
|
117 |
-
converted_word += unicodeatoz[ord(character) - 97]
|
118 |
-
elif ord(character) >= 65 and ord(character) <= 90:
|
119 |
-
converted_word += unicodeAtoZ[ord(character) - 65]
|
120 |
-
elif ord(character) >= 48 and ord(character) <= 57:
|
121 |
-
converted_word += unicode0to9[ord(character) - 48]
|
122 |
-
else:
|
123 |
-
converted_word += symbolsDict[character]
|
124 |
-
except KeyError:
|
125 |
-
converted_word += character
|
126 |
-
converted += converted_word
|
127 |
-
else:
|
128 |
-
# Non-word token (punctuation, whitespace)
|
129 |
-
converted += token
|
130 |
return converted
|
131 |
|
132 |
def extract_text_from_pdf(pdf_file):
|
@@ -179,4 +149,4 @@ def main():
|
|
179 |
)
|
180 |
|
181 |
if __name__ == "__main__":
|
182 |
-
main()
|
|
|
2 |
import PyPDF2
|
3 |
import io
|
4 |
import os
|
|
|
|
|
|
|
5 |
|
|
|
|
|
6 |
|
|
|
|
|
|
|
|
|
7 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
8 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
9 |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
|
|
|
81 |
previoussymbol = ''
|
82 |
return normalized
|
83 |
|
|
|
|
|
|
|
|
|
|
|
84 |
def convert(preeti):
|
85 |
converted = ''
|
86 |
normalizedpreeti = normalizePreeti(preeti)
|
87 |
+
for index, character in enumerate(normalizedpreeti):
|
88 |
+
try:
|
89 |
+
if ord(character) >= 97 and ord(character) <= 122:
|
90 |
+
converted += unicodeatoz[ord(character) - 97]
|
91 |
+
elif ord(character) >= 65 and ord(character) <= 90:
|
92 |
+
converted += unicodeAtoZ[ord(character) - 65]
|
93 |
+
elif ord(character) >= 48 and ord(character) <= 57:
|
94 |
+
converted += unicode0to9[ord(character) - 48]
|
|
|
|
|
95 |
else:
|
96 |
+
converted += symbolsDict[character]
|
97 |
+
except KeyError:
|
98 |
+
converted += character
|
99 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
return converted
|
101 |
|
102 |
def extract_text_from_pdf(pdf_file):
|
|
|
149 |
)
|
150 |
|
151 |
if __name__ == "__main__":
|
152 |
+
main()
|