Spaces:
Sleeping
Sleeping
import streamlit as st | |
import io | |
import os | |
import pdfplumber | |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"] | |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"] | |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"] | |
symbolsDict = { | |
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", | |
"(": "९", ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", | |
"|": "्र", ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", | |
"?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/" | |
} | |
def normalizePreeti(preetitxt): | |
normalized = '' | |
previoussymbol = '' | |
preetitxt = preetitxt.replace('qm', 's|').replace('f]', 'ो').replace('km', 'फ').replace('0f', 'ण').replace('If', 'क्ष').replace('if', 'ष').replace('cf', 'आ') | |
index = -1 | |
while index + 1 < len(preetitxt): | |
index += 1 | |
character = preetitxt[index] | |
try: | |
if preetitxt[index + 2] == '{': | |
if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो': | |
normalized += '{' + character + preetitxt[index + 1] | |
index += 2 | |
continue | |
if preetitxt[index + 1] == '{': | |
if character != 'f': | |
normalized += '{' + character | |
index += 1 | |
continue | |
except IndexError: | |
pass | |
if character == 'l': | |
previoussymbol = 'l' | |
continue | |
else: | |
normalized += character + previoussymbol | |
previoussymbol = '' | |
return normalized | |
def convert(preeti): | |
converted = '' | |
normalizedpreeti = normalizePreeti(preeti) | |
for character in normalizedpreeti: | |
try: | |
if 97 <= ord(character) <= 122: | |
converted += unicodeatoz[ord(character) - 97] | |
elif 65 <= ord(character) <= 90: | |
converted += unicodeAtoZ[ord(character) - 65] | |
elif 48 <= ord(character) <= 57: | |
converted += unicode0to9[ord(character) - 48] | |
else: | |
converted += symbolsDict[character] | |
except KeyError: | |
converted += character | |
return converted | |
def extract_text_from_pdf(pdf_file): | |
text = '' | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
extracted_text = page.extract_text() | |
if extracted_text: | |
text += extracted_text | |
return handle_vertical_text(text) | |
def handle_vertical_text(text): | |
lines = text.split('\n') | |
vertical_lines = [] | |
horizontal_line = '' | |
for line in lines: | |
if len(line) == 1: # Possible vertical arrangement (single character per line) | |
horizontal_line += line | |
else: | |
if horizontal_line: # If we've built a horizontal line, add it. | |
vertical_lines.append(horizontal_line) | |
horizontal_line = '' | |
vertical_lines.append(line) # Add the full line if it's not vertical. | |
if horizontal_line: | |
vertical_lines.append(horizontal_line) | |
return ' '.join(vertical_lines) | |
def main(): | |
st.title("PDF/TXT to Unicode Converter (Nepali RAG)") | |
uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"]) | |
if uploaded_file is not None: | |
file_extension = os.path.splitext(uploaded_file.name)[1].lower() | |
if file_extension == ".pdf": | |
text = extract_text_from_pdf(io.BytesIO(uploaded_file.read())) | |
else: # .txt file | |
text = uploaded_file.getvalue().decode("utf-8") | |
converted_text = convert(text) | |
st.subheader("Original Text") | |
st.text_area("", value=text, height=200) | |
st.subheader("Converted Text") | |
st.text_area("", value=converted_text, height=200) | |
# Create a download button for the converted text | |
st.download_button( | |
label="Download Converted Text", | |
data=converted_text.encode("utf-8"), | |
file_name="converted_text.txt", | |
mime="text/plain" | |
) | |
# Write footer | |
st.markdown("Made with ❤️ by Sumit Yadav(https://sumityadav.com.np)") | |
if __name__ == "__main__": | |
main() |