Spaces:
Sleeping
Sleeping
File size: 4,370 Bytes
76f42d9 66882a0 76f42d9 5f99735 66882a0 5f99735 d0a3b36 5f99735 76f42d9 5f99735 2d857e8 d0a3b36 5f99735 76f42d9 d0a3b36 5f99735 d0a3b36 76f42d9 5f99735 d0a3b36 5f99735 d0a3b36 5f99735 d0a3b36 5f99735 d0a3b36 5f99735 d0a3b36 5f99735 d0a3b36 5f99735 d0a3b36 93294e9 76f42d9 5f99735 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import streamlit as st
import PyPDF2
import io
import os
import re
import string
import nltk
# Download NLTK resources
nltk.download('words')
# English words from NLTK corpus
english_words = set(nltk.corpus.words.words())
# Define Devanagari digits and patterns for matching
DEVANAGARI_DIGITS = {'०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '१०'}
DEVANAGARI_PATTERN = re.compile(r'^[०-९]+(?:[.,/][०-९]+)*$') # Match Devanagari digits
NUMERIC_PATTERN = re.compile(r'^\d+(?:[.,/]\d+)*$') # Match numeric patterns
# Unicode conversion mappings
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", "(": "९",
")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", ";": "स",
":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", "?": "रु", "=": ".",
"ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
}
def normalizePreeti(preetitxt):
"""Normalize Preeti text for consistent conversion."""
# (same function as before)
return preetitxt
def convert(preeti):
"""Convert Preeti text to Unicode."""
# (same function as before)
return preeti
def is_english_word(word):
"""Check if a word is English."""
word = word.lower().strip(string.punctuation)
return word in english_words
def is_valid_numeric(word):
"""Check if the word is a valid numeric string."""
return bool(NUMERIC_PATTERN.match(word))
def is_devanagari_digit(word):
"""Check if the word contains only Devanagari digits."""
return bool(DEVANAGARI_PATTERN.match(word))
def process_text_word_by_word(page_text):
"""Process each word and retain or convert based on language."""
processed_text = []
words_in_page = page_text.split()
for word in words_in_page:
word_cleaned = word.strip(string.punctuation)
if is_english_word(word_cleaned):
processed_text.append(word) # Retain English words
elif is_devanagari_digit(word_cleaned):
processed_text.append(word) # Retain Devanagari digits
elif is_valid_numeric(word_cleaned):
processed_text.append(word) # Retain numeric expressions
else:
processed_text.append(convert(word)) # Convert other words
return ' '.join(processed_text)
def text_both_english_and_nepali(pdf_file):
"""Process text from each page of a PDF."""
pages_with_english = []
text = ""
# Extract text from PDF
reader = PyPDF2.PdfReader(pdf_file)
for page_num, page in enumerate(reader.pages):
page_text = page.extract_text()
processed_text = process_text_word_by_word(page_text)
text += f"\nPage {page_num + 1}:\n{processed_text}"
return text
def main():
st.title("Advanced PDF/TXT to Unicode Converter")
uploaded_file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"])
if uploaded_file is not None:
text = ""
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
if file_extension == ".pdf":
text = text_both_english_and_nepali(uploaded_file)
elif file_extension == ".txt":
text = process_text_word_by_word(uploaded_file.getvalue().decode("utf-8"))
st.subheader("Processed Text")
st.text_area("", value=text, height=400)
# Download button for the processed text
st.download_button(
label="Download Processed Text",
data=text.encode("utf-8"),
file_name="processed_text.txt",
mime="text/plain"
)
if __name__ == "__main__":
main()
|