Spaces:

rockerritesh
/

preeti-unicode

Sleeping

File size: 4,370 Bytes

import streamlit as st
import PyPDF2
import io
import os
import re
import string
import nltk

# Download NLTK resources
nltk.download('words')

# English words from NLTK corpus
english_words = set(nltk.corpus.words.words())

# Define Devanagari digits and patterns for matching
DEVANAGARI_DIGITS = {'०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '१०'}
DEVANAGARI_PATTERN = re.compile(r'^[०-९]+(?:[.,/][०-९]+)*$')  # Match Devanagari digits
NUMERIC_PATTERN = re.compile(r'^\d+(?:[.,/]\d+)*$')  # Match numeric patterns

# Unicode conversion mappings
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
    "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", "(": "९",
    ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", ";": "स",
    ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", "?": "रु", "=": ".",
    "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
}

def normalizePreeti(preetitxt):
    """Normalize Preeti text for consistent conversion."""
    # (same function as before)
    return preetitxt

def convert(preeti):
    """Convert Preeti text to Unicode."""
    # (same function as before)
    return preeti

def is_english_word(word):
    """Check if a word is English."""
    word = word.lower().strip(string.punctuation)
    return word in english_words

def is_valid_numeric(word):
    """Check if the word is a valid numeric string."""
    return bool(NUMERIC_PATTERN.match(word))

def is_devanagari_digit(word):
    """Check if the word contains only Devanagari digits."""
    return bool(DEVANAGARI_PATTERN.match(word))

def process_text_word_by_word(page_text):
    """Process each word and retain or convert based on language."""
    processed_text = []
    words_in_page = page_text.split()

    for word in words_in_page:
        word_cleaned = word.strip(string.punctuation)
        if is_english_word(word_cleaned):
            processed_text.append(word)  # Retain English words
        elif is_devanagari_digit(word_cleaned):
            processed_text.append(word)  # Retain Devanagari digits
        elif is_valid_numeric(word_cleaned):
            processed_text.append(word)  # Retain numeric expressions
        else:
            processed_text.append(convert(word))  # Convert other words
    
    return ' '.join(processed_text)

def text_both_english_and_nepali(pdf_file):
    """Process text from each page of a PDF."""
    pages_with_english = []
    text = ""

    # Extract text from PDF
    reader = PyPDF2.PdfReader(pdf_file)
    for page_num, page in enumerate(reader.pages):
        page_text = page.extract_text()
        processed_text = process_text_word_by_word(page_text)
        text += f"\nPage {page_num + 1}:\n{processed_text}"
    return text

def main():
    st.title("Advanced PDF/TXT to Unicode Converter")

    uploaded_file = st.file_uploader("Upload a PDF or TXT file", type=["pdf", "txt"])

    if uploaded_file is not None:
        text = ""
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()

        if file_extension == ".pdf":
            text = text_both_english_and_nepali(uploaded_file)
        elif file_extension == ".txt":
            text = process_text_word_by_word(uploaded_file.getvalue().decode("utf-8"))

        st.subheader("Processed Text")
        st.text_area("", value=text, height=400)

        # Download button for the processed text
        st.download_button(
            label="Download Processed Text",
            data=text.encode("utf-8"),
            file_name="processed_text.txt",
            mime="text/plain"
        )

if __name__ == "__main__":
    main()