import streamlit as st import PyPDF2 import io import os import re # Existing mapping dictionaries unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"] unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"] unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"] symbolsDict = { "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/" } def is_preeti_text(text): """ Check if text segment is likely to be Preeti-encoded Nepali. Returns True if the text contains common Preeti patterns. """ preeti_patterns = [ r'cf', r'qm', r'If', r'0f', r'km', r'f]', # Common Preeti combinations r'[a-zA-Z]{2,}[\\|\[\]{}]', # Preeti vowel signs and consonants ] return any(re.search(pattern, text) for pattern in preeti_patterns) def normalizePreeti(preetitxt): """Normalized Preeti text with improved handling""" normalized = '' previoussymbol = '' # Common Preeti substitutions replacements = { 'qm': 's|', 'f]': 'ो', 'km': 'फ', '0f': 'ण', 'If': 'क्ष', 'if': 'ष', 'cf': 'आ' } for old, new in replacements.items(): preetitxt = preetitxt.replace(old, new) index = -1 while index + 1 < len(preetitxt): index += 1 character = preetitxt[index] try: if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{': if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो': normalized += '{' + character + preetitxt[index + 1] index += 2 continue if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{': if character != 'f': normalized += '{' + character index += 1 continue except IndexError: pass if character == 'l': previoussymbol = 'l' continue else: normalized += character + previoussymbol previoussymbol = '' return normalized def convert_preeti_segment(preeti): """Convert a single Preeti segment to Unicode""" converted = '' normalizedpreeti = normalizePreeti(preeti) for character in normalizedpreeti: try: if ord('a') <= ord(character) <= ord('z'): converted += unicodeatoz[ord(character) - ord('a')] elif ord('A') <= ord(character) <= ord('Z'): converted += unicodeAtoZ[ord(character) - ord('A')] elif ord('0') <= ord(character) <= ord('9'): converted += unicode0to9[ord(character) - ord('0')] else: converted += symbolsDict.get(character, character) except (KeyError, IndexError): converted += character return converted def smart_convert(text): """ Convert text while preserving English segments. Uses pattern matching to identify and preserve English text. """ # Patterns to identify different text segments patterns = [ # Email addresses r'\b[\w\.-]+@[\w\.-]+\.\w+\b', # URLs r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', # Date patterns r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', # Common English words (3 or more characters) r'\b[A-Za-z]{3,}\b', # Numbers with units r'\b\d+\s*[A-Za-z]+\b', ] # Combine patterns combined_pattern = '|'.join(patterns) # Split text into segments while preserving delimiters segments = [] last_end = 0 for match in re.finditer(combined_pattern, text): start, end = match.span() # Add text before match if start > last_end: segment = text[last_end:start] if segment.strip(): segments.append((segment, is_preeti_text(segment))) # Add matched text (preserve it) segments.append((match.group(), False)) last_end = end # Add remaining text if last_end < len(text): segment = text[last_end:] if segment.strip(): segments.append((segment, is_preeti_text(segment))) # Convert segments result = '' for segment, is_preeti in segments: if is_preeti: result += convert_preeti_segment(segment) else: result += segment return result def extract_text_from_pdf(pdf_file): """Extract text from PDF with improved encoding handling""" text = '' try: with open(pdf_file, 'rb') as file: reader = PyPDF2.PdfReader(file) for page in reader.pages: text += page.extract_text() or '' except Exception as e: st.error(f"Error reading PDF: {str(e)}") return '' return text def main(): st.title("Smart Preeti to Unicode Converter") st.write("This converter preserves English text while converting Preeti to Unicode") uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"]) if uploaded_file is not None: try: if uploaded_file.name.lower().endswith('.pdf'): pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read())) text = "" for page in pdf_reader.pages: text += page.extract_text() or '' else: # .txt file text = uploaded_file.getvalue().decode("utf-8") converted_text = smart_convert(text) col1, col2 = st.columns(2) with col1: st.subheader("Original Text") st.text_area("", value=text, height=300) with col2: st.subheader("Converted Text") st.text_area("", value=converted_text, height=300) st.download_button( label="Download Converted Text", data=converted_text.encode("utf-8"), file_name="converted_text.txt", mime="text/plain" ) except Exception as e: st.error(f"An error occurred: {str(e)}") if __name__ == "__main__": main()