import streamlit as st import PyPDF2 import io import os import re # Existing mapping dictionaries remain the same unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"] unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"] unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"] symbolsDict = { "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/" } # Common Preeti patterns that need to be preserved as units preeti_compounds = { 'qm': 's|', 'f]': 'ो', 'km': 'फ', '0f': 'ण', 'If': 'क्ष', 'if': 'ष', 'cf': 'आ', '6«': 'ट्र', 'g]': 'ने', '8f': 'डा', '«': '्र', 'j|m': 'क्र', ';+': 'सं' } def is_nepali_unicode(char): """Check if character is already in Nepali Unicode range""" return '\u0900' <= char <= '\u097F' def get_preeti_segment(text, start_idx): """ Extract a complete Preeti segment starting from given index. Returns the segment and the ending index. """ if start_idx >= len(text): return "", start_idx current_idx = start_idx segment = "" while current_idx < len(text): # Check for compound characters first matched = False for compound in sorted(preeti_compounds.keys(), key=len, reverse=True): if text[current_idx:].startswith(compound): segment += compound current_idx += len(compound) matched = True break if not matched: char = text[current_idx] if char.isspace() or is_nepali_unicode(char): break segment += char current_idx += 1 return segment, current_idx def normalize_preeti(preetitxt): """Normalize Preeti text with improved compound handling""" # First handle the compound characters for old, new in preeti_compounds.items(): preetitxt = preetitxt.replace(old, new) # Handle remaining special cases normalized = '' idx = 0 while idx < len(preetitxt): if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l': normalized += preetitxt[idx + 1] + 'ि' idx += 2 else: normalized += preetitxt[idx] idx += 1 return normalized def convert_segment(segment): """Convert a single Preeti segment to Unicode""" if not segment.strip(): return segment # If already in Nepali Unicode, return as is if all(is_nepali_unicode(char) for char in segment if char.strip()): return segment converted = '' normalized = normalize_preeti(segment) for char in normalized: if is_nepali_unicode(char): converted += char elif char.isascii(): try: if 'a' <= char <= 'z': converted += unicodeatoz[ord(char) - ord('a')] elif 'A' <= char <= 'Z': converted += unicodeAtoZ[ord(char) - ord('A')] elif '0' <= char <= '9': converted += unicode0to9[ord(char) - ord('0')] else: converted += symbolsDict.get(char, char) except (IndexError, KeyError): converted += char else: converted += char return converted def smart_convert_mixed(text): """ Convert text while handling mixed Preeti, Unicode and English. Processes text character by character to maintain proper segmentation. """ result = "" idx = 0 while idx < len(text): char = text[idx] # Skip spaces and preserve them if char.isspace(): result += char idx += 1 continue # If character is already in Nepali Unicode, preserve it if is_nepali_unicode(char): result += char idx += 1 continue # If it's a potential Preeti character, get the complete segment if char.isascii(): preeti_segment, new_idx = get_preeti_segment(text, idx) if preeti_segment: result += convert_segment(preeti_segment) idx = new_idx continue # Default case: preserve the character result += char idx += 1 return result def main(): st.title("Advanced Mixed Text Converter") st.write("Converts Preeti text while preserving existing Nepali Unicode and English") # Input area input_text = st.text_area("Enter text to convert", height=200) if st.button("Convert"): if input_text: converted_text = smart_convert_mixed(input_text) st.subheader("Converted Text") st.text_area("", value=converted_text, height=200) st.download_button( label="Download Converted Text", data=converted_text.encode("utf-8"), file_name="converted_text.txt", mime="text/plain" ) if __name__ == "__main__": main()