Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
import io | |
import os | |
import re | |
# Existing mapping dictionaries remain the same | |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"] | |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"] | |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"] | |
symbolsDict = { | |
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", | |
"^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")", | |
"+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", | |
";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", | |
">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", | |
"å": "द्व", "÷": "/" | |
} | |
# Common Preeti patterns that need to be preserved as units | |
preeti_compounds = { | |
'qm': 's|', | |
'f]': 'ो', | |
'km': 'फ', | |
'0f': 'ण', | |
'If': 'क्ष', | |
'if': 'ष', | |
'cf': 'आ', | |
'6«': 'ट्र', | |
'g]': 'ने', | |
'8f': 'डा', | |
'«': '्र', | |
'j|m': 'क्र', | |
';+': 'सं' | |
} | |
def is_nepali_unicode(char): | |
"""Check if character is already in Nepali Unicode range""" | |
return '\u0900' <= char <= '\u097F' | |
def get_preeti_segment(text, start_idx): | |
""" | |
Extract a complete Preeti segment starting from given index. | |
Returns the segment and the ending index. | |
""" | |
if start_idx >= len(text): | |
return "", start_idx | |
current_idx = start_idx | |
segment = "" | |
while current_idx < len(text): | |
# Check for compound characters first | |
matched = False | |
for compound in sorted(preeti_compounds.keys(), key=len, reverse=True): | |
if text[current_idx:].startswith(compound): | |
segment += compound | |
current_idx += len(compound) | |
matched = True | |
break | |
if not matched: | |
char = text[current_idx] | |
if char.isspace() or is_nepali_unicode(char): | |
break | |
segment += char | |
current_idx += 1 | |
return segment, current_idx | |
def normalize_preeti(preetitxt): | |
"""Normalize Preeti text with improved compound handling""" | |
# First handle the compound characters | |
for old, new in preeti_compounds.items(): | |
preetitxt = preetitxt.replace(old, new) | |
# Handle remaining special cases | |
normalized = '' | |
idx = 0 | |
while idx < len(preetitxt): | |
if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l': | |
normalized += preetitxt[idx + 1] + 'ि' | |
idx += 2 | |
else: | |
normalized += preetitxt[idx] | |
idx += 1 | |
return normalized | |
def convert_segment(segment): | |
"""Convert a single Preeti segment to Unicode""" | |
if not segment.strip(): | |
return segment | |
# If already in Nepali Unicode, return as is | |
if all(is_nepali_unicode(char) for char in segment if char.strip()): | |
return segment | |
converted = '' | |
normalized = normalize_preeti(segment) | |
for char in normalized: | |
if is_nepali_unicode(char): | |
converted += char | |
elif char.isascii(): | |
try: | |
if 'a' <= char <= 'z': | |
converted += unicodeatoz[ord(char) - ord('a')] | |
elif 'A' <= char <= 'Z': | |
converted += unicodeAtoZ[ord(char) - ord('A')] | |
elif '0' <= char <= '9': | |
converted += unicode0to9[ord(char) - ord('0')] | |
else: | |
converted += symbolsDict.get(char, char) | |
except (IndexError, KeyError): | |
converted += char | |
else: | |
converted += char | |
return converted | |
def smart_convert_mixed(text): | |
""" | |
Convert text while handling mixed Preeti, Unicode and English. | |
Processes text character by character to maintain proper segmentation. | |
""" | |
result = "" | |
idx = 0 | |
while idx < len(text): | |
char = text[idx] | |
# Skip spaces and preserve them | |
if char.isspace(): | |
result += char | |
idx += 1 | |
continue | |
# If character is already in Nepali Unicode, preserve it | |
if is_nepali_unicode(char): | |
result += char | |
idx += 1 | |
continue | |
# If it's a potential Preeti character, get the complete segment | |
if char.isascii(): | |
preeti_segment, new_idx = get_preeti_segment(text, idx) | |
if preeti_segment: | |
result += convert_segment(preeti_segment) | |
idx = new_idx | |
continue | |
# Default case: preserve the character | |
result += char | |
idx += 1 | |
return result | |
def main(): | |
st.title("Advanced Mixed Text Converter") | |
st.write("Converts Preeti text while preserving existing Nepali Unicode and English") | |
# Input area | |
input_text = st.text_area("Enter text to convert", height=200) | |
if st.button("Convert"): | |
if input_text: | |
converted_text = smart_convert_mixed(input_text) | |
st.subheader("Converted Text") | |
st.text_area("", value=converted_text, height=200) | |
st.download_button( | |
label="Download Converted Text", | |
data=converted_text.encode("utf-8"), | |
file_name="converted_text.txt", | |
mime="text/plain" | |
) | |
if __name__ == "__main__": | |
main() |