preeti-unicode / app.py
rockerritesh's picture
Update app.py
2d857e8 verified
raw
history blame
6.03 kB
import streamlit as st
import PyPDF2
import io
import os
import re
# Existing mapping dictionaries remain the same
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
"^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
"+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
"å": "द्व", "÷": "/"
}
# Common Preeti patterns that need to be preserved as units
preeti_compounds = {
'qm': 's|',
'f]': 'ो',
'km': 'फ',
'0f': 'ण',
'If': 'क्ष',
'if': 'ष',
'cf': 'आ',
'6«': 'ट्र',
'g]': 'ने',
'8f': 'डा',
'«': '्र',
'j|m': 'क्र',
';+': 'सं'
}
def is_nepali_unicode(char):
"""Check if character is already in Nepali Unicode range"""
return '\u0900' <= char <= '\u097F'
def get_preeti_segment(text, start_idx):
"""
Extract a complete Preeti segment starting from given index.
Returns the segment and the ending index.
"""
if start_idx >= len(text):
return "", start_idx
current_idx = start_idx
segment = ""
while current_idx < len(text):
# Check for compound characters first
matched = False
for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
if text[current_idx:].startswith(compound):
segment += compound
current_idx += len(compound)
matched = True
break
if not matched:
char = text[current_idx]
if char.isspace() or is_nepali_unicode(char):
break
segment += char
current_idx += 1
return segment, current_idx
def normalize_preeti(preetitxt):
"""Normalize Preeti text with improved compound handling"""
# First handle the compound characters
for old, new in preeti_compounds.items():
preetitxt = preetitxt.replace(old, new)
# Handle remaining special cases
normalized = ''
idx = 0
while idx < len(preetitxt):
if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
normalized += preetitxt[idx + 1] + 'ि'
idx += 2
else:
normalized += preetitxt[idx]
idx += 1
return normalized
def convert_segment(segment):
"""Convert a single Preeti segment to Unicode"""
if not segment.strip():
return segment
# If already in Nepali Unicode, return as is
if all(is_nepali_unicode(char) for char in segment if char.strip()):
return segment
converted = ''
normalized = normalize_preeti(segment)
for char in normalized:
if is_nepali_unicode(char):
converted += char
elif char.isascii():
try:
if 'a' <= char <= 'z':
converted += unicodeatoz[ord(char) - ord('a')]
elif 'A' <= char <= 'Z':
converted += unicodeAtoZ[ord(char) - ord('A')]
elif '0' <= char <= '9':
converted += unicode0to9[ord(char) - ord('0')]
else:
converted += symbolsDict.get(char, char)
except (IndexError, KeyError):
converted += char
else:
converted += char
return converted
def smart_convert_mixed(text):
"""
Convert text while handling mixed Preeti, Unicode and English.
Processes text character by character to maintain proper segmentation.
"""
result = ""
idx = 0
while idx < len(text):
char = text[idx]
# Skip spaces and preserve them
if char.isspace():
result += char
idx += 1
continue
# If character is already in Nepali Unicode, preserve it
if is_nepali_unicode(char):
result += char
idx += 1
continue
# If it's a potential Preeti character, get the complete segment
if char.isascii():
preeti_segment, new_idx = get_preeti_segment(text, idx)
if preeti_segment:
result += convert_segment(preeti_segment)
idx = new_idx
continue
# Default case: preserve the character
result += char
idx += 1
return result
def main():
st.title("Advanced Mixed Text Converter")
st.write("Converts Preeti text while preserving existing Nepali Unicode and English")
# Input area
input_text = st.text_area("Enter text to convert", height=200)
if st.button("Convert"):
if input_text:
converted_text = smart_convert_mixed(input_text)
st.subheader("Converted Text")
st.text_area("", value=converted_text, height=200)
st.download_button(
label="Download Converted Text",
data=converted_text.encode("utf-8"),
file_name="converted_text.txt",
mime="text/plain"
)
if __name__ == "__main__":
main()