preeti-unicode / app.py
rockerritesh's picture
better way to handle english
93294e9 verified
raw
history blame
7.21 kB
import streamlit as st
import PyPDF2
import io
import os
import re
# Existing mapping dictionaries
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
"^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
"+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
"å": "द्व", "÷": "/"
}
def is_preeti_text(text):
"""
Check if text segment is likely to be Preeti-encoded Nepali.
Returns True if the text contains common Preeti patterns.
"""
preeti_patterns = [
r'cf', r'qm', r'If', r'0f', r'km', r'f]', # Common Preeti combinations
r'[a-zA-Z]{2,}[\\|\[\]{}]', # Preeti vowel signs and consonants
]
return any(re.search(pattern, text) for pattern in preeti_patterns)
def normalizePreeti(preetitxt):
"""Normalized Preeti text with improved handling"""
normalized = ''
previoussymbol = ''
# Common Preeti substitutions
replacements = {
'qm': 's|',
'f]': 'ो',
'km': 'फ',
'0f': 'ण',
'If': 'क्ष',
'if': 'ष',
'cf': 'आ'
}
for old, new in replacements.items():
preetitxt = preetitxt.replace(old, new)
index = -1
while index + 1 < len(preetitxt):
index += 1
character = preetitxt[index]
try:
if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{':
if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
normalized += '{' + character + preetitxt[index + 1]
index += 2
continue
if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{':
if character != 'f':
normalized += '{' + character
index += 1
continue
except IndexError:
pass
if character == 'l':
previoussymbol = 'l'
continue
else:
normalized += character + previoussymbol
previoussymbol = ''
return normalized
def convert_preeti_segment(preeti):
"""Convert a single Preeti segment to Unicode"""
converted = ''
normalizedpreeti = normalizePreeti(preeti)
for character in normalizedpreeti:
try:
if ord('a') <= ord(character) <= ord('z'):
converted += unicodeatoz[ord(character) - ord('a')]
elif ord('A') <= ord(character) <= ord('Z'):
converted += unicodeAtoZ[ord(character) - ord('A')]
elif ord('0') <= ord(character) <= ord('9'):
converted += unicode0to9[ord(character) - ord('0')]
else:
converted += symbolsDict.get(character, character)
except (KeyError, IndexError):
converted += character
return converted
def smart_convert(text):
"""
Convert text while preserving English segments.
Uses pattern matching to identify and preserve English text.
"""
# Patterns to identify different text segments
patterns = [
# Email addresses
r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
# URLs
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
# Date patterns
r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b',
# Common English words (3 or more characters)
r'\b[A-Za-z]{3,}\b',
# Numbers with units
r'\b\d+\s*[A-Za-z]+\b',
]
# Combine patterns
combined_pattern = '|'.join(patterns)
# Split text into segments while preserving delimiters
segments = []
last_end = 0
for match in re.finditer(combined_pattern, text):
start, end = match.span()
# Add text before match
if start > last_end:
segment = text[last_end:start]
if segment.strip():
segments.append((segment, is_preeti_text(segment)))
# Add matched text (preserve it)
segments.append((match.group(), False))
last_end = end
# Add remaining text
if last_end < len(text):
segment = text[last_end:]
if segment.strip():
segments.append((segment, is_preeti_text(segment)))
# Convert segments
result = ''
for segment, is_preeti in segments:
if is_preeti:
result += convert_preeti_segment(segment)
else:
result += segment
return result
def extract_text_from_pdf(pdf_file):
"""Extract text from PDF with improved encoding handling"""
text = ''
try:
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text() or ''
except Exception as e:
st.error(f"Error reading PDF: {str(e)}")
return ''
return text
def main():
st.title("Smart Preeti to Unicode Converter")
st.write("This converter preserves English text while converting Preeti to Unicode")
uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
if uploaded_file is not None:
try:
if uploaded_file.name.lower().endswith('.pdf'):
pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or ''
else: # .txt file
text = uploaded_file.getvalue().decode("utf-8")
converted_text = smart_convert(text)
col1, col2 = st.columns(2)
with col1:
st.subheader("Original Text")
st.text_area("", value=text, height=300)
with col2:
st.subheader("Converted Text")
st.text_area("", value=converted_text, height=300)
st.download_button(
label="Download Converted Text",
data=converted_text.encode("utf-8"),
file_name="converted_text.txt",
mime="text/plain"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main()