Spaces:
Sleeping
Sleeping
import streamlit as st | |
import PyPDF2 | |
import io | |
import os | |
import re | |
# Existing mapping dictionaries | |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"] | |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"] | |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"] | |
symbolsDict = { | |
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", | |
"^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")", | |
"+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र", | |
";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", | |
">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", | |
"å": "द्व", "÷": "/" | |
} | |
def is_preeti_text(text): | |
""" | |
Check if text segment is likely to be Preeti-encoded Nepali. | |
Returns True if the text contains common Preeti patterns. | |
""" | |
preeti_patterns = [ | |
r'cf', r'qm', r'If', r'0f', r'km', r'f]', # Common Preeti combinations | |
r'[a-zA-Z]{2,}[\\|\[\]{}]', # Preeti vowel signs and consonants | |
] | |
return any(re.search(pattern, text) for pattern in preeti_patterns) | |
def normalizePreeti(preetitxt): | |
"""Normalized Preeti text with improved handling""" | |
normalized = '' | |
previoussymbol = '' | |
# Common Preeti substitutions | |
replacements = { | |
'qm': 's|', | |
'f]': 'ो', | |
'km': 'फ', | |
'0f': 'ण', | |
'If': 'क्ष', | |
'if': 'ष', | |
'cf': 'आ' | |
} | |
for old, new in replacements.items(): | |
preetitxt = preetitxt.replace(old, new) | |
index = -1 | |
while index + 1 < len(preetitxt): | |
index += 1 | |
character = preetitxt[index] | |
try: | |
if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{': | |
if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो': | |
normalized += '{' + character + preetitxt[index + 1] | |
index += 2 | |
continue | |
if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{': | |
if character != 'f': | |
normalized += '{' + character | |
index += 1 | |
continue | |
except IndexError: | |
pass | |
if character == 'l': | |
previoussymbol = 'l' | |
continue | |
else: | |
normalized += character + previoussymbol | |
previoussymbol = '' | |
return normalized | |
def convert_preeti_segment(preeti): | |
"""Convert a single Preeti segment to Unicode""" | |
converted = '' | |
normalizedpreeti = normalizePreeti(preeti) | |
for character in normalizedpreeti: | |
try: | |
if ord('a') <= ord(character) <= ord('z'): | |
converted += unicodeatoz[ord(character) - ord('a')] | |
elif ord('A') <= ord(character) <= ord('Z'): | |
converted += unicodeAtoZ[ord(character) - ord('A')] | |
elif ord('0') <= ord(character) <= ord('9'): | |
converted += unicode0to9[ord(character) - ord('0')] | |
else: | |
converted += symbolsDict.get(character, character) | |
except (KeyError, IndexError): | |
converted += character | |
return converted | |
def smart_convert(text): | |
""" | |
Convert text while preserving English segments. | |
Uses pattern matching to identify and preserve English text. | |
""" | |
# Patterns to identify different text segments | |
patterns = [ | |
# Email addresses | |
r'\b[\w\.-]+@[\w\.-]+\.\w+\b', | |
# URLs | |
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', | |
# Date patterns | |
r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', | |
# Common English words (3 or more characters) | |
r'\b[A-Za-z]{3,}\b', | |
# Numbers with units | |
r'\b\d+\s*[A-Za-z]+\b', | |
] | |
# Combine patterns | |
combined_pattern = '|'.join(patterns) | |
# Split text into segments while preserving delimiters | |
segments = [] | |
last_end = 0 | |
for match in re.finditer(combined_pattern, text): | |
start, end = match.span() | |
# Add text before match | |
if start > last_end: | |
segment = text[last_end:start] | |
if segment.strip(): | |
segments.append((segment, is_preeti_text(segment))) | |
# Add matched text (preserve it) | |
segments.append((match.group(), False)) | |
last_end = end | |
# Add remaining text | |
if last_end < len(text): | |
segment = text[last_end:] | |
if segment.strip(): | |
segments.append((segment, is_preeti_text(segment))) | |
# Convert segments | |
result = '' | |
for segment, is_preeti in segments: | |
if is_preeti: | |
result += convert_preeti_segment(segment) | |
else: | |
result += segment | |
return result | |
def extract_text_from_pdf(pdf_file): | |
"""Extract text from PDF with improved encoding handling""" | |
text = '' | |
try: | |
with open(pdf_file, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
for page in reader.pages: | |
text += page.extract_text() or '' | |
except Exception as e: | |
st.error(f"Error reading PDF: {str(e)}") | |
return '' | |
return text | |
def main(): | |
st.title("Smart Preeti to Unicode Converter") | |
st.write("This converter preserves English text while converting Preeti to Unicode") | |
uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"]) | |
if uploaded_file is not None: | |
try: | |
if uploaded_file.name.lower().endswith('.pdf'): | |
pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read())) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() or '' | |
else: # .txt file | |
text = uploaded_file.getvalue().decode("utf-8") | |
converted_text = smart_convert(text) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Original Text") | |
st.text_area("", value=text, height=300) | |
with col2: | |
st.subheader("Converted Text") | |
st.text_area("", value=converted_text, height=300) | |
st.download_button( | |
label="Download Converted Text", | |
data=converted_text.encode("utf-8"), | |
file_name="converted_text.txt", | |
mime="text/plain" | |
) | |
except Exception as e: | |
st.error(f"An error occurred: {str(e)}") | |
if __name__ == "__main__": | |
main() |