Spaces:
Sleeping
Sleeping
File size: 7,205 Bytes
76f42d9 66882a0 76f42d9 93294e9 66882a0 93294e9 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 ac96a59 93294e9 ac96a59 93294e9 ac96a59 93294e9 ac96a59 93294e9 ac96a59 93294e9 ac96a59 93294e9 ac96a59 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 76f42d9 93294e9 66882a0 76f42d9 93294e9 76f42d9 93294e9 76f42d9 ac96a59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
import streamlit as st
import PyPDF2
import io
import os
import re
# Existing mapping dictionaries
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
"^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
"+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
"å": "द्व", "÷": "/"
}
def is_preeti_text(text):
"""
Check if text segment is likely to be Preeti-encoded Nepali.
Returns True if the text contains common Preeti patterns.
"""
preeti_patterns = [
r'cf', r'qm', r'If', r'0f', r'km', r'f]', # Common Preeti combinations
r'[a-zA-Z]{2,}[\\|\[\]{}]', # Preeti vowel signs and consonants
]
return any(re.search(pattern, text) for pattern in preeti_patterns)
def normalizePreeti(preetitxt):
"""Normalized Preeti text with improved handling"""
normalized = ''
previoussymbol = ''
# Common Preeti substitutions
replacements = {
'qm': 's|',
'f]': 'ो',
'km': 'फ',
'0f': 'ण',
'If': 'क्ष',
'if': 'ष',
'cf': 'आ'
}
for old, new in replacements.items():
preetitxt = preetitxt.replace(old, new)
index = -1
while index + 1 < len(preetitxt):
index += 1
character = preetitxt[index]
try:
if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{':
if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
normalized += '{' + character + preetitxt[index + 1]
index += 2
continue
if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{':
if character != 'f':
normalized += '{' + character
index += 1
continue
except IndexError:
pass
if character == 'l':
previoussymbol = 'l'
continue
else:
normalized += character + previoussymbol
previoussymbol = ''
return normalized
def convert_preeti_segment(preeti):
"""Convert a single Preeti segment to Unicode"""
converted = ''
normalizedpreeti = normalizePreeti(preeti)
for character in normalizedpreeti:
try:
if ord('a') <= ord(character) <= ord('z'):
converted += unicodeatoz[ord(character) - ord('a')]
elif ord('A') <= ord(character) <= ord('Z'):
converted += unicodeAtoZ[ord(character) - ord('A')]
elif ord('0') <= ord(character) <= ord('9'):
converted += unicode0to9[ord(character) - ord('0')]
else:
converted += symbolsDict.get(character, character)
except (KeyError, IndexError):
converted += character
return converted
def smart_convert(text):
"""
Convert text while preserving English segments.
Uses pattern matching to identify and preserve English text.
"""
# Patterns to identify different text segments
patterns = [
# Email addresses
r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
# URLs
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
# Date patterns
r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b',
# Common English words (3 or more characters)
r'\b[A-Za-z]{3,}\b',
# Numbers with units
r'\b\d+\s*[A-Za-z]+\b',
]
# Combine patterns
combined_pattern = '|'.join(patterns)
# Split text into segments while preserving delimiters
segments = []
last_end = 0
for match in re.finditer(combined_pattern, text):
start, end = match.span()
# Add text before match
if start > last_end:
segment = text[last_end:start]
if segment.strip():
segments.append((segment, is_preeti_text(segment)))
# Add matched text (preserve it)
segments.append((match.group(), False))
last_end = end
# Add remaining text
if last_end < len(text):
segment = text[last_end:]
if segment.strip():
segments.append((segment, is_preeti_text(segment)))
# Convert segments
result = ''
for segment, is_preeti in segments:
if is_preeti:
result += convert_preeti_segment(segment)
else:
result += segment
return result
def extract_text_from_pdf(pdf_file):
"""Extract text from PDF with improved encoding handling"""
text = ''
try:
with open(pdf_file, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page in reader.pages:
text += page.extract_text() or ''
except Exception as e:
st.error(f"Error reading PDF: {str(e)}")
return ''
return text
def main():
st.title("Smart Preeti to Unicode Converter")
st.write("This converter preserves English text while converting Preeti to Unicode")
uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
if uploaded_file is not None:
try:
if uploaded_file.name.lower().endswith('.pdf'):
pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or ''
else: # .txt file
text = uploaded_file.getvalue().decode("utf-8")
converted_text = smart_convert(text)
col1, col2 = st.columns(2)
with col1:
st.subheader("Original Text")
st.text_area("", value=text, height=300)
with col2:
st.subheader("Converted Text")
st.text_area("", value=converted_text, height=300)
st.download_button(
label="Download Converted Text",
data=converted_text.encode("utf-8"),
file_name="converted_text.txt",
mime="text/plain"
)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
if __name__ == "__main__":
main() |