Spaces:

rockerritesh
/

preeti-unicode

Sleeping

App Files Files Community

preeti-unicode / app.py

rockerritesh

Update app.py

2d857e8 verified 8 months ago

raw

history blame

6.03 kB

	import streamlit as st
	import PyPDF2
	import io
	import os
	import re

	# Existing mapping dictionaries remain the same
	unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
	unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
	unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
	symbolsDict = {
	"~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
	"^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
	"+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "\|": "्र",
	";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
	">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
	"å": "द्व", "÷": "/"
	}

	# Common Preeti patterns that need to be preserved as units
	preeti_compounds = {
	'qm': 's\|',
	'f]': 'ो',
	'km': 'फ',
	'0f': 'ण',
	'If': 'क्ष',
	'if': 'ष',
	'cf': 'आ',
	'6«': 'ट्र',
	'g]': 'ने',
	'8f': 'डा',
	'«': '्र',
	'j\|m': 'क्र',
	';+': 'सं'
	}

	def is_nepali_unicode(char):
	"""Check if character is already in Nepali Unicode range"""
	return '\u0900' <= char <= '\u097F'

	def get_preeti_segment(text, start_idx):
	"""
	Extract a complete Preeti segment starting from given index.
	Returns the segment and the ending index.
	"""
	if start_idx >= len(text):
	return "", start_idx

	current_idx = start_idx
	segment = ""

	while current_idx < len(text):
	# Check for compound characters first
	matched = False
	for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
	if text[current_idx:].startswith(compound):
	segment += compound
	current_idx += len(compound)
	matched = True
	break

	if not matched:
	char = text[current_idx]
	if char.isspace() or is_nepali_unicode(char):
	break
	segment += char
	current_idx += 1

	return segment, current_idx

	def normalize_preeti(preetitxt):
	"""Normalize Preeti text with improved compound handling"""
	# First handle the compound characters
	for old, new in preeti_compounds.items():
	preetitxt = preetitxt.replace(old, new)

	# Handle remaining special cases
	normalized = ''
	idx = 0
	while idx < len(preetitxt):
	if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
	normalized += preetitxt[idx + 1] + 'ि'
	idx += 2
	else:
	normalized += preetitxt[idx]
	idx += 1

	return normalized

	def convert_segment(segment):
	"""Convert a single Preeti segment to Unicode"""
	if not segment.strip():
	return segment

	# If already in Nepali Unicode, return as is
	if all(is_nepali_unicode(char) for char in segment if char.strip()):
	return segment

	converted = ''
	normalized = normalize_preeti(segment)

	for char in normalized:
	if is_nepali_unicode(char):
	converted += char
	elif char.isascii():
	try:
	if 'a' <= char <= 'z':
	converted += unicodeatoz[ord(char) - ord('a')]
	elif 'A' <= char <= 'Z':
	converted += unicodeAtoZ[ord(char) - ord('A')]
	elif '0' <= char <= '9':
	converted += unicode0to9[ord(char) - ord('0')]
	else:
	converted += symbolsDict.get(char, char)
	except (IndexError, KeyError):
	converted += char
	else:
	converted += char

	return converted

	def smart_convert_mixed(text):
	"""
	Convert text while handling mixed Preeti, Unicode and English.
	Processes text character by character to maintain proper segmentation.
	"""
	result = ""
	idx = 0

	while idx < len(text):
	char = text[idx]

	# Skip spaces and preserve them
	if char.isspace():
	result += char
	idx += 1
	continue

	# If character is already in Nepali Unicode, preserve it
	if is_nepali_unicode(char):
	result += char
	idx += 1
	continue

	# If it's a potential Preeti character, get the complete segment
	if char.isascii():
	preeti_segment, new_idx = get_preeti_segment(text, idx)
	if preeti_segment:
	result += convert_segment(preeti_segment)
	idx = new_idx
	continue

	# Default case: preserve the character
	result += char
	idx += 1

	return result

	def main():
	st.title("Advanced Mixed Text Converter")
	st.write("Converts Preeti text while preserving existing Nepali Unicode and English")

	# Input area
	input_text = st.text_area("Enter text to convert", height=200)

	if st.button("Convert"):
	if input_text:
	converted_text = smart_convert_mixed(input_text)

	st.subheader("Converted Text")
	st.text_area("", value=converted_text, height=200)

	st.download_button(
	label="Download Converted Text",
	data=converted_text.encode("utf-8"),
	file_name="converted_text.txt",
	mime="text/plain"
	)

	if __name__ == "__main__":
	main()