Spaces:

rockerritesh
/

preeti-unicode

Sleeping

File size: 6,031 Bytes

import streamlit as st
import PyPDF2
import io
import os
import re

# Existing mapping dictionaries remain the same
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
    "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५",
    "^": "६", "&": "७", "*": "८", "(": "९", ")": "०", "-": "(", "_": ")",
    "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्", "|": "्र",
    ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।",
    ">": "श्र", "/": "र", "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख",
    "å": "द्व", "÷": "/"
}

# Common Preeti patterns that need to be preserved as units
preeti_compounds = {
    'qm': 's|',
    'f]': 'ो',
    'km': 'फ',
    '0f': 'ण',
    'If': 'क्ष',
    'if': 'ष',
    'cf': 'आ',
    '6«': 'ट्र',
    'g]': 'ने',
    '8f': 'डा',
    '«': '्र',
    'j|m': 'क्र',
    ';+': 'सं'
}

def is_nepali_unicode(char):
    """Check if character is already in Nepali Unicode range"""
    return '\u0900' <= char <= '\u097F'

def get_preeti_segment(text, start_idx):
    """
    Extract a complete Preeti segment starting from given index.
    Returns the segment and the ending index.
    """
    if start_idx >= len(text):
        return "", start_idx
    
    current_idx = start_idx
    segment = ""
    
    while current_idx < len(text):
        # Check for compound characters first
        matched = False
        for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
            if text[current_idx:].startswith(compound):
                segment += compound
                current_idx += len(compound)
                matched = True
                break
        
        if not matched:
            char = text[current_idx]
            if char.isspace() or is_nepali_unicode(char):
                break
            segment += char
            current_idx += 1
            
    return segment, current_idx

def normalize_preeti(preetitxt):
    """Normalize Preeti text with improved compound handling"""
    # First handle the compound characters
    for old, new in preeti_compounds.items():
        preetitxt = preetitxt.replace(old, new)
    
    # Handle remaining special cases
    normalized = ''
    idx = 0
    while idx < len(preetitxt):
        if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
            normalized += preetitxt[idx + 1] + 'ि'
            idx += 2
        else:
            normalized += preetitxt[idx]
            idx += 1
            
    return normalized

def convert_segment(segment):
    """Convert a single Preeti segment to Unicode"""
    if not segment.strip():
        return segment
    
    # If already in Nepali Unicode, return as is
    if all(is_nepali_unicode(char) for char in segment if char.strip()):
        return segment
    
    converted = ''
    normalized = normalize_preeti(segment)
    
    for char in normalized:
        if is_nepali_unicode(char):
            converted += char
        elif char.isascii():
            try:
                if 'a' <= char <= 'z':
                    converted += unicodeatoz[ord(char) - ord('a')]
                elif 'A' <= char <= 'Z':
                    converted += unicodeAtoZ[ord(char) - ord('A')]
                elif '0' <= char <= '9':
                    converted += unicode0to9[ord(char) - ord('0')]
                else:
                    converted += symbolsDict.get(char, char)
            except (IndexError, KeyError):
                converted += char
        else:
            converted += char
            
    return converted

def smart_convert_mixed(text):
    """
    Convert text while handling mixed Preeti, Unicode and English.
    Processes text character by character to maintain proper segmentation.
    """
    result = ""
    idx = 0
    
    while idx < len(text):
        char = text[idx]
        
        # Skip spaces and preserve them
        if char.isspace():
            result += char
            idx += 1
            continue
            
        # If character is already in Nepali Unicode, preserve it
        if is_nepali_unicode(char):
            result += char
            idx += 1
            continue
            
        # If it's a potential Preeti character, get the complete segment
        if char.isascii():
            preeti_segment, new_idx = get_preeti_segment(text, idx)
            if preeti_segment:
                result += convert_segment(preeti_segment)
                idx = new_idx
                continue
                
        # Default case: preserve the character
        result += char
        idx += 1
        
    return result

def main():
    st.title("Advanced Mixed Text Converter")
    st.write("Converts Preeti text while preserving existing Nepali Unicode and English")

    # Input area
    input_text = st.text_area("Enter text to convert", height=200)
    
    if st.button("Convert"):
        if input_text:
            converted_text = smart_convert_mixed(input_text)
            
            st.subheader("Converted Text")
            st.text_area("", value=converted_text, height=200)
            
            st.download_button(
                label="Download Converted Text",
                data=converted_text.encode("utf-8"),
                file_name="converted_text.txt",
                mime="text/plain"
            )

if __name__ == "__main__":
    main()