File size: 4,825 Bytes
76f42d9
 
 
75bc1c1
76f42d9
 
 
 
 
49bb6dc
 
 
 
76f42d9
 
 
 
 
49bb6dc
76f42d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49bb6dc
76f42d9
49bb6dc
76f42d9
49bb6dc
76f42d9
49bb6dc
76f42d9
 
 
 
 
 
 
 
 
75bc1c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76f42d9
 
49bb6dc
76f42d9
 
 
 
 
 
 
b6a8855
76f42d9
49bb6dc
76f42d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4fffe7
76f42d9
 
49bb6dc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import io
import os
import pdfplumber

unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
symbolsDict = {
    "~": "ञ्", "`": "ञ", "!": "१", "@": "२", "#": "३", "$": "४", "%": "५", "^": "६", "&": "७", "*": "८",
    "(": "९", ")": "०", "-": "(", "_": ")", "+": "ं", "[": "ृ", "{": "र्", "]": "े", "}": "ै", "\\": "्",
    "|": "्र", ";": "स", ":": "स्", "'": "ु", "\"": "ू", ",": ",", "<": "?", ".": "।", ">": "श्र", "/": "र",
    "?": "रु", "=": ".", "ˆ": "फ्", "Î": "ङ्ख", "å": "द्व", "÷": "/"
}

def normalizePreeti(preetitxt):
    normalized = ''
    previoussymbol = ''
    preetitxt = preetitxt.replace('qm', 's|').replace('f]', 'ो').replace('km', 'फ').replace('0f', 'ण').replace('If', 'क्ष').replace('if', 'ष').replace('cf', 'आ')
    index = -1
    while index + 1 < len(preetitxt):
        index += 1
        character = preetitxt[index]
        try:
            if preetitxt[index + 2] == '{':
                if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
                    normalized += '{' + character + preetitxt[index + 1]
                    index += 2
                    continue
            if preetitxt[index + 1] == '{':
                if character != 'f':
                    normalized += '{' + character
                    index += 1
                    continue
        except IndexError:
            pass
        if character == 'l':
            previoussymbol = 'l'
            continue
        else:
            normalized += character + previoussymbol
            previoussymbol = ''
    return normalized

def convert(preeti):
    converted = ''
    normalizedpreeti = normalizePreeti(preeti)
    for character in normalizedpreeti:
        try:
            if 97 <= ord(character) <= 122:
                converted += unicodeatoz[ord(character) - 97]
            elif 65 <= ord(character) <= 90:
                converted += unicodeAtoZ[ord(character) - 65]
            elif 48 <= ord(character) <= 57:
                converted += unicode0to9[ord(character) - 48]
            else:
                converted += symbolsDict[character]
        except KeyError:
            converted += character
    return converted

def extract_text_from_pdf(pdf_file):
    text = ''
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text
    return handle_vertical_text(text)

def handle_vertical_text(text):
    lines = text.split('\n')
    vertical_lines = []
    horizontal_line = ''
    for line in lines:
        if len(line) == 1:  # Possible vertical arrangement (single character per line)
            horizontal_line += line
        else:
            if horizontal_line:  # If we've built a horizontal line, add it.
                vertical_lines.append(horizontal_line)
                horizontal_line = ''
            vertical_lines.append(line)  # Add the full line if it's not vertical.
    if horizontal_line:
        vertical_lines.append(horizontal_line)
    return ' '.join(vertical_lines)

def main():
    st.title("PDF/TXT to Unicode Converter (Nepali RAG)")

    uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])

    if uploaded_file is not None:
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()

        if file_extension == ".pdf":
            text = extract_text_from_pdf(io.BytesIO(uploaded_file.read()))
        else:  # .txt file
            text = uploaded_file.getvalue().decode("utf-8")

        converted_text = convert(text)

        st.subheader("Original Text")
        st.text_area("", value=text, height=200)

        st.subheader("Converted Text")
        st.text_area("", value=converted_text, height=200)

        # Create a download button for the converted text
        st.download_button(
            label="Download Converted Text",
            data=converted_text.encode("utf-8"),
            file_name="converted_text.txt",
            mime="text/plain"
        )

    # Write footer
    st.markdown("Made with ❤️ by Sumit Yadav(https://sumityadav.com.np)")

if __name__ == "__main__":
    main()