Spaces:

rockerritesh
/

preeti-unicode

Sleeping

App Files Files Community

rockerritesh commited on Sep 27, 2024

Commit

75bc1c1

verified ·

1 Parent(s): ac9b066

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -6

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import streamlit as st
 import PyPDF2
 import io
 import os
 unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
 unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
@@ -101,11 +101,34 @@ def convert(preeti):
 def extract_text_from_pdf(pdf_file):
     text = ''
-    with open(pdf_file, 'rb') as file:
-        reader = PyPDF2.PdfReader(file)
-        for page in reader.pages:
-            text += page.extract_text()
-    return text
 def process_file(inputfile):
     ext = os.path.splitext(inputfile)[1].lower()

 import PyPDF2
 import io
 import os
+import pdfplumber
 unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
 unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
 def extract_text_from_pdf(pdf_file):
     text = ''
+    with pdfplumber.open(pdf_file) as pdf:
+        for page in pdf.pages:
+            extracted_text = page.extract_text()
+            if extracted_text:
+                text += extracted_text
+    return handle_vertical_text(text)
+def handle_vertical_text(text):
+    # If the text is vertical, it's likely arranged with one character per line.
+    # We'll attempt to reformat the text by concatenating characters that are stacked vertically.
+    lines = text.split('\n')
+    vertical_lines = []
+    horizontal_line = ''
+    for line in lines:
+        if len(line) == 1:  # Possible vertical arrangement (single character per line)
+            horizontal_line += line
+        else:
+            if horizontal_line:  # If we've built a horizontal line, add it.
+                vertical_lines.append(horizontal_line)
+                horizontal_line = ''
+            vertical_lines.append(line)  # Add the full line if it's not vertical.
+    if horizontal_line:
+        vertical_lines.append(horizontal_line)
+    return ' '.join(vertical_lines)
 def process_file(inputfile):
     ext = os.path.splitext(inputfile)[1].lower()