Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
|
|
2 |
import PyPDF2
|
3 |
import io
|
4 |
import os
|
5 |
-
|
6 |
|
7 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
8 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
@@ -101,11 +101,34 @@ def convert(preeti):
|
|
101 |
|
102 |
def extract_text_from_pdf(pdf_file):
|
103 |
text = ''
|
104 |
-
with open(pdf_file
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
def process_file(inputfile):
|
111 |
ext = os.path.splitext(inputfile)[1].lower()
|
|
|
2 |
import PyPDF2
|
3 |
import io
|
4 |
import os
|
5 |
+
import pdfplumber
|
6 |
|
7 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
8 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
|
|
101 |
|
102 |
def extract_text_from_pdf(pdf_file):
|
103 |
text = ''
|
104 |
+
with pdfplumber.open(pdf_file) as pdf:
|
105 |
+
for page in pdf.pages:
|
106 |
+
extracted_text = page.extract_text()
|
107 |
+
if extracted_text:
|
108 |
+
text += extracted_text
|
109 |
+
|
110 |
+
return handle_vertical_text(text)
|
111 |
+
|
112 |
+
def handle_vertical_text(text):
|
113 |
+
# If the text is vertical, it's likely arranged with one character per line.
|
114 |
+
# We'll attempt to reformat the text by concatenating characters that are stacked vertically.
|
115 |
+
lines = text.split('\n')
|
116 |
+
vertical_lines = []
|
117 |
+
horizontal_line = ''
|
118 |
+
|
119 |
+
for line in lines:
|
120 |
+
if len(line) == 1: # Possible vertical arrangement (single character per line)
|
121 |
+
horizontal_line += line
|
122 |
+
else:
|
123 |
+
if horizontal_line: # If we've built a horizontal line, add it.
|
124 |
+
vertical_lines.append(horizontal_line)
|
125 |
+
horizontal_line = ''
|
126 |
+
vertical_lines.append(line) # Add the full line if it's not vertical.
|
127 |
+
|
128 |
+
if horizontal_line:
|
129 |
+
vertical_lines.append(horizontal_line)
|
130 |
+
|
131 |
+
return ' '.join(vertical_lines)
|
132 |
|
133 |
def process_file(inputfile):
|
134 |
ext = os.path.splitext(inputfile)[1].lower()
|