rockerritesh commited on
Commit
75bc1c1
·
verified ·
1 Parent(s): ac9b066

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -6
app.py CHANGED
@@ -2,7 +2,7 @@ import streamlit as st
2
  import PyPDF2
3
  import io
4
  import os
5
-
6
 
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
@@ -101,11 +101,34 @@ def convert(preeti):
101
 
102
  def extract_text_from_pdf(pdf_file):
103
  text = ''
104
- with open(pdf_file, 'rb') as file:
105
- reader = PyPDF2.PdfReader(file)
106
- for page in reader.pages:
107
- text += page.extract_text()
108
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  def process_file(inputfile):
111
  ext = os.path.splitext(inputfile)[1].lower()
 
2
  import PyPDF2
3
  import io
4
  import os
5
+ import pdfplumber
6
 
7
  unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
8
  unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
 
101
 
102
  def extract_text_from_pdf(pdf_file):
103
  text = ''
104
+ with pdfplumber.open(pdf_file) as pdf:
105
+ for page in pdf.pages:
106
+ extracted_text = page.extract_text()
107
+ if extracted_text:
108
+ text += extracted_text
109
+
110
+ return handle_vertical_text(text)
111
+
112
+ def handle_vertical_text(text):
113
+ # If the text is vertical, it's likely arranged with one character per line.
114
+ # We'll attempt to reformat the text by concatenating characters that are stacked vertically.
115
+ lines = text.split('\n')
116
+ vertical_lines = []
117
+ horizontal_line = ''
118
+
119
+ for line in lines:
120
+ if len(line) == 1: # Possible vertical arrangement (single character per line)
121
+ horizontal_line += line
122
+ else:
123
+ if horizontal_line: # If we've built a horizontal line, add it.
124
+ vertical_lines.append(horizontal_line)
125
+ horizontal_line = ''
126
+ vertical_lines.append(line) # Add the full line if it's not vertical.
127
+
128
+ if horizontal_line:
129
+ vertical_lines.append(horizontal_line)
130
+
131
+ return ' '.join(vertical_lines)
132
 
133
  def process_file(inputfile):
134
  ext = os.path.splitext(inputfile)[1].lower()