Spaces:
Sleeping
Sleeping
app changes
Browse files- app.py +170 -168
- cropped_image.pdf +0 -0
app.py
CHANGED
@@ -155,178 +155,180 @@ st.markdown("Link to the app - [PDF to extract loadn details app on 🤗 Spaces]
|
|
155 |
#image uploader
|
156 |
file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
|
157 |
print(file_name)
|
158 |
-
# Find the PDF path
|
159 |
-
pdf_path = file_name # '/content/data/'+file_name+".pdf"
|
160 |
-
#text_file_path = '/content/data/'+file_name+".txt"
|
161 |
-
# Create a pdf file object
|
162 |
-
pdfFileObj = open(pdf_path, 'rb')
|
163 |
-
# Create a pdf reader object
|
164 |
-
pdfReaded = PyPDF2.PdfReader(pdfFileObj)
|
165 |
-
|
166 |
-
# Create the dictionary to extract text from each image
|
167 |
-
text_per_page = {}
|
168 |
-
# Create a boolean variable for image detection
|
169 |
-
image_flag = False
|
170 |
-
number_of_pages = len(list(extract_pages(pdfFileObj)))
|
171 |
-
result = ''
|
172 |
-
|
173 |
-
# We extract the pages from the PDF
|
174 |
-
for pagenum, page in enumerate(extract_pages(pdf_path)):
|
175 |
-
|
176 |
-
# Initialize the variables needed for the text extraction from the page
|
177 |
-
pageObj = pdfReaded.pages[pagenum]
|
178 |
-
page_text = []
|
179 |
-
line_format = []
|
180 |
-
text_from_images = []
|
181 |
-
text_from_tables = []
|
182 |
-
page_content = []
|
183 |
-
# Initialize the number of the examined tables
|
184 |
-
table_in_page= -1
|
185 |
-
# Open the pdf file
|
186 |
-
pdf = pdfplumber.open(pdf_path)
|
187 |
-
# Find the examined page
|
188 |
-
page_tables = pdf.pages[pagenum]
|
189 |
-
# Find the number of tables in the page
|
190 |
-
tables = page_tables.find_tables()
|
191 |
-
if len(tables)!=0:
|
192 |
-
table_in_page = 0
|
193 |
-
|
194 |
-
# Extracting the tables of the page
|
195 |
-
for table_num in range(len(tables)):
|
196 |
-
# Extract the information of the table
|
197 |
-
table = extract_table(pdf_path, pagenum, table_num)
|
198 |
-
# Convert the table information in structured string format
|
199 |
-
table_string = table_converter(table)
|
200 |
-
# Append the table string into a list
|
201 |
-
text_from_tables.append(table_string)
|
202 |
-
|
203 |
-
# Find all the elements
|
204 |
-
page_elements = [(element.y1, element) for element in page._objs]
|
205 |
-
# Sort all the element as they appear in the page
|
206 |
-
page_elements.sort(key=lambda a: a[0], reverse=True)
|
207 |
-
|
208 |
-
|
209 |
-
# Find the elements that composed a page
|
210 |
-
for i,component in enumerate(page_elements):
|
211 |
-
# Extract the element of the page layout
|
212 |
-
element = component[1]
|
213 |
-
|
214 |
-
# Check the elements for tables
|
215 |
-
if table_in_page == -1:
|
216 |
-
pass
|
217 |
-
else:
|
218 |
-
if is_element_inside_any_table(element, page ,tables):
|
219 |
-
table_found = find_table_for_element(element,page ,tables)
|
220 |
-
if table_found == table_in_page and table_found != None:
|
221 |
-
page_content.append(text_from_tables[table_in_page])
|
222 |
-
#page_text.append('table')
|
223 |
-
#line_format.append('table')
|
224 |
-
table_in_page+=1
|
225 |
-
# Pass this iteration because the content of this element was extracted from the tables
|
226 |
-
continue
|
227 |
-
|
228 |
-
if not is_element_inside_any_table(element,page,tables):
|
229 |
-
|
230 |
-
# Check if the element is text element
|
231 |
-
if isinstance(element, LTTextContainer):
|
232 |
-
# Use the function to extract the text and format for each text element
|
233 |
-
(line_text, format_per_line) = text_extraction(element)
|
234 |
-
# Append the text of each line to the page text
|
235 |
-
page_text.append(line_text)
|
236 |
-
# Append the format for each line containing text
|
237 |
-
line_format.append(format_per_line)
|
238 |
-
page_content.append(line_text)
|
239 |
-
|
240 |
-
|
241 |
-
# Check the elements for images
|
242 |
-
if isinstance(element, LTFigure):
|
243 |
-
# Crop the image from PDF
|
244 |
-
crop_image(element, pageObj)
|
245 |
-
# Convert the croped pdf to image
|
246 |
-
convert_to_images('cropped_image.pdf')
|
247 |
-
# Extract the text from image
|
248 |
-
image_text = image_to_text('PDF_image.png')
|
249 |
-
image_text = "" # removed to remove the errors with image
|
250 |
-
text_from_images.append(image_text)
|
251 |
-
page_content.append(image_text)
|
252 |
-
# Add a placeholder in the text and format lists
|
253 |
-
#page_text.append('image')
|
254 |
-
#line_format.append('image')
|
255 |
-
# Update the flag for image detection
|
256 |
-
image_flag = True
|
257 |
-
|
258 |
-
|
259 |
-
# Create the key of the dictionary
|
260 |
-
dctkey = 'Page_'+str(pagenum)
|
261 |
-
|
262 |
-
# Add the list of list as value of the page key
|
263 |
-
#text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
264 |
-
text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
|
265 |
-
#result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
|
266 |
-
|
267 |
-
|
268 |
-
result = " "
|
269 |
-
for t in range(number_of_pages):
|
270 |
-
page = 'Page_'+str(t)
|
271 |
-
#result = result.join(map(str, text_per_page[page]))
|
272 |
-
for q in range(len(text_per_page[page])):
|
273 |
-
#print(f"{''.join(map(str, text_per_page[page][q]))}")
|
274 |
-
result = result + f"{''.join(map(str, text_per_page[page][q]))}"
|
275 |
-
|
276 |
-
#paid key
|
277 |
-
os.environ["OPENAI_API_KEY"]="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"
|
278 |
-
|
279 |
-
from dotenv import load_dotenv, find_dotenv
|
280 |
-
_ = load_dotenv(find_dotenv()) # read local .env file
|
281 |
-
openai.api_key = os.environ['OPENAI_API_KEY']
|
282 |
-
|
283 |
-
|
284 |
-
template="You are a helpful assistant that annalyses a bank statement annd provides answers"
|
285 |
-
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
|
286 |
-
human_template= "{text}"
|
287 |
-
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
|
288 |
-
|
289 |
-
prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
|
290 |
-
Start Month, Tenure in Months, Rate of interest and EMI.
|
291 |
-
|
292 |
-
Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
|
293 |
-
|
294 |
-
Only return the JSON.
|
295 |
-
"""
|
296 |
-
|
297 |
-
|
298 |
-
prompt_template_1 = PromptTemplate.from_template(
|
299 |
-
prompt_1 + "```{loan_data} ```"
|
300 |
-
)
|
301 |
-
#prompt_template_1.format(loan_data=result.lower())
|
302 |
-
response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
|
303 |
-
|
304 |
-
prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
|
305 |
-
details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
|
306 |
-
|
307 |
-
Return a table of ALL transactions by
|
308 |
-
|
309 |
-
1. COMBININNG monthly transactions for each month
|
310 |
-
2. WITHOUT missing rows for ANY month
|
311 |
-
3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
|
312 |
-
|
313 |
-
from text in triple tick marks.
|
314 |
-
|
315 |
-
Just return the table"""
|
316 |
-
|
317 |
-
prompt_template_2 = PromptTemplate.from_template(
|
318 |
-
prompt_2 + "```{response_1} {loan_data} ```"
|
319 |
-
)
|
320 |
-
#prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
|
321 |
-
|
322 |
-
|
323 |
-
response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
|
324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
326 |
|
327 |
-
reader = load_model() #load model
|
328 |
|
329 |
-
|
330 |
|
331 |
with st.spinner("🤖 AI is at Work! "):
|
332 |
st.write(response_2)
|
|
|
155 |
#image uploader
|
156 |
file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
|
157 |
print(file_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
+
if file_name is not None:
|
160 |
+
st.write(file_name.name)
|
161 |
+
|
162 |
+
file_details = {"FileName":file_name.name,"FileType":file_name.type}
|
163 |
+
st.write(file_details)
|
164 |
+
# Find the PDF path
|
165 |
+
pdf_path = file_name # '/content/data/'+file_name+".pdf"
|
166 |
+
st.write(pdf_path)
|
167 |
+
#text_file_path = '/content/data/'+file_name+".txt"
|
168 |
+
# Create a pdf file object
|
169 |
+
#pdfFileObj = open(+pdf_path, 'rb')
|
170 |
+
# Create a pdf reader object
|
171 |
+
pdfReaded = PyPDF2.PdfReader(file_name)
|
172 |
+
|
173 |
+
# Create the dictionary to extract text from each image
|
174 |
+
text_per_page = {}
|
175 |
+
# Create a boolean variable for image detection
|
176 |
+
image_flag = False
|
177 |
+
|
178 |
+
number_of_pages = len(list(extract_pages(file_name)))
|
179 |
+
result = ''
|
180 |
+
|
181 |
+
# We extract the pages from the PDF
|
182 |
+
for pagenum, page in enumerate(extract_pages(file_name)):
|
183 |
+
|
184 |
+
# Initialize the variables needed for the text extraction from the page
|
185 |
+
pageObj = pdfReaded.pages[pagenum]
|
186 |
+
page_text = []
|
187 |
+
line_format = []
|
188 |
+
text_from_images = []
|
189 |
+
text_from_tables = []
|
190 |
+
page_content = []
|
191 |
+
# Initialize the number of the examined tables
|
192 |
+
table_in_page= -1
|
193 |
+
# Open the pdf file
|
194 |
+
pdf = pdfplumber.open(pdf_path)
|
195 |
+
# Find the examined page
|
196 |
+
page_tables = pdf.pages[pagenum]
|
197 |
+
# Find the number of tables in the page
|
198 |
+
tables = page_tables.find_tables()
|
199 |
+
if len(tables)!=0:
|
200 |
+
table_in_page = 0
|
201 |
+
|
202 |
+
# Extracting the tables of the page
|
203 |
+
for table_num in range(len(tables)):
|
204 |
+
# Extract the information of the table
|
205 |
+
table = extract_table(pdf_path, pagenum, table_num)
|
206 |
+
# Convert the table information in structured string format
|
207 |
+
table_string = table_converter(table)
|
208 |
+
# Append the table string into a list
|
209 |
+
text_from_tables.append(table_string)
|
210 |
+
|
211 |
+
# Find all the elements
|
212 |
+
page_elements = [(element.y1, element) for element in page._objs]
|
213 |
+
# Sort all the element as they appear in the page
|
214 |
+
page_elements.sort(key=lambda a: a[0], reverse=True)
|
215 |
+
|
216 |
+
|
217 |
+
# Find the elements that composed a page
|
218 |
+
for i,component in enumerate(page_elements):
|
219 |
+
# Extract the element of the page layout
|
220 |
+
element = component[1]
|
221 |
+
|
222 |
+
# Check the elements for tables
|
223 |
+
if table_in_page == -1:
|
224 |
+
pass
|
225 |
+
else:
|
226 |
+
if is_element_inside_any_table(element, page ,tables):
|
227 |
+
table_found = find_table_for_element(element,page ,tables)
|
228 |
+
if table_found == table_in_page and table_found != None:
|
229 |
+
page_content.append(text_from_tables[table_in_page])
|
230 |
+
#page_text.append('table')
|
231 |
+
#line_format.append('table')
|
232 |
+
table_in_page+=1
|
233 |
+
# Pass this iteration because the content of this element was extracted from the tables
|
234 |
+
continue
|
235 |
+
|
236 |
+
if not is_element_inside_any_table(element,page,tables):
|
237 |
+
|
238 |
+
# Check if the element is text element
|
239 |
+
if isinstance(element, LTTextContainer):
|
240 |
+
# Use the function to extract the text and format for each text element
|
241 |
+
(line_text, format_per_line) = text_extraction(element)
|
242 |
+
# Append the text of each line to the page text
|
243 |
+
page_text.append(line_text)
|
244 |
+
# Append the format for each line containing text
|
245 |
+
line_format.append(format_per_line)
|
246 |
+
page_content.append(line_text)
|
247 |
+
|
248 |
+
|
249 |
+
# Check the elements for images
|
250 |
+
if isinstance(element, LTFigure):
|
251 |
+
# Crop the image from PDF
|
252 |
+
crop_image(element, pageObj)
|
253 |
+
# Convert the croped pdf to image
|
254 |
+
convert_to_images('cropped_image.pdf')
|
255 |
+
# Extract the text from image
|
256 |
+
image_text = image_to_text('PDF_image.png')
|
257 |
+
image_text = "" # removed to remove the errors with image
|
258 |
+
text_from_images.append(image_text)
|
259 |
+
page_content.append(image_text)
|
260 |
+
# Add a placeholder in the text and format lists
|
261 |
+
#page_text.append('image')
|
262 |
+
#line_format.append('image')
|
263 |
+
# Update the flag for image detection
|
264 |
+
image_flag = True
|
265 |
+
|
266 |
+
|
267 |
+
# Create the key of the dictionary
|
268 |
+
dctkey = 'Page_'+str(pagenum)
|
269 |
+
|
270 |
+
# Add the list of list as value of the page key
|
271 |
+
#text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
272 |
+
text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
|
273 |
+
#result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
|
274 |
+
|
275 |
+
|
276 |
+
result = " "
|
277 |
+
for t in range(number_of_pages):
|
278 |
+
page = 'Page_'+str(t)
|
279 |
+
#result = result.join(map(str, text_per_page[page]))
|
280 |
+
for q in range(len(text_per_page[page])):
|
281 |
+
#print(f"{''.join(map(str, text_per_page[page][q]))}")
|
282 |
+
result = result + f"{''.join(map(str, text_per_page[page][q]))}"
|
283 |
+
|
284 |
+
#paid key
|
285 |
+
os.environ["OPENAI_API_KEY"]="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"
|
286 |
+
|
287 |
+
from dotenv import load_dotenv, find_dotenv
|
288 |
+
_ = load_dotenv(find_dotenv()) # read local .env file
|
289 |
+
openai.api_key = os.environ['OPENAI_API_KEY']
|
290 |
+
|
291 |
+
|
292 |
+
template="You are a helpful assistant that annalyses a bank statement annd provides answers"
|
293 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
|
294 |
+
human_template= "{text}"
|
295 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
|
296 |
+
|
297 |
+
prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
|
298 |
+
Start Month, Tenure in Months, Rate of interest and EMI.
|
299 |
+
|
300 |
+
Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
|
301 |
+
|
302 |
+
Only return the JSON.
|
303 |
+
"""
|
304 |
+
|
305 |
+
|
306 |
+
prompt_template_1 = PromptTemplate.from_template(
|
307 |
+
prompt_1 + "```{loan_data} ```"
|
308 |
+
)
|
309 |
+
#prompt_template_1.format(loan_data=result.lower())
|
310 |
+
response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
|
311 |
+
|
312 |
+
prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
|
313 |
+
details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
|
314 |
|
315 |
+
Return a table of ALL transactions by
|
316 |
+
|
317 |
+
1. COMBININNG monthly transactions for each month
|
318 |
+
2. WITHOUT missing rows for ANY month
|
319 |
+
3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
|
320 |
+
|
321 |
+
from text in triple tick marks.
|
322 |
+
|
323 |
+
Just return the table"""
|
324 |
+
|
325 |
+
prompt_template_2 = PromptTemplate.from_template(
|
326 |
+
prompt_2 + "```{response_1} {loan_data} ```"
|
327 |
+
)
|
328 |
+
#prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
|
329 |
|
|
|
330 |
|
331 |
+
response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
|
332 |
|
333 |
with st.spinner("🤖 AI is at Work! "):
|
334 |
st.write(response_2)
|
cropped_image.pdf
ADDED
Binary file (164 kB). View file
|
|