ravi259 commited on
Commit
52d663a
Β·
1 Parent(s): e0adc8a
Files changed (2) hide show
  1. .gitignore +3 -0
  2. app.py +175 -169
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Compiled source #
2
+ ###################
3
+ *.env
app.py CHANGED
@@ -148,7 +148,7 @@ def image_to_text(image_path):
148
  st.title("Extract Loan details from PDF or Image")
149
 
150
  #subtitle
151
- st.markdown("## Loan detail extractor using OpenAI `streamlit` - hosted on πŸ€— Spaces")
152
 
153
  st.markdown("Link to the app - [PDF to extract loadn details app on πŸ€— Spaces](https://huggingface.co/spaces/ravi259/Loan-details-extraction-app)")
154
 
@@ -156,183 +156,189 @@ st.markdown("Link to the app - [PDF to extract loadn details app on πŸ€— Spaces]
156
  file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
157
  print(file_name)
158
 
159
- if file_name is not None:
160
- st.write(file_name.name)
161
-
162
- file_details = {"FileName":file_name.name,"FileType":file_name.type}
163
- st.write(file_details)
164
- # Find the PDF path
165
- pdf_path = file_name # '/content/data/'+file_name+".pdf"
166
- st.write(pdf_path)
167
- #text_file_path = '/content/data/'+file_name+".txt"
168
- # Create a pdf file object
169
- #pdfFileObj = open(+pdf_path, 'rb')
170
- # Create a pdf reader object
171
- pdfReaded = PyPDF2.PdfReader(file_name)
172
-
173
- # Create the dictionary to extract text from each image
174
- text_per_page = {}
175
- # Create a boolean variable for image detection
176
- image_flag = False
177
-
178
- number_of_pages = len(list(extract_pages(file_name)))
179
- result = ''
180
-
181
- # We extract the pages from the PDF
182
- for pagenum, page in enumerate(extract_pages(file_name)):
183
-
184
- # Initialize the variables needed for the text extraction from the page
185
- pageObj = pdfReaded.pages[pagenum]
186
- page_text = []
187
- line_format = []
188
- text_from_images = []
189
- text_from_tables = []
190
- page_content = []
191
- # Initialize the number of the examined tables
192
- table_in_page= -1
193
- # Open the pdf file
194
- pdf = pdfplumber.open(pdf_path)
195
- # Find the examined page
196
- page_tables = pdf.pages[pagenum]
197
- # Find the number of tables in the page
198
- tables = page_tables.find_tables()
199
- if len(tables)!=0:
200
- table_in_page = 0
201
-
202
- # Extracting the tables of the page
203
- for table_num in range(len(tables)):
204
- # Extract the information of the table
205
- table = extract_table(pdf_path, pagenum, table_num)
206
- # Convert the table information in structured string format
207
- table_string = table_converter(table)
208
- # Append the table string into a list
209
- text_from_tables.append(table_string)
210
-
211
- # Find all the elements
212
- page_elements = [(element.y1, element) for element in page._objs]
213
- # Sort all the element as they appear in the page
214
- page_elements.sort(key=lambda a: a[0], reverse=True)
215
-
216
-
217
- # Find the elements that composed a page
218
- for i,component in enumerate(page_elements):
219
- # Extract the element of the page layout
220
- element = component[1]
221
-
222
- # Check the elements for tables
223
- if table_in_page == -1:
224
- pass
225
- else:
226
- if is_element_inside_any_table(element, page ,tables):
227
- table_found = find_table_for_element(element,page ,tables)
228
- if table_found == table_in_page and table_found != None:
229
- page_content.append(text_from_tables[table_in_page])
230
- #page_text.append('table')
231
- #line_format.append('table')
232
- table_in_page+=1
233
- # Pass this iteration because the content of this element was extracted from the tables
234
- continue
235
-
236
- if not is_element_inside_any_table(element,page,tables):
237
-
238
- # Check if the element is text element
239
- if isinstance(element, LTTextContainer):
240
- # Use the function to extract the text and format for each text element
241
- (line_text, format_per_line) = text_extraction(element)
242
- # Append the text of each line to the page text
243
- page_text.append(line_text)
244
- # Append the format for each line containing text
245
- line_format.append(format_per_line)
246
- page_content.append(line_text)
247
-
248
-
249
- # Check the elements for images
250
- if isinstance(element, LTFigure):
251
- # Crop the image from PDF
252
- crop_image(element, pageObj)
253
- # Convert the croped pdf to image
254
- convert_to_images('cropped_image.pdf')
255
- # Extract the text from image
256
- image_text = image_to_text('PDF_image.png')
257
- image_text = "" # removed to remove the errors with image
258
- text_from_images.append(image_text)
259
- page_content.append(image_text)
260
- # Add a placeholder in the text and format lists
261
- #page_text.append('image')
262
- #line_format.append('image')
263
- # Update the flag for image detection
264
- image_flag = True
265
-
266
-
267
- # Create the key of the dictionary
268
- dctkey = 'Page_'+str(pagenum)
269
-
270
- # Add the list of list as value of the page key
271
- #text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
272
- text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
273
- #result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
274
-
275
-
276
- result = " "
277
- for t in range(number_of_pages):
278
- page = 'Page_'+str(t)
279
- #result = result.join(map(str, text_per_page[page]))
280
- for q in range(len(text_per_page[page])):
281
- #print(f"{''.join(map(str, text_per_page[page][q]))}")
282
- result = result + f"{''.join(map(str, text_per_page[page][q]))}"
283
-
284
-
285
- from dotenv import load_dotenv, find_dotenv
286
- _ = load_dotenv(find_dotenv()) # read local .env file
287
- openai.api_key = os.environ['OPENAI_API_KEY']
288
-
289
-
290
- template="You are a helpful assistant that annalyses a bank statement annd provides answers"
291
- system_message_prompt = SystemMessagePromptTemplate.from_template(template)
292
- human_template= "{text}"
293
- human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
294
-
295
- prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
296
- Start Month, Tenure in Months, Rate of interest and EMI.
297
-
298
- Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
299
-
300
- Only return the JSON.
301
- """
302
-
303
-
304
- prompt_template_1 = PromptTemplate.from_template(
305
- prompt_1 + "```{loan_data} ```"
306
- )
307
- #prompt_template_1.format(loan_data=result.lower())
308
- response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
309
 
310
- prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
311
- details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
312
 
313
- Return a table of ALL transactions by
 
314
 
315
- 1. COMBININNG monthly transactions for each month
316
- 2. WITHOUT missing rows for ANY month
317
- 3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
318
 
319
- from text in triple tick marks.
 
 
320
 
321
- Just return the table"""
322
 
323
- prompt_template_2 = PromptTemplate.from_template(
324
- prompt_2 + "```{response_1} {loan_data} ```"
325
- )
326
- #prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
327
 
 
 
 
 
 
328
 
329
- response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
330
 
 
331
  with st.spinner("πŸ€– AI is at Work! "):
 
 
 
 
 
 
 
 
 
 
332
  st.write(response_2)
333
- #st.success("Here you go!")
334
- st.balloons()
335
- else:
336
- st.write("Upload an Image")
337
 
338
- st.caption("Made with ❀️ by @1littlecoder. Credits to πŸ€— Spaces for Hosting this ")
 
148
  st.title("Extract Loan details from PDF or Image")
149
 
150
  #subtitle
151
+ st.markdown("## Loan detail extractor using `OpenAI` and `streamlit` - hosted on πŸ€— Spaces")
152
 
153
  st.markdown("Link to the app - [PDF to extract loadn details app on πŸ€— Spaces](https://huggingface.co/spaces/ravi259/Loan-details-extraction-app)")
154
 
 
156
  file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
157
  print(file_name)
158
 
159
+ def read_file_get_prompts(file_name):
160
+ if file_name is not None:
161
+ st.write(file_name.name)
162
+
163
+ file_details = {"FileName":file_name.name,"FileType":file_name.type}
164
+ st.write(file_details)
165
+ # Find the PDF path
166
+ pdf_path = file_name # '/content/data/'+file_name+".pdf"
167
+ st.write(pdf_path)
168
+ #text_file_path = '/content/data/'+file_name+".txt"
169
+ # Create a pdf file object
170
+ #pdfFileObj = open(+pdf_path, 'rb')
171
+ # Create a pdf reader object
172
+ pdfReaded = PyPDF2.PdfReader(file_name)
173
+
174
+ # Create the dictionary to extract text from each image
175
+ text_per_page = {}
176
+ # Create a boolean variable for image detection
177
+ image_flag = False
178
+
179
+ number_of_pages = len(list(extract_pages(file_name)))
180
+ result = ''
181
+
182
+ # We extract the pages from the PDF
183
+ for pagenum, page in enumerate(extract_pages(file_name)):
184
+
185
+ # Initialize the variables needed for the text extraction from the page
186
+ pageObj = pdfReaded.pages[pagenum]
187
+ page_text = []
188
+ line_format = []
189
+ text_from_images = []
190
+ text_from_tables = []
191
+ page_content = []
192
+ # Initialize the number of the examined tables
193
+ table_in_page= -1
194
+ # Open the pdf file
195
+ pdf = pdfplumber.open(pdf_path)
196
+ # Find the examined page
197
+ page_tables = pdf.pages[pagenum]
198
+ # Find the number of tables in the page
199
+ tables = page_tables.find_tables()
200
+ if len(tables)!=0:
201
+ table_in_page = 0
202
+
203
+ # Extracting the tables of the page
204
+ for table_num in range(len(tables)):
205
+ # Extract the information of the table
206
+ table = extract_table(pdf_path, pagenum, table_num)
207
+ # Convert the table information in structured string format
208
+ table_string = table_converter(table)
209
+ # Append the table string into a list
210
+ text_from_tables.append(table_string)
211
+
212
+ # Find all the elements
213
+ page_elements = [(element.y1, element) for element in page._objs]
214
+ # Sort all the element as they appear in the page
215
+ page_elements.sort(key=lambda a: a[0], reverse=True)
216
+
217
+
218
+ # Find the elements that composed a page
219
+ for i,component in enumerate(page_elements):
220
+ # Extract the element of the page layout
221
+ element = component[1]
222
+
223
+ # Check the elements for tables
224
+ if table_in_page == -1:
225
+ pass
226
+ else:
227
+ if is_element_inside_any_table(element, page ,tables):
228
+ table_found = find_table_for_element(element,page ,tables)
229
+ if table_found == table_in_page and table_found != None:
230
+ page_content.append(text_from_tables[table_in_page])
231
+ #page_text.append('table')
232
+ #line_format.append('table')
233
+ table_in_page+=1
234
+ # Pass this iteration because the content of this element was extracted from the tables
235
+ continue
236
+
237
+ if not is_element_inside_any_table(element,page,tables):
238
+
239
+ # Check if the element is text element
240
+ if isinstance(element, LTTextContainer):
241
+ # Use the function to extract the text and format for each text element
242
+ (line_text, format_per_line) = text_extraction(element)
243
+ # Append the text of each line to the page text
244
+ page_text.append(line_text)
245
+ # Append the format for each line containing text
246
+ line_format.append(format_per_line)
247
+ page_content.append(line_text)
248
+
249
+
250
+ # Check the elements for images
251
+ if isinstance(element, LTFigure):
252
+ # Crop the image from PDF
253
+ crop_image(element, pageObj)
254
+ # Convert the croped pdf to image
255
+ convert_to_images('cropped_image.pdf')
256
+ # Extract the text from image
257
+ image_text = image_to_text('PDF_image.png')
258
+ image_text = "" # removed to remove the errors with image
259
+ text_from_images.append(image_text)
260
+ page_content.append(image_text)
261
+ # Add a placeholder in the text and format lists
262
+ #page_text.append('image')
263
+ #line_format.append('image')
264
+ # Update the flag for image detection
265
+ image_flag = True
266
+
267
+
268
+ # Create the key of the dictionary
269
+ dctkey = 'Page_'+str(pagenum)
270
+
271
+ # Add the list of list as value of the page key
272
+ #text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
273
+ text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
274
+ #result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
275
+
276
+
277
+ result = " "
278
+ for t in range(number_of_pages):
279
+ page = 'Page_'+str(t)
280
+ #result = result.join(map(str, text_per_page[page]))
281
+ for q in range(len(text_per_page[page])):
282
+ #print(f"{''.join(map(str, text_per_page[page][q]))}")
283
+ result = result + f"{''.join(map(str, text_per_page[page][q]))}"
284
+ return result
285
+
286
+
287
+
288
+ template="You are a helpful assistant that annalyses a bank statement annd provides answers"
289
+ system_message_prompt = SystemMessagePromptTemplate.from_template(template)
290
+ human_template= "{text}"
291
+ human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
292
+
293
+ prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
294
+ Start Month, Tenure in Months, Rate of interest and EMI.
295
+
296
+ Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
297
+
298
+ Only return the JSON.
299
+ """
300
+
301
+
302
+ prompt_template_1 = PromptTemplate.from_template(
303
+ prompt_1 + "```{loan_data} ```"
304
+ )
305
+ #prompt_template_1.format(loan_data=result.lower())
 
 
 
306
 
 
 
307
 
308
+ prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
309
+ details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
310
 
311
+ Return a table of ALL transactions by
 
 
312
 
313
+ 1. COMBININNG monthly transactions for each month
314
+ 2. WITHOUT missing rows for ANY month
315
+ 3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
316
 
317
+ from text in triple tick marks.
318
 
319
+ Just return the table"""
 
 
 
320
 
321
+ prompt_template_2 = PromptTemplate.from_template(
322
+ prompt_2 + "```{response_1} {loan_data} ```"
323
+ )
324
+ #prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
325
+
326
 
 
327
 
328
+ if st.button('Get Loan Details'):
329
  with st.spinner("πŸ€– AI is at Work! "):
330
+ result = read_file_get_prompts(file_name)
331
+ response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
332
+ st.write(response_1)
333
+ st.balloons()
334
+
335
+ if st.button('Get Loan Transactions'):
336
+ with st.spinner("πŸ€– AI is at Work! "):
337
+ result = read_file_get_prompts(file_name)
338
+ #response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
339
+ response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
340
  st.write(response_2)
341
+ st.balloons()
342
+
343
+ st.caption("Made with ❀️ by @ravi259. Credits to πŸ€— Spaces for Hosting this ")
 
344