ravi259 commited on
Commit
24d7dac
·
1 Parent(s): a6117d9

app changes

Browse files
Files changed (2) hide show
  1. app.py +170 -168
  2. cropped_image.pdf +0 -0
app.py CHANGED
@@ -155,178 +155,180 @@ st.markdown("Link to the app - [PDF to extract loadn details app on 🤗 Spaces]
155
  #image uploader
156
  file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
157
  print(file_name)
158
- # Find the PDF path
159
- pdf_path = file_name # '/content/data/'+file_name+".pdf"
160
- #text_file_path = '/content/data/'+file_name+".txt"
161
- # Create a pdf file object
162
- pdfFileObj = open(pdf_path, 'rb')
163
- # Create a pdf reader object
164
- pdfReaded = PyPDF2.PdfReader(pdfFileObj)
165
-
166
- # Create the dictionary to extract text from each image
167
- text_per_page = {}
168
- # Create a boolean variable for image detection
169
- image_flag = False
170
- number_of_pages = len(list(extract_pages(pdfFileObj)))
171
- result = ''
172
-
173
- # We extract the pages from the PDF
174
- for pagenum, page in enumerate(extract_pages(pdf_path)):
175
-
176
- # Initialize the variables needed for the text extraction from the page
177
- pageObj = pdfReaded.pages[pagenum]
178
- page_text = []
179
- line_format = []
180
- text_from_images = []
181
- text_from_tables = []
182
- page_content = []
183
- # Initialize the number of the examined tables
184
- table_in_page= -1
185
- # Open the pdf file
186
- pdf = pdfplumber.open(pdf_path)
187
- # Find the examined page
188
- page_tables = pdf.pages[pagenum]
189
- # Find the number of tables in the page
190
- tables = page_tables.find_tables()
191
- if len(tables)!=0:
192
- table_in_page = 0
193
-
194
- # Extracting the tables of the page
195
- for table_num in range(len(tables)):
196
- # Extract the information of the table
197
- table = extract_table(pdf_path, pagenum, table_num)
198
- # Convert the table information in structured string format
199
- table_string = table_converter(table)
200
- # Append the table string into a list
201
- text_from_tables.append(table_string)
202
-
203
- # Find all the elements
204
- page_elements = [(element.y1, element) for element in page._objs]
205
- # Sort all the element as they appear in the page
206
- page_elements.sort(key=lambda a: a[0], reverse=True)
207
-
208
-
209
- # Find the elements that composed a page
210
- for i,component in enumerate(page_elements):
211
- # Extract the element of the page layout
212
- element = component[1]
213
-
214
- # Check the elements for tables
215
- if table_in_page == -1:
216
- pass
217
- else:
218
- if is_element_inside_any_table(element, page ,tables):
219
- table_found = find_table_for_element(element,page ,tables)
220
- if table_found == table_in_page and table_found != None:
221
- page_content.append(text_from_tables[table_in_page])
222
- #page_text.append('table')
223
- #line_format.append('table')
224
- table_in_page+=1
225
- # Pass this iteration because the content of this element was extracted from the tables
226
- continue
227
-
228
- if not is_element_inside_any_table(element,page,tables):
229
-
230
- # Check if the element is text element
231
- if isinstance(element, LTTextContainer):
232
- # Use the function to extract the text and format for each text element
233
- (line_text, format_per_line) = text_extraction(element)
234
- # Append the text of each line to the page text
235
- page_text.append(line_text)
236
- # Append the format for each line containing text
237
- line_format.append(format_per_line)
238
- page_content.append(line_text)
239
-
240
-
241
- # Check the elements for images
242
- if isinstance(element, LTFigure):
243
- # Crop the image from PDF
244
- crop_image(element, pageObj)
245
- # Convert the croped pdf to image
246
- convert_to_images('cropped_image.pdf')
247
- # Extract the text from image
248
- image_text = image_to_text('PDF_image.png')
249
- image_text = "" # removed to remove the errors with image
250
- text_from_images.append(image_text)
251
- page_content.append(image_text)
252
- # Add a placeholder in the text and format lists
253
- #page_text.append('image')
254
- #line_format.append('image')
255
- # Update the flag for image detection
256
- image_flag = True
257
-
258
-
259
- # Create the key of the dictionary
260
- dctkey = 'Page_'+str(pagenum)
261
-
262
- # Add the list of list as value of the page key
263
- #text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
264
- text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
265
- #result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
266
-
267
-
268
- result = " "
269
- for t in range(number_of_pages):
270
- page = 'Page_'+str(t)
271
- #result = result.join(map(str, text_per_page[page]))
272
- for q in range(len(text_per_page[page])):
273
- #print(f"{''.join(map(str, text_per_page[page][q]))}")
274
- result = result + f"{''.join(map(str, text_per_page[page][q]))}"
275
-
276
- #paid key
277
- os.environ["OPENAI_API_KEY"]="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"
278
-
279
- from dotenv import load_dotenv, find_dotenv
280
- _ = load_dotenv(find_dotenv()) # read local .env file
281
- openai.api_key = os.environ['OPENAI_API_KEY']
282
-
283
-
284
- template="You are a helpful assistant that annalyses a bank statement annd provides answers"
285
- system_message_prompt = SystemMessagePromptTemplate.from_template(template)
286
- human_template= "{text}"
287
- human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
288
-
289
- prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
290
- Start Month, Tenure in Months, Rate of interest and EMI.
291
-
292
- Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
293
-
294
- Only return the JSON.
295
- """
296
-
297
-
298
- prompt_template_1 = PromptTemplate.from_template(
299
- prompt_1 + "```{loan_data} ```"
300
- )
301
- #prompt_template_1.format(loan_data=result.lower())
302
- response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
303
-
304
- prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
305
- details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
306
-
307
- Return a table of ALL transactions by
308
-
309
- 1. COMBININNG monthly transactions for each month
310
- 2. WITHOUT missing rows for ANY month
311
- 3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
312
-
313
- from text in triple tick marks.
314
-
315
- Just return the table"""
316
-
317
- prompt_template_2 = PromptTemplate.from_template(
318
- prompt_2 + "```{response_1} {loan_data} ```"
319
- )
320
- #prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
321
-
322
-
323
- response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
- reader = load_model() #load model
328
 
329
- if file_name is not None:
330
 
331
  with st.spinner("🤖 AI is at Work! "):
332
  st.write(response_2)
 
155
  #image uploader
156
  file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
157
  print(file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
+ if file_name is not None:
160
+ st.write(file_name.name)
161
+
162
+ file_details = {"FileName":file_name.name,"FileType":file_name.type}
163
+ st.write(file_details)
164
+ # Find the PDF path
165
+ pdf_path = file_name # '/content/data/'+file_name+".pdf"
166
+ st.write(pdf_path)
167
+ #text_file_path = '/content/data/'+file_name+".txt"
168
+ # Create a pdf file object
169
+ #pdfFileObj = open(+pdf_path, 'rb')
170
+ # Create a pdf reader object
171
+ pdfReaded = PyPDF2.PdfReader(file_name)
172
+
173
+ # Create the dictionary to extract text from each image
174
+ text_per_page = {}
175
+ # Create a boolean variable for image detection
176
+ image_flag = False
177
+
178
+ number_of_pages = len(list(extract_pages(file_name)))
179
+ result = ''
180
+
181
+ # We extract the pages from the PDF
182
+ for pagenum, page in enumerate(extract_pages(file_name)):
183
+
184
+ # Initialize the variables needed for the text extraction from the page
185
+ pageObj = pdfReaded.pages[pagenum]
186
+ page_text = []
187
+ line_format = []
188
+ text_from_images = []
189
+ text_from_tables = []
190
+ page_content = []
191
+ # Initialize the number of the examined tables
192
+ table_in_page= -1
193
+ # Open the pdf file
194
+ pdf = pdfplumber.open(pdf_path)
195
+ # Find the examined page
196
+ page_tables = pdf.pages[pagenum]
197
+ # Find the number of tables in the page
198
+ tables = page_tables.find_tables()
199
+ if len(tables)!=0:
200
+ table_in_page = 0
201
+
202
+ # Extracting the tables of the page
203
+ for table_num in range(len(tables)):
204
+ # Extract the information of the table
205
+ table = extract_table(pdf_path, pagenum, table_num)
206
+ # Convert the table information in structured string format
207
+ table_string = table_converter(table)
208
+ # Append the table string into a list
209
+ text_from_tables.append(table_string)
210
+
211
+ # Find all the elements
212
+ page_elements = [(element.y1, element) for element in page._objs]
213
+ # Sort all the element as they appear in the page
214
+ page_elements.sort(key=lambda a: a[0], reverse=True)
215
+
216
+
217
+ # Find the elements that composed a page
218
+ for i,component in enumerate(page_elements):
219
+ # Extract the element of the page layout
220
+ element = component[1]
221
+
222
+ # Check the elements for tables
223
+ if table_in_page == -1:
224
+ pass
225
+ else:
226
+ if is_element_inside_any_table(element, page ,tables):
227
+ table_found = find_table_for_element(element,page ,tables)
228
+ if table_found == table_in_page and table_found != None:
229
+ page_content.append(text_from_tables[table_in_page])
230
+ #page_text.append('table')
231
+ #line_format.append('table')
232
+ table_in_page+=1
233
+ # Pass this iteration because the content of this element was extracted from the tables
234
+ continue
235
+
236
+ if not is_element_inside_any_table(element,page,tables):
237
+
238
+ # Check if the element is text element
239
+ if isinstance(element, LTTextContainer):
240
+ # Use the function to extract the text and format for each text element
241
+ (line_text, format_per_line) = text_extraction(element)
242
+ # Append the text of each line to the page text
243
+ page_text.append(line_text)
244
+ # Append the format for each line containing text
245
+ line_format.append(format_per_line)
246
+ page_content.append(line_text)
247
+
248
+
249
+ # Check the elements for images
250
+ if isinstance(element, LTFigure):
251
+ # Crop the image from PDF
252
+ crop_image(element, pageObj)
253
+ # Convert the croped pdf to image
254
+ convert_to_images('cropped_image.pdf')
255
+ # Extract the text from image
256
+ image_text = image_to_text('PDF_image.png')
257
+ image_text = "" # removed to remove the errors with image
258
+ text_from_images.append(image_text)
259
+ page_content.append(image_text)
260
+ # Add a placeholder in the text and format lists
261
+ #page_text.append('image')
262
+ #line_format.append('image')
263
+ # Update the flag for image detection
264
+ image_flag = True
265
+
266
+
267
+ # Create the key of the dictionary
268
+ dctkey = 'Page_'+str(pagenum)
269
+
270
+ # Add the list of list as value of the page key
271
+ #text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
272
+ text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
273
+ #result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
274
+
275
+
276
+ result = " "
277
+ for t in range(number_of_pages):
278
+ page = 'Page_'+str(t)
279
+ #result = result.join(map(str, text_per_page[page]))
280
+ for q in range(len(text_per_page[page])):
281
+ #print(f"{''.join(map(str, text_per_page[page][q]))}")
282
+ result = result + f"{''.join(map(str, text_per_page[page][q]))}"
283
+
284
+ #paid key
285
+ os.environ["OPENAI_API_KEY"]="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"
286
+
287
+ from dotenv import load_dotenv, find_dotenv
288
+ _ = load_dotenv(find_dotenv()) # read local .env file
289
+ openai.api_key = os.environ['OPENAI_API_KEY']
290
+
291
+
292
+ template="You are a helpful assistant that annalyses a bank statement annd provides answers"
293
+ system_message_prompt = SystemMessagePromptTemplate.from_template(template)
294
+ human_template= "{text}"
295
+ human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
296
+
297
+ prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
298
+ Start Month, Tenure in Months, Rate of interest and EMI.
299
+
300
+ Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
301
+
302
+ Only return the JSON.
303
+ """
304
+
305
+
306
+ prompt_template_1 = PromptTemplate.from_template(
307
+ prompt_1 + "```{loan_data} ```"
308
+ )
309
+ #prompt_template_1.format(loan_data=result.lower())
310
+ response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
311
+
312
+ prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
313
+ details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
314
 
315
+ Return a table of ALL transactions by
316
+
317
+ 1. COMBININNG monthly transactions for each month
318
+ 2. WITHOUT missing rows for ANY month
319
+ 3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
320
+
321
+ from text in triple tick marks.
322
+
323
+ Just return the table"""
324
+
325
+ prompt_template_2 = PromptTemplate.from_template(
326
+ prompt_2 + "```{response_1} {loan_data} ```"
327
+ )
328
+ #prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
329
 
 
330
 
331
+ response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
332
 
333
  with st.spinner("🤖 AI is at Work! "):
334
  st.write(response_2)
cropped_image.pdf ADDED
Binary file (164 kB). View file