Spaces:
Sleeping
Sleeping
final
Browse files- .gitignore +3 -0
- app.py +175 -169
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Compiled source #
|
2 |
+
###################
|
3 |
+
*.env
|
app.py
CHANGED
@@ -148,7 +148,7 @@ def image_to_text(image_path):
|
|
148 |
st.title("Extract Loan details from PDF or Image")
|
149 |
|
150 |
#subtitle
|
151 |
-
st.markdown("## Loan detail extractor using OpenAI `streamlit` - hosted on π€ Spaces")
|
152 |
|
153 |
st.markdown("Link to the app - [PDF to extract loadn details app on π€ Spaces](https://huggingface.co/spaces/ravi259/Loan-details-extraction-app)")
|
154 |
|
@@ -156,183 +156,189 @@ st.markdown("Link to the app - [PDF to extract loadn details app on π€ Spaces]
|
|
156 |
file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
|
157 |
print(file_name)
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
)
|
307 |
-
#prompt_template_1.format(loan_data=result.lower())
|
308 |
-
response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
|
309 |
|
310 |
-
prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
|
311 |
-
details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
|
312 |
|
313 |
-
|
|
|
314 |
|
315 |
-
|
316 |
-
2. WITHOUT missing rows for ANY month
|
317 |
-
3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
|
318 |
|
319 |
-
|
|
|
|
|
320 |
|
321 |
-
|
322 |
|
323 |
-
|
324 |
-
prompt_2 + "```{response_1} {loan_data} ```"
|
325 |
-
)
|
326 |
-
#prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
|
327 |
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
-
response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
|
330 |
|
|
|
331 |
with st.spinner("π€ AI is at Work! "):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
st.write(response_2)
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
st.write("Upload an Image")
|
337 |
|
338 |
-
st.caption("Made with β€οΈ by @1littlecoder. Credits to π€ Spaces for Hosting this ")
|
|
|
148 |
st.title("Extract Loan details from PDF or Image")
|
149 |
|
150 |
#subtitle
|
151 |
+
st.markdown("## Loan detail extractor using `OpenAI` and `streamlit` - hosted on π€ Spaces")
|
152 |
|
153 |
st.markdown("Link to the app - [PDF to extract loadn details app on π€ Spaces](https://huggingface.co/spaces/ravi259/Loan-details-extraction-app)")
|
154 |
|
|
|
156 |
file_name = st.file_uploader(label = "Upload your PDF file here",type=['pdf','png','jpg','jpeg'])
|
157 |
print(file_name)
|
158 |
|
159 |
+
def read_file_get_prompts(file_name):
|
160 |
+
if file_name is not None:
|
161 |
+
st.write(file_name.name)
|
162 |
+
|
163 |
+
file_details = {"FileName":file_name.name,"FileType":file_name.type}
|
164 |
+
st.write(file_details)
|
165 |
+
# Find the PDF path
|
166 |
+
pdf_path = file_name # '/content/data/'+file_name+".pdf"
|
167 |
+
st.write(pdf_path)
|
168 |
+
#text_file_path = '/content/data/'+file_name+".txt"
|
169 |
+
# Create a pdf file object
|
170 |
+
#pdfFileObj = open(+pdf_path, 'rb')
|
171 |
+
# Create a pdf reader object
|
172 |
+
pdfReaded = PyPDF2.PdfReader(file_name)
|
173 |
+
|
174 |
+
# Create the dictionary to extract text from each image
|
175 |
+
text_per_page = {}
|
176 |
+
# Create a boolean variable for image detection
|
177 |
+
image_flag = False
|
178 |
+
|
179 |
+
number_of_pages = len(list(extract_pages(file_name)))
|
180 |
+
result = ''
|
181 |
+
|
182 |
+
# We extract the pages from the PDF
|
183 |
+
for pagenum, page in enumerate(extract_pages(file_name)):
|
184 |
+
|
185 |
+
# Initialize the variables needed for the text extraction from the page
|
186 |
+
pageObj = pdfReaded.pages[pagenum]
|
187 |
+
page_text = []
|
188 |
+
line_format = []
|
189 |
+
text_from_images = []
|
190 |
+
text_from_tables = []
|
191 |
+
page_content = []
|
192 |
+
# Initialize the number of the examined tables
|
193 |
+
table_in_page= -1
|
194 |
+
# Open the pdf file
|
195 |
+
pdf = pdfplumber.open(pdf_path)
|
196 |
+
# Find the examined page
|
197 |
+
page_tables = pdf.pages[pagenum]
|
198 |
+
# Find the number of tables in the page
|
199 |
+
tables = page_tables.find_tables()
|
200 |
+
if len(tables)!=0:
|
201 |
+
table_in_page = 0
|
202 |
+
|
203 |
+
# Extracting the tables of the page
|
204 |
+
for table_num in range(len(tables)):
|
205 |
+
# Extract the information of the table
|
206 |
+
table = extract_table(pdf_path, pagenum, table_num)
|
207 |
+
# Convert the table information in structured string format
|
208 |
+
table_string = table_converter(table)
|
209 |
+
# Append the table string into a list
|
210 |
+
text_from_tables.append(table_string)
|
211 |
+
|
212 |
+
# Find all the elements
|
213 |
+
page_elements = [(element.y1, element) for element in page._objs]
|
214 |
+
# Sort all the element as they appear in the page
|
215 |
+
page_elements.sort(key=lambda a: a[0], reverse=True)
|
216 |
+
|
217 |
+
|
218 |
+
# Find the elements that composed a page
|
219 |
+
for i,component in enumerate(page_elements):
|
220 |
+
# Extract the element of the page layout
|
221 |
+
element = component[1]
|
222 |
+
|
223 |
+
# Check the elements for tables
|
224 |
+
if table_in_page == -1:
|
225 |
+
pass
|
226 |
+
else:
|
227 |
+
if is_element_inside_any_table(element, page ,tables):
|
228 |
+
table_found = find_table_for_element(element,page ,tables)
|
229 |
+
if table_found == table_in_page and table_found != None:
|
230 |
+
page_content.append(text_from_tables[table_in_page])
|
231 |
+
#page_text.append('table')
|
232 |
+
#line_format.append('table')
|
233 |
+
table_in_page+=1
|
234 |
+
# Pass this iteration because the content of this element was extracted from the tables
|
235 |
+
continue
|
236 |
+
|
237 |
+
if not is_element_inside_any_table(element,page,tables):
|
238 |
+
|
239 |
+
# Check if the element is text element
|
240 |
+
if isinstance(element, LTTextContainer):
|
241 |
+
# Use the function to extract the text and format for each text element
|
242 |
+
(line_text, format_per_line) = text_extraction(element)
|
243 |
+
# Append the text of each line to the page text
|
244 |
+
page_text.append(line_text)
|
245 |
+
# Append the format for each line containing text
|
246 |
+
line_format.append(format_per_line)
|
247 |
+
page_content.append(line_text)
|
248 |
+
|
249 |
+
|
250 |
+
# Check the elements for images
|
251 |
+
if isinstance(element, LTFigure):
|
252 |
+
# Crop the image from PDF
|
253 |
+
crop_image(element, pageObj)
|
254 |
+
# Convert the croped pdf to image
|
255 |
+
convert_to_images('cropped_image.pdf')
|
256 |
+
# Extract the text from image
|
257 |
+
image_text = image_to_text('PDF_image.png')
|
258 |
+
image_text = "" # removed to remove the errors with image
|
259 |
+
text_from_images.append(image_text)
|
260 |
+
page_content.append(image_text)
|
261 |
+
# Add a placeholder in the text and format lists
|
262 |
+
#page_text.append('image')
|
263 |
+
#line_format.append('image')
|
264 |
+
# Update the flag for image detection
|
265 |
+
image_flag = True
|
266 |
+
|
267 |
+
|
268 |
+
# Create the key of the dictionary
|
269 |
+
dctkey = 'Page_'+str(pagenum)
|
270 |
+
|
271 |
+
# Add the list of list as value of the page key
|
272 |
+
#text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
273 |
+
text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
|
274 |
+
#result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
|
275 |
+
|
276 |
+
|
277 |
+
result = " "
|
278 |
+
for t in range(number_of_pages):
|
279 |
+
page = 'Page_'+str(t)
|
280 |
+
#result = result.join(map(str, text_per_page[page]))
|
281 |
+
for q in range(len(text_per_page[page])):
|
282 |
+
#print(f"{''.join(map(str, text_per_page[page][q]))}")
|
283 |
+
result = result + f"{''.join(map(str, text_per_page[page][q]))}"
|
284 |
+
return result
|
285 |
+
|
286 |
+
|
287 |
+
|
288 |
+
template="You are a helpful assistant that annalyses a bank statement annd provides answers"
|
289 |
+
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
|
290 |
+
human_template= "{text}"
|
291 |
+
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
|
292 |
+
|
293 |
+
prompt_1 = """Loan status include details like Total Outstanding or Total Loan Amount,
|
294 |
+
Start Month, Tenure in Months, Rate of interest and EMI.
|
295 |
+
|
296 |
+
Extract the details from text from triple tick marks and return a JSON object ONLY with keys Total Loan Amount as Number, Start Month in format mmm-yyyy, Tenure in Months, ROI, EMI as Number.
|
297 |
+
|
298 |
+
Only return the JSON.
|
299 |
+
"""
|
300 |
+
|
301 |
+
|
302 |
+
prompt_template_1 = PromptTemplate.from_template(
|
303 |
+
prompt_1 + "```{loan_data} ```"
|
304 |
+
)
|
305 |
+
#prompt_template_1.format(loan_data=result.lower())
|
|
|
|
|
|
|
306 |
|
|
|
|
|
307 |
|
308 |
+
prompt_2 = """Loan transaction details are the information of transaction happened during a period and contains
|
309 |
+
details like Month, EMI as monthly amount paid, Payment status as Paid or Unpaid, outstanding Balance after payment of EMI.
|
310 |
|
311 |
+
Return a table of ALL transactions by
|
|
|
|
|
312 |
|
313 |
+
1. COMBININNG monthly transactions for each month
|
314 |
+
2. WITHOUT missing rows for ANY month
|
315 |
+
3. with columns Month, EMI Paid, Payment Status, Interest Amount, Principal Amount, Balance Amount
|
316 |
|
317 |
+
from text in triple tick marks.
|
318 |
|
319 |
+
Just return the table"""
|
|
|
|
|
|
|
320 |
|
321 |
+
prompt_template_2 = PromptTemplate.from_template(
|
322 |
+
prompt_2 + "```{response_1} {loan_data} ```"
|
323 |
+
)
|
324 |
+
#prompt_template_2.format(response_1 =response_1, loan_data=result.lower())
|
325 |
+
|
326 |
|
|
|
327 |
|
328 |
+
if st.button('Get Loan Details'):
|
329 |
with st.spinner("π€ AI is at Work! "):
|
330 |
+
result = read_file_get_prompts(file_name)
|
331 |
+
response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
|
332 |
+
st.write(response_1)
|
333 |
+
st.balloons()
|
334 |
+
|
335 |
+
if st.button('Get Loan Transactions'):
|
336 |
+
with st.spinner("π€ AI is at Work! "):
|
337 |
+
result = read_file_get_prompts(file_name)
|
338 |
+
#response_1 = OpenAI().complete(prompt_template_1.format(loan_data=result.lower()))
|
339 |
+
response_2 = OpenAI().complete(prompt_template_2.format(response_1 =response_1, loan_data=result.lower()))
|
340 |
st.write(response_2)
|
341 |
+
st.balloons()
|
342 |
+
|
343 |
+
st.caption("Made with β€οΈ by @ravi259. Credits to π€ Spaces for Hosting this ")
|
|
|
344 |
|
|