Bhaskar2611 commited on
Commit
3c9f1bc
·
verified ·
1 Parent(s): 60fde0c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +442 -16
app.py CHANGED
@@ -1,3 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import json
@@ -7,6 +359,8 @@ import pdfplumber
7
  import pytesseract
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
 
 
10
 
11
  # Initialize with reliable free model
12
  hf_token = os.getenv("HF_TOKEN")
@@ -93,10 +447,8 @@ Extract all transactions from this bank statement with these exact fields:
93
  - credit (format: 0.00)
94
  - closing_balance (format: 0.00 or -0.00 for negative)
95
  - category
96
-
97
  Statement text:
98
  {text[:3000]} [truncated if too long]
99
-
100
  Return JSON with this exact structure:
101
  {{
102
  "transactions": [
@@ -111,7 +463,6 @@ Return JSON with this exact structure:
111
  }}
112
  ]
113
  }}
114
-
115
  RULES:
116
  1. Output ONLY the JSON object with no additional text
117
  2. Keep amounts as strings with 2 decimal places
@@ -251,7 +602,7 @@ def format_number(value):
251
  # If we can't convert to float, return original but clean it
252
  return value.split('.')[0] + '.' + value.split('.')[1][:2].ljust(2, '0')
253
 
254
- def process_file(file, is_scanned):
255
  """Main processing function"""
256
  if not file:
257
  return empty_df()
@@ -332,21 +683,96 @@ def empty_df():
332
  return pd.DataFrame(columns=["Date", "Description", "Amount", "Debit",
333
  "Credit", "Closing Balance", "Category"])
334
 
335
- # Gradio Interface
336
- interface = gr.Interface(
337
- fn=process_file,
338
- inputs=[
339
- gr.File(label="Upload Bank Statement (PDF/Excel)")
340
- ],
341
- outputs=gr.Dataframe(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  label="Parsed Transactions",
343
  headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
344
  datatype=["date", "str", "number", "number", "number", "number", "str"]
345
- ),
346
- title="AI Bank Statement Parser",
347
- description="Extract structured transaction data from PDF/Excel bank statements",
348
- allow_flagging="never"
349
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  if __name__ == "__main__":
352
  interface.launch()
 
1
+ # import os
2
+ # import re
3
+ # import json
4
+ # import gradio as gr
5
+ # import pandas as pd
6
+ # import pdfplumber
7
+ # import pytesseract
8
+ # from pdf2image import convert_from_path
9
+ # from huggingface_hub import InferenceClient
10
+
11
+ # # Initialize with reliable free model
12
+ # hf_token = os.getenv("HF_TOKEN")
13
+ # client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
14
+
15
+ # def extract_excel_data(file_path):
16
+ # """Extract text from Excel file"""
17
+ # df = pd.read_excel(file_path, engine='openpyxl')
18
+ # return df.to_string(index=False)
19
+
20
+ # def extract_text_from_pdf(pdf_path, is_scanned=False):
21
+ # """Extract text from PDF with fallback OCR"""
22
+ # try:
23
+ # # Try native PDF extraction first
24
+ # with pdfplumber.open(pdf_path) as pdf:
25
+ # text = ""
26
+ # for page in pdf.pages:
27
+ # # Extract tables first for structured data
28
+ # tables = page.extract_tables()
29
+ # for table in tables:
30
+ # for row in table:
31
+ # text += " | ".join(str(cell) for cell in row) + "\n"
32
+ # text += "\n"
33
+
34
+ # # Extract text for unstructured data
35
+ # page_text = page.extract_text()
36
+ # if page_text:
37
+ # text += page_text + "\n\n"
38
+ # return text
39
+ # except Exception as e:
40
+ # print(f"Native PDF extraction failed: {str(e)}")
41
+ # # Fallback to OCR for scanned PDFs
42
+ # images = convert_from_path(pdf_path, dpi=200)
43
+ # text = ""
44
+ # for image in images:
45
+ # text += pytesseract.image_to_string(image) + "\n"
46
+ # return text
47
+
48
+ # def parse_bank_statement(text, file_type):
49
+ # """Parse bank statement using LLM with fallback to rule-based parser"""
50
+ # # Clean text differently based on file type
51
+ # cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
52
+
53
+ # if file_type == 'pdf':
54
+ # # PDF-specific cleaning
55
+ # cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
56
+ # cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
57
+
58
+ # # Keep only lines that look like transactions
59
+ # transaction_lines = []
60
+ # for line in cleaned_text.split('\n'):
61
+ # if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
62
+ # transaction_lines.append(line)
63
+ # elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
64
+ # transaction_lines.append(line)
65
+
66
+ # cleaned_text = "\n".join(transaction_lines)
67
+
68
+ # print(f"Cleaned text sample: {cleaned_text[:200]}...")
69
+
70
+ # # Try rule-based parsing first for structured data
71
+ # rule_based_data = rule_based_parser(cleaned_text)
72
+ # if rule_based_data["transactions"]:
73
+ # print("Using rule-based parser results")
74
+ # return rule_based_data
75
+
76
+ # # Fallback to LLM for unstructured data
77
+ # print("Falling back to LLM parsing")
78
+ # return llm_parser(cleaned_text)
79
+
80
+ # def llm_parser(text):
81
+ # """LLM parser for unstructured text"""
82
+ # # Craft precise prompt with strict JSON formatting instructions
83
+ # prompt = f"""
84
+ # <|system|>
85
+ # You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
86
+ # </s>
87
+ # <|user|>
88
+ # Extract all transactions from this bank statement with these exact fields:
89
+ # - date (format: YYYY-MM-DD)
90
+ # - description
91
+ # - amount (format: 0.00)
92
+ # - debit (format: 0.00)
93
+ # - credit (format: 0.00)
94
+ # - closing_balance (format: 0.00 or -0.00 for negative)
95
+ # - category
96
+
97
+ # Statement text:
98
+ # {text[:3000]} [truncated if too long]
99
+
100
+ # Return JSON with this exact structure:
101
+ # {{
102
+ # "transactions": [
103
+ # {{
104
+ # "date": "2025-05-08",
105
+ # "description": "Company XYZ Payroll",
106
+ # "amount": "8315.40",
107
+ # "debit": "0.00",
108
+ # "credit": "8315.40",
109
+ # "closing_balance": "38315.40",
110
+ # "category": "Salary"
111
+ # }}
112
+ # ]
113
+ # }}
114
+
115
+ # RULES:
116
+ # 1. Output ONLY the JSON object with no additional text
117
+ # 2. Keep amounts as strings with 2 decimal places
118
+ # 3. For missing values, use empty strings
119
+ # 4. Convert negative amounts to format "-123.45"
120
+ # 5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
121
+ # </s>
122
+ # <|assistant|>
123
+ # """
124
+
125
+ # try:
126
+ # # Call LLM via Hugging Face Inference API
127
+ # response = client.text_generation(
128
+ # prompt,
129
+ # max_new_tokens=2000,
130
+ # temperature=0.01,
131
+ # stop=["</s>"] # Updated to 'stop' parameter
132
+ # )
133
+ # print(f"LLM Response: {response}")
134
+
135
+ # # Validate and clean JSON response
136
+ # response = response.strip()
137
+ # if not response.startswith('{'):
138
+ # # Find the first { and last } to extract JSON
139
+ # start_idx = response.find('{')
140
+ # end_idx = response.rfind('}')
141
+ # if start_idx != -1 and end_idx != -1:
142
+ # response = response[start_idx:end_idx+1]
143
+
144
+ # # Parse JSON and validate structure
145
+ # data = json.loads(response)
146
+ # if "transactions" not in data:
147
+ # raise ValueError("Missing 'transactions' key in JSON")
148
+
149
+ # return data
150
+ # except Exception as e:
151
+ # print(f"LLM Error: {str(e)}")
152
+ # return {"transactions": []}
153
+
154
+ # def rule_based_parser(text):
155
+ # """Enhanced fallback parser for structured tables"""
156
+ # lines = [line.strip() for line in text.split('\n') if line.strip()]
157
+
158
+ # # Find header line - more flexible detection
159
+ # header_index = None
160
+ # header_patterns = [
161
+ # r'Date\b', r'Description\b', r'Amount\b',
162
+ # r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
163
+ # ]
164
+
165
+ # # First try: Look for a full header line
166
+ # for i, line in enumerate(lines):
167
+ # if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns[:3]):
168
+ # header_index = i
169
+ # break
170
+
171
+ # # Second try: Look for any header indicators
172
+ # if header_index is None:
173
+ # for i, line in enumerate(lines):
174
+ # if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
175
+ # header_index = i
176
+ # break
177
+
178
+ # # Third try: Look for pipe-delimited headers
179
+ # if header_index is None:
180
+ # for i, line in enumerate(lines):
181
+ # if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
182
+ # header_index = i
183
+ # break
184
+
185
+ # if header_index is None:
186
+ # return {"transactions": []}
187
+
188
+ # data_lines = lines[header_index + 1:]
189
+ # transactions = []
190
+
191
+ # for line in data_lines:
192
+ # # Handle both pipe-delimited and space-delimited formats
193
+ # if '|' in line:
194
+ # parts = [p.strip() for p in line.split('|') if p.strip()]
195
+ # else:
196
+ # # Space-delimited format - split by 2+ spaces
197
+ # parts = re.split(r'\s{2,}', line)
198
+
199
+ # # Skip lines that don't have enough parts
200
+ # if len(parts) < 7:
201
+ # continue
202
+
203
+ # try:
204
+ # # Handle transaction date validation
205
+ # if not re.match(r'\d{4}-\d{2}-\d{2}', parts[0]):
206
+ # continue
207
+
208
+ # transactions.append({
209
+ # "date": parts[0],
210
+ # "description": parts[1],
211
+ # "amount": format_number(parts[2]),
212
+ # "debit": format_number(parts[3]),
213
+ # "credit": format_number(parts[4]),
214
+ # "closing_balance": format_number(parts[5]),
215
+ # "category": parts[6]
216
+ # })
217
+ # except Exception as e:
218
+ # print(f"Error parsing line: {str(e)}")
219
+
220
+ # return {"transactions": transactions}
221
+
222
+ # def format_number(value):
223
+ # """Format numeric values consistently"""
224
+ # if not value or str(value).lower() in ['nan', 'nat']:
225
+ # return "0.00"
226
+
227
+ # # If it's already a number, format directly
228
+ # if isinstance(value, (int, float)):
229
+ # return f"{value:.2f}"
230
+
231
+ # # Clean string values
232
+ # value = str(value).replace(',', '').replace('$', '').strip()
233
+
234
+ # # Handle negative numbers in parentheses
235
+ # if '(' in value and ')' in value:
236
+ # value = '-' + value.replace('(', '').replace(')', '')
237
+
238
+ # # Handle empty values
239
+ # if not value:
240
+ # return "0.00"
241
+
242
+ # # Standardize decimal format
243
+ # if '.' not in value:
244
+ # value += '.00'
245
+
246
+ # # Ensure two decimal places
247
+ # try:
248
+ # num_value = float(value)
249
+ # return f"{num_value:.2f}"
250
+ # except ValueError:
251
+ # # If we can't convert to float, return original but clean it
252
+ # return value.split('.')[0] + '.' + value.split('.')[1][:2].ljust(2, '0')
253
+
254
+ # def process_file(file, is_scanned):
255
+ # """Main processing function"""
256
+ # if not file:
257
+ # return empty_df()
258
+
259
+ # file_path = file.name
260
+ # file_ext = os.path.splitext(file_path)[1].lower()
261
+
262
+ # try:
263
+ # if file_ext == '.xlsx':
264
+ # # Directly process Excel files without text conversion
265
+ # df = pd.read_excel(file_path, engine='openpyxl')
266
+
267
+ # # Normalize column names
268
+ # df.columns = df.columns.str.strip().str.lower()
269
+
270
+ # # Create mapping to expected columns
271
+ # col_mapping = {
272
+ # 'date': 'date',
273
+ # 'description': 'description',
274
+ # 'amount': 'amount',
275
+ # 'debit': 'debit',
276
+ # 'credit': 'credit',
277
+ # 'closing balance': 'closing_balance',
278
+ # 'closing': 'closing_balance',
279
+ # 'balance': 'closing_balance',
280
+ # 'category': 'category'
281
+ # }
282
+
283
+ # # Create output DataFrame with required columns
284
+ # output_df = pd.DataFrame()
285
+ # for col in ['date', 'description', 'amount', 'debit', 'credit', 'closing_balance', 'category']:
286
+ # if col in df.columns:
287
+ # output_df[col] = df[col]
288
+ # elif any(alias in col_mapping and col_mapping[alias] == col for alias in df.columns):
289
+ # # Find alias
290
+ # for alias in df.columns:
291
+ # if alias in col_mapping and col_mapping[alias] == col:
292
+ # output_df[col] = df[alias]
293
+ # break
294
+ # else:
295
+ # output_df[col] = ""
296
+
297
+ # # Format numeric columns
298
+ # for col in ['amount', 'debit', 'credit', 'closing_balance']:
299
+ # output_df[col] = output_df[col].apply(format_number)
300
+
301
+ # # Rename columns for display
302
+ # output_df.columns = ["Date", "Description", "Amount", "Debit",
303
+ # "Credit", "Closing Balance", "Category"]
304
+ # return output_df
305
+
306
+ # elif file_ext == '.pdf':
307
+ # text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
308
+ # parsed_data = parse_bank_statement(text, 'pdf')
309
+ # df = pd.DataFrame(parsed_data["transactions"])
310
+
311
+ # # Ensure all required columns exist
312
+ # required_cols = ["date", "description", "amount", "debit",
313
+ # "credit", "closing_balance", "category"]
314
+ # for col in required_cols:
315
+ # if col not in df.columns:
316
+ # df[col] = ""
317
+
318
+ # # Format columns properly
319
+ # df.columns = ["Date", "Description", "Amount", "Debit",
320
+ # "Credit", "Closing Balance", "Category"]
321
+ # return df
322
+
323
+ # else:
324
+ # return empty_df()
325
+
326
+ # except Exception as e:
327
+ # print(f"Processing error: {str(e)}")
328
+ # return empty_df()
329
+
330
+ # def empty_df():
331
+ # """Return empty DataFrame with correct columns"""
332
+ # return pd.DataFrame(columns=["Date", "Description", "Amount", "Debit",
333
+ # "Credit", "Closing Balance", "Category"])
334
+
335
+ # # Gradio Interface
336
+ # interface = gr.Interface(
337
+ # fn=process_file,
338
+ # inputs=[
339
+ # gr.File(label="Upload Bank Statement (PDF/Excel)")
340
+ # ],
341
+ # outputs=gr.Dataframe(
342
+ # label="Parsed Transactions",
343
+ # headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
344
+ # datatype=["date", "str", "number", "number", "number", "number", "str"]
345
+ # ),
346
+ # title="AI Bank Statement Parser",
347
+ # description="Extract structured transaction data from PDF/Excel bank statements",
348
+ # allow_flagging="never"
349
+ # )
350
+
351
+ # if __name__ == "__main__":
352
+ # interface.launch()
353
  import os
354
  import re
355
  import json
 
359
  import pytesseract
360
  from pdf2image import convert_from_path
361
  from huggingface_hub import InferenceClient
362
+ from fpdf import FPDF # Added for PDF generation
363
+ import tempfile # Added for temporary file handling
364
 
365
  # Initialize with reliable free model
366
  hf_token = os.getenv("HF_TOKEN")
 
447
  - credit (format: 0.00)
448
  - closing_balance (format: 0.00 or -0.00 for negative)
449
  - category
 
450
  Statement text:
451
  {text[:3000]} [truncated if too long]
 
452
  Return JSON with this exact structure:
453
  {{
454
  "transactions": [
 
463
  }}
464
  ]
465
  }}
 
466
  RULES:
467
  1. Output ONLY the JSON object with no additional text
468
  2. Keep amounts as strings with 2 decimal places
 
602
  # If we can't convert to float, return original but clean it
603
  return value.split('.')[0] + '.' + value.split('.')[1][:2].ljust(2, '0')
604
 
605
+ def process_file(file, is_scanned=False):
606
  """Main processing function"""
607
  if not file:
608
  return empty_df()
 
683
  return pd.DataFrame(columns=["Date", "Description", "Amount", "Debit",
684
  "Credit", "Closing Balance", "Category"])
685
 
686
+ # New function to generate PDF from DataFrame
687
+ def generate_pdf(df):
688
+ """Generate PDF from DataFrame and return file path"""
689
+ if df.empty:
690
+ return None
691
+
692
+ # Create a PDF
693
+ pdf = FPDF()
694
+ pdf.add_page()
695
+ pdf.set_font("Arial", size=8) # Smaller font to fit more data
696
+
697
+ # Set column widths
698
+ col_widths = [22, 65, 20, 15, 15, 25, 20] # Adjusted to fit all columns
699
+
700
+ # Headers
701
+ headers = df.columns.tolist()
702
+ for i, header in enumerate(headers):
703
+ pdf.cell(col_widths[i], 10, header, border=1)
704
+ pdf.ln()
705
+
706
+ # Data
707
+ for _, row in df.iterrows():
708
+ for i, col in enumerate(headers):
709
+ # Truncate long descriptions
710
+ value = str(row[col])
711
+ if headers[i] == "Description" and len(value) > 30:
712
+ value = value[:27] + "..."
713
+ pdf.cell(col_widths[i], 10, value, border=1)
714
+ pdf.ln()
715
+
716
+ # Save to temporary file
717
+ temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
718
+ temp_file.close()
719
+ pdf.output(temp_file.name)
720
+ return temp_file.name
721
+
722
+ # Modified Gradio Interface
723
+ with gr.Blocks() as interface: # Changed to Blocks for more control
724
+ gr.Markdown("## AI Bank Statement Parser")
725
+ gr.Markdown("Extract structured transaction data from PDF/Excel bank statements")
726
+
727
+ # File input
728
+ file_input = gr.File(label="Upload Bank Statement (PDF/Excel)")
729
+
730
+ # Output dataframe
731
+ output_df = gr.Dataframe(
732
  label="Parsed Transactions",
733
  headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
734
  datatype=["date", "str", "number", "number", "number", "number", "str"]
735
+ )
736
+
737
+ # State to store the processed DataFrame
738
+ state_df = gr.State(value=pd.DataFrame())
739
+
740
+ # Download button (initially hidden)
741
+ download_btn = gr.DownloadButton(
742
+ "Download as PDF",
743
+ visible=False,
744
+ elem_classes="download-btn"
745
+ )
746
+
747
+ # Process file and update state
748
+ def process_and_store(file):
749
+ df = process_file(file)
750
+ return df, df, gr.DownloadButton(visible=not df.empty)
751
+
752
+ # Connect components
753
+ file_input.change(
754
+ process_and_store,
755
+ inputs=[file_input],
756
+ outputs=[output_df, state_df, download_btn]
757
+ )
758
+
759
+ # Generate PDF when download button is clicked
760
+ def on_download_click(df):
761
+ return generate_pdf(df)
762
+
763
+ download_btn.click(
764
+ on_download_click,
765
+ inputs=[state_df],
766
+ outputs=[download_btn]
767
+ )
768
+
769
+ # Add custom CSS for the download button position
770
+ interface.css = """
771
+ .download-btn {
772
+ margin-top: 20px !important;
773
+ margin-bottom: 30px !important;
774
+ }
775
+ """
776
 
777
  if __name__ == "__main__":
778
  interface.launch()