Bhaskar2611 commited on
Commit
1dff96b
·
verified ·
1 Parent(s): c70b653

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +420 -66
app.py CHANGED
@@ -1,3 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import json
@@ -45,22 +343,26 @@ def extract_text_from_pdf(pdf_path, is_scanned=False):
45
  text += pytesseract.image_to_string(image) + "\n"
46
  return text
47
 
48
- def parse_bank_statement(text):
49
  """Parse bank statement using LLM with fallback to rule-based parser"""
50
- # Clean text and remove non-essential lines
51
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
52
- cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
53
- cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
54
-
55
- # Keep only lines that look like transactions
56
- transaction_lines = []
57
- for line in cleaned_text.split('\n'):
58
- if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
59
- transaction_lines.append(line)
60
- elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
61
- transaction_lines.append(line)
62
-
63
- cleaned_text = "\n".join(transaction_lines)
 
 
 
 
64
  print(f"Cleaned text sample: {cleaned_text[:200]}...")
65
 
66
  # Try rule-based parsing first for structured data
@@ -158,11 +460,26 @@ def rule_based_parser(text):
158
  r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
159
  ]
160
 
 
161
  for i, line in enumerate(lines):
162
- if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
163
  header_index = i
164
  break
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  if header_index is None:
167
  return {"transactions": []}
168
 
@@ -174,27 +491,18 @@ def rule_based_parser(text):
174
  if '|' in line:
175
  parts = [p.strip() for p in line.split('|') if p.strip()]
176
  else:
177
- # Space-delimited format - split while preserving multi-word descriptions
178
- parts = []
179
- current = ""
180
- in_description = False
181
- for char in line:
182
- if char == ' ' and not in_description:
183
- if current:
184
- parts.append(current)
185
- current = ""
186
- # After date field, we're in description
187
- if len(parts) == 1:
188
- in_description = True
189
- else:
190
- current += char
191
- if current:
192
- parts.append(current)
193
 
 
194
  if len(parts) < 7:
195
  continue
196
 
197
  try:
 
 
 
 
198
  transactions.append({
199
  "date": parts[0],
200
  "description": parts[1],
@@ -211,70 +519,116 @@ def rule_based_parser(text):
211
 
212
  def format_number(value):
213
  """Format numeric values consistently"""
214
- if not value:
215
  return "0.00"
216
 
217
- # Clean numeric values
218
- value = value.replace(',', '').replace('$', '').strip()
 
 
 
 
219
 
220
  # Handle negative numbers in parentheses
221
  if '(' in value and ')' in value:
222
  value = '-' + value.replace('(', '').replace(')', '')
223
 
 
 
 
 
224
  # Standardize decimal format
225
  if '.' not in value:
226
  value += '.00'
227
 
228
  # Ensure two decimal places
229
  try:
230
- return f"{float(value):.2f}"
231
- except:
232
- return value
 
 
233
 
234
  def process_file(file, is_scanned):
235
  """Main processing function"""
236
  if not file:
237
- return pd.DataFrame(columns=[
238
- "Date", "Description", "Amount", "Debit",
239
- "Credit", "Closing Balance", "Category"
240
- ])
241
 
242
  file_path = file.name
243
  file_ext = os.path.splitext(file_path)[1].lower()
244
 
245
  try:
246
  if file_ext == '.xlsx':
247
- text = extract_excel_data(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  elif file_ext == '.pdf':
249
  text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
250
- else:
251
- return pd.DataFrame(columns=[
252
- "Date", "Description", "Amount", "Debit",
253
- "Credit", "Closing Balance", "Category"
254
- ])
255
-
256
- parsed_data = parse_bank_statement(text)
257
- df = pd.DataFrame(parsed_data["transactions"])
 
 
 
 
 
 
258
 
259
- # Ensure all required columns exist
260
- required_cols = ["date", "description", "amount", "debit",
261
- "credit", "closing_balance", "category"]
262
- for col in required_cols:
263
- if col not in df.columns:
264
- df[col] = ""
265
-
266
- # Format columns properly
267
- df.columns = ["Date", "Description", "Amount", "Debit",
268
- "Credit", "Closing Balance", "Category"]
269
- return df
270
 
271
  except Exception as e:
272
  print(f"Processing error: {str(e)}")
273
- # Return empty DataFrame with correct columns on error
274
- return pd.DataFrame(columns=[
275
- "Date", "Description", "Amount", "Debit",
276
- "Credit", "Closing Balance", "Category"
277
- ])
 
278
 
279
  # Gradio Interface
280
  interface = gr.Interface(
 
1
+ # import os
2
+ # import re
3
+ # import json
4
+ # import gradio as gr
5
+ # import pandas as pd
6
+ # import pdfplumber
7
+ # import pytesseract
8
+ # from pdf2image import convert_from_path
9
+ # from huggingface_hub import InferenceClient
10
+
11
+ # # Initialize with reliable free model
12
+ # hf_token = os.getenv("HF_TOKEN")
13
+ # client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
14
+
15
+ # def extract_excel_data(file_path):
16
+ # """Extract text from Excel file"""
17
+ # df = pd.read_excel(file_path, engine='openpyxl')
18
+ # return df.to_string(index=False)
19
+
20
+ # def extract_text_from_pdf(pdf_path, is_scanned=False):
21
+ # """Extract text from PDF with fallback OCR"""
22
+ # try:
23
+ # # Try native PDF extraction first
24
+ # with pdfplumber.open(pdf_path) as pdf:
25
+ # text = ""
26
+ # for page in pdf.pages:
27
+ # # Extract tables first for structured data
28
+ # tables = page.extract_tables()
29
+ # for table in tables:
30
+ # for row in table:
31
+ # text += " | ".join(str(cell) for cell in row) + "\n"
32
+ # text += "\n"
33
+
34
+ # # Extract text for unstructured data
35
+ # page_text = page.extract_text()
36
+ # if page_text:
37
+ # text += page_text + "\n\n"
38
+ # return text
39
+ # except Exception as e:
40
+ # print(f"Native PDF extraction failed: {str(e)}")
41
+ # # Fallback to OCR for scanned PDFs
42
+ # images = convert_from_path(pdf_path, dpi=200)
43
+ # text = ""
44
+ # for image in images:
45
+ # text += pytesseract.image_to_string(image) + "\n"
46
+ # return text
47
+
48
+ # def parse_bank_statement(text):
49
+ # """Parse bank statement using LLM with fallback to rule-based parser"""
50
+ # # Clean text and remove non-essential lines
51
+ # cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
52
+ # cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
53
+ # cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
54
+
55
+ # # Keep only lines that look like transactions
56
+ # transaction_lines = []
57
+ # for line in cleaned_text.split('\n'):
58
+ # if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
59
+ # transaction_lines.append(line)
60
+ # elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
61
+ # transaction_lines.append(line)
62
+
63
+ # cleaned_text = "\n".join(transaction_lines)
64
+ # print(f"Cleaned text sample: {cleaned_text[:200]}...")
65
+
66
+ # # Try rule-based parsing first for structured data
67
+ # rule_based_data = rule_based_parser(cleaned_text)
68
+ # if rule_based_data["transactions"]:
69
+ # print("Using rule-based parser results")
70
+ # return rule_based_data
71
+
72
+ # # Fallback to LLM for unstructured data
73
+ # print("Falling back to LLM parsing")
74
+ # return llm_parser(cleaned_text)
75
+
76
+ # def llm_parser(text):
77
+ # """LLM parser for unstructured text"""
78
+ # # Craft precise prompt with strict JSON formatting instructions
79
+ # prompt = f"""
80
+ # <|system|>
81
+ # You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
82
+ # </s>
83
+ # <|user|>
84
+ # Extract all transactions from this bank statement with these exact fields:
85
+ # - date (format: YYYY-MM-DD)
86
+ # - description
87
+ # - amount (format: 0.00)
88
+ # - debit (format: 0.00)
89
+ # - credit (format: 0.00)
90
+ # - closing_balance (format: 0.00 or -0.00 for negative)
91
+ # - category
92
+
93
+ # Statement text:
94
+ # {text[:3000]} [truncated if too long]
95
+
96
+ # Return JSON with this exact structure:
97
+ # {{
98
+ # "transactions": [
99
+ # {{
100
+ # "date": "2025-05-08",
101
+ # "description": "Company XYZ Payroll",
102
+ # "amount": "8315.40",
103
+ # "debit": "0.00",
104
+ # "credit": "8315.40",
105
+ # "closing_balance": "38315.40",
106
+ # "category": "Salary"
107
+ # }}
108
+ # ]
109
+ # }}
110
+
111
+ # RULES:
112
+ # 1. Output ONLY the JSON object with no additional text
113
+ # 2. Keep amounts as strings with 2 decimal places
114
+ # 3. For missing values, use empty strings
115
+ # 4. Convert negative amounts to format "-123.45"
116
+ # 5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
117
+ # </s>
118
+ # <|assistant|>
119
+ # """
120
+
121
+ # try:
122
+ # # Call LLM via Hugging Face Inference API
123
+ # response = client.text_generation(
124
+ # prompt,
125
+ # max_new_tokens=2000,
126
+ # temperature=0.01,
127
+ # stop=["</s>"] # Updated to 'stop' parameter
128
+ # )
129
+ # print(f"LLM Response: {response}")
130
+
131
+ # # Validate and clean JSON response
132
+ # response = response.strip()
133
+ # if not response.startswith('{'):
134
+ # # Find the first { and last } to extract JSON
135
+ # start_idx = response.find('{')
136
+ # end_idx = response.rfind('}')
137
+ # if start_idx != -1 and end_idx != -1:
138
+ # response = response[start_idx:end_idx+1]
139
+
140
+ # # Parse JSON and validate structure
141
+ # data = json.loads(response)
142
+ # if "transactions" not in data:
143
+ # raise ValueError("Missing 'transactions' key in JSON")
144
+
145
+ # return data
146
+ # except Exception as e:
147
+ # print(f"LLM Error: {str(e)}")
148
+ # return {"transactions": []}
149
+
150
+ # def rule_based_parser(text):
151
+ # """Enhanced fallback parser for structured tables"""
152
+ # lines = [line.strip() for line in text.split('\n') if line.strip()]
153
+
154
+ # # Find header line - more flexible detection
155
+ # header_index = None
156
+ # header_patterns = [
157
+ # r'Date\b', r'Description\b', r'Amount\b',
158
+ # r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
159
+ # ]
160
+
161
+ # for i, line in enumerate(lines):
162
+ # if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
163
+ # header_index = i
164
+ # break
165
+
166
+ # if header_index is None:
167
+ # return {"transactions": []}
168
+
169
+ # data_lines = lines[header_index + 1:]
170
+ # transactions = []
171
+
172
+ # for line in data_lines:
173
+ # # Handle both pipe-delimited and space-delimited formats
174
+ # if '|' in line:
175
+ # parts = [p.strip() for p in line.split('|') if p.strip()]
176
+ # else:
177
+ # # Space-delimited format - split while preserving multi-word descriptions
178
+ # parts = []
179
+ # current = ""
180
+ # in_description = False
181
+ # for char in line:
182
+ # if char == ' ' and not in_description:
183
+ # if current:
184
+ # parts.append(current)
185
+ # current = ""
186
+ # # After date field, we're in description
187
+ # if len(parts) == 1:
188
+ # in_description = True
189
+ # else:
190
+ # current += char
191
+ # if current:
192
+ # parts.append(current)
193
+
194
+ # if len(parts) < 7:
195
+ # continue
196
+
197
+ # try:
198
+ # transactions.append({
199
+ # "date": parts[0],
200
+ # "description": parts[1],
201
+ # "amount": format_number(parts[2]),
202
+ # "debit": format_number(parts[3]),
203
+ # "credit": format_number(parts[4]),
204
+ # "closing_balance": format_number(parts[5]),
205
+ # "category": parts[6]
206
+ # })
207
+ # except Exception as e:
208
+ # print(f"Error parsing line: {str(e)}")
209
+
210
+ # return {"transactions": transactions}
211
+
212
+ # def format_number(value):
213
+ # """Format numeric values consistently"""
214
+ # if not value:
215
+ # return "0.00"
216
+
217
+ # # Clean numeric values
218
+ # value = value.replace(',', '').replace('$', '').strip()
219
+
220
+ # # Handle negative numbers in parentheses
221
+ # if '(' in value and ')' in value:
222
+ # value = '-' + value.replace('(', '').replace(')', '')
223
+
224
+ # # Standardize decimal format
225
+ # if '.' not in value:
226
+ # value += '.00'
227
+
228
+ # # Ensure two decimal places
229
+ # try:
230
+ # return f"{float(value):.2f}"
231
+ # except:
232
+ # return value
233
+
234
+ # def process_file(file, is_scanned):
235
+ # """Main processing function"""
236
+ # if not file:
237
+ # return pd.DataFrame(columns=[
238
+ # "Date", "Description", "Amount", "Debit",
239
+ # "Credit", "Closing Balance", "Category"
240
+ # ])
241
+
242
+ # file_path = file.name
243
+ # file_ext = os.path.splitext(file_path)[1].lower()
244
+
245
+ # try:
246
+ # if file_ext == '.xlsx':
247
+ # text = extract_excel_data(file_path)
248
+ # elif file_ext == '.pdf':
249
+ # text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
250
+ # else:
251
+ # return pd.DataFrame(columns=[
252
+ # "Date", "Description", "Amount", "Debit",
253
+ # "Credit", "Closing Balance", "Category"
254
+ # ])
255
+
256
+ # parsed_data = parse_bank_statement(text)
257
+ # df = pd.DataFrame(parsed_data["transactions"])
258
+
259
+ # # Ensure all required columns exist
260
+ # required_cols = ["date", "description", "amount", "debit",
261
+ # "credit", "closing_balance", "category"]
262
+ # for col in required_cols:
263
+ # if col not in df.columns:
264
+ # df[col] = ""
265
+
266
+ # # Format columns properly
267
+ # df.columns = ["Date", "Description", "Amount", "Debit",
268
+ # "Credit", "Closing Balance", "Category"]
269
+ # return df
270
+
271
+ # except Exception as e:
272
+ # print(f"Processing error: {str(e)}")
273
+ # # Return empty DataFrame with correct columns on error
274
+ # return pd.DataFrame(columns=[
275
+ # "Date", "Description", "Amount", "Debit",
276
+ # "Credit", "Closing Balance", "Category"
277
+ # ])
278
+
279
+ # # Gradio Interface
280
+ # interface = gr.Interface(
281
+ # fn=process_file,
282
+ # inputs=[
283
+ # gr.File(label="Upload Bank Statement (PDF/Excel)"),
284
+ # gr.Checkbox(label="Is Scanned PDF? (Use OCR)")
285
+ # ],
286
+ # outputs=gr.Dataframe(
287
+ # label="Parsed Transactions",
288
+ # headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
289
+ # datatype=["date", "str", "number", "number", "number", "number", "str"]
290
+ # ),
291
+ # title="AI Bank Statement Parser",
292
+ # description="Extract structured transaction data from PDF/Excel bank statements",
293
+ # allow_flagging="never"
294
+ # )
295
+
296
+ # if __name__ == "__main__":
297
+ # interface.launch()
298
+
299
  import os
300
  import re
301
  import json
 
343
  text += pytesseract.image_to_string(image) + "\n"
344
  return text
345
 
346
+ def parse_bank_statement(text, file_type):
347
  """Parse bank statement using LLM with fallback to rule-based parser"""
348
+ # Clean text differently based on file type
349
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
350
+
351
+ if file_type == 'pdf':
352
+ # PDF-specific cleaning
353
+ cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
354
+ cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
355
+
356
+ # Keep only lines that look like transactions
357
+ transaction_lines = []
358
+ for line in cleaned_text.split('\n'):
359
+ if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
360
+ transaction_lines.append(line)
361
+ elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
362
+ transaction_lines.append(line)
363
+
364
+ cleaned_text = "\n".join(transaction_lines)
365
+
366
  print(f"Cleaned text sample: {cleaned_text[:200]}...")
367
 
368
  # Try rule-based parsing first for structured data
 
460
  r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
461
  ]
462
 
463
+ # First try: Look for a full header line
464
  for i, line in enumerate(lines):
465
+ if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns[:3]):
466
  header_index = i
467
  break
468
 
469
+ # Second try: Look for any header indicators
470
+ if header_index is None:
471
+ for i, line in enumerate(lines):
472
+ if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
473
+ header_index = i
474
+ break
475
+
476
+ # Third try: Look for pipe-delimited headers
477
+ if header_index is None:
478
+ for i, line in enumerate(lines):
479
+ if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
480
+ header_index = i
481
+ break
482
+
483
  if header_index is None:
484
  return {"transactions": []}
485
 
 
491
  if '|' in line:
492
  parts = [p.strip() for p in line.split('|') if p.strip()]
493
  else:
494
+ # Space-delimited format - split by 2+ spaces
495
+ parts = re.split(r'\s{2,}', line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
 
497
+ # Skip lines that don't have enough parts
498
  if len(parts) < 7:
499
  continue
500
 
501
  try:
502
+ # Handle transaction date validation
503
+ if not re.match(r'\d{4}-\d{2}-\d{2}', parts[0]):
504
+ continue
505
+
506
  transactions.append({
507
  "date": parts[0],
508
  "description": parts[1],
 
519
 
520
  def format_number(value):
521
  """Format numeric values consistently"""
522
+ if not value or str(value).lower() in ['nan', 'nat']:
523
  return "0.00"
524
 
525
+ # If it's already a number, format directly
526
+ if isinstance(value, (int, float)):
527
+ return f"{value:.2f}"
528
+
529
+ # Clean string values
530
+ value = str(value).replace(',', '').replace('$', '').strip()
531
 
532
  # Handle negative numbers in parentheses
533
  if '(' in value and ')' in value:
534
  value = '-' + value.replace('(', '').replace(')', '')
535
 
536
+ # Handle empty values
537
+ if not value:
538
+ return "0.00"
539
+
540
  # Standardize decimal format
541
  if '.' not in value:
542
  value += '.00'
543
 
544
  # Ensure two decimal places
545
  try:
546
+ num_value = float(value)
547
+ return f"{num_value:.2f}"
548
+ except ValueError:
549
+ # If we can't convert to float, return original but clean it
550
+ return value.split('.')[0] + '.' + value.split('.')[1][:2].ljust(2, '0')
551
 
552
  def process_file(file, is_scanned):
553
  """Main processing function"""
554
  if not file:
555
+ return empty_df()
 
 
 
556
 
557
  file_path = file.name
558
  file_ext = os.path.splitext(file_path)[1].lower()
559
 
560
  try:
561
  if file_ext == '.xlsx':
562
+ # Directly process Excel files without text conversion
563
+ df = pd.read_excel(file_path, engine='openpyxl')
564
+
565
+ # Normalize column names
566
+ df.columns = df.columns.str.strip().str.lower()
567
+
568
+ # Create mapping to expected columns
569
+ col_mapping = {
570
+ 'date': 'date',
571
+ 'description': 'description',
572
+ 'amount': 'amount',
573
+ 'debit': 'debit',
574
+ 'credit': 'credit',
575
+ 'closing balance': 'closing_balance',
576
+ 'closing': 'closing_balance',
577
+ 'balance': 'closing_balance',
578
+ 'category': 'category'
579
+ }
580
+
581
+ # Create output DataFrame with required columns
582
+ output_df = pd.DataFrame()
583
+ for col in ['date', 'description', 'amount', 'debit', 'credit', 'closing_balance', 'category']:
584
+ if col in df.columns:
585
+ output_df[col] = df[col]
586
+ elif any(alias in col_mapping and col_mapping[alias] == col for alias in df.columns):
587
+ # Find alias
588
+ for alias in df.columns:
589
+ if alias in col_mapping and col_mapping[alias] == col:
590
+ output_df[col] = df[alias]
591
+ break
592
+ else:
593
+ output_df[col] = ""
594
+
595
+ # Format numeric columns
596
+ for col in ['amount', 'debit', 'credit', 'closing_balance']:
597
+ output_df[col] = output_df[col].apply(format_number)
598
+
599
+ # Rename columns for display
600
+ output_df.columns = ["Date", "Description", "Amount", "Debit",
601
+ "Credit", "Closing Balance", "Category"]
602
+ return output_df
603
+
604
  elif file_ext == '.pdf':
605
  text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
606
+ parsed_data = parse_bank_statement(text, 'pdf')
607
+ df = pd.DataFrame(parsed_data["transactions"])
608
+
609
+ # Ensure all required columns exist
610
+ required_cols = ["date", "description", "amount", "debit",
611
+ "credit", "closing_balance", "category"]
612
+ for col in required_cols:
613
+ if col not in df.columns:
614
+ df[col] = ""
615
+
616
+ # Format columns properly
617
+ df.columns = ["Date", "Description", "Amount", "Debit",
618
+ "Credit", "Closing Balance", "Category"]
619
+ return df
620
 
621
+ else:
622
+ return empty_df()
 
 
 
 
 
 
 
 
 
623
 
624
  except Exception as e:
625
  print(f"Processing error: {str(e)}")
626
+ return empty_df()
627
+
628
+ def empty_df():
629
+ """Return empty DataFrame with correct columns"""
630
+ return pd.DataFrame(columns=["Date", "Description", "Amount", "Debit",
631
+ "Credit", "Closing Balance", "Category"])
632
 
633
  # Gradio Interface
634
  interface = gr.Interface(