Bhaskar2611 commited on
Commit
27a375e
·
verified ·
1 Parent(s): 2912b90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -79
app.py CHANGED
@@ -1,132 +1,157 @@
1
  import os
 
 
2
  import gradio as gr
 
3
  import pdfplumber
4
  import pytesseract
5
- from PIL import Image
6
  from pdf2image import convert_from_path
7
- import pandas as pd
8
- import numpy as np
9
- import re
 
 
10
 
11
- # For Excel files
12
  def extract_excel_data(file_path):
 
13
  df = pd.read_excel(file_path, engine='openpyxl')
14
- return df.to_string()
15
 
16
- # For PDF files with fallback OCR
17
  def extract_text_from_pdf(pdf_path, is_scanned=False):
 
18
  try:
19
- # First try native PDF extraction
20
  with pdfplumber.open(pdf_path) as pdf:
21
  text = ""
22
  for page in pdf.pages:
23
  text += page.extract_text() + "\n"
24
  return text
25
  except Exception as e:
26
- # Fallback to OCR if PDF is invalid
27
  print(f"Native PDF extraction failed: {str(e)}")
28
- print("Trying OCR fallback...")
29
  images = convert_from_path(pdf_path, dpi=200)
30
  text = ""
31
  for image in images:
32
  text += pytesseract.image_to_string(image) + "\n"
33
  return text
34
 
35
- # Prompt engineering for structured extraction
36
  def parse_bank_statement(text):
37
- # Clean up text from PDF/OCR artifacts
38
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
39
 
 
40
  prompt = f"""
41
- Extract the following details from the bank statement text:
42
- - Transaction Date
43
- - Description / Merchant
44
- - Amount
45
- - Debit / Credit
46
- - Closing Balance
47
- - Expense Type (if available)
48
 
49
- Return the results in JSON format with keys:
50
- ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
51
 
52
- Example:
53
- {{
54
- "transactions": [
55
- {{
56
- "date": "2025-06-01",
57
- "description": "Grocery Store",
58
- "amount": "150.00",
59
- "debit_credit": "Debit",
60
- "closing_balance": "1200.00",
61
- "expense_type": "Food"
62
- }}
63
- ]
64
- }}
65
 
66
- Bank Statement Text:
67
- {cleaned_text}
68
- """
69
-
70
- # Simulate LLM response with deterministic parsing for demo
71
- # Replace this with actual LLM inference in production
72
- return simulate_llm_parsing(cleaned_text)
73
 
74
- def simulate_llm_parsing(text):
75
- """Mock LLM response for demo purposes"""
76
- # Simple regex-based parsing for demonstration
77
- transactions = []
78
- lines = text.split('\n')
79
-
80
- # Skip header lines
81
- data_lines = lines[lines.index('Date') + 1:]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- for i in range(0, len(data_lines), 7): # Process in chunks of 7
84
- if i+6 >= len(data_lines):
 
 
 
85
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  try:
88
  transactions.append({
89
- "date": data_lines[i].strip(),
90
- "description": data_lines[i+1].strip(),
91
- "amount": data_lines[i+2].strip(),
92
- "debit_credit": data_lines[i+3].strip(),
93
- "closing_balance": data_lines[i+5].strip(),
94
- "expense_type": data_lines[i+6].strip()
 
95
  })
96
  except Exception as e:
97
- print(f"Error parsing line {i}: {str(e)}")
98
- continue
99
-
100
  return {"transactions": transactions}
101
 
102
- # Main function
103
  def process_file(file, is_scanned):
 
 
 
 
104
  file_path = file.name
105
  file_ext = os.path.splitext(file_path)[1].lower()
106
 
107
- if file_ext == '.xlsx':
108
- text = extract_excel_data(file_path)
109
- elif file_ext == '.pdf':
110
- text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
111
- else:
112
- return "Unsupported file format. Please upload PDF or Excel."
113
-
114
- parsed_data = parse_bank_statement(text)
115
-
116
- # Convert to DataFrame for display
117
- df = pd.DataFrame(parsed_data["transactions"])
118
- return df
 
119
 
120
- # Gradio interface
121
  interface = gr.Interface(
122
  fn=process_file,
123
  inputs=[
124
- gr.File(label="Upload PDF/Excel"),
125
- gr.Checkbox(label="Is Scanned PDF?")
126
  ],
127
- outputs=gr.Dataframe(label="Extracted Transactions"),
128
- title="Bank Statement Parser",
129
- description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.",
 
 
 
130
  allow_flagging="never"
131
  )
132
 
 
1
  import os
2
+ import re
3
+ import json
4
  import gradio as gr
5
+ import pandas as pd
6
  import pdfplumber
7
  import pytesseract
 
8
  from pdf2image import convert_from_path
9
+ from huggingface_hub import InferenceClient
10
+
11
+ # Initialize Hugging Face Inference Client
12
+ hf_token = os.getenv("HF_TOKEN")
13
+ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=hf_token)
14
 
 
15
  def extract_excel_data(file_path):
16
+ """Extract text from Excel file"""
17
  df = pd.read_excel(file_path, engine='openpyxl')
18
+ return df.to_string(index=False)
19
 
 
20
  def extract_text_from_pdf(pdf_path, is_scanned=False):
21
+ """Extract text from PDF with fallback OCR"""
22
  try:
23
+ # Try native PDF extraction first
24
  with pdfplumber.open(pdf_path) as pdf:
25
  text = ""
26
  for page in pdf.pages:
27
  text += page.extract_text() + "\n"
28
  return text
29
  except Exception as e:
 
30
  print(f"Native PDF extraction failed: {str(e)}")
31
+ # Fallback to OCR for scanned PDFs
32
  images = convert_from_path(pdf_path, dpi=200)
33
  text = ""
34
  for image in images:
35
  text += pytesseract.image_to_string(image) + "\n"
36
  return text
37
 
 
38
  def parse_bank_statement(text):
39
+ """Parse bank statement using LLM with fallback to rule-based parser"""
40
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
41
 
42
+ # Craft precise prompt for LLM
43
  prompt = f"""
44
+ You are a financial data parser. Extract transactions from bank statements.
 
 
 
 
 
 
45
 
46
+ Given this bank statement text:
 
47
 
48
+ Extract all transactions with these fields:
49
+ - Date
50
+ - Description
51
+ - Amount
52
+ - Debit
53
+ - Credit
54
+ - Closing Balance
55
+ - Category
 
 
 
 
 
56
 
57
+ Return JSON with "transactions" array containing these fields.
 
 
 
 
 
 
58
 
59
+ Example format:
60
+ {"transactions": [
61
+ {"date": "2025-05-08", "description": "Company XYZ Payroll", "amount": "8315.40", "debit": "0.00", "credit": "8315.40", "closing_balance": "38315.40", "category": "Salary"},
62
+ ...
63
+ ]}
64
+
65
+ Rules:
66
+ 1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
67
+ 2. Convert negative balances to standard format (e.g., "-2421.72")
68
+ 3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
69
+ """
70
+
71
+ try:
72
+ # Call LLM via Hugging Face Inference API
73
+ response = client.text_generation(prompt, max_new_tokens=1000, temperature=0.1)
74
+ return json.loads(response)
75
+ except Exception as e:
76
+ print(f"LLM Error: {str(e)}")
77
+ # Fallback to rule-based parser
78
+ return rule_based_parser(cleaned_text)
79
+
80
+ def rule_based_parser(text):
81
+ """Fallback parser for structured tables with pipe delimiters"""
82
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
83
 
84
+ # Find header line containing 'Date'
85
+ header_index = None
86
+ for i, line in enumerate(lines):
87
+ if re.search(r'\bDate\b', line):
88
+ header_index = i
89
  break
90
+
91
+ if header_index is None or header_index + 1 >= len(lines):
92
+ return {"transactions": []}
93
+
94
+ data_lines = lines[header_index + 1:]
95
+ transactions = []
96
+
97
+ for line in data_lines:
98
+ if not line.startswith('|'):
99
+ continue
100
+
101
+ parts = [p.strip() for p in line.split('|') if p.strip()]
102
+ if len(parts) < 7:
103
+ continue
104
 
105
  try:
106
  transactions.append({
107
+ "date": parts[0],
108
+ "description": parts[1],
109
+ "amount": parts[2],
110
+ "debit": parts[3],
111
+ "credit": parts[4],
112
+ "closing_balance": parts[5],
113
+ "category": parts[6]
114
  })
115
  except Exception as e:
116
+ print(f"Error parsing line: {str(e)}")
117
+
 
118
  return {"transactions": transactions}
119
 
 
120
  def process_file(file, is_scanned):
121
+ """Main processing function"""
122
+ if not file:
123
+ return "No file uploaded"
124
+
125
  file_path = file.name
126
  file_ext = os.path.splitext(file_path)[1].lower()
127
 
128
+ try:
129
+ if file_ext == '.xlsx':
130
+ text = extract_excel_data(file_path)
131
+ elif file_ext == '.pdf':
132
+ text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
133
+ else:
134
+ return {"error": "Unsupported file format"}
135
+
136
+ parsed_data = parse_bank_statement(text)
137
+ df = pd.DataFrame(parsed_data["transactions"])
138
+ return df
139
+ except Exception as e:
140
+ return f"Error: {str(e)}"
141
 
142
+ # Gradio Interface
143
  interface = gr.Interface(
144
  fn=process_file,
145
  inputs=[
146
+ gr.File(label="Upload Bank Statement (PDF/Excel)"),
147
+ gr.Checkbox(label="Is Scanned PDF? (Use OCR)")
148
  ],
149
+ outputs=gr.Dataframe(
150
+ label="Parsed Transactions",
151
+ headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"]
152
+ ),
153
+ title="AI Bank Statement Parser",
154
+ description="Extract structured transaction data from PDF/Excel bank statements using LLM and hybrid parsing techniques.",
155
  allow_flagging="never"
156
  )
157