Bhaskar2611 commited on
Commit
6255a6d
·
verified ·
1 Parent(s): aca59c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -44
app.py CHANGED
@@ -8,9 +8,9 @@ import pytesseract
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
10
 
11
- # Initialize Hugging Face Inference Client with a free model
12
  hf_token = os.getenv("HF_TOKEN")
13
- client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=hf_token)
14
 
15
  def extract_excel_data(file_path):
16
  """Extract text from Excel file"""
@@ -40,25 +40,25 @@ def parse_bank_statement(text):
40
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
41
  print(f"Original text sample: {cleaned_text[:200]}...")
42
 
43
- # Craft precise prompt for LLM with proper JSON escaping
44
  prompt = f"""
45
- You are a financial data parser. Extract transactions from bank statements.
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- Given this bank statement text:
48
- {cleaned_text}
49
 
50
- Extract all transactions with these fields:
51
- - Date
52
- - Description
53
- - Amount
54
- - Debit
55
- - Credit
56
- - Closing Balance
57
- - Category
58
-
59
- Return JSON with "transactions" array containing these fields.
60
-
61
- Example format:
62
  {{
63
  "transactions": [
64
  {{
@@ -82,44 +82,69 @@ Example format:
82
  ]
83
  }}
84
 
85
- Rules:
86
- 1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
87
- 2. Convert negative balances to standard format (e.g., "-2421.72")
88
- 3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
89
- 4. Only return valid JSON with no additional text
 
 
 
90
  """
91
-
92
  try:
93
- # Call LLM via Hugging Face Inference API
94
  response = client.text_generation(
95
  prompt,
96
  max_new_tokens=2000,
97
- temperature=0.1,
98
  stop_sequences=["</s>"]
99
  )
100
  print(f"LLM Response: {response}")
101
 
102
- # Extract JSON from response (remove non-JSON prefixes/suffixes)
103
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
104
- if json_match:
105
- return json.loads(json_match.group())
106
- return json.loads(response)
 
 
 
 
 
 
 
 
 
 
107
  except Exception as e:
108
  print(f"LLM Error: {str(e)}")
109
  # Fallback to rule-based parser
110
  return rule_based_parser(cleaned_text)
111
 
112
  def rule_based_parser(text):
113
- """Fallback parser for structured tables with pipe delimiters"""
114
  lines = [line.strip() for line in text.split('\n') if line.strip()]
115
 
116
- # Find header line containing '| Date'
117
  header_index = None
 
 
 
 
 
118
  for i, line in enumerate(lines):
119
- if re.search(r'\|Date|Date\|', line, re.IGNORECASE):
120
  header_index = i
121
  break
122
 
 
 
 
 
 
 
 
123
  if header_index is None or header_index + 1 >= len(lines):
124
  return {"transactions": []}
125
 
@@ -127,15 +152,17 @@ def rule_based_parser(text):
127
  transactions = []
128
 
129
  for line in data_lines:
130
- if not '|' in line:
131
- continue
132
-
133
- parts = [p.strip() for p in line.split('|') if p.strip()]
 
 
 
134
  if len(parts) < 7:
135
  continue
136
 
137
  try:
138
- # Handle numeric values consistently
139
  transactions.append({
140
  "date": parts[0],
141
  "description": parts[1],
@@ -152,9 +179,30 @@ def rule_based_parser(text):
152
 
153
  def format_number(value):
154
  """Format numeric values consistently"""
155
- value = value.replace(',', '')
156
- if re.match(r'^-?\d+(\.\d+)?$', value):
157
- return f"{float(value):.2f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  return value
159
 
160
  def process_file(file, is_scanned):
@@ -189,6 +237,7 @@ def process_file(file, is_scanned):
189
  if col not in df.columns:
190
  df[col] = ""
191
 
 
192
  df.columns = ["Date", "Description", "Amount", "Debit",
193
  "Credit", "Closing Balance", "Category"]
194
  return df
@@ -210,10 +259,11 @@ interface = gr.Interface(
210
  ],
211
  outputs=gr.Dataframe(
212
  label="Parsed Transactions",
213
- headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"]
 
214
  ),
215
  title="AI Bank Statement Parser",
216
- description="Extract structured transaction data from PDF/Excel bank statements using LLM and hybrid parsing techniques.",
217
  allow_flagging="never"
218
  )
219
 
 
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
10
 
11
+ # Initialize Hugging Face Inference Client with a better free model
12
  hf_token = os.getenv("HF_TOKEN")
13
+ client = InferenceClient(model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
14
 
15
  def extract_excel_data(file_path):
16
  """Extract text from Excel file"""
 
40
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
41
  print(f"Original text sample: {cleaned_text[:200]}...")
42
 
43
+ # Craft precise prompt with strict JSON formatting instructions
44
  prompt = f"""
45
+ <|system|>
46
+ You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
47
+ </s>
48
+ <|user|>
49
+ Extract all transactions from this bank statement with these exact fields:
50
+ - date (format: YYYY-MM-DD)
51
+ - description
52
+ - amount (format: 0.00)
53
+ - debit (format: 0.00)
54
+ - credit (format: 0.00)
55
+ - closing_balance (format: 0.00 or -0.00 for negative)
56
+ - category
57
 
58
+ Statement text:
59
+ {cleaned_text[:3000]} [truncated if too long]
60
 
61
+ Return JSON with this exact structure:
 
 
 
 
 
 
 
 
 
 
 
62
  {{
63
  "transactions": [
64
  {{
 
82
  ]
83
  }}
84
 
85
+ RULES:
86
+ 1. Output ONLY the JSON object with no additional text
87
+ 2. Keep amounts as strings with 2 decimal places
88
+ 3. For missing values, use empty strings
89
+ 4. Convert negative amounts to format "-123.45"
90
+ 5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
91
+ </s>
92
+ <|assistant|>
93
  """
94
+
95
  try:
96
+ # Call LLM with strict parameters
97
  response = client.text_generation(
98
  prompt,
99
  max_new_tokens=2000,
100
+ temperature=0.01, # Lower temperature for more deterministic output
101
  stop_sequences=["</s>"]
102
  )
103
  print(f"LLM Response: {response}")
104
 
105
+ # Validate and clean JSON response
106
+ response = response.strip()
107
+ if not response.startswith('{'):
108
+ # Find the first { and last } to extract JSON
109
+ start_idx = response.find('{')
110
+ end_idx = response.rfind('}')
111
+ if start_idx != -1 and end_idx != -1:
112
+ response = response[start_idx:end_idx+1]
113
+
114
+ # Parse JSON and validate structure
115
+ data = json.loads(response)
116
+ if "transactions" not in data:
117
+ raise ValueError("Missing 'transactions' key in JSON")
118
+
119
+ return data
120
  except Exception as e:
121
  print(f"LLM Error: {str(e)}")
122
  # Fallback to rule-based parser
123
  return rule_based_parser(cleaned_text)
124
 
125
  def rule_based_parser(text):
126
+ """Enhanced fallback parser for structured tables"""
127
  lines = [line.strip() for line in text.split('\n') if line.strip()]
128
 
129
+ # Find header line - more flexible detection
130
  header_index = None
131
+ header_patterns = [
132
+ r'Date\b', r'Description\b', r'Amount\b',
133
+ r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
134
+ ]
135
+
136
  for i, line in enumerate(lines):
137
+ if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
138
  header_index = i
139
  break
140
 
141
+ if header_index is None:
142
+ # Try pipe-delimited format as fallback
143
+ for i, line in enumerate(lines):
144
+ if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
145
+ header_index = i
146
+ break
147
+
148
  if header_index is None or header_index + 1 >= len(lines):
149
  return {"transactions": []}
150
 
 
152
  transactions = []
153
 
154
  for line in data_lines:
155
+ # Handle both pipe-delimited and space-aligned formats
156
+ if '|' in line:
157
+ parts = [p.strip() for p in line.split('|') if p.strip()]
158
+ else:
159
+ # Space-aligned format - split by 2+ spaces
160
+ parts = re.split(r'\s{2,}', line)
161
+
162
  if len(parts) < 7:
163
  continue
164
 
165
  try:
 
166
  transactions.append({
167
  "date": parts[0],
168
  "description": parts[1],
 
179
 
180
  def format_number(value):
181
  """Format numeric values consistently"""
182
+ if not value:
183
+ return "0.00"
184
+
185
+ # Clean numeric values
186
+ value = value.replace(',', '').replace('$', '').strip()
187
+
188
+ # Handle negative numbers in parentheses
189
+ if '(' in value and ')' in value:
190
+ value = '-' + value.replace('(', '').replace(')', '')
191
+
192
+ # Standardize decimal format
193
+ if '.' not in value:
194
+ value += '.00'
195
+
196
+ # Ensure two decimal places
197
+ parts = value.split('.')
198
+ if len(parts) == 2:
199
+ integer = parts[0].lstrip('0') or '0'
200
+ decimal = parts[1][:2].ljust(2, '0')
201
+ value = f"{integer}.{decimal}"
202
+
203
+ # Handle negative signs
204
+ if value.startswith('-'):
205
+ return f"-{value[1:].lstrip('0')}" if value[1:] != '0.00' else '0.00'
206
  return value
207
 
208
  def process_file(file, is_scanned):
 
237
  if col not in df.columns:
238
  df[col] = ""
239
 
240
+ # Format columns properly
241
  df.columns = ["Date", "Description", "Amount", "Debit",
242
  "Credit", "Closing Balance", "Category"]
243
  return df
 
259
  ],
260
  outputs=gr.Dataframe(
261
  label="Parsed Transactions",
262
+ headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
263
+ datatype=["date", "str", "number", "number", "number", "number", "str"]
264
  ),
265
  title="AI Bank Statement Parser",
266
+ description="Extract structured transaction data from PDF/Excel bank statements",
267
  allow_flagging="never"
268
  )
269