Bhaskar2611 commited on
Commit
bca7a79
·
verified ·
1 Parent(s): a58d36b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -352
app.py CHANGED
@@ -1,355 +1,3 @@
1
- # import os
2
- # import re
3
- # import json
4
- # import gradio as gr
5
- # import pandas as pd
6
- # import pdfplumber
7
- # import pytesseract
8
- # from pdf2image import convert_from_path
9
- # from huggingface_hub import InferenceClient
10
-
11
- # # Initialize with reliable free model
12
- # hf_token = os.getenv("HF_TOKEN")
13
- # client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
14
-
15
- # def extract_excel_data(file_path):
16
- # """Extract text from Excel file"""
17
- # df = pd.read_excel(file_path, engine='openpyxl')
18
- # return df.to_string(index=False)
19
-
20
- # def extract_text_from_pdf(pdf_path, is_scanned=False):
21
- # """Extract text from PDF with fallback OCR"""
22
- # try:
23
- # # Try native PDF extraction first
24
- # with pdfplumber.open(pdf_path) as pdf:
25
- # text = ""
26
- # for page in pdf.pages:
27
- # # Extract tables first for structured data
28
- # tables = page.extract_tables()
29
- # for table in tables:
30
- # for row in table:
31
- # text += " | ".join(str(cell) for cell in row) + "\n"
32
- # text += "\n"
33
-
34
- # # Extract text for unstructured data
35
- # page_text = page.extract_text()
36
- # if page_text:
37
- # text += page_text + "\n\n"
38
- # return text
39
- # except Exception as e:
40
- # print(f"Native PDF extraction failed: {str(e)}")
41
- # # Fallback to OCR for scanned PDFs
42
- # images = convert_from_path(pdf_path, dpi=200)
43
- # text = ""
44
- # for image in images:
45
- # text += pytesseract.image_to_string(image) + "\n"
46
- # return text
47
-
48
- # def parse_bank_statement(text, file_type):
49
- # """Parse bank statement using LLM with fallback to rule-based parser"""
50
- # # Clean text differently based on file type
51
- # cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
52
-
53
- # if file_type == 'pdf':
54
- # # PDF-specific cleaning
55
- # cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
56
- # cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
57
-
58
- # # Keep only lines that look like transactions
59
- # transaction_lines = []
60
- # for line in cleaned_text.split('\n'):
61
- # if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
62
- # transaction_lines.append(line)
63
- # elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
64
- # transaction_lines.append(line)
65
-
66
- # cleaned_text = "\n".join(transaction_lines)
67
-
68
- # print(f"Cleaned text sample: {cleaned_text[:200]}...")
69
-
70
- # # Try rule-based parsing first for structured data
71
- # rule_based_data = rule_based_parser(cleaned_text)
72
- # if rule_based_data["transactions"]:
73
- # print("Using rule-based parser results")
74
- # return rule_based_data
75
-
76
- # # Fallback to LLM for unstructured data
77
- # print("Falling back to LLM parsing")
78
- # return llm_parser(cleaned_text)
79
-
80
- # def llm_parser(text):
81
- # """LLM parser for unstructured text"""
82
- # # Craft precise prompt with strict JSON formatting instructions
83
- # prompt = f"""
84
- # <|system|>
85
- # You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
86
- # </s>
87
- # <|user|>
88
- # Extract all transactions from this bank statement with these exact fields:
89
- # - date (format: YYYY-MM-DD)
90
- # - description
91
- # - amount (format: 0.00)
92
- # - debit (format: 0.00)
93
- # - credit (format: 0.00)
94
- # - closing_balance (format: 0.00 or -0.00 for negative)
95
- # - category
96
-
97
- # Statement text:
98
- # {text[:3000]} [truncated if too long]
99
-
100
- # Return JSON with this exact structure:
101
- # {{
102
- # "transactions": [
103
- # {{
104
- # "date": "2025-05-08",
105
- # "description": "Company XYZ Payroll",
106
- # "amount": "8315.40",
107
- # "debit": "0.00",
108
- # "credit": "8315.40",
109
- # "closing_balance": "38315.40",
110
- # "category": "Salary"
111
- # }}
112
- # ]
113
- # }}
114
-
115
- # RULES:
116
- # 1. Output ONLY the JSON object with no additional text
117
- # 2. Keep amounts as strings with 2 decimal places
118
- # 3. For missing values, use empty strings
119
- # 4. Convert negative amounts to format "-123.45"
120
- # 5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
121
- # </s>
122
- # <|assistant|>
123
- # """
124
-
125
- # try:
126
- # # Call LLM via Hugging Face Inference API
127
- # response = client.text_generation(
128
- # prompt,
129
- # max_new_tokens=2000,
130
- # temperature=0.01,
131
- # stop=["</s>"] # Updated to 'stop' parameter
132
- # )
133
- # print(f"LLM Response: {response}")
134
-
135
- # # Validate and clean JSON response
136
- # response = response.strip()
137
- # if not response.startswith('{'):
138
- # # Find the first { and last } to extract JSON
139
- # start_idx = response.find('{')
140
- # end_idx = response.rfind('}')
141
- # if start_idx != -1 and end_idx != -1:
142
- # response = response[start_idx:end_idx+1]
143
-
144
- # # Parse JSON and validate structure
145
- # data = json.loads(response)
146
- # if "transactions" not in data:
147
- # raise ValueError("Missing 'transactions' key in JSON")
148
-
149
- # return data
150
- # except Exception as e:
151
- # print(f"LLM Error: {str(e)}")
152
- # return {"transactions": []}
153
-
154
- # def rule_based_parser(text):
155
- # """Enhanced fallback parser for structured tables"""
156
- # lines = [line.strip() for line in text.split('\n') if line.strip()]
157
-
158
- # # Find header line - more flexible detection
159
- # header_index = None
160
- # header_patterns = [
161
- # r'Date\b', r'Description\b', r'Amount\b',
162
- # r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
163
- # ]
164
-
165
- # # First try: Look for a full header line
166
- # for i, line in enumerate(lines):
167
- # if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns[:3]):
168
- # header_index = i
169
- # break
170
-
171
- # # Second try: Look for any header indicators
172
- # if header_index is None:
173
- # for i, line in enumerate(lines):
174
- # if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
175
- # header_index = i
176
- # break
177
-
178
- # # Third try: Look for pipe-delimited headers
179
- # if header_index is None:
180
- # for i, line in enumerate(lines):
181
- # if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
182
- # header_index = i
183
- # break
184
-
185
- # if header_index is None:
186
- # return {"transactions": []}
187
-
188
- # data_lines = lines[header_index + 1:]
189
- # transactions = []
190
-
191
- # for line in data_lines:
192
- # # Handle both pipe-delimited and space-delimited formats
193
- # if '|' in line:
194
- # parts = [p.strip() for p in line.split('|') if p.strip()]
195
- # else:
196
- # # Space-delimited format - split by 2+ spaces
197
- # parts = re.split(r'\s{2,}', line)
198
-
199
- # # Skip lines that don't have enough parts
200
- # if len(parts) < 7:
201
- # continue
202
-
203
- # try:
204
- # # Handle transaction date validation
205
- # if not re.match(r'\d{4}-\d{2}-\d{2}', parts[0]):
206
- # continue
207
-
208
- # transactions.append({
209
- # "date": parts[0],
210
- # "description": parts[1],
211
- # "amount": format_number(parts[2]),
212
- # "debit": format_number(parts[3]),
213
- # "credit": format_number(parts[4]),
214
- # "closing_balance": format_number(parts[5]),
215
- # "category": parts[6]
216
- # })
217
- # except Exception as e:
218
- # print(f"Error parsing line: {str(e)}")
219
-
220
- # return {"transactions": transactions}
221
-
222
- # def format_number(value):
223
- # """Format numeric values consistently"""
224
- # if not value or str(value).lower() in ['nan', 'nat']:
225
- # return "0.00"
226
-
227
- # # If it's already a number, format directly
228
- # if isinstance(value, (int, float)):
229
- # return f"{value:.2f}"
230
-
231
- # # Clean string values
232
- # value = str(value).replace(',', '').replace('$', '').strip()
233
-
234
- # # Handle negative numbers in parentheses
235
- # if '(' in value and ')' in value:
236
- # value = '-' + value.replace('(', '').replace(')', '')
237
-
238
- # # Handle empty values
239
- # if not value:
240
- # return "0.00"
241
-
242
- # # Standardize decimal format
243
- # if '.' not in value:
244
- # value += '.00'
245
-
246
- # # Ensure two decimal places
247
- # try:
248
- # num_value = float(value)
249
- # return f"{num_value:.2f}"
250
- # except ValueError:
251
- # # If we can't convert to float, return original but clean it
252
- # return value.split('.')[0] + '.' + value.split('.')[1][:2].ljust(2, '0')
253
-
254
- # def process_file(file, is_scanned):
255
- # """Main processing function"""
256
- # if not file:
257
- # return empty_df()
258
-
259
- # file_path = file.name
260
- # file_ext = os.path.splitext(file_path)[1].lower()
261
-
262
- # try:
263
- # if file_ext == '.xlsx':
264
- # # Directly process Excel files without text conversion
265
- # df = pd.read_excel(file_path, engine='openpyxl')
266
-
267
- # # Normalize column names
268
- # df.columns = df.columns.str.strip().str.lower()
269
-
270
- # # Create mapping to expected columns
271
- # col_mapping = {
272
- # 'date': 'date',
273
- # 'description': 'description',
274
- # 'amount': 'amount',
275
- # 'debit': 'debit',
276
- # 'credit': 'credit',
277
- # 'closing balance': 'closing_balance',
278
- # 'closing': 'closing_balance',
279
- # 'balance': 'closing_balance',
280
- # 'category': 'category'
281
- # }
282
-
283
- # # Create output DataFrame with required columns
284
- # output_df = pd.DataFrame()
285
- # for col in ['date', 'description', 'amount', 'debit', 'credit', 'closing_balance', 'category']:
286
- # if col in df.columns:
287
- # output_df[col] = df[col]
288
- # elif any(alias in col_mapping and col_mapping[alias] == col for alias in df.columns):
289
- # # Find alias
290
- # for alias in df.columns:
291
- # if alias in col_mapping and col_mapping[alias] == col:
292
- # output_df[col] = df[alias]
293
- # break
294
- # else:
295
- # output_df[col] = ""
296
-
297
- # # Format numeric columns
298
- # for col in ['amount', 'debit', 'credit', 'closing_balance']:
299
- # output_df[col] = output_df[col].apply(format_number)
300
-
301
- # # Rename columns for display
302
- # output_df.columns = ["Date", "Description", "Amount", "Debit",
303
- # "Credit", "Closing Balance", "Category"]
304
- # return output_df
305
-
306
- # elif file_ext == '.pdf':
307
- # text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
308
- # parsed_data = parse_bank_statement(text, 'pdf')
309
- # df = pd.DataFrame(parsed_data["transactions"])
310
-
311
- # # Ensure all required columns exist
312
- # required_cols = ["date", "description", "amount", "debit",
313
- # "credit", "closing_balance", "category"]
314
- # for col in required_cols:
315
- # if col not in df.columns:
316
- # df[col] = ""
317
-
318
- # # Format columns properly
319
- # df.columns = ["Date", "Description", "Amount", "Debit",
320
- # "Credit", "Closing Balance", "Category"]
321
- # return df
322
-
323
- # else:
324
- # return empty_df()
325
-
326
- # except Exception as e:
327
- # print(f"Processing error: {str(e)}")
328
- # return empty_df()
329
-
330
- # def empty_df():
331
- # """Return empty DataFrame with correct columns"""
332
- # return pd.DataFrame(columns=["Date", "Description", "Amount", "Debit",
333
- # "Credit", "Closing Balance", "Category"])
334
-
335
- # # Gradio Interface
336
- # interface = gr.Interface(
337
- # fn=process_file,
338
- # inputs=[
339
- # gr.File(label="Upload Bank Statement (PDF/Excel)")
340
- # ],
341
- # outputs=gr.Dataframe(
342
- # label="Parsed Transactions",
343
- # headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
344
- # datatype=["date", "str", "number", "number", "number", "number", "str"]
345
- # ),
346
- # title="AI Bank Statement Parser",
347
- # description="Extract structured transaction data from PDF/Excel bank statements",
348
- # allow_flagging="never"
349
- # )
350
-
351
- # if __name__ == "__main__":
352
- # interface.launch()
353
  import os
354
  import re
355
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import json