Bhaskar2611 commited on
Commit
4b0182d
·
verified ·
1 Parent(s): 1dff96b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -298
app.py CHANGED
@@ -1,301 +1,3 @@
1
- # import os
2
- # import re
3
- # import json
4
- # import gradio as gr
5
- # import pandas as pd
6
- # import pdfplumber
7
- # import pytesseract
8
- # from pdf2image import convert_from_path
9
- # from huggingface_hub import InferenceClient
10
-
11
- # # Initialize with reliable free model
12
- # hf_token = os.getenv("HF_TOKEN")
13
- # client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
14
-
15
- # def extract_excel_data(file_path):
16
- # """Extract text from Excel file"""
17
- # df = pd.read_excel(file_path, engine='openpyxl')
18
- # return df.to_string(index=False)
19
-
20
- # def extract_text_from_pdf(pdf_path, is_scanned=False):
21
- # """Extract text from PDF with fallback OCR"""
22
- # try:
23
- # # Try native PDF extraction first
24
- # with pdfplumber.open(pdf_path) as pdf:
25
- # text = ""
26
- # for page in pdf.pages:
27
- # # Extract tables first for structured data
28
- # tables = page.extract_tables()
29
- # for table in tables:
30
- # for row in table:
31
- # text += " | ".join(str(cell) for cell in row) + "\n"
32
- # text += "\n"
33
-
34
- # # Extract text for unstructured data
35
- # page_text = page.extract_text()
36
- # if page_text:
37
- # text += page_text + "\n\n"
38
- # return text
39
- # except Exception as e:
40
- # print(f"Native PDF extraction failed: {str(e)}")
41
- # # Fallback to OCR for scanned PDFs
42
- # images = convert_from_path(pdf_path, dpi=200)
43
- # text = ""
44
- # for image in images:
45
- # text += pytesseract.image_to_string(image) + "\n"
46
- # return text
47
-
48
- # def parse_bank_statement(text):
49
- # """Parse bank statement using LLM with fallback to rule-based parser"""
50
- # # Clean text and remove non-essential lines
51
- # cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
52
- # cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
53
- # cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
54
-
55
- # # Keep only lines that look like transactions
56
- # transaction_lines = []
57
- # for line in cleaned_text.split('\n'):
58
- # if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
59
- # transaction_lines.append(line)
60
- # elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
61
- # transaction_lines.append(line)
62
-
63
- # cleaned_text = "\n".join(transaction_lines)
64
- # print(f"Cleaned text sample: {cleaned_text[:200]}...")
65
-
66
- # # Try rule-based parsing first for structured data
67
- # rule_based_data = rule_based_parser(cleaned_text)
68
- # if rule_based_data["transactions"]:
69
- # print("Using rule-based parser results")
70
- # return rule_based_data
71
-
72
- # # Fallback to LLM for unstructured data
73
- # print("Falling back to LLM parsing")
74
- # return llm_parser(cleaned_text)
75
-
76
- # def llm_parser(text):
77
- # """LLM parser for unstructured text"""
78
- # # Craft precise prompt with strict JSON formatting instructions
79
- # prompt = f"""
80
- # <|system|>
81
- # You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
82
- # </s>
83
- # <|user|>
84
- # Extract all transactions from this bank statement with these exact fields:
85
- # - date (format: YYYY-MM-DD)
86
- # - description
87
- # - amount (format: 0.00)
88
- # - debit (format: 0.00)
89
- # - credit (format: 0.00)
90
- # - closing_balance (format: 0.00 or -0.00 for negative)
91
- # - category
92
-
93
- # Statement text:
94
- # {text[:3000]} [truncated if too long]
95
-
96
- # Return JSON with this exact structure:
97
- # {{
98
- # "transactions": [
99
- # {{
100
- # "date": "2025-05-08",
101
- # "description": "Company XYZ Payroll",
102
- # "amount": "8315.40",
103
- # "debit": "0.00",
104
- # "credit": "8315.40",
105
- # "closing_balance": "38315.40",
106
- # "category": "Salary"
107
- # }}
108
- # ]
109
- # }}
110
-
111
- # RULES:
112
- # 1. Output ONLY the JSON object with no additional text
113
- # 2. Keep amounts as strings with 2 decimal places
114
- # 3. For missing values, use empty strings
115
- # 4. Convert negative amounts to format "-123.45"
116
- # 5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
117
- # </s>
118
- # <|assistant|>
119
- # """
120
-
121
- # try:
122
- # # Call LLM via Hugging Face Inference API
123
- # response = client.text_generation(
124
- # prompt,
125
- # max_new_tokens=2000,
126
- # temperature=0.01,
127
- # stop=["</s>"] # Updated to 'stop' parameter
128
- # )
129
- # print(f"LLM Response: {response}")
130
-
131
- # # Validate and clean JSON response
132
- # response = response.strip()
133
- # if not response.startswith('{'):
134
- # # Find the first { and last } to extract JSON
135
- # start_idx = response.find('{')
136
- # end_idx = response.rfind('}')
137
- # if start_idx != -1 and end_idx != -1:
138
- # response = response[start_idx:end_idx+1]
139
-
140
- # # Parse JSON and validate structure
141
- # data = json.loads(response)
142
- # if "transactions" not in data:
143
- # raise ValueError("Missing 'transactions' key in JSON")
144
-
145
- # return data
146
- # except Exception as e:
147
- # print(f"LLM Error: {str(e)}")
148
- # return {"transactions": []}
149
-
150
- # def rule_based_parser(text):
151
- # """Enhanced fallback parser for structured tables"""
152
- # lines = [line.strip() for line in text.split('\n') if line.strip()]
153
-
154
- # # Find header line - more flexible detection
155
- # header_index = None
156
- # header_patterns = [
157
- # r'Date\b', r'Description\b', r'Amount\b',
158
- # r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
159
- # ]
160
-
161
- # for i, line in enumerate(lines):
162
- # if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
163
- # header_index = i
164
- # break
165
-
166
- # if header_index is None:
167
- # return {"transactions": []}
168
-
169
- # data_lines = lines[header_index + 1:]
170
- # transactions = []
171
-
172
- # for line in data_lines:
173
- # # Handle both pipe-delimited and space-delimited formats
174
- # if '|' in line:
175
- # parts = [p.strip() for p in line.split('|') if p.strip()]
176
- # else:
177
- # # Space-delimited format - split while preserving multi-word descriptions
178
- # parts = []
179
- # current = ""
180
- # in_description = False
181
- # for char in line:
182
- # if char == ' ' and not in_description:
183
- # if current:
184
- # parts.append(current)
185
- # current = ""
186
- # # After date field, we're in description
187
- # if len(parts) == 1:
188
- # in_description = True
189
- # else:
190
- # current += char
191
- # if current:
192
- # parts.append(current)
193
-
194
- # if len(parts) < 7:
195
- # continue
196
-
197
- # try:
198
- # transactions.append({
199
- # "date": parts[0],
200
- # "description": parts[1],
201
- # "amount": format_number(parts[2]),
202
- # "debit": format_number(parts[3]),
203
- # "credit": format_number(parts[4]),
204
- # "closing_balance": format_number(parts[5]),
205
- # "category": parts[6]
206
- # })
207
- # except Exception as e:
208
- # print(f"Error parsing line: {str(e)}")
209
-
210
- # return {"transactions": transactions}
211
-
212
- # def format_number(value):
213
- # """Format numeric values consistently"""
214
- # if not value:
215
- # return "0.00"
216
-
217
- # # Clean numeric values
218
- # value = value.replace(',', '').replace('$', '').strip()
219
-
220
- # # Handle negative numbers in parentheses
221
- # if '(' in value and ')' in value:
222
- # value = '-' + value.replace('(', '').replace(')', '')
223
-
224
- # # Standardize decimal format
225
- # if '.' not in value:
226
- # value += '.00'
227
-
228
- # # Ensure two decimal places
229
- # try:
230
- # return f"{float(value):.2f}"
231
- # except:
232
- # return value
233
-
234
- # def process_file(file, is_scanned):
235
- # """Main processing function"""
236
- # if not file:
237
- # return pd.DataFrame(columns=[
238
- # "Date", "Description", "Amount", "Debit",
239
- # "Credit", "Closing Balance", "Category"
240
- # ])
241
-
242
- # file_path = file.name
243
- # file_ext = os.path.splitext(file_path)[1].lower()
244
-
245
- # try:
246
- # if file_ext == '.xlsx':
247
- # text = extract_excel_data(file_path)
248
- # elif file_ext == '.pdf':
249
- # text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
250
- # else:
251
- # return pd.DataFrame(columns=[
252
- # "Date", "Description", "Amount", "Debit",
253
- # "Credit", "Closing Balance", "Category"
254
- # ])
255
-
256
- # parsed_data = parse_bank_statement(text)
257
- # df = pd.DataFrame(parsed_data["transactions"])
258
-
259
- # # Ensure all required columns exist
260
- # required_cols = ["date", "description", "amount", "debit",
261
- # "credit", "closing_balance", "category"]
262
- # for col in required_cols:
263
- # if col not in df.columns:
264
- # df[col] = ""
265
-
266
- # # Format columns properly
267
- # df.columns = ["Date", "Description", "Amount", "Debit",
268
- # "Credit", "Closing Balance", "Category"]
269
- # return df
270
-
271
- # except Exception as e:
272
- # print(f"Processing error: {str(e)}")
273
- # # Return empty DataFrame with correct columns on error
274
- # return pd.DataFrame(columns=[
275
- # "Date", "Description", "Amount", "Debit",
276
- # "Credit", "Closing Balance", "Category"
277
- # ])
278
-
279
- # # Gradio Interface
280
- # interface = gr.Interface(
281
- # fn=process_file,
282
- # inputs=[
283
- # gr.File(label="Upload Bank Statement (PDF/Excel)"),
284
- # gr.Checkbox(label="Is Scanned PDF? (Use OCR)")
285
- # ],
286
- # outputs=gr.Dataframe(
287
- # label="Parsed Transactions",
288
- # headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
289
- # datatype=["date", "str", "number", "number", "number", "number", "str"]
290
- # ),
291
- # title="AI Bank Statement Parser",
292
- # description="Extract structured transaction data from PDF/Excel bank statements",
293
- # allow_flagging="never"
294
- # )
295
-
296
- # if __name__ == "__main__":
297
- # interface.launch()
298
-
299
  import os
300
  import re
301
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import re
3
  import json