Bhaskar2611 commited on
Commit
c70b653
·
verified ·
1 Parent(s): fd970b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -37
app.py CHANGED
@@ -8,7 +8,7 @@ import pytesseract
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
10
 
11
- # Initialize with a reliable free model that supports text-generation
12
  hf_token = os.getenv("HF_TOKEN")
13
  client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
14
 
@@ -24,7 +24,17 @@ def extract_text_from_pdf(pdf_path, is_scanned=False):
24
  with pdfplumber.open(pdf_path) as pdf:
25
  text = ""
26
  for page in pdf.pages:
27
- text += page.extract_text() + "\n"
 
 
 
 
 
 
 
 
 
 
28
  return text
29
  except Exception as e:
30
  print(f"Native PDF extraction failed: {str(e)}")
@@ -37,9 +47,34 @@ def extract_text_from_pdf(pdf_path, is_scanned=False):
37
 
38
  def parse_bank_statement(text):
39
  """Parse bank statement using LLM with fallback to rule-based parser"""
 
40
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
41
- print(f"Original text sample: {cleaned_text[:200]}...")
 
 
 
 
 
 
 
 
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Craft precise prompt with strict JSON formatting instructions
44
  prompt = f"""
45
  <|system|>
@@ -56,7 +91,7 @@ Extract all transactions from this bank statement with these exact fields:
56
  - category
57
 
58
  Statement text:
59
- {cleaned_text[:3000]} [truncated if too long]
60
 
61
  Return JSON with this exact structure:
62
  {{
@@ -69,15 +104,6 @@ Return JSON with this exact structure:
69
  "credit": "8315.40",
70
  "closing_balance": "38315.40",
71
  "category": "Salary"
72
- }},
73
- {{
74
- "date": "2025-05-19",
75
- "description": "Whole Foods",
76
- "amount": "142.21",
77
- "debit": "142.21",
78
- "credit": "0.00",
79
- "closing_balance": "38173.19",
80
- "category": "Groceries"
81
  }}
82
  ]
83
  }}
@@ -98,7 +124,7 @@ RULES:
98
  prompt,
99
  max_new_tokens=2000,
100
  temperature=0.01,
101
- stop_sequences=["</s>"]
102
  )
103
  print(f"LLM Response: {response}")
104
 
@@ -119,8 +145,7 @@ RULES:
119
  return data
120
  except Exception as e:
121
  print(f"LLM Error: {str(e)}")
122
- # Fallback to rule-based parser
123
- return rule_based_parser(cleaned_text)
124
 
125
  def rule_based_parser(text):
126
  """Enhanced fallback parser for structured tables"""
@@ -134,30 +159,37 @@ def rule_based_parser(text):
134
  ]
135
 
136
  for i, line in enumerate(lines):
137
- if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
138
  header_index = i
139
  break
140
 
141
  if header_index is None:
142
- # Try pipe-delimited format as fallback
143
- for i, line in enumerate(lines):
144
- if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
145
- header_index = i
146
- break
147
-
148
- if header_index is None or header_index + 1 >= len(lines):
149
  return {"transactions": []}
150
 
151
  data_lines = lines[header_index + 1:]
152
  transactions = []
153
 
154
  for line in data_lines:
155
- # Handle both pipe-delimited and space-aligned formats
156
  if '|' in line:
157
  parts = [p.strip() for p in line.split('|') if p.strip()]
158
  else:
159
- # Space-aligned format - split by 2+ spaces
160
- parts = re.split(r'\s{2,}', line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  if len(parts) < 7:
163
  continue
@@ -194,16 +226,10 @@ def format_number(value):
194
  value += '.00'
195
 
196
  # Ensure two decimal places
197
- parts = value.split('.')
198
- if len(parts) == 2:
199
- integer = parts[0].lstrip('0') or '0'
200
- decimal = parts[1][:2].ljust(2, '0')
201
- value = f"{integer}.{decimal}"
202
-
203
- # Handle negative signs
204
- if value.startswith('-'):
205
- return f"-{value[1:].lstrip('0')}" if value[1:] != '0.00' else '0.00'
206
- return value
207
 
208
  def process_file(file, is_scanned):
209
  """Main processing function"""
 
8
  from pdf2image import convert_from_path
9
  from huggingface_hub import InferenceClient
10
 
11
+ # Initialize with reliable free model
12
  hf_token = os.getenv("HF_TOKEN")
13
  client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
14
 
 
24
  with pdfplumber.open(pdf_path) as pdf:
25
  text = ""
26
  for page in pdf.pages:
27
+ # Extract tables first for structured data
28
+ tables = page.extract_tables()
29
+ for table in tables:
30
+ for row in table:
31
+ text += " | ".join(str(cell) for cell in row) + "\n"
32
+ text += "\n"
33
+
34
+ # Extract text for unstructured data
35
+ page_text = page.extract_text()
36
+ if page_text:
37
+ text += page_text + "\n\n"
38
  return text
39
  except Exception as e:
40
  print(f"Native PDF extraction failed: {str(e)}")
 
47
 
48
  def parse_bank_statement(text):
49
  """Parse bank statement using LLM with fallback to rule-based parser"""
50
+ # Clean text and remove non-essential lines
51
  cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
52
+ cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
53
+ cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
54
+
55
+ # Keep only lines that look like transactions
56
+ transaction_lines = []
57
+ for line in cleaned_text.split('\n'):
58
+ if re.match(r'^\d{4}-\d{2}-\d{2}', line): # Date pattern
59
+ transaction_lines.append(line)
60
+ elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
61
+ transaction_lines.append(line)
62
+
63
+ cleaned_text = "\n".join(transaction_lines)
64
+ print(f"Cleaned text sample: {cleaned_text[:200]}...")
65
 
66
+ # Try rule-based parsing first for structured data
67
+ rule_based_data = rule_based_parser(cleaned_text)
68
+ if rule_based_data["transactions"]:
69
+ print("Using rule-based parser results")
70
+ return rule_based_data
71
+
72
+ # Fallback to LLM for unstructured data
73
+ print("Falling back to LLM parsing")
74
+ return llm_parser(cleaned_text)
75
+
76
+ def llm_parser(text):
77
+ """LLM parser for unstructured text"""
78
  # Craft precise prompt with strict JSON formatting instructions
79
  prompt = f"""
80
  <|system|>
 
91
  - category
92
 
93
  Statement text:
94
+ {text[:3000]} [truncated if too long]
95
 
96
  Return JSON with this exact structure:
97
  {{
 
104
  "credit": "8315.40",
105
  "closing_balance": "38315.40",
106
  "category": "Salary"
 
 
 
 
 
 
 
 
 
107
  }}
108
  ]
109
  }}
 
124
  prompt,
125
  max_new_tokens=2000,
126
  temperature=0.01,
127
+ stop=["</s>"] # Updated to 'stop' parameter
128
  )
129
  print(f"LLM Response: {response}")
130
 
 
145
  return data
146
  except Exception as e:
147
  print(f"LLM Error: {str(e)}")
148
+ return {"transactions": []}
 
149
 
150
  def rule_based_parser(text):
151
  """Enhanced fallback parser for structured tables"""
 
159
  ]
160
 
161
  for i, line in enumerate(lines):
162
+ if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
163
  header_index = i
164
  break
165
 
166
  if header_index is None:
 
 
 
 
 
 
 
167
  return {"transactions": []}
168
 
169
  data_lines = lines[header_index + 1:]
170
  transactions = []
171
 
172
  for line in data_lines:
173
+ # Handle both pipe-delimited and space-delimited formats
174
  if '|' in line:
175
  parts = [p.strip() for p in line.split('|') if p.strip()]
176
  else:
177
+ # Space-delimited format - split while preserving multi-word descriptions
178
+ parts = []
179
+ current = ""
180
+ in_description = False
181
+ for char in line:
182
+ if char == ' ' and not in_description:
183
+ if current:
184
+ parts.append(current)
185
+ current = ""
186
+ # After date field, we're in description
187
+ if len(parts) == 1:
188
+ in_description = True
189
+ else:
190
+ current += char
191
+ if current:
192
+ parts.append(current)
193
 
194
  if len(parts) < 7:
195
  continue
 
226
  value += '.00'
227
 
228
  # Ensure two decimal places
229
+ try:
230
+ return f"{float(value):.2f}"
231
+ except:
232
+ return value
 
 
 
 
 
 
233
 
234
  def process_file(file, is_scanned):
235
  """Main processing function"""