Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8

Commit

c70b653

verified ·

1 Parent(s): fd970b6

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -37

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pytesseract
 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
-# Initialize with a reliable free model that supports text-generation
 hf_token = os.getenv("HF_TOKEN")
 client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
@@ -24,7 +24,17 @@ def extract_text_from_pdf(pdf_path, is_scanned=False):
         with pdfplumber.open(pdf_path) as pdf:
             text = ""
             for page in pdf.pages:
-                text += page.extract_text() + "\n"
             return text
     except Exception as e:
         print(f"Native PDF extraction failed: {str(e)}")
@@ -37,9 +47,34 @@ def extract_text_from_pdf(pdf_path, is_scanned=False):
 def parse_bank_statement(text):
     """Parse bank statement using LLM with fallback to rule-based parser"""
     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
-    print(f"Original text sample: {cleaned_text[:200]}...")
     # Craft precise prompt with strict JSON formatting instructions
     prompt = f"""
 <|system|>
@@ -56,7 +91,7 @@ Extract all transactions from this bank statement with these exact fields:
 - category
 Statement text:
-{cleaned_text[:3000]}  [truncated if too long]
 Return JSON with this exact structure:
 {{
@@ -69,15 +104,6 @@ Return JSON with this exact structure:
       "credit": "8315.40",
       "closing_balance": "38315.40",
       "category": "Salary"
-    }},
-    {{
-      "date": "2025-05-19",
-      "description": "Whole Foods",
-      "amount": "142.21",
-      "debit": "142.21",
-      "credit": "0.00",
-      "closing_balance": "38173.19",
-      "category": "Groceries"
     }}
   ]
 }}
@@ -98,7 +124,7 @@ RULES:
             prompt,
             max_new_tokens=2000,
             temperature=0.01,
-            stop_sequences=["</s>"]
         )
         print(f"LLM Response: {response}")
@@ -119,8 +145,7 @@ RULES:
         return data
     except Exception as e:
         print(f"LLM Error: {str(e)}")
-        # Fallback to rule-based parser
-        return rule_based_parser(cleaned_text)
 def rule_based_parser(text):
     """Enhanced fallback parser for structured tables"""
@@ -134,30 +159,37 @@ def rule_based_parser(text):
     ]
     for i, line in enumerate(lines):
-        if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
             header_index = i
             break
     if header_index is None:
-        # Try pipe-delimited format as fallback
-        for i, line in enumerate(lines):
-            if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
-                header_index = i
-                break
-    if header_index is None or header_index + 1 >= len(lines):
         return {"transactions": []}
     data_lines = lines[header_index + 1:]
     transactions = []
     for line in data_lines:
-        # Handle both pipe-delimited and space-aligned formats
         if '|' in line:
             parts = [p.strip() for p in line.split('|') if p.strip()]
         else:
-            # Space-aligned format - split by 2+ spaces
-            parts = re.split(r'\s{2,}', line)
         if len(parts) < 7:
             continue
@@ -194,16 +226,10 @@ def format_number(value):
         value += '.00'
     # Ensure two decimal places
-    parts = value.split('.')
-    if len(parts) == 2:
-        integer = parts[0].lstrip('0') or '0'
-        decimal = parts[1][:2].ljust(2, '0')
-        value = f"{integer}.{decimal}"
-    # Handle negative signs
-    if value.startswith('-'):
-        return f"-{value[1:].lstrip('0')}" if value[1:] != '0.00' else '0.00'
-    return value
 def process_file(file, is_scanned):
     """Main processing function"""

 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
+# Initialize with reliable free model
 hf_token = os.getenv("HF_TOKEN")
 client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
         with pdfplumber.open(pdf_path) as pdf:
             text = ""
             for page in pdf.pages:
+                # Extract tables first for structured data
+                tables = page.extract_tables()
+                for table in tables:
+                    for row in table:
+                        text += " | ".join(str(cell) for cell in row) + "\n"
+                    text += "\n"
+                # Extract text for unstructured data
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n\n"
             return text
     except Exception as e:
         print(f"Native PDF extraction failed: {str(e)}")
 def parse_bank_statement(text):
     """Parse bank statement using LLM with fallback to rule-based parser"""
+    # Clean text and remove non-essential lines
     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+    cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
+    cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
+    # Keep only lines that look like transactions
+    transaction_lines = []
+    for line in cleaned_text.split('\n'):
+        if re.match(r'^\d{4}-\d{2}-\d{2}', line):  # Date pattern
+            transaction_lines.append(line)
+        elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
+            transaction_lines.append(line)
+    cleaned_text = "\n".join(transaction_lines)
+    print(f"Cleaned text sample: {cleaned_text[:200]}...")
+    # Try rule-based parsing first for structured data
+    rule_based_data = rule_based_parser(cleaned_text)
+    if rule_based_data["transactions"]:
+        print("Using rule-based parser results")
+        return rule_based_data
+    # Fallback to LLM for unstructured data
+    print("Falling back to LLM parsing")
+    return llm_parser(cleaned_text)
+def llm_parser(text):
+    """LLM parser for unstructured text"""
     # Craft precise prompt with strict JSON formatting instructions
     prompt = f"""
 <|system|>
 - category
 Statement text:
+{text[:3000]}  [truncated if too long]
 Return JSON with this exact structure:
 {{
       "credit": "8315.40",
       "closing_balance": "38315.40",
       "category": "Salary"
     }}
   ]
 }}
             prompt,
             max_new_tokens=2000,
             temperature=0.01,
+            stop=["</s>"]  # Updated to 'stop' parameter
         )
         print(f"LLM Response: {response}")
         return data
     except Exception as e:
         print(f"LLM Error: {str(e)}")
+        return {"transactions": []}
 def rule_based_parser(text):
     """Enhanced fallback parser for structured tables"""
     ]
     for i, line in enumerate(lines):
+        if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
             header_index = i
             break
     if header_index is None:
         return {"transactions": []}
     data_lines = lines[header_index + 1:]
     transactions = []
     for line in data_lines:
+        # Handle both pipe-delimited and space-delimited formats
         if '|' in line:
             parts = [p.strip() for p in line.split('|') if p.strip()]
         else:
+            # Space-delimited format - split while preserving multi-word descriptions
+            parts = []
+            current = ""
+            in_description = False
+            for char in line:
+                if char == ' ' and not in_description:
+                    if current:
+                        parts.append(current)
+                        current = ""
+                    # After date field, we're in description
+                    if len(parts) == 1:
+                        in_description = True
+                else:
+                    current += char
+            if current:
+                parts.append(current)
         if len(parts) < 7:
             continue
         value += '.00'
     # Ensure two decimal places
+    try:
+        return f"{float(value):.2f}"
+    except:
+        return value
 def process_file(file, is_scanned):
     """Main processing function"""