Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8

Commit

6255a6d

verified ·

1 Parent(s): aca59c0

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -44

app.py CHANGED Viewed

@@ -8,9 +8,9 @@ import pytesseract
 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
-# Initialize Hugging Face Inference Client with a free model
 hf_token = os.getenv("HF_TOKEN")
-client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=hf_token)
 def extract_excel_data(file_path):
     """Extract text from Excel file"""
@@ -40,25 +40,25 @@ def parse_bank_statement(text):
     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
     print(f"Original text sample: {cleaned_text[:200]}...")
-    # Craft precise prompt for LLM with proper JSON escaping
     prompt = f"""
-You are a financial data parser. Extract transactions from bank statements.
-Given this bank statement text:
-{cleaned_text}
-Extract all transactions with these fields:
-- Date
-- Description
-- Amount
-- Debit
-- Credit
-- Closing Balance
-- Category
-Return JSON with "transactions" array containing these fields.
-Example format:
 {{
   "transactions": [
     {{
@@ -82,44 +82,69 @@ Example format:
   ]
 }}
-Rules:
-1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
-2. Convert negative balances to standard format (e.g., "-2421.72")
-3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
-4. Only return valid JSON with no additional text
 """
     try:
-        # Call LLM via Hugging Face Inference API
         response = client.text_generation(
             prompt,
             max_new_tokens=2000,
-            temperature=0.1,
             stop_sequences=["</s>"]
         )
         print(f"LLM Response: {response}")
-        # Extract JSON from response (remove non-JSON prefixes/suffixes)
-        json_match = re.search(r'\{.*\}', response, re.DOTALL)
-        if json_match:
-            return json.loads(json_match.group())
-        return json.loads(response)
     except Exception as e:
         print(f"LLM Error: {str(e)}")
         # Fallback to rule-based parser
         return rule_based_parser(cleaned_text)
 def rule_based_parser(text):
-    """Fallback parser for structured tables with pipe delimiters"""
     lines = [line.strip() for line in text.split('\n') if line.strip()]
-    # Find header line containing '| Date'
     header_index = None
     for i, line in enumerate(lines):
-        if re.search(r'\|Date|Date\|', line, re.IGNORECASE):
             header_index = i
             break
     if header_index is None or header_index + 1 >= len(lines):
         return {"transactions": []}
@@ -127,15 +152,17 @@ def rule_based_parser(text):
     transactions = []
     for line in data_lines:
-        if not '|' in line:
-            continue
-        parts = [p.strip() for p in line.split('|') if p.strip()]
         if len(parts) < 7:
             continue
         try:
-            # Handle numeric values consistently
             transactions.append({
                 "date": parts[0],
                 "description": parts[1],
@@ -152,9 +179,30 @@ def rule_based_parser(text):
 def format_number(value):
     """Format numeric values consistently"""
-    value = value.replace(',', '')
-    if re.match(r'^-?\d+(\.\d+)?$', value):
-        return f"{float(value):.2f}"
     return value
 def process_file(file, is_scanned):
@@ -189,6 +237,7 @@ def process_file(file, is_scanned):
             if col not in df.columns:
                 df[col] = ""
         df.columns = ["Date", "Description", "Amount", "Debit",
                      "Credit", "Closing Balance", "Category"]
         return df
@@ -210,10 +259,11 @@ interface = gr.Interface(
     ],
     outputs=gr.Dataframe(
         label="Parsed Transactions",
-        headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"]
     ),
     title="AI Bank Statement Parser",
-    description="Extract structured transaction data from PDF/Excel bank statements using LLM and hybrid parsing techniques.",
     allow_flagging="never"
 )

 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
+# Initialize Hugging Face Inference Client with a better free model
 hf_token = os.getenv("HF_TOKEN")
+client = InferenceClient(model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token)
 def extract_excel_data(file_path):
     """Extract text from Excel file"""
     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
     print(f"Original text sample: {cleaned_text[:200]}...")
+    # Craft precise prompt with strict JSON formatting instructions
     prompt = f"""
+<|system|>
+You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
+</s>
+<|user|>
+Extract all transactions from this bank statement with these exact fields:
+- date (format: YYYY-MM-DD)
+- description
+- amount (format: 0.00)
+- debit (format: 0.00)
+- credit (format: 0.00)
+- closing_balance (format: 0.00 or -0.00 for negative)
+- category
+Statement text:
+{cleaned_text[:3000]}  [truncated if too long]
+Return JSON with this exact structure:
 {{
   "transactions": [
     {{
   ]
 }}
+RULES:
+1. Output ONLY the JSON object with no additional text
+2. Keep amounts as strings with 2 decimal places
+3. For missing values, use empty strings
+4. Convert negative amounts to format "-123.45"
+5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
+</s>
+<|assistant|>
 """
     try:
+        # Call LLM with strict parameters
         response = client.text_generation(
             prompt,
             max_new_tokens=2000,
+            temperature=0.01,  # Lower temperature for more deterministic output
             stop_sequences=["</s>"]
         )
         print(f"LLM Response: {response}")
+        # Validate and clean JSON response
+        response = response.strip()
+        if not response.startswith('{'):
+            # Find the first { and last } to extract JSON
+            start_idx = response.find('{')
+            end_idx = response.rfind('}')
+            if start_idx != -1 and end_idx != -1:
+                response = response[start_idx:end_idx+1]
+        # Parse JSON and validate structure
+        data = json.loads(response)
+        if "transactions" not in data:
+            raise ValueError("Missing 'transactions' key in JSON")
+        return data
     except Exception as e:
         print(f"LLM Error: {str(e)}")
         # Fallback to rule-based parser
         return rule_based_parser(cleaned_text)
 def rule_based_parser(text):
+    """Enhanced fallback parser for structured tables"""
     lines = [line.strip() for line in text.split('\n') if line.strip()]
+    # Find header line - more flexible detection
     header_index = None
+    header_patterns = [
+        r'Date\b', r'Description\b', r'Amount\b',
+        r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
+    ]
     for i, line in enumerate(lines):
+        if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
             header_index = i
             break
+    if header_index is None:
+        # Try pipe-delimited format as fallback
+        for i, line in enumerate(lines):
+            if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
+                header_index = i
+                break
     if header_index is None or header_index + 1 >= len(lines):
         return {"transactions": []}
     transactions = []
     for line in data_lines:
+        # Handle both pipe-delimited and space-aligned formats
+        if '|' in line:
+            parts = [p.strip() for p in line.split('|') if p.strip()]
+        else:
+            # Space-aligned format - split by 2+ spaces
+            parts = re.split(r'\s{2,}', line)
         if len(parts) < 7:
             continue
         try:
             transactions.append({
                 "date": parts[0],
                 "description": parts[1],
 def format_number(value):
     """Format numeric values consistently"""
+    if not value:
+        return "0.00"
+    # Clean numeric values
+    value = value.replace(',', '').replace('$', '').strip()
+    # Handle negative numbers in parentheses
+    if '(' in value and ')' in value:
+        value = '-' + value.replace('(', '').replace(')', '')
+    # Standardize decimal format
+    if '.' not in value:
+        value += '.00'
+    # Ensure two decimal places
+    parts = value.split('.')
+    if len(parts) == 2:
+        integer = parts[0].lstrip('0') or '0'
+        decimal = parts[1][:2].ljust(2, '0')
+        value = f"{integer}.{decimal}"
+    # Handle negative signs
+    if value.startswith('-'):
+        return f"-{value[1:].lstrip('0')}" if value[1:] != '0.00' else '0.00'
     return value
 def process_file(file, is_scanned):
             if col not in df.columns:
                 df[col] = ""
+        # Format columns properly
         df.columns = ["Date", "Description", "Amount", "Debit",
                      "Credit", "Closing Balance", "Category"]
         return df
     ],
     outputs=gr.Dataframe(
         label="Parsed Transactions",
+        headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
+        datatype=["date", "str", "number", "number", "number", "number", "str"]
     ),
     title="AI Bank Statement Parser",
+    description="Extract structured transaction data from PDF/Excel bank statements",
     allow_flagging="never"
 )