Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8

Commit

4cfc47d

verified ·

1 Parent(s): d05b1b3

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -5

app.py CHANGED Viewed

@@ -81,10 +81,10 @@ def rule_based_parser(text):
     """Fallback parser for structured tables with pipe delimiters"""
     lines = [line.strip() for line in text.split('\n') if line.strip()]
-    # Find header line containing 'Date'
     header_index = None
     for i, line in enumerate(lines):
-        if re.search(r'\bDate\b', line):
             header_index = i
             break
@@ -120,7 +120,10 @@ def rule_based_parser(text):
 def process_file(file, is_scanned):
     """Main processing function"""
     if not file:
-        return "No file uploaded"
     file_path = file.name
     file_ext = os.path.splitext(file_path)[1].lower()
@@ -131,13 +134,32 @@ def process_file(file, is_scanned):
         elif file_ext == '.pdf':
             text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
         else:
-            return {"error": "Unsupported file format"}
         parsed_data = parse_bank_statement(text)
         df = pd.DataFrame(parsed_data["transactions"])
         return df
     except Exception as e:
-        return f"Error: {str(e)}"
 # Gradio Interface
 interface = gr.Interface(

     """Fallback parser for structured tables with pipe delimiters"""
     lines = [line.strip() for line in text.split('\n') if line.strip()]
+    # Find header line containing '| Date'
     header_index = None
     for i, line in enumerate(lines):
+        if re.search(r'\|Date', line):  # Improved pattern to match "|Date"
             header_index = i
             break
 def process_file(file, is_scanned):
     """Main processing function"""
     if not file:
+        return pd.DataFrame(columns=[
+            "Date", "Description", "Amount", "Debit",
+            "Credit", "Closing Balance", "Category"
+        ])
     file_path = file.name
     file_ext = os.path.splitext(file_path)[1].lower()
         elif file_ext == '.pdf':
             text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
         else:
+            return pd.DataFrame(columns=[
+                "Date", "Description", "Amount", "Debit",
+                "Credit", "Closing Balance", "Category"
+            ])
         parsed_data = parse_bank_statement(text)
         df = pd.DataFrame(parsed_data["transactions"])
+        # Ensure all required columns exist
+        required_cols = ["date", "description", "amount", "debit",
+                        "credit", "closing_balance", "category"]
+        for col in required_cols:
+            if col not in df.columns:
+                df[col] = ""
+        df.columns = ["Date", "Description", "Amount", "Debit",
+                     "Credit", "Closing Balance", "Category"]
         return df
     except Exception as e:
+        print(f"Processing error: {str(e)}")
+        # Return empty DataFrame with correct columns on error
+        return pd.DataFrame(columns=[
+            "Date", "Description", "Amount", "Debit",
+            "Credit", "Closing Balance", "Category"
+        ])
 # Gradio Interface
 interface = gr.Interface(