Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8

Commit

1dff96b

verified ·

1 Parent(s): c70b653

Update app.py

Browse files

Files changed (1) hide show

app.py +420 -66

app.py CHANGED Viewed

@@ -1,3 +1,301 @@
 import os
 import re
 import json
@@ -45,22 +343,26 @@ def extract_text_from_pdf(pdf_path, is_scanned=False):
             text += pytesseract.image_to_string(image) + "\n"
         return text
-def parse_bank_statement(text):
     """Parse bank statement using LLM with fallback to rule-based parser"""
-    # Clean text and remove non-essential lines
     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
-    cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
-    cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
-    # Keep only lines that look like transactions
-    transaction_lines = []
-    for line in cleaned_text.split('\n'):
-        if re.match(r'^\d{4}-\d{2}-\d{2}', line):  # Date pattern
-            transaction_lines.append(line)
-        elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
-            transaction_lines.append(line)
-    cleaned_text = "\n".join(transaction_lines)
     print(f"Cleaned text sample: {cleaned_text[:200]}...")
     # Try rule-based parsing first for structured data
@@ -158,11 +460,26 @@ def rule_based_parser(text):
         r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
     ]
     for i, line in enumerate(lines):
-        if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
             header_index = i
             break
     if header_index is None:
         return {"transactions": []}
@@ -174,27 +491,18 @@ def rule_based_parser(text):
         if '|' in line:
             parts = [p.strip() for p in line.split('|') if p.strip()]
         else:
-            # Space-delimited format - split while preserving multi-word descriptions
-            parts = []
-            current = ""
-            in_description = False
-            for char in line:
-                if char == ' ' and not in_description:
-                    if current:
-                        parts.append(current)
-                        current = ""
-                    # After date field, we're in description
-                    if len(parts) == 1:
-                        in_description = True
-                else:
-                    current += char
-            if current:
-                parts.append(current)
         if len(parts) < 7:
             continue
         try:
             transactions.append({
                 "date": parts[0],
                 "description": parts[1],
@@ -211,70 +519,116 @@ def rule_based_parser(text):
 def format_number(value):
     """Format numeric values consistently"""
-    if not value:
         return "0.00"
-    # Clean numeric values
-    value = value.replace(',', '').replace('$', '').strip()
     # Handle negative numbers in parentheses
     if '(' in value and ')' in value:
         value = '-' + value.replace('(', '').replace(')', '')
     # Standardize decimal format
     if '.' not in value:
         value += '.00'
     # Ensure two decimal places
     try:
-        return f"{float(value):.2f}"
-    except:
-        return value
 def process_file(file, is_scanned):
     """Main processing function"""
     if not file:
-        return pd.DataFrame(columns=[
-            "Date", "Description", "Amount", "Debit",
-            "Credit", "Closing Balance", "Category"
-        ])
     file_path = file.name
     file_ext = os.path.splitext(file_path)[1].lower()
     try:
         if file_ext == '.xlsx':
-            text = extract_excel_data(file_path)
         elif file_ext == '.pdf':
             text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
-        else:
-            return pd.DataFrame(columns=[
-                "Date", "Description", "Amount", "Debit",
-                "Credit", "Closing Balance", "Category"
-            ])
-        parsed_data = parse_bank_statement(text)
-        df = pd.DataFrame(parsed_data["transactions"])
-        # Ensure all required columns exist
-        required_cols = ["date", "description", "amount", "debit",
-                        "credit", "closing_balance", "category"]
-        for col in required_cols:
-            if col not in df.columns:
-                df[col] = ""
-        # Format columns properly
-        df.columns = ["Date", "Description", "Amount", "Debit",
-                     "Credit", "Closing Balance", "Category"]
-        return df
     except Exception as e:
         print(f"Processing error: {str(e)}")
-        # Return empty DataFrame with correct columns on error
-        return pd.DataFrame(columns=[
-            "Date", "Description", "Amount", "Debit",
-            "Credit", "Closing Balance", "Category"
-        ])
 # Gradio Interface
 interface = gr.Interface(

+# import os
+# import re
+# import json
+# import gradio as gr
+# import pandas as pd
+# import pdfplumber
+# import pytesseract
+# from pdf2image import convert_from_path
+# from huggingface_hub import InferenceClient
+# # Initialize with reliable free model
+# hf_token = os.getenv("HF_TOKEN")
+# client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.2", token=hf_token)
+# def extract_excel_data(file_path):
+#     """Extract text from Excel file"""
+#     df = pd.read_excel(file_path, engine='openpyxl')
+#     return df.to_string(index=False)
+# def extract_text_from_pdf(pdf_path, is_scanned=False):
+#     """Extract text from PDF with fallback OCR"""
+#     try:
+#         # Try native PDF extraction first
+#         with pdfplumber.open(pdf_path) as pdf:
+#             text = ""
+#             for page in pdf.pages:
+#                 # Extract tables first for structured data
+#                 tables = page.extract_tables()
+#                 for table in tables:
+#                     for row in table:
+#                         text += " | ".join(str(cell) for cell in row) + "\n"
+#                     text += "\n"
+#                 # Extract text for unstructured data
+#                 page_text = page.extract_text()
+#                 if page_text:
+#                     text += page_text + "\n\n"
+#             return text
+#     except Exception as e:
+#         print(f"Native PDF extraction failed: {str(e)}")
+#         # Fallback to OCR for scanned PDFs
+#         images = convert_from_path(pdf_path, dpi=200)
+#         text = ""
+#         for image in images:
+#             text += pytesseract.image_to_string(image) + "\n"
+#         return text
+# def parse_bank_statement(text):
+#     """Parse bank statement using LLM with fallback to rule-based parser"""
+#     # Clean text and remove non-essential lines
+#     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+#     cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
+#     cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
+#     # Keep only lines that look like transactions
+#     transaction_lines = []
+#     for line in cleaned_text.split('\n'):
+#         if re.match(r'^\d{4}-\d{2}-\d{2}', line):  # Date pattern
+#             transaction_lines.append(line)
+#         elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
+#             transaction_lines.append(line)
+#     cleaned_text = "\n".join(transaction_lines)
+#     print(f"Cleaned text sample: {cleaned_text[:200]}...")
+#     # Try rule-based parsing first for structured data
+#     rule_based_data = rule_based_parser(cleaned_text)
+#     if rule_based_data["transactions"]:
+#         print("Using rule-based parser results")
+#         return rule_based_data
+#     # Fallback to LLM for unstructured data
+#     print("Falling back to LLM parsing")
+#     return llm_parser(cleaned_text)
+# def llm_parser(text):
+#     """LLM parser for unstructured text"""
+#     # Craft precise prompt with strict JSON formatting instructions
+#     prompt = f"""
+# <|system|>
+# You are a financial data parser. Extract transactions from bank statements and return ONLY valid JSON.
+# </s>
+# <|user|>
+# Extract all transactions from this bank statement with these exact fields:
+# - date (format: YYYY-MM-DD)
+# - description
+# - amount (format: 0.00)
+# - debit (format: 0.00)
+# - credit (format: 0.00)
+# - closing_balance (format: 0.00 or -0.00 for negative)
+# - category
+# Statement text:
+# {text[:3000]}  [truncated if too long]
+# Return JSON with this exact structure:
+# {{
+#   "transactions": [
+#     {{
+#       "date": "2025-05-08",
+#       "description": "Company XYZ Payroll",
+#       "amount": "8315.40",
+#       "debit": "0.00",
+#       "credit": "8315.40",
+#       "closing_balance": "38315.40",
+#       "category": "Salary"
+#     }}
+#   ]
+# }}
+# RULES:
+# 1. Output ONLY the JSON object with no additional text
+# 2. Keep amounts as strings with 2 decimal places
+# 3. For missing values, use empty strings
+# 4. Convert negative amounts to format "-123.45"
+# 5. Map categories to: Salary, Groceries, Medical, Utilities, Entertainment, Dining, Misc
+# </s>
+# <|assistant|>
+# """
+#     try:
+#         # Call LLM via Hugging Face Inference API
+#         response = client.text_generation(
+#             prompt,
+#             max_new_tokens=2000,
+#             temperature=0.01,
+#             stop=["</s>"]  # Updated to 'stop' parameter
+#         )
+#         print(f"LLM Response: {response}")
+#         # Validate and clean JSON response
+#         response = response.strip()
+#         if not response.startswith('{'):
+#             # Find the first { and last } to extract JSON
+#             start_idx = response.find('{')
+#             end_idx = response.rfind('}')
+#             if start_idx != -1 and end_idx != -1:
+#                 response = response[start_idx:end_idx+1]
+#         # Parse JSON and validate structure
+#         data = json.loads(response)
+#         if "transactions" not in data:
+#             raise ValueError("Missing 'transactions' key in JSON")
+#         return data
+#     except Exception as e:
+#         print(f"LLM Error: {str(e)}")
+#         return {"transactions": []}
+# def rule_based_parser(text):
+#     """Enhanced fallback parser for structured tables"""
+#     lines = [line.strip() for line in text.split('\n') if line.strip()]
+#     # Find header line - more flexible detection
+#     header_index = None
+#     header_patterns = [
+#         r'Date\b', r'Description\b', r'Amount\b',
+#         r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
+#     ]
+#     for i, line in enumerate(lines):
+#         if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
+#             header_index = i
+#             break
+#     if header_index is None:
+#         return {"transactions": []}
+#     data_lines = lines[header_index + 1:]
+#     transactions = []
+#     for line in data_lines:
+#         # Handle both pipe-delimited and space-delimited formats
+#         if '|' in line:
+#             parts = [p.strip() for p in line.split('|') if p.strip()]
+#         else:
+#             # Space-delimited format - split while preserving multi-word descriptions
+#             parts = []
+#             current = ""
+#             in_description = False
+#             for char in line:
+#                 if char == ' ' and not in_description:
+#                     if current:
+#                         parts.append(current)
+#                         current = ""
+#                     # After date field, we're in description
+#                     if len(parts) == 1:
+#                         in_description = True
+#                 else:
+#                     current += char
+#             if current:
+#                 parts.append(current)
+#         if len(parts) < 7:
+#             continue
+#         try:
+#             transactions.append({
+#                 "date": parts[0],
+#                 "description": parts[1],
+#                 "amount": format_number(parts[2]),
+#                 "debit": format_number(parts[3]),
+#                 "credit": format_number(parts[4]),
+#                 "closing_balance": format_number(parts[5]),
+#                 "category": parts[6]
+#             })
+#         except Exception as e:
+#             print(f"Error parsing line: {str(e)}")
+#     return {"transactions": transactions}
+# def format_number(value):
+#     """Format numeric values consistently"""
+#     if not value:
+#         return "0.00"
+#     # Clean numeric values
+#     value = value.replace(',', '').replace('$', '').strip()
+#     # Handle negative numbers in parentheses
+#     if '(' in value and ')' in value:
+#         value = '-' + value.replace('(', '').replace(')', '')
+#     # Standardize decimal format
+#     if '.' not in value:
+#         value += '.00'
+#     # Ensure two decimal places
+#     try:
+#         return f"{float(value):.2f}"
+#     except:
+#         return value
+# def process_file(file, is_scanned):
+#     """Main processing function"""
+#     if not file:
+#         return pd.DataFrame(columns=[
+#             "Date", "Description", "Amount", "Debit",
+#             "Credit", "Closing Balance", "Category"
+#         ])
+#     file_path = file.name
+#     file_ext = os.path.splitext(file_path)[1].lower()
+#     try:
+#         if file_ext == '.xlsx':
+#             text = extract_excel_data(file_path)
+#         elif file_ext == '.pdf':
+#             text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
+#         else:
+#             return pd.DataFrame(columns=[
+#                 "Date", "Description", "Amount", "Debit",
+#                 "Credit", "Closing Balance", "Category"
+#             ])
+#         parsed_data = parse_bank_statement(text)
+#         df = pd.DataFrame(parsed_data["transactions"])
+#         # Ensure all required columns exist
+#         required_cols = ["date", "description", "amount", "debit",
+#                         "credit", "closing_balance", "category"]
+#         for col in required_cols:
+#             if col not in df.columns:
+#                 df[col] = ""
+#         # Format columns properly
+#         df.columns = ["Date", "Description", "Amount", "Debit",
+#                      "Credit", "Closing Balance", "Category"]
+#         return df
+#     except Exception as e:
+#         print(f"Processing error: {str(e)}")
+#         # Return empty DataFrame with correct columns on error
+#         return pd.DataFrame(columns=[
+#             "Date", "Description", "Amount", "Debit",
+#             "Credit", "Closing Balance", "Category"
+#         ])
+# # Gradio Interface
+# interface = gr.Interface(
+#     fn=process_file,
+#     inputs=[
+#         gr.File(label="Upload Bank Statement (PDF/Excel)"),
+#         gr.Checkbox(label="Is Scanned PDF? (Use OCR)")
+#     ],
+#     outputs=gr.Dataframe(
+#         label="Parsed Transactions",
+#         headers=["Date", "Description", "Amount", "Debit", "Credit", "Closing Balance", "Category"],
+#         datatype=["date", "str", "number", "number", "number", "number", "str"]
+#     ),
+#     title="AI Bank Statement Parser",
+#     description="Extract structured transaction data from PDF/Excel bank statements",
+#     allow_flagging="never"
+# )
+# if __name__ == "__main__":
+#     interface.launch()
 import os
 import re
 import json
             text += pytesseract.image_to_string(image) + "\n"
         return text
+def parse_bank_statement(text, file_type):
     """Parse bank statement using LLM with fallback to rule-based parser"""
+    # Clean text differently based on file type
     cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+    if file_type == 'pdf':
+        # PDF-specific cleaning
+        cleaned_text = re.sub(r'Page \d+ of \d+', '', cleaned_text, flags=re.IGNORECASE)
+        cleaned_text = re.sub(r'CropBox.*?MediaBox', '', cleaned_text, flags=re.IGNORECASE)
+        # Keep only lines that look like transactions
+        transaction_lines = []
+        for line in cleaned_text.split('\n'):
+            if re.match(r'^\d{4}-\d{2}-\d{2}', line):  # Date pattern
+                transaction_lines.append(line)
+            elif '|' in line and any(x in line for x in ['Date', 'Amount', 'Balance']):
+                transaction_lines.append(line)
+        cleaned_text = "\n".join(transaction_lines)
     print(f"Cleaned text sample: {cleaned_text[:200]}...")
     # Try rule-based parsing first for structured data
         r'Debit\b', r'Credit\b', r'Closing\s*Balance\b', r'Category\b'
     ]
+    # First try: Look for a full header line
     for i, line in enumerate(lines):
+        if all(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns[:3]):
             header_index = i
             break
+    # Second try: Look for any header indicators
+    if header_index is None:
+        for i, line in enumerate(lines):
+            if any(re.search(pattern, line, re.IGNORECASE) for pattern in header_patterns):
+                header_index = i
+                break
+    # Third try: Look for pipe-delimited headers
+    if header_index is None:
+        for i, line in enumerate(lines):
+            if '|' in line and any(p in line for p in ['Date', 'Amount', 'Balance']):
+                header_index = i
+                break
     if header_index is None:
         return {"transactions": []}
         if '|' in line:
             parts = [p.strip() for p in line.split('|') if p.strip()]
         else:
+            # Space-delimited format - split by 2+ spaces
+            parts = re.split(r'\s{2,}', line)
+        # Skip lines that don't have enough parts
         if len(parts) < 7:
             continue
         try:
+            # Handle transaction date validation
+            if not re.match(r'\d{4}-\d{2}-\d{2}', parts[0]):
+                continue
             transactions.append({
                 "date": parts[0],
                 "description": parts[1],
 def format_number(value):
     """Format numeric values consistently"""
+    if not value or str(value).lower() in ['nan', 'nat']:
         return "0.00"
+    # If it's already a number, format directly
+    if isinstance(value, (int, float)):
+        return f"{value:.2f}"
+    # Clean string values
+    value = str(value).replace(',', '').replace('$', '').strip()
     # Handle negative numbers in parentheses
     if '(' in value and ')' in value:
         value = '-' + value.replace('(', '').replace(')', '')
+    # Handle empty values
+    if not value:
+        return "0.00"
     # Standardize decimal format
     if '.' not in value:
         value += '.00'
     # Ensure two decimal places
     try:
+        num_value = float(value)
+        return f"{num_value:.2f}"
+    except ValueError:
+        # If we can't convert to float, return original but clean it
+        return value.split('.')[0] + '.' + value.split('.')[1][:2].ljust(2, '0')
 def process_file(file, is_scanned):
     """Main processing function"""
     if not file:
+        return empty_df()
     file_path = file.name
     file_ext = os.path.splitext(file_path)[1].lower()
     try:
         if file_ext == '.xlsx':
+            # Directly process Excel files without text conversion
+            df = pd.read_excel(file_path, engine='openpyxl')
+            # Normalize column names
+            df.columns = df.columns.str.strip().str.lower()
+            # Create mapping to expected columns
+            col_mapping = {
+                'date': 'date',
+                'description': 'description',
+                'amount': 'amount',
+                'debit': 'debit',
+                'credit': 'credit',
+                'closing balance': 'closing_balance',
+                'closing': 'closing_balance',
+                'balance': 'closing_balance',
+                'category': 'category'
+            }
+            # Create output DataFrame with required columns
+            output_df = pd.DataFrame()
+            for col in ['date', 'description', 'amount', 'debit', 'credit', 'closing_balance', 'category']:
+                if col in df.columns:
+                    output_df[col] = df[col]
+                elif any(alias in col_mapping and col_mapping[alias] == col for alias in df.columns):
+                    # Find alias
+                    for alias in df.columns:
+                        if alias in col_mapping and col_mapping[alias] == col:
+                            output_df[col] = df[alias]
+                            break
+                else:
+                    output_df[col] = ""
+            # Format numeric columns
+            for col in ['amount', 'debit', 'credit', 'closing_balance']:
+                output_df[col] = output_df[col].apply(format_number)
+            # Rename columns for display
+            output_df.columns = ["Date", "Description", "Amount", "Debit",
+                               "Credit", "Closing Balance", "Category"]
+            return output_df
         elif file_ext == '.pdf':
             text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
+            parsed_data = parse_bank_statement(text, 'pdf')
+            df = pd.DataFrame(parsed_data["transactions"])
+            # Ensure all required columns exist
+            required_cols = ["date", "description", "amount", "debit",
+                            "credit", "closing_balance", "category"]
+            for col in required_cols:
+                if col not in df.columns:
+                    df[col] = ""
+            # Format columns properly
+            df.columns = ["Date", "Description", "Amount", "Debit",
+                         "Credit", "Closing Balance", "Category"]
+            return df
+        else:
+            return empty_df()
     except Exception as e:
         print(f"Processing error: {str(e)}")
+        return empty_df()
+def empty_df():
+    """Return empty DataFrame with correct columns"""
+    return pd.DataFrame(columns=["Date", "Description", "Amount", "Debit",
+                               "Credit", "Closing Balance", "Category"])
 # Gradio Interface
 interface = gr.Interface(