Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import pdfplumber | |
| import pytesseract | |
| from PIL import Image | |
| from pdf2image import convert_from_path | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| # For Excel files | |
| def extract_excel_data(file_path): | |
| df = pd.read_excel(file_path, engine='openpyxl') | |
| return df.to_string() | |
| # For PDF files with fallback OCR | |
| def extract_text_from_pdf(pdf_path, is_scanned=False): | |
| try: | |
| # First try native PDF extraction | |
| with pdfplumber.open(pdf_path) as pdf: | |
| text = "" | |
| for page in pdf.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| # Fallback to OCR if PDF is invalid | |
| print(f"Native PDF extraction failed: {str(e)}") | |
| print("Trying OCR fallback...") | |
| images = convert_from_path(pdf_path, dpi=200) | |
| text = "" | |
| for image in images: | |
| text += pytesseract.image_to_string(image) + "\n" | |
| return text | |
| # Prompt engineering for structured extraction | |
| def parse_bank_statement(text): | |
| # Clean up text from PDF/OCR artifacts | |
| cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) | |
| prompt = f""" | |
| Extract the following details from the bank statement text: | |
| - Transaction Date | |
| - Description / Merchant | |
| - Amount | |
| - Debit / Credit | |
| - Closing Balance | |
| - Expense Type (if available) | |
| Return the results in JSON format with keys: | |
| ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"]. | |
| Example: | |
| {{ | |
| "transactions": [ | |
| {{ | |
| "date": "2025-06-01", | |
| "description": "Grocery Store", | |
| "amount": "150.00", | |
| "debit_credit": "Debit", | |
| "closing_balance": "1200.00", | |
| "expense_type": "Food" | |
| }} | |
| ] | |
| }} | |
| Bank Statement Text: | |
| {cleaned_text} | |
| """ | |
| # Simulate LLM response with deterministic parsing for demo | |
| # Replace this with actual LLM inference in production | |
| return simulate_llm_parsing(cleaned_text) | |
| def simulate_llm_parsing(text): | |
| """Mock LLM response for demo purposes""" | |
| # Simple regex-based parsing for demonstration | |
| transactions = [] | |
| lines = text.split('\n') | |
| # Skip header lines | |
| data_lines = lines[lines.index('Date') + 1:] | |
| for i in range(0, len(data_lines), 7): # Process in chunks of 7 | |
| if i+6 >= len(data_lines): | |
| break | |
| try: | |
| transactions.append({ | |
| "date": data_lines[i].strip(), | |
| "description": data_lines[i+1].strip(), | |
| "amount": data_lines[i+2].strip(), | |
| "debit_credit": data_lines[i+3].strip(), | |
| "closing_balance": data_lines[i+5].strip(), | |
| "expense_type": data_lines[i+6].strip() | |
| }) | |
| except Exception as e: | |
| print(f"Error parsing line {i}: {str(e)}") | |
| continue | |
| return {"transactions": transactions} | |
| # Main function | |
| def process_file(file, is_scanned): | |
| file_path = file.name | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| if file_ext == '.xlsx': | |
| text = extract_excel_data(file_path) | |
| elif file_ext == '.pdf': | |
| text = extract_text_from_pdf(file_path, is_scanned=is_scanned) | |
| else: | |
| return "Unsupported file format. Please upload PDF or Excel." | |
| parsed_data = parse_bank_statement(text) | |
| # Convert to DataFrame for display | |
| df = pd.DataFrame(parsed_data["transactions"]) | |
| return df | |
| # Gradio interface | |
| interface = gr.Interface( | |
| fn=process_file, | |
| inputs=[ | |
| gr.File(label="Upload PDF/Excel"), | |
| gr.Checkbox(label="Is Scanned PDF?") | |
| ], | |
| outputs=gr.Dataframe(label="Extracted Transactions"), | |
| title="Bank Statement Parser", | |
| description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.", | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() |