import os import gradio as gr import pdfplumber import pytesseract from PIL import Image from pdf2image import convert_from_path import pandas as pd import numpy as np import re # For Excel files def extract_excel_data(file_path): df = pd.read_excel(file_path, engine='openpyxl') return df.to_string() # For PDF files with fallback OCR def extract_text_from_pdf(pdf_path, is_scanned=False): try: # First try native PDF extraction with pdfplumber.open(pdf_path) as pdf: text = "" for page in pdf.pages: text += page.extract_text() + "\n" return text except Exception as e: # Fallback to OCR if PDF is invalid print(f"Native PDF extraction failed: {str(e)}") print("Trying OCR fallback...") images = convert_from_path(pdf_path, dpi=200) text = "" for image in images: text += pytesseract.image_to_string(image) + "\n" return text # Prompt engineering for structured extraction def parse_bank_statement(text): # Clean up text from PDF/OCR artifacts cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) prompt = f""" Extract the following details from the bank statement text: - Transaction Date - Description / Merchant - Amount - Debit / Credit - Closing Balance - Expense Type (if available) Return the results in JSON format with keys: ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"]. Example: {{ "transactions": [ {{ "date": "2025-06-01", "description": "Grocery Store", "amount": "150.00", "debit_credit": "Debit", "closing_balance": "1200.00", "expense_type": "Food" }} ] }} Bank Statement Text: {cleaned_text} """ # Simulate LLM response with deterministic parsing for demo # Replace this with actual LLM inference in production return simulate_llm_parsing(cleaned_text) def simulate_llm_parsing(text): """Mock LLM response for demo purposes""" # Simple regex-based parsing for demonstration transactions = [] lines = text.split('\n') # Skip header lines data_lines = lines[lines.index('Date') + 1:] for i in range(0, len(data_lines), 7): # Process in chunks of 7 if i+6 >= len(data_lines): break try: transactions.append({ "date": data_lines[i].strip(), "description": data_lines[i+1].strip(), "amount": data_lines[i+2].strip(), "debit_credit": data_lines[i+3].strip(), "closing_balance": data_lines[i+5].strip(), "expense_type": data_lines[i+6].strip() }) except Exception as e: print(f"Error parsing line {i}: {str(e)}") continue return {"transactions": transactions} # Main function def process_file(file, is_scanned): file_path = file.name file_ext = os.path.splitext(file_path)[1].lower() if file_ext == '.xlsx': text = extract_excel_data(file_path) elif file_ext == '.pdf': text = extract_text_from_pdf(file_path, is_scanned=is_scanned) else: return "Unsupported file format. Please upload PDF or Excel." parsed_data = parse_bank_statement(text) # Convert to DataFrame for display df = pd.DataFrame(parsed_data["transactions"]) return df # Gradio interface interface = gr.Interface( fn=process_file, inputs=[ gr.File(label="Upload PDF/Excel"), gr.Checkbox(label="Is Scanned PDF?") ], outputs=gr.Dataframe(label="Extracted Transactions"), title="Bank Statement Parser", description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.", allow_flagging="never" ) if __name__ == "__main__": interface.launch()