Bhaskar2611's picture
Update app.py
ff610ff verified
raw
history blame
4.02 kB
import os
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd
import numpy as np
import re
# For Excel files
def extract_excel_data(file_path):
df = pd.read_excel(file_path, engine='openpyxl')
return df.to_string()
# For PDF files with fallback OCR
def extract_text_from_pdf(pdf_path, is_scanned=False):
try:
# First try native PDF extraction
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
# Fallback to OCR if PDF is invalid
print(f"Native PDF extraction failed: {str(e)}")
print("Trying OCR fallback...")
images = convert_from_path(pdf_path, dpi=200)
text = ""
for image in images:
text += pytesseract.image_to_string(image) + "\n"
return text
# Prompt engineering for structured extraction
def parse_bank_statement(text):
# Clean up text from PDF/OCR artifacts
cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
prompt = f"""
Extract the following details from the bank statement text:
- Transaction Date
- Description / Merchant
- Amount
- Debit / Credit
- Closing Balance
- Expense Type (if available)
Return the results in JSON format with keys:
["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
Example:
{{
"transactions": [
{{
"date": "2025-06-01",
"description": "Grocery Store",
"amount": "150.00",
"debit_credit": "Debit",
"closing_balance": "1200.00",
"expense_type": "Food"
}}
]
}}
Bank Statement Text:
{cleaned_text}
"""
# Simulate LLM response with deterministic parsing for demo
# Replace this with actual LLM inference in production
return simulate_llm_parsing(cleaned_text)
def simulate_llm_parsing(text):
"""Mock LLM response for demo purposes"""
# Simple regex-based parsing for demonstration
transactions = []
lines = text.split('\n')
# Skip header lines
data_lines = lines[lines.index('Date') + 1:]
for i in range(0, len(data_lines), 7): # Process in chunks of 7
if i+6 >= len(data_lines):
break
try:
transactions.append({
"date": data_lines[i].strip(),
"description": data_lines[i+1].strip(),
"amount": data_lines[i+2].strip(),
"debit_credit": data_lines[i+3].strip(),
"closing_balance": data_lines[i+5].strip(),
"expense_type": data_lines[i+6].strip()
})
except Exception as e:
print(f"Error parsing line {i}: {str(e)}")
continue
return {"transactions": transactions}
# Main function
def process_file(file, is_scanned):
file_path = file.name
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.xlsx':
text = extract_excel_data(file_path)
elif file_ext == '.pdf':
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
else:
return "Unsupported file format. Please upload PDF or Excel."
parsed_data = parse_bank_statement(text)
# Convert to DataFrame for display
df = pd.DataFrame(parsed_data["transactions"])
return df
# Gradio interface
interface = gr.Interface(
fn=process_file,
inputs=[
gr.File(label="Upload PDF/Excel"),
gr.Checkbox(label="Is Scanned PDF?")
],
outputs=gr.Dataframe(label="Extracted Transactions"),
title="Bank Statement Parser",
description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.",
allow_flagging="never"
)
if __name__ == "__main__":
interface.launch()