Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

BankStatement_Parser / app.py

Bhaskar2611

Update app.py

ff610ff verified 2 months ago

raw

history blame

4.02 kB

	import os
	import gradio as gr
	import pdfplumber
	import pytesseract
	from PIL import Image
	from pdf2image import convert_from_path
	import pandas as pd
	import numpy as np
	import re

	# For Excel files
	def extract_excel_data(file_path):
	df = pd.read_excel(file_path, engine='openpyxl')
	return df.to_string()

	# For PDF files with fallback OCR
	def extract_text_from_pdf(pdf_path, is_scanned=False):
	try:
	# First try native PDF extraction
	with pdfplumber.open(pdf_path) as pdf:
	text = ""
	for page in pdf.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	# Fallback to OCR if PDF is invalid
	print(f"Native PDF extraction failed: {str(e)}")
	print("Trying OCR fallback...")
	images = convert_from_path(pdf_path, dpi=200)
	text = ""
	for image in images:
	text += pytesseract.image_to_string(image) + "\n"
	return text

	# Prompt engineering for structured extraction
	def parse_bank_statement(text):
	# Clean up text from PDF/OCR artifacts
	cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)

	prompt = f"""
	Extract the following details from the bank statement text:
	- Transaction Date
	- Description / Merchant
	- Amount
	- Debit / Credit
	- Closing Balance
	- Expense Type (if available)

	Return the results in JSON format with keys:
	["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].

	Example:
	{{
	"transactions": [
	{{
	"date": "2025-06-01",
	"description": "Grocery Store",
	"amount": "150.00",
	"debit_credit": "Debit",
	"closing_balance": "1200.00",
	"expense_type": "Food"
	}}
	]
	}}

	Bank Statement Text:
	{cleaned_text}
	"""

	# Simulate LLM response with deterministic parsing for demo
	# Replace this with actual LLM inference in production
	return simulate_llm_parsing(cleaned_text)

	def simulate_llm_parsing(text):
	"""Mock LLM response for demo purposes"""
	# Simple regex-based parsing for demonstration
	transactions = []
	lines = text.split('\n')

	# Skip header lines
	data_lines = lines[lines.index('Date') + 1:]

	for i in range(0, len(data_lines), 7): # Process in chunks of 7
	if i+6 >= len(data_lines):
	break

	try:
	transactions.append({
	"date": data_lines[i].strip(),
	"description": data_lines[i+1].strip(),
	"amount": data_lines[i+2].strip(),
	"debit_credit": data_lines[i+3].strip(),
	"closing_balance": data_lines[i+5].strip(),
	"expense_type": data_lines[i+6].strip()
	})
	except Exception as e:
	print(f"Error parsing line {i}: {str(e)}")
	continue

	return {"transactions": transactions}

	# Main function
	def process_file(file, is_scanned):
	file_path = file.name
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.xlsx':
	text = extract_excel_data(file_path)
	elif file_ext == '.pdf':
	text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
	else:
	return "Unsupported file format. Please upload PDF or Excel."

	parsed_data = parse_bank_statement(text)

	# Convert to DataFrame for display
	df = pd.DataFrame(parsed_data["transactions"])
	return df

	# Gradio interface
	interface = gr.Interface(
	fn=process_file,
	inputs=[
	gr.File(label="Upload PDF/Excel"),
	gr.Checkbox(label="Is Scanned PDF?")
	],
	outputs=gr.Dataframe(label="Extracted Transactions"),
	title="Bank Statement Parser",
	description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.",
	allow_flagging="never"
	)

	if __name__ == "__main__":
	interface.launch()