Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

BankStatement_Parser / app.py

Bhaskar2611

Create app.py

f330df4 verified 2 months ago

raw

history blame

2.85 kB

	import os
	import gradio as gr
	import pdfplumber
	import pytesseract
	from PIL import Image
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	import pandas as pd
	import torch

	# Load Hugging Face token from environment
	hf_token = os.getenv("HF_TOKEN") # Set this in Space Secrets [[2]]

	# Load Mistral-7B-Instruct with authentication
	model_name = "mistralai/Mistral-7B-Instruct-v0.3"

	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	token=hf_token # Pass token for gated repo access [[6]]
	)
	pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
	except Exception as e:
	raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e

	# Text extraction from PDF
	def extract_text_from_pdf(pdf_path, is_scanned=False):
	text = ""
	if is_scanned:
	images = convert_from_path(pdf_path) # Requires pdf2image
	for image in images:
	text += pytesseract.image_to_string(image)
	else:
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text()
	return text

	# Prompt engineering for structured extraction
	def parse_bank_statement(text):
	prompt = f"""
	Extract the following details from the bank statement text:
	- Transaction Date
	- Description / Merchant
	- Amount
	- Debit / Credit
	- Closing Balance
	- Expense Type (if available)
	Return the results in JSON format with keys:
	["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
	Example:
	{{
	"transactions": [
	{{
	"date": "2025-06-01",
	"description": "Grocery Store",
	"amount": "150.00",
	"debit_credit": "Debit",
	"closing_balance": "1200.00",
	"expense_type": "Food"
	}}
	]
	}}
	Bank Statement Text:
	{text}
	"""
	response = pipe(prompt)[0]["generated_text"]
	return response # In production, parse JSON programmatically

	# Main function
	def process_file(file, is_scanned):
	file_path = file.name
	text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
	parsed_data = parse_bank_statement(text)
	df = pd.DataFrame(parsed_data["transactions"])
	return df

	# Gradio interface
	interface = gr.Interface(
	fn=process_file,
	inputs=[
	gr.File(label="Upload PDF/Excel"),
	gr.Checkbox(label="Is Scanned PDF?")
	],
	outputs=gr.Dataframe(label="Extracted Transactions"),
	title="Bank Statement Parser",
	description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
	)

	interface.launch()