Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import pdfplumber | |
import pytesseract | |
from PIL import Image | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
import pandas as pd | |
import torch | |
# Load Hugging Face token from environment | |
hf_token = os.getenv("HF_TOKEN") # Set this in Space Secrets [[2]] | |
# Load Mistral-7B-Instruct with authentication | |
model_name = "mistralai/Mistral-7B-Instruct-v0.3" | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float16, | |
token=hf_token # Pass token for gated repo access [[6]] | |
) | |
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500) | |
except Exception as e: | |
raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e | |
# Text extraction from PDF | |
def extract_text_from_pdf(pdf_path, is_scanned=False): | |
text = "" | |
if is_scanned: | |
images = convert_from_path(pdf_path) # Requires pdf2image | |
for image in images: | |
text += pytesseract.image_to_string(image) | |
else: | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() | |
return text | |
# Prompt engineering for structured extraction | |
def parse_bank_statement(text): | |
prompt = f""" | |
Extract the following details from the bank statement text: | |
- Transaction Date | |
- Description / Merchant | |
- Amount | |
- Debit / Credit | |
- Closing Balance | |
- Expense Type (if available) | |
Return the results in JSON format with keys: | |
["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"]. | |
Example: | |
{{ | |
"transactions": [ | |
{{ | |
"date": "2025-06-01", | |
"description": "Grocery Store", | |
"amount": "150.00", | |
"debit_credit": "Debit", | |
"closing_balance": "1200.00", | |
"expense_type": "Food" | |
}} | |
] | |
}} | |
Bank Statement Text: | |
{text} | |
""" | |
response = pipe(prompt)[0]["generated_text"] | |
return response # In production, parse JSON programmatically | |
# Main function | |
def process_file(file, is_scanned): | |
file_path = file.name | |
text = extract_text_from_pdf(file_path, is_scanned=is_scanned) | |
parsed_data = parse_bank_statement(text) | |
df = pd.DataFrame(parsed_data["transactions"]) | |
return df | |
# Gradio interface | |
interface = gr.Interface( | |
fn=process_file, | |
inputs=[ | |
gr.File(label="Upload PDF/Excel"), | |
gr.Checkbox(label="Is Scanned PDF?") | |
], | |
outputs=gr.Dataframe(label="Extracted Transactions"), | |
title="Bank Statement Parser", | |
description="Convert PDF/Excel bank statements into structured data using Mistral-7B." | |
) | |
interface.launch() |