Spaces:
Sleeping
Sleeping
File size: 2,852 Bytes
f330df4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import os
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
import torch
# Load Hugging Face token from environment
hf_token = os.getenv("HF_TOKEN") # Set this in Space Secrets [[2]]
# Load Mistral-7B-Instruct with authentication
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
token=hf_token # Pass token for gated repo access [[6]]
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
except Exception as e:
raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e
# Text extraction from PDF
def extract_text_from_pdf(pdf_path, is_scanned=False):
text = ""
if is_scanned:
images = convert_from_path(pdf_path) # Requires pdf2image
for image in images:
text += pytesseract.image_to_string(image)
else:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
# Prompt engineering for structured extraction
def parse_bank_statement(text):
prompt = f"""
Extract the following details from the bank statement text:
- Transaction Date
- Description / Merchant
- Amount
- Debit / Credit
- Closing Balance
- Expense Type (if available)
Return the results in JSON format with keys:
["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
Example:
{{
"transactions": [
{{
"date": "2025-06-01",
"description": "Grocery Store",
"amount": "150.00",
"debit_credit": "Debit",
"closing_balance": "1200.00",
"expense_type": "Food"
}}
]
}}
Bank Statement Text:
{text}
"""
response = pipe(prompt)[0]["generated_text"]
return response # In production, parse JSON programmatically
# Main function
def process_file(file, is_scanned):
file_path = file.name
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
parsed_data = parse_bank_statement(text)
df = pd.DataFrame(parsed_data["transactions"])
return df
# Gradio interface
interface = gr.Interface(
fn=process_file,
inputs=[
gr.File(label="Upload PDF/Excel"),
gr.Checkbox(label="Is Scanned PDF?")
],
outputs=gr.Dataframe(label="Extracted Transactions"),
title="Bank Statement Parser",
description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
)
interface.launch() |