Bhaskar2611's picture
Create app.py
f330df4 verified
raw
history blame
2.85 kB
import os
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
import torch
# Load Hugging Face token from environment
hf_token = os.getenv("HF_TOKEN") # Set this in Space Secrets [[2]]
# Load Mistral-7B-Instruct with authentication
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
token=hf_token # Pass token for gated repo access [[6]]
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
except Exception as e:
raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e
# Text extraction from PDF
def extract_text_from_pdf(pdf_path, is_scanned=False):
text = ""
if is_scanned:
images = convert_from_path(pdf_path) # Requires pdf2image
for image in images:
text += pytesseract.image_to_string(image)
else:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
return text
# Prompt engineering for structured extraction
def parse_bank_statement(text):
prompt = f"""
Extract the following details from the bank statement text:
- Transaction Date
- Description / Merchant
- Amount
- Debit / Credit
- Closing Balance
- Expense Type (if available)
Return the results in JSON format with keys:
["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
Example:
{{
"transactions": [
{{
"date": "2025-06-01",
"description": "Grocery Store",
"amount": "150.00",
"debit_credit": "Debit",
"closing_balance": "1200.00",
"expense_type": "Food"
}}
]
}}
Bank Statement Text:
{text}
"""
response = pipe(prompt)[0]["generated_text"]
return response # In production, parse JSON programmatically
# Main function
def process_file(file, is_scanned):
file_path = file.name
text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
parsed_data = parse_bank_statement(text)
df = pd.DataFrame(parsed_data["transactions"])
return df
# Gradio interface
interface = gr.Interface(
fn=process_file,
inputs=[
gr.File(label="Upload PDF/Excel"),
gr.Checkbox(label="Is Scanned PDF?")
],
outputs=gr.Dataframe(label="Extracted Transactions"),
title="Bank Statement Parser",
description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
)
interface.launch()