import os import gradio as gr import pdfplumber import pytesseract from PIL import Image from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import pandas as pd import torch # Load Hugging Face token from environment hf_token = os.getenv("HF_TOKEN") # Set this in Space Secrets [[2]] # Load Mistral-7B-Instruct with authentication model_name = "mistralai/Mistral-7B-Instruct-v0.3" try: tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, token=hf_token # Pass token for gated repo access [[6]] ) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500) except Exception as e: raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e # Text extraction from PDF def extract_text_from_pdf(pdf_path, is_scanned=False): text = "" if is_scanned: images = convert_from_path(pdf_path) # Requires pdf2image for image in images: text += pytesseract.image_to_string(image) else: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text += page.extract_text() return text # Prompt engineering for structured extraction def parse_bank_statement(text): prompt = f""" Extract the following details from the bank statement text: - Transaction Date - Description / Merchant - Amount - Debit / Credit - Closing Balance - Expense Type (if available) Return the results in JSON format with keys: ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"]. Example: {{ "transactions": [ {{ "date": "2025-06-01", "description": "Grocery Store", "amount": "150.00", "debit_credit": "Debit", "closing_balance": "1200.00", "expense_type": "Food" }} ] }} Bank Statement Text: {text} """ response = pipe(prompt)[0]["generated_text"] return response # In production, parse JSON programmatically # Main function def process_file(file, is_scanned): file_path = file.name text = extract_text_from_pdf(file_path, is_scanned=is_scanned) parsed_data = parse_bank_statement(text) df = pd.DataFrame(parsed_data["transactions"]) return df # Gradio interface interface = gr.Interface( fn=process_file, inputs=[ gr.File(label="Upload PDF/Excel"), gr.Checkbox(label="Is Scanned PDF?") ], outputs=gr.Dataframe(label="Extracted Transactions"), title="Bank Statement Parser", description="Convert PDF/Excel bank statements into structured data using Mistral-7B." ) interface.launch()