File size: 2,852 Bytes
f330df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd
import torch

# Load Hugging Face token from environment
hf_token = os.getenv("HF_TOKEN")  # Set this in Space Secrets [[2]]

# Load Mistral-7B-Instruct with authentication
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        torch_dtype=torch.float16, 
        token=hf_token  # Pass token for gated repo access [[6]]
    )
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
except Exception as e:
    raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e

# Text extraction from PDF
def extract_text_from_pdf(pdf_path, is_scanned=False):
    text = ""
    if is_scanned:
        images = convert_from_path(pdf_path)  # Requires pdf2image
        for image in images:
            text += pytesseract.image_to_string(image)
    else:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text()
    return text

# Prompt engineering for structured extraction
def parse_bank_statement(text):
    prompt = f"""
    Extract the following details from the bank statement text:
    - Transaction Date
    - Description / Merchant
    - Amount
    - Debit / Credit
    - Closing Balance
    - Expense Type (if available)
    Return the results in JSON format with keys: 
    ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
    Example:
    {{
      "transactions": [
        {{
          "date": "2025-06-01",
          "description": "Grocery Store",
          "amount": "150.00",
          "debit_credit": "Debit",
          "closing_balance": "1200.00",
          "expense_type": "Food"
        }}
      ]
    }}
    Bank Statement Text:
    {text}
    """
    response = pipe(prompt)[0]["generated_text"]
    return response  # In production, parse JSON programmatically

# Main function
def process_file(file, is_scanned):
    file_path = file.name
    text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
    parsed_data = parse_bank_statement(text)
    df = pd.DataFrame(parsed_data["transactions"])
    return df

# Gradio interface
interface = gr.Interface(
    fn=process_file,
    inputs=[
        gr.File(label="Upload PDF/Excel"),
        gr.Checkbox(label="Is Scanned PDF?")
    ],
    outputs=gr.Dataframe(label="Extracted Transactions"),
    title="Bank Statement Parser",
    description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
)

interface.launch()