Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8

Commit

f330df4

verified ·

1 Parent(s): c056cf5

Create app.py

Browse files

Files changed (1) hide show

app.py +91 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import gradio as gr
+import pdfplumber
+import pytesseract
+from PIL import Image
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import pandas as pd
+import torch
+# Load Hugging Face token from environment
+hf_token = os.getenv("HF_TOKEN")  # Set this in Space Secrets [[2]]
+# Load Mistral-7B-Instruct with authentication
+model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+try:
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        token=hf_token  # Pass token for gated repo access [[6]]
+    )
+    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
+except Exception as e:
+    raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e
+# Text extraction from PDF
+def extract_text_from_pdf(pdf_path, is_scanned=False):
+    text = ""
+    if is_scanned:
+        images = convert_from_path(pdf_path)  # Requires pdf2image
+        for image in images:
+            text += pytesseract.image_to_string(image)
+    else:
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text()
+    return text
+# Prompt engineering for structured extraction
+def parse_bank_statement(text):
+    prompt = f"""
+    Extract the following details from the bank statement text:
+    - Transaction Date
+    - Description / Merchant
+    - Amount
+    - Debit / Credit
+    - Closing Balance
+    - Expense Type (if available)
+    Return the results in JSON format with keys:
+    ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
+    Example:
+    {{
+      "transactions": [
+        {{
+          "date": "2025-06-01",
+          "description": "Grocery Store",
+          "amount": "150.00",
+          "debit_credit": "Debit",
+          "closing_balance": "1200.00",
+          "expense_type": "Food"
+        }}
+      ]
+    }}
+    Bank Statement Text:
+    {text}
+    """
+    response = pipe(prompt)[0]["generated_text"]
+    return response  # In production, parse JSON programmatically
+# Main function
+def process_file(file, is_scanned):
+    file_path = file.name
+    text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
+    parsed_data = parse_bank_statement(text)
+    df = pd.DataFrame(parsed_data["transactions"])
+    return df
+# Gradio interface
+interface = gr.Interface(
+    fn=process_file,
+    inputs=[
+        gr.File(label="Upload PDF/Excel"),
+        gr.Checkbox(label="Is Scanned PDF?")
+    ],
+    outputs=gr.Dataframe(label="Extracted Transactions"),
+    title="Bank Statement Parser",
+    description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
+)
+interface.launch()