Bhaskar2611 commited on
Commit
f330df4
·
verified ·
1 Parent(s): c056cf5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pdfplumber
4
+ import pytesseract
5
+ from PIL import Image
6
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
7
+ import pandas as pd
8
+ import torch
9
+
10
+ # Load Hugging Face token from environment
11
+ hf_token = os.getenv("HF_TOKEN") # Set this in Space Secrets [[2]]
12
+
13
+ # Load Mistral-7B-Instruct with authentication
14
+ model_name = "mistralai/Mistral-7B-Instruct-v0.3"
15
+
16
+ try:
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ torch_dtype=torch.float16,
21
+ token=hf_token # Pass token for gated repo access [[6]]
22
+ )
23
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=500)
24
+ except Exception as e:
25
+ raise RuntimeError("Failed to load model. Ensure you have access to the gated repository and a valid HF_TOKEN.") from e
26
+
27
+ # Text extraction from PDF
28
+ def extract_text_from_pdf(pdf_path, is_scanned=False):
29
+ text = ""
30
+ if is_scanned:
31
+ images = convert_from_path(pdf_path) # Requires pdf2image
32
+ for image in images:
33
+ text += pytesseract.image_to_string(image)
34
+ else:
35
+ with pdfplumber.open(pdf_path) as pdf:
36
+ for page in pdf.pages:
37
+ text += page.extract_text()
38
+ return text
39
+
40
+ # Prompt engineering for structured extraction
41
+ def parse_bank_statement(text):
42
+ prompt = f"""
43
+ Extract the following details from the bank statement text:
44
+ - Transaction Date
45
+ - Description / Merchant
46
+ - Amount
47
+ - Debit / Credit
48
+ - Closing Balance
49
+ - Expense Type (if available)
50
+ Return the results in JSON format with keys:
51
+ ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].
52
+ Example:
53
+ {{
54
+ "transactions": [
55
+ {{
56
+ "date": "2025-06-01",
57
+ "description": "Grocery Store",
58
+ "amount": "150.00",
59
+ "debit_credit": "Debit",
60
+ "closing_balance": "1200.00",
61
+ "expense_type": "Food"
62
+ }}
63
+ ]
64
+ }}
65
+ Bank Statement Text:
66
+ {text}
67
+ """
68
+ response = pipe(prompt)[0]["generated_text"]
69
+ return response # In production, parse JSON programmatically
70
+
71
+ # Main function
72
+ def process_file(file, is_scanned):
73
+ file_path = file.name
74
+ text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
75
+ parsed_data = parse_bank_statement(text)
76
+ df = pd.DataFrame(parsed_data["transactions"])
77
+ return df
78
+
79
+ # Gradio interface
80
+ interface = gr.Interface(
81
+ fn=process_file,
82
+ inputs=[
83
+ gr.File(label="Upload PDF/Excel"),
84
+ gr.Checkbox(label="Is Scanned PDF?")
85
+ ],
86
+ outputs=gr.Dataframe(label="Extracted Transactions"),
87
+ title="Bank Statement Parser",
88
+ description="Convert PDF/Excel bank statements into structured data using Mistral-7B."
89
+ )
90
+
91
+ interface.launch()