Spaces:

Bhaskar2611
/

BankStatement_Parser

Sleeping

App Files Files Community

Bhaskar2611 commited on Jun 8

Commit

aca59c0

verified ·

1 Parent(s): 15c9ede

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -9

app.py CHANGED Viewed

@@ -8,9 +8,9 @@ import pytesseract
 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
-# Initialize Hugging Face Inference Client
 hf_token = os.getenv("HF_TOKEN")
-client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=hf_token)
 def extract_excel_data(file_path):
     """Extract text from Excel file"""
@@ -86,11 +86,23 @@ Rules:
 1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
 2. Convert negative balances to standard format (e.g., "-2421.72")
 3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
 """
     try:
         # Call LLM via Hugging Face Inference API
-        response = client.text_generation(prompt, max_new_tokens=1000, temperature=0.1)
         return json.loads(response)
     except Exception as e:
         print(f"LLM Error: {str(e)}")
@@ -104,7 +116,7 @@ def rule_based_parser(text):
     # Find header line containing '| Date'
     header_index = None
     for i, line in enumerate(lines):
-        if re.search(r'\|Date', line):  # Improved pattern to match "|Date"
             header_index = i
             break
@@ -115,7 +127,7 @@ def rule_based_parser(text):
     transactions = []
     for line in data_lines:
-        if not line.startswith('|'):
             continue
         parts = [p.strip() for p in line.split('|') if p.strip()]
@@ -123,13 +135,14 @@ def rule_based_parser(text):
             continue
         try:
             transactions.append({
                 "date": parts[0],
                 "description": parts[1],
-                "amount": parts[2],
-                "debit": parts[3],
-                "credit": parts[4],
-                "closing_balance": parts[5],
                 "category": parts[6]
             })
         except Exception as e:
@@ -137,6 +150,13 @@ def rule_based_parser(text):
     return {"transactions": transactions}
 def process_file(file, is_scanned):
     """Main processing function"""
     if not file:

 from pdf2image import convert_from_path
 from huggingface_hub import InferenceClient
+# Initialize Hugging Face Inference Client with a free model
 hf_token = os.getenv("HF_TOKEN")
+client = InferenceClient(model="HuggingFaceH4/zephyr-7b-beta", token=hf_token)
 def extract_excel_data(file_path):
     """Extract text from Excel file"""
 1. Ensure numeric fields have valid numbers (e.g., "0.00" instead of "-")
 2. Convert negative balances to standard format (e.g., "-2421.72")
 3. Map category names consistently (e.g., "Groceries", "Medical", "Utilities")
+4. Only return valid JSON with no additional text
 """
     try:
         # Call LLM via Hugging Face Inference API
+        response = client.text_generation(
+            prompt,
+            max_new_tokens=2000,
+            temperature=0.1,
+            stop_sequences=["</s>"]
+        )
+        print(f"LLM Response: {response}")
+        # Extract JSON from response (remove non-JSON prefixes/suffixes)
+        json_match = re.search(r'\{.*\}', response, re.DOTALL)
+        if json_match:
+            return json.loads(json_match.group())
         return json.loads(response)
     except Exception as e:
         print(f"LLM Error: {str(e)}")
     # Find header line containing '| Date'
     header_index = None
     for i, line in enumerate(lines):
+        if re.search(r'\|Date|Date\|', line, re.IGNORECASE):
             header_index = i
             break
     transactions = []
     for line in data_lines:
+        if not '|' in line:
             continue
         parts = [p.strip() for p in line.split('|') if p.strip()]
             continue
         try:
+            # Handle numeric values consistently
             transactions.append({
                 "date": parts[0],
                 "description": parts[1],
+                "amount": format_number(parts[2]),
+                "debit": format_number(parts[3]),
+                "credit": format_number(parts[4]),
+                "closing_balance": format_number(parts[5]),
                 "category": parts[6]
             })
         except Exception as e:
     return {"transactions": transactions}
+def format_number(value):
+    """Format numeric values consistently"""
+    value = value.replace(',', '')
+    if re.match(r'^-?\d+(\.\d+)?$', value):
+        return f"{float(value):.2f}"
+    return value
 def process_file(file, is_scanned):
     """Main processing function"""
     if not file: