import os
import gradio as gr
import pdfplumber
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd
import numpy as np
import re

# For Excel files
def extract_excel_data(file_path):
    df = pd.read_excel(file_path, engine='openpyxl')
    return df.to_string()

# For PDF files with fallback OCR
def extract_text_from_pdf(pdf_path, is_scanned=False):
    try:
        # First try native PDF extraction
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        # Fallback to OCR if PDF is invalid
        print(f"Native PDF extraction failed: {str(e)}")
        print("Trying OCR fallback...")
        images = convert_from_path(pdf_path, dpi=200)
        text = ""
        for image in images:
            text += pytesseract.image_to_string(image) + "\n"
        return text

# Prompt engineering for structured extraction
def parse_bank_statement(text):
    # Clean up text from PDF/OCR artifacts
    cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
    
    prompt = f"""
    Extract the following details from the bank statement text:
    - Transaction Date
    - Description / Merchant
    - Amount
    - Debit / Credit
    - Closing Balance
    - Expense Type (if available)

    Return the results in JSON format with keys: 
    ["date", "description", "amount", "debit_credit", "closing_balance", "expense_type"].

    Example:
    {{
      "transactions": [
        {{
          "date": "2025-06-01",
          "description": "Grocery Store",
          "amount": "150.00",
          "debit_credit": "Debit",
          "closing_balance": "1200.00",
          "expense_type": "Food"
        }}
      ]
    }}

    Bank Statement Text:
    {cleaned_text}
    """
    
    # Simulate LLM response with deterministic parsing for demo
    # Replace this with actual LLM inference in production
    return simulate_llm_parsing(cleaned_text)

def simulate_llm_parsing(text):
    """Mock LLM response for demo purposes"""
    # Simple regex-based parsing for demonstration
    transactions = []
    lines = text.split('\n')
    
    # Skip header lines
    data_lines = lines[lines.index('Date') + 1:]
    
    for i in range(0, len(data_lines), 7):  # Process in chunks of 7
        if i+6 >= len(data_lines):
            break
            
        try:
            transactions.append({
                "date": data_lines[i].strip(),
                "description": data_lines[i+1].strip(),
                "amount": data_lines[i+2].strip(),
                "debit_credit": data_lines[i+3].strip(),
                "closing_balance": data_lines[i+5].strip(),
                "expense_type": data_lines[i+6].strip()
            })
        except Exception as e:
            print(f"Error parsing line {i}: {str(e)}")
            continue
            
    return {"transactions": transactions}

# Main function
def process_file(file, is_scanned):
    file_path = file.name
    file_ext = os.path.splitext(file_path)[1].lower()
    
    if file_ext == '.xlsx':
        text = extract_excel_data(file_path)
    elif file_ext == '.pdf':
        text = extract_text_from_pdf(file_path, is_scanned=is_scanned)
    else:
        return "Unsupported file format. Please upload PDF or Excel."
    
    parsed_data = parse_bank_statement(text)
    
    # Convert to DataFrame for display
    df = pd.DataFrame(parsed_data["transactions"])
    return df

# Gradio interface
interface = gr.Interface(
    fn=process_file,
    inputs=[
        gr.File(label="Upload PDF/Excel"),
        gr.Checkbox(label="Is Scanned PDF?")
    ],
    outputs=gr.Dataframe(label="Extracted Transactions"),
    title="Bank Statement Parser",
    description="Convert PDF/Excel bank statements into structured data using hybrid parsing techniques.",
    allow_flagging="never"
)

if __name__ == "__main__":
    interface.launch()