# Import necessary libraries
import streamlit as st
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import PyPDF2
from docx import Document
import pandas as pd
import os

# Check if faiss is installed
try:
    import faiss
except ImportError:
    st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).")
    st.stop()

# Set up Streamlit app
st.title("Financial Statement Generator")
st.write("Upload a PDF or DOCX file to generate financial statements.")

# File upload
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])

# Function to extract text from PDF
def extract_text_from_pdf(file):
    pdf_reader = PyPDF2.PdfFileReader(file)
    text = ""
    for page_num in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page_num)
        text += page.extract_text()
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file):
    doc = Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Function to process extracted text using RAG model
def generate_financial_statements(text):
    # Load RAG model and tokenizer
    tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base")
    retriever = RagRetriever.from_pretrained(
        "facebook/rag-sequence-base",
        index_name="exact",
        trust_remote_code=True  # Allow execution of remote code
    )
    model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever)

    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

    # Generate financial statements
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000)

    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Function to parse generated text into financial statements
def parse_financial_statements(generated_text):
    # Placeholder logic for parsing generated text into structured data
    # You can customize this based on your specific requirements
    statements = {
        "Ledger": [],
        "Journal General": [],
        "Income Statement": [],
        "Balance Sheet": [],
        "Cash Flow Statement": []
    }

    # Example parsing logic (replace with actual logic)
    lines = generated_text.split("\n")
    for line in lines:
        if "Transaction:" in line:
            statements["Ledger"].append(line)
        elif "Revenue:" in line or "Expense:" in line:
            statements["Income Statement"].append(line)
        elif "Asset:" in line or "Liability:" in line or "Equity:" in line:
            statements["Balance Sheet"].append(line)
        elif "Cash Inflow:" in line or "Cash Outflow:" in line:
            statements["Cash Flow Statement"].append(line)

    return statements

# Main logic
if uploaded_file is not None:
    # Extract text from uploaded file
    if uploaded_file.type == "application/pdf":
        text = extract_text_from_pdf(uploaded_file)
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        text = extract_text_from_docx(uploaded_file)
    else:
        st.error("Unsupported file format. Please upload a PDF or DOCX file.")
        st.stop()

    # Display extracted text
    st.subheader("Extracted Text")
    st.write(text)

    # Generate financial statements
    st.subheader("Generated Financial Statements")
    generated_text = generate_financial_statements(text)
    statements = parse_financial_statements(generated_text)

    # Display financial statements
    for statement_type, data in statements.items():
        st.write(f"### {statement_type}")
        if data:
            st.write(data)
        else:
            st.write("No data available for this statement.")

    # Allow users to download statements as CSV
    for statement_type, data in statements.items():
        if data:
            df = pd.DataFrame(data, columns=[statement_type])
            csv = df.to_csv(index=False)
            st.download_button(
                label=f"Download {statement_type} as CSV",
                data=csv,
                file_name=f"{statement_type.lower().replace(' ', '_')}.csv",
                mime="text/csv"
            )

# Dependencies
st.sidebar.subheader("Dependencies")
st.sidebar.write("""
- Streamlit
- Hugging Face Transformers
- PyPDF2
- python-docx
- pandas
- faiss-cpu (or faiss-gpu)
- datasets
""")

# Deployment instructions
st.sidebar.subheader("Deployment Instructions")
st.sidebar.write("""
1. Install dependencies: `pip install -r requirements.txt`
2. Run the app: `streamlit run app.py`
3. Access the app in your browser at `http://localhost:8501`
""")