# Import necessary libraries import streamlit as st from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration import PyPDF2 from docx import Document import pandas as pd import os # Check if faiss is installed try: import faiss except ImportError: st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).") st.stop() # Set up Streamlit app st.title("Financial Statement Generator") st.write("Upload a PDF or DOCX file to generate financial statements.") # File upload uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"]) # Function to extract text from PDF def extract_text_from_pdf(file): pdf_reader = PyPDF2.PdfFileReader(file) text = "" for page_num in range(pdf_reader.getNumPages()): page = pdf_reader.getPage(page_num) text += page.extract_text() return text # Function to extract text from DOCX def extract_text_from_docx(file): doc = Document(file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text # Function to process extracted text using RAG model def generate_financial_statements(text): # Load RAG model and tokenizer tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base") retriever = RagRetriever.from_pretrained( "facebook/rag-sequence-base", index_name="exact", trust_remote_code=True # Allow execution of remote code ) model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever) # Tokenize input text inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) # Generate financial statements outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000) # Decode generated text generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated_text # Function to parse generated text into financial statements def parse_financial_statements(generated_text): # Placeholder logic for parsing generated text into structured data # You can customize this based on your specific requirements statements = { "Ledger": [], "Journal General": [], "Income Statement": [], "Balance Sheet": [], "Cash Flow Statement": [] } # Example parsing logic (replace with actual logic) lines = generated_text.split("\n") for line in lines: if "Transaction:" in line: statements["Ledger"].append(line) elif "Revenue:" in line or "Expense:" in line: statements["Income Statement"].append(line) elif "Asset:" in line or "Liability:" in line or "Equity:" in line: statements["Balance Sheet"].append(line) elif "Cash Inflow:" in line or "Cash Outflow:" in line: statements["Cash Flow Statement"].append(line) return statements # Main logic if uploaded_file is not None: # Extract text from uploaded file if uploaded_file.type == "application/pdf": text = extract_text_from_pdf(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": text = extract_text_from_docx(uploaded_file) else: st.error("Unsupported file format. Please upload a PDF or DOCX file.") st.stop() # Display extracted text st.subheader("Extracted Text") st.write(text) # Generate financial statements st.subheader("Generated Financial Statements") generated_text = generate_financial_statements(text) statements = parse_financial_statements(generated_text) # Display financial statements for statement_type, data in statements.items(): st.write(f"### {statement_type}") if data: st.write(data) else: st.write("No data available for this statement.") # Allow users to download statements as CSV for statement_type, data in statements.items(): if data: df = pd.DataFrame(data, columns=[statement_type]) csv = df.to_csv(index=False) st.download_button( label=f"Download {statement_type} as CSV", data=csv, file_name=f"{statement_type.lower().replace(' ', '_')}.csv", mime="text/csv" ) # Dependencies st.sidebar.subheader("Dependencies") st.sidebar.write(""" - Streamlit - Hugging Face Transformers - PyPDF2 - python-docx - pandas - faiss-cpu (or faiss-gpu) - datasets """) # Deployment instructions st.sidebar.subheader("Deployment Instructions") st.sidebar.write(""" 1. Install dependencies: `pip install -r requirements.txt` 2. Run the app: `streamlit run app.py` 3. Access the app in your browser at `http://localhost:8501` """)