Spaces:
Running
Running
| # Import necessary libraries | |
| import streamlit as st | |
| from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration | |
| import PyPDF2 | |
| from docx import Document | |
| import pandas as pd | |
| import os | |
| # Check if faiss is installed | |
| try: | |
| import faiss | |
| except ImportError: | |
| st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).") | |
| st.stop() | |
| # Set up Streamlit app | |
| st.title("Financial Statement Generator") | |
| st.write("Upload a PDF or DOCX file to generate financial statements.") | |
| # File upload | |
| uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"]) | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(file): | |
| pdf_reader = PyPDF2.PdfFileReader(file) | |
| text = "" | |
| for page_num in range(pdf_reader.getNumPages()): | |
| page = pdf_reader.getPage(page_num) | |
| text += page.extract_text() | |
| return text | |
| # Function to extract text from DOCX | |
| def extract_text_from_docx(file): | |
| doc = Document(file) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| # Function to process extracted text using RAG model | |
| def generate_financial_statements(text): | |
| # Load RAG model and tokenizer | |
| tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base") | |
| retriever = RagRetriever.from_pretrained( | |
| "facebook/rag-sequence-base", | |
| index_name="exact", | |
| trust_remote_code=True # Allow execution of remote code | |
| ) | |
| model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever) | |
| # Tokenize input text | |
| inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
| # Generate financial statements | |
| outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000) | |
| # Decode generated text | |
| generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return generated_text | |
| # Function to parse generated text into financial statements | |
| def parse_financial_statements(generated_text): | |
| # Placeholder logic for parsing generated text into structured data | |
| # You can customize this based on your specific requirements | |
| statements = { | |
| "Ledger": [], | |
| "Journal General": [], | |
| "Income Statement": [], | |
| "Balance Sheet": [], | |
| "Cash Flow Statement": [] | |
| } | |
| # Example parsing logic (replace with actual logic) | |
| lines = generated_text.split("\n") | |
| for line in lines: | |
| if "Transaction:" in line: | |
| statements["Ledger"].append(line) | |
| elif "Revenue:" in line or "Expense:" in line: | |
| statements["Income Statement"].append(line) | |
| elif "Asset:" in line or "Liability:" in line or "Equity:" in line: | |
| statements["Balance Sheet"].append(line) | |
| elif "Cash Inflow:" in line or "Cash Outflow:" in line: | |
| statements["Cash Flow Statement"].append(line) | |
| return statements | |
| # Main logic | |
| if uploaded_file is not None: | |
| # Extract text from uploaded file | |
| if uploaded_file.type == "application/pdf": | |
| text = extract_text_from_pdf(uploaded_file) | |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| text = extract_text_from_docx(uploaded_file) | |
| else: | |
| st.error("Unsupported file format. Please upload a PDF or DOCX file.") | |
| st.stop() | |
| # Display extracted text | |
| st.subheader("Extracted Text") | |
| st.write(text) | |
| # Generate financial statements | |
| st.subheader("Generated Financial Statements") | |
| generated_text = generate_financial_statements(text) | |
| statements = parse_financial_statements(generated_text) | |
| # Display financial statements | |
| for statement_type, data in statements.items(): | |
| st.write(f"### {statement_type}") | |
| if data: | |
| st.write(data) | |
| else: | |
| st.write("No data available for this statement.") | |
| # Allow users to download statements as CSV | |
| for statement_type, data in statements.items(): | |
| if data: | |
| df = pd.DataFrame(data, columns=[statement_type]) | |
| csv = df.to_csv(index=False) | |
| st.download_button( | |
| label=f"Download {statement_type} as CSV", | |
| data=csv, | |
| file_name=f"{statement_type.lower().replace(' ', '_')}.csv", | |
| mime="text/csv" | |
| ) | |
| # Dependencies | |
| st.sidebar.subheader("Dependencies") | |
| st.sidebar.write(""" | |
| - Streamlit | |
| - Hugging Face Transformers | |
| - PyPDF2 | |
| - python-docx | |
| - pandas | |
| - faiss-cpu (or faiss-gpu) | |
| - datasets | |
| """) | |
| # Deployment instructions | |
| st.sidebar.subheader("Deployment Instructions") | |
| st.sidebar.write(""" | |
| 1. Install dependencies: `pip install -r requirements.txt` | |
| 2. Run the app: `streamlit run app.py` | |
| 3. Access the app in your browser at `http://localhost:8501` | |
| """) |