Spaces:
Running
Running
# Import necessary libraries | |
import streamlit as st | |
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration | |
import PyPDF2 | |
from docx import Document | |
import pandas as pd | |
import os | |
# Check if faiss is installed | |
try: | |
import faiss | |
except ImportError: | |
st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).") | |
st.stop() | |
# Set up Streamlit app | |
st.title("Financial Statement Generator") | |
st.write("Upload a PDF or DOCX file to generate financial statements.") | |
# File upload | |
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"]) | |
# Function to extract text from PDF | |
def extract_text_from_pdf(file): | |
pdf_reader = PyPDF2.PdfFileReader(file) | |
text = "" | |
for page_num in range(pdf_reader.getNumPages()): | |
page = pdf_reader.getPage(page_num) | |
text += page.extract_text() | |
return text | |
# Function to extract text from DOCX | |
def extract_text_from_docx(file): | |
doc = Document(file) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
# Function to process extracted text using RAG model | |
def generate_financial_statements(text): | |
# Load RAG model and tokenizer | |
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base") | |
retriever = RagRetriever.from_pretrained( | |
"facebook/rag-sequence-base", | |
index_name="exact", | |
trust_remote_code=True # Allow execution of remote code | |
) | |
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever) | |
# Tokenize input text | |
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True) | |
# Generate financial statements | |
outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000) | |
# Decode generated text | |
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return generated_text | |
# Function to parse generated text into financial statements | |
def parse_financial_statements(generated_text): | |
# Placeholder logic for parsing generated text into structured data | |
# You can customize this based on your specific requirements | |
statements = { | |
"Ledger": [], | |
"Journal General": [], | |
"Income Statement": [], | |
"Balance Sheet": [], | |
"Cash Flow Statement": [] | |
} | |
# Example parsing logic (replace with actual logic) | |
lines = generated_text.split("\n") | |
for line in lines: | |
if "Transaction:" in line: | |
statements["Ledger"].append(line) | |
elif "Revenue:" in line or "Expense:" in line: | |
statements["Income Statement"].append(line) | |
elif "Asset:" in line or "Liability:" in line or "Equity:" in line: | |
statements["Balance Sheet"].append(line) | |
elif "Cash Inflow:" in line or "Cash Outflow:" in line: | |
statements["Cash Flow Statement"].append(line) | |
return statements | |
# Main logic | |
if uploaded_file is not None: | |
# Extract text from uploaded file | |
if uploaded_file.type == "application/pdf": | |
text = extract_text_from_pdf(uploaded_file) | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
text = extract_text_from_docx(uploaded_file) | |
else: | |
st.error("Unsupported file format. Please upload a PDF or DOCX file.") | |
st.stop() | |
# Display extracted text | |
st.subheader("Extracted Text") | |
st.write(text) | |
# Generate financial statements | |
st.subheader("Generated Financial Statements") | |
generated_text = generate_financial_statements(text) | |
statements = parse_financial_statements(generated_text) | |
# Display financial statements | |
for statement_type, data in statements.items(): | |
st.write(f"### {statement_type}") | |
if data: | |
st.write(data) | |
else: | |
st.write("No data available for this statement.") | |
# Allow users to download statements as CSV | |
for statement_type, data in statements.items(): | |
if data: | |
df = pd.DataFrame(data, columns=[statement_type]) | |
csv = df.to_csv(index=False) | |
st.download_button( | |
label=f"Download {statement_type} as CSV", | |
data=csv, | |
file_name=f"{statement_type.lower().replace(' ', '_')}.csv", | |
mime="text/csv" | |
) | |
# Dependencies | |
st.sidebar.subheader("Dependencies") | |
st.sidebar.write(""" | |
- Streamlit | |
- Hugging Face Transformers | |
- PyPDF2 | |
- python-docx | |
- pandas | |
- faiss-cpu (or faiss-gpu) | |
- datasets | |
""") | |
# Deployment instructions | |
st.sidebar.subheader("Deployment Instructions") | |
st.sidebar.write(""" | |
1. Install dependencies: `pip install -r requirements.txt` | |
2. Run the app: `streamlit run app.py` | |
3. Access the app in your browser at `http://localhost:8501` | |
""") |