mfraz's picture
Update app.py
2cb0c11 verified
raw
history blame
4.9 kB
# Import necessary libraries
import streamlit as st
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import PyPDF2
from docx import Document
import pandas as pd
import os
# Check if faiss is installed
try:
import faiss
except ImportError:
st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).")
st.stop()
# Set up Streamlit app
st.title("Financial Statement Generator")
st.write("Upload a PDF or DOCX file to generate financial statements.")
# File upload
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])
# Function to extract text from PDF
def extract_text_from_pdf(file):
pdf_reader = PyPDF2.PdfFileReader(file)
text = ""
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
text += page.extract_text()
return text
# Function to extract text from DOCX
def extract_text_from_docx(file):
doc = Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
# Function to process extracted text using RAG model
def generate_financial_statements(text):
# Load RAG model and tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base")
retriever = RagRetriever.from_pretrained(
"facebook/rag-sequence-base",
index_name="exact",
trust_remote_code=True # Allow execution of remote code
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever)
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
# Generate financial statements
outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000)
# Decode generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
# Function to parse generated text into financial statements
def parse_financial_statements(generated_text):
# Placeholder logic for parsing generated text into structured data
# You can customize this based on your specific requirements
statements = {
"Ledger": [],
"Journal General": [],
"Income Statement": [],
"Balance Sheet": [],
"Cash Flow Statement": []
}
# Example parsing logic (replace with actual logic)
lines = generated_text.split("\n")
for line in lines:
if "Transaction:" in line:
statements["Ledger"].append(line)
elif "Revenue:" in line or "Expense:" in line:
statements["Income Statement"].append(line)
elif "Asset:" in line or "Liability:" in line or "Equity:" in line:
statements["Balance Sheet"].append(line)
elif "Cash Inflow:" in line or "Cash Outflow:" in line:
statements["Cash Flow Statement"].append(line)
return statements
# Main logic
if uploaded_file is not None:
# Extract text from uploaded file
if uploaded_file.type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
text = extract_text_from_docx(uploaded_file)
else:
st.error("Unsupported file format. Please upload a PDF or DOCX file.")
st.stop()
# Display extracted text
st.subheader("Extracted Text")
st.write(text)
# Generate financial statements
st.subheader("Generated Financial Statements")
generated_text = generate_financial_statements(text)
statements = parse_financial_statements(generated_text)
# Display financial statements
for statement_type, data in statements.items():
st.write(f"### {statement_type}")
if data:
st.write(data)
else:
st.write("No data available for this statement.")
# Allow users to download statements as CSV
for statement_type, data in statements.items():
if data:
df = pd.DataFrame(data, columns=[statement_type])
csv = df.to_csv(index=False)
st.download_button(
label=f"Download {statement_type} as CSV",
data=csv,
file_name=f"{statement_type.lower().replace(' ', '_')}.csv",
mime="text/csv"
)
# Dependencies
st.sidebar.subheader("Dependencies")
st.sidebar.write("""
- Streamlit
- Hugging Face Transformers
- PyPDF2
- python-docx
- pandas
- faiss-cpu (or faiss-gpu)
- datasets
""")
# Deployment instructions
st.sidebar.subheader("Deployment Instructions")
st.sidebar.write("""
1. Install dependencies: `pip install -r requirements.txt`
2. Run the app: `streamlit run app.py`
3. Access the app in your browser at `http://localhost:8501`
""")