Spaces:

mfraz
/

Financial-Statements

Sleeping

App Files Files Community

Financial-Statements / app.py

mfraz

Update app.py

2cb0c11 verified 5 months ago

raw

history blame

4.9 kB

	# Import necessary libraries
	import streamlit as st
	from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
	import PyPDF2
	from docx import Document
	import pandas as pd
	import os

	# Check if faiss is installed
	try:
	import faiss
	except ImportError:
	st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).")
	st.stop()

	# Set up Streamlit app
	st.title("Financial Statement Generator")
	st.write("Upload a PDF or DOCX file to generate financial statements.")

	# File upload
	uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])

	# Function to extract text from PDF
	def extract_text_from_pdf(file):
	pdf_reader = PyPDF2.PdfFileReader(file)
	text = ""
	for page_num in range(pdf_reader.getNumPages()):
	page = pdf_reader.getPage(page_num)
	text += page.extract_text()
	return text

	# Function to extract text from DOCX
	def extract_text_from_docx(file):
	doc = Document(file)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text

	# Function to process extracted text using RAG model
	def generate_financial_statements(text):
	# Load RAG model and tokenizer
	tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base")
	retriever = RagRetriever.from_pretrained(
	"facebook/rag-sequence-base",
	index_name="exact",
	trust_remote_code=True # Allow execution of remote code
	)
	model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever)

	# Tokenize input text
	inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

	# Generate financial statements
	outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000)

	# Decode generated text
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	return generated_text

	# Function to parse generated text into financial statements
	def parse_financial_statements(generated_text):
	# Placeholder logic for parsing generated text into structured data
	# You can customize this based on your specific requirements
	statements = {
	"Ledger": [],
	"Journal General": [],
	"Income Statement": [],
	"Balance Sheet": [],
	"Cash Flow Statement": []
	}

	# Example parsing logic (replace with actual logic)
	lines = generated_text.split("\n")
	for line in lines:
	if "Transaction:" in line:
	statements["Ledger"].append(line)
	elif "Revenue:" in line or "Expense:" in line:
	statements["Income Statement"].append(line)
	elif "Asset:" in line or "Liability:" in line or "Equity:" in line:
	statements["Balance Sheet"].append(line)
	elif "Cash Inflow:" in line or "Cash Outflow:" in line:
	statements["Cash Flow Statement"].append(line)

	return statements

	# Main logic
	if uploaded_file is not None:
	# Extract text from uploaded file
	if uploaded_file.type == "application/pdf":
	text = extract_text_from_pdf(uploaded_file)
	elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	text = extract_text_from_docx(uploaded_file)
	else:
	st.error("Unsupported file format. Please upload a PDF or DOCX file.")
	st.stop()

	# Display extracted text
	st.subheader("Extracted Text")
	st.write(text)

	# Generate financial statements
	st.subheader("Generated Financial Statements")
	generated_text = generate_financial_statements(text)
	statements = parse_financial_statements(generated_text)

	# Display financial statements
	for statement_type, data in statements.items():
	st.write(f"### {statement_type}")
	if data:
	st.write(data)
	else:
	st.write("No data available for this statement.")

	# Allow users to download statements as CSV
	for statement_type, data in statements.items():
	if data:
	df = pd.DataFrame(data, columns=[statement_type])
	csv = df.to_csv(index=False)
	st.download_button(
	label=f"Download {statement_type} as CSV",
	data=csv,
	file_name=f"{statement_type.lower().replace(' ', '_')}.csv",
	mime="text/csv"
	)

	# Dependencies
	st.sidebar.subheader("Dependencies")
	st.sidebar.write("""
	- Streamlit
	- Hugging Face Transformers
	- PyPDF2
	- python-docx
	- pandas
	- faiss-cpu (or faiss-gpu)
	- datasets
	""")

	# Deployment instructions
	st.sidebar.subheader("Deployment Instructions")
	st.sidebar.write("""
	1. Install dependencies: `pip install -r requirements.txt`
	2. Run the app: `streamlit run app.py`
	3. Access the app in your browser at `http://localhost:8501`
	""")