Spaces:
Sleeping
Sleeping
File size: 4,901 Bytes
1896b1a 1d3b6af 1896b1a 2320a8a 1896b1a 87713d1 1896b1a 1d3b6af 1896b1a 1d3b6af 1896b1a 2320a8a 1896b1a 2320a8a 1896b1a 2cb0c11 1896b1a 1d3b6af 1896b1a 1d3b6af 1896b1a 00ef1a7 1896b1a 87713d1 2cb0c11 1896b1a 87713d1 1896b1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# Import necessary libraries
import streamlit as st
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import PyPDF2
from docx import Document
import pandas as pd
import os
# Check if faiss is installed
try:
import faiss
except ImportError:
st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).")
st.stop()
# Set up Streamlit app
st.title("Financial Statement Generator")
st.write("Upload a PDF or DOCX file to generate financial statements.")
# File upload
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])
# Function to extract text from PDF
def extract_text_from_pdf(file):
pdf_reader = PyPDF2.PdfFileReader(file)
text = ""
for page_num in range(pdf_reader.getNumPages()):
page = pdf_reader.getPage(page_num)
text += page.extract_text()
return text
# Function to extract text from DOCX
def extract_text_from_docx(file):
doc = Document(file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
# Function to process extracted text using RAG model
def generate_financial_statements(text):
# Load RAG model and tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base")
retriever = RagRetriever.from_pretrained(
"facebook/rag-sequence-base",
index_name="exact",
trust_remote_code=True # Allow execution of remote code
)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever)
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
# Generate financial statements
outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000)
# Decode generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
# Function to parse generated text into financial statements
def parse_financial_statements(generated_text):
# Placeholder logic for parsing generated text into structured data
# You can customize this based on your specific requirements
statements = {
"Ledger": [],
"Journal General": [],
"Income Statement": [],
"Balance Sheet": [],
"Cash Flow Statement": []
}
# Example parsing logic (replace with actual logic)
lines = generated_text.split("\n")
for line in lines:
if "Transaction:" in line:
statements["Ledger"].append(line)
elif "Revenue:" in line or "Expense:" in line:
statements["Income Statement"].append(line)
elif "Asset:" in line or "Liability:" in line or "Equity:" in line:
statements["Balance Sheet"].append(line)
elif "Cash Inflow:" in line or "Cash Outflow:" in line:
statements["Cash Flow Statement"].append(line)
return statements
# Main logic
if uploaded_file is not None:
# Extract text from uploaded file
if uploaded_file.type == "application/pdf":
text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
text = extract_text_from_docx(uploaded_file)
else:
st.error("Unsupported file format. Please upload a PDF or DOCX file.")
st.stop()
# Display extracted text
st.subheader("Extracted Text")
st.write(text)
# Generate financial statements
st.subheader("Generated Financial Statements")
generated_text = generate_financial_statements(text)
statements = parse_financial_statements(generated_text)
# Display financial statements
for statement_type, data in statements.items():
st.write(f"### {statement_type}")
if data:
st.write(data)
else:
st.write("No data available for this statement.")
# Allow users to download statements as CSV
for statement_type, data in statements.items():
if data:
df = pd.DataFrame(data, columns=[statement_type])
csv = df.to_csv(index=False)
st.download_button(
label=f"Download {statement_type} as CSV",
data=csv,
file_name=f"{statement_type.lower().replace(' ', '_')}.csv",
mime="text/csv"
)
# Dependencies
st.sidebar.subheader("Dependencies")
st.sidebar.write("""
- Streamlit
- Hugging Face Transformers
- PyPDF2
- python-docx
- pandas
- faiss-cpu (or faiss-gpu)
- datasets
""")
# Deployment instructions
st.sidebar.subheader("Deployment Instructions")
st.sidebar.write("""
1. Install dependencies: `pip install -r requirements.txt`
2. Run the app: `streamlit run app.py`
3. Access the app in your browser at `http://localhost:8501`
""") |