File size: 4,901 Bytes
1896b1a
1d3b6af
1896b1a
 
2320a8a
1896b1a
 
 
87713d1
 
 
 
 
 
 
1896b1a
 
 
1d3b6af
1896b1a
 
1d3b6af
1896b1a
 
 
 
 
 
 
 
 
 
 
2320a8a
1896b1a
 
 
2320a8a
 
1896b1a
 
 
 
2cb0c11
 
 
 
 
1896b1a
1d3b6af
1896b1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d3b6af
1896b1a
 
 
 
 
00ef1a7
1896b1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87713d1
2cb0c11
1896b1a
 
 
 
 
87713d1
1896b1a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Import necessary libraries
import streamlit as st
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import PyPDF2
from docx import Document
import pandas as pd
import os

# Check if faiss is installed
try:
    import faiss
except ImportError:
    st.error("The `faiss` library is required but not installed. Please install it using `pip install faiss-cpu` (or `faiss-gpu` for GPU support).")
    st.stop()

# Set up Streamlit app
st.title("Financial Statement Generator")
st.write("Upload a PDF or DOCX file to generate financial statements.")

# File upload
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])

# Function to extract text from PDF
def extract_text_from_pdf(file):
    pdf_reader = PyPDF2.PdfFileReader(file)
    text = ""
    for page_num in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page_num)
        text += page.extract_text()
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file):
    doc = Document(file)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Function to process extracted text using RAG model
def generate_financial_statements(text):
    # Load RAG model and tokenizer
    tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base")
    retriever = RagRetriever.from_pretrained(
        "facebook/rag-sequence-base",
        index_name="exact",
        trust_remote_code=True  # Allow execution of remote code
    )
    model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever)

    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

    # Generate financial statements
    outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000)

    # Decode generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Function to parse generated text into financial statements
def parse_financial_statements(generated_text):
    # Placeholder logic for parsing generated text into structured data
    # You can customize this based on your specific requirements
    statements = {
        "Ledger": [],
        "Journal General": [],
        "Income Statement": [],
        "Balance Sheet": [],
        "Cash Flow Statement": []
    }

    # Example parsing logic (replace with actual logic)
    lines = generated_text.split("\n")
    for line in lines:
        if "Transaction:" in line:
            statements["Ledger"].append(line)
        elif "Revenue:" in line or "Expense:" in line:
            statements["Income Statement"].append(line)
        elif "Asset:" in line or "Liability:" in line or "Equity:" in line:
            statements["Balance Sheet"].append(line)
        elif "Cash Inflow:" in line or "Cash Outflow:" in line:
            statements["Cash Flow Statement"].append(line)

    return statements

# Main logic
if uploaded_file is not None:
    # Extract text from uploaded file
    if uploaded_file.type == "application/pdf":
        text = extract_text_from_pdf(uploaded_file)
    elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        text = extract_text_from_docx(uploaded_file)
    else:
        st.error("Unsupported file format. Please upload a PDF or DOCX file.")
        st.stop()

    # Display extracted text
    st.subheader("Extracted Text")
    st.write(text)

    # Generate financial statements
    st.subheader("Generated Financial Statements")
    generated_text = generate_financial_statements(text)
    statements = parse_financial_statements(generated_text)

    # Display financial statements
    for statement_type, data in statements.items():
        st.write(f"### {statement_type}")
        if data:
            st.write(data)
        else:
            st.write("No data available for this statement.")

    # Allow users to download statements as CSV
    for statement_type, data in statements.items():
        if data:
            df = pd.DataFrame(data, columns=[statement_type])
            csv = df.to_csv(index=False)
            st.download_button(
                label=f"Download {statement_type} as CSV",
                data=csv,
                file_name=f"{statement_type.lower().replace(' ', '_')}.csv",
                mime="text/csv"
            )

# Dependencies
st.sidebar.subheader("Dependencies")
st.sidebar.write("""
- Streamlit
- Hugging Face Transformers
- PyPDF2
- python-docx
- pandas
- faiss-cpu (or faiss-gpu)
- datasets
""")

# Deployment instructions
st.sidebar.subheader("Deployment Instructions")
st.sidebar.write("""
1. Install dependencies: `pip install -r requirements.txt`
2. Run the app: `streamlit run app.py`
3. Access the app in your browser at `http://localhost:8501`
""")