mfraz commited on
Commit
1896b1a
·
verified ·
1 Parent(s): 00ef1a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -62
app.py CHANGED
@@ -1,72 +1,132 @@
 
1
  import streamlit as st
2
- import pandas as pd
3
- from transformers import T5ForConditionalGeneration, T5Tokenizer
4
  from docx import Document
 
 
 
 
 
 
5
 
6
- # Load the generator model (FLAN-T5)
7
- generator_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
8
- generator_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
9
 
10
- # Function to read DOCS files
11
- def read_docs(file):
 
 
 
 
 
 
 
 
 
12
  doc = Document(file)
13
- text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
 
 
14
  return text
15
 
16
- # Function to process user input and generate financial statements
17
- def generate_financial_statements(file, file_type):
18
- # Read the file based on its type
19
- if file_type == "csv":
20
- df = pd.read_csv(file)
21
- context = df.to_string()
22
- elif file_type == "xlsx":
23
- df = pd.read_excel(file)
24
- context = df.to_string()
25
- elif file_type == "docx":
26
- context = read_docs(file)
27
- else:
28
- st.error("Unsupported file type. Please upload a CSV, Excel, or DOCS file.")
29
- return None
30
-
31
- # Define financial statement queries
32
- queries = [
33
- "Generate a journal from the following financial data:",
34
- "Generate a general ledger from the following financial data:",
35
- "Generate an income statement from the following financial data:",
36
- "Generate a balance sheet from the following financial data:",
37
- "Generate a cash flow statement from the following financial data:"
38
- ]
39
-
40
- # Generate financial statements using the generator model
41
- financial_statements = {}
42
- for query in queries:
43
- # Combine query and context
44
- input_text = f"{query}\n{context}"
45
-
46
- # Generate response using the generator model
47
- input_ids = generator_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids
48
- output = generator_model.generate(input_ids, max_length=512)
49
- response = generator_tokenizer.decode(output[0], skip_special_tokens=True)
50
-
51
- # Store the result
52
- financial_statements[query] = response
53
-
54
- return financial_statements
55
-
56
- # Streamlit UI
57
- st.title("Financial Statement Generator")
58
- st.write("Upload your financial data (CSV, Excel, or DOCS) to generate journal, general ledger, income statement, balance sheet, and cash flow statement.")
59
 
60
- # File upload
61
- uploaded_file = st.file_uploader("Upload your file", type=["csv", "xlsx", "docx"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if uploaded_file is not None:
63
- file_type = uploaded_file.name.split(".")[-1].lower() # Ensure lowercase file type
64
- financial_statements = generate_financial_statements(uploaded_file, file_type)
65
-
66
- # Display results if financial_statements is not None
67
- if financial_statements is not None:
68
- for statement_type, statement in financial_statements.items():
69
- st.subheader(statement_type)
70
- st.write(statement)
71
  else:
72
- st.error("Failed to generate financial statements. Please check the file type and content.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
  import streamlit as st
3
+ from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
4
+ import PyPDF2
5
  from docx import Document
6
+ import pandas as pd
7
+ import os
8
+
9
+ # Set up Streamlit app
10
+ st.title("Financial Statement Generator")
11
+ st.write("Upload a PDF or DOCX file to generate financial statements.")
12
 
13
+ # File upload
14
+ uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])
 
15
 
16
+ # Function to extract text from PDF
17
+ def extract_text_from_pdf(file):
18
+ pdf_reader = PyPDF2.PdfFileReader(file)
19
+ text = ""
20
+ for page_num in range(pdf_reader.getNumPages()):
21
+ page = pdf_reader.getPage(page_num)
22
+ text += page.extract_text()
23
+ return text
24
+
25
+ # Function to extract text from DOCX
26
+ def extract_text_from_docx(file):
27
  doc = Document(file)
28
+ text = ""
29
+ for paragraph in doc.paragraphs:
30
+ text += paragraph.text + "\n"
31
  return text
32
 
33
+ # Function to process extracted text using RAG model
34
+ def generate_financial_statements(text):
35
+ # Load RAG model and tokenizer
36
+ tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-base")
37
+ retriever = RagRetriever.from_pretrained("facebook/rag-sequence-base", index_name="exact")
38
+ model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-base", retriever=retriever)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ # Tokenize input text
41
+ inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
42
+
43
+ # Generate financial statements
44
+ outputs = model.generate(input_ids=inputs["input_ids"], max_length=1000)
45
+
46
+ # Decode generated text
47
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
48
+ return generated_text
49
+
50
+ # Function to parse generated text into financial statements
51
+ def parse_financial_statements(generated_text):
52
+ # Placeholder logic for parsing generated text into structured data
53
+ # You can customize this based on your specific requirements
54
+ statements = {
55
+ "Ledger": [],
56
+ "Journal General": [],
57
+ "Income Statement": [],
58
+ "Balance Sheet": [],
59
+ "Cash Flow Statement": []
60
+ }
61
+
62
+ # Example parsing logic (replace with actual logic)
63
+ lines = generated_text.split("\n")
64
+ for line in lines:
65
+ if "Transaction:" in line:
66
+ statements["Ledger"].append(line)
67
+ elif "Revenue:" in line or "Expense:" in line:
68
+ statements["Income Statement"].append(line)
69
+ elif "Asset:" in line or "Liability:" in line or "Equity:" in line:
70
+ statements["Balance Sheet"].append(line)
71
+ elif "Cash Inflow:" in line or "Cash Outflow:" in line:
72
+ statements["Cash Flow Statement"].append(line)
73
+
74
+ return statements
75
+
76
+ # Main logic
77
  if uploaded_file is not None:
78
+ # Extract text from uploaded file
79
+ if uploaded_file.type == "application/pdf":
80
+ text = extract_text_from_pdf(uploaded_file)
81
+ elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
82
+ text = extract_text_from_docx(uploaded_file)
 
 
 
83
  else:
84
+ st.error("Unsupported file format. Please upload a PDF or DOCX file.")
85
+ st.stop()
86
+
87
+ # Display extracted text
88
+ st.subheader("Extracted Text")
89
+ st.write(text)
90
+
91
+ # Generate financial statements
92
+ st.subheader("Generated Financial Statements")
93
+ generated_text = generate_financial_statements(text)
94
+ statements = parse_financial_statements(generated_text)
95
+
96
+ # Display financial statements
97
+ for statement_type, data in statements.items():
98
+ st.write(f"### {statement_type}")
99
+ if data:
100
+ st.write(data)
101
+ else:
102
+ st.write("No data available for this statement.")
103
+
104
+ # Allow users to download statements as CSV
105
+ for statement_type, data in statements.items():
106
+ if data:
107
+ df = pd.DataFrame(data, columns=[statement_type])
108
+ csv = df.to_csv(index=False)
109
+ st.download_button(
110
+ label=f"Download {statement_type} as CSV",
111
+ data=csv,
112
+ file_name=f"{statement_type.lower().replace(' ', '_')}.csv",
113
+ mime="text/csv"
114
+ )
115
+
116
+ # Dependencies
117
+ st.sidebar.subheader("Dependencies")
118
+ st.sidebar.write("""
119
+ - Streamlit
120
+ - Hugging Face Transformers
121
+ - PyPDF2
122
+ - python-docx
123
+ - pandas
124
+ """)
125
+
126
+ # Deployment instructions
127
+ st.sidebar.subheader("Deployment Instructions")
128
+ st.sidebar.write("""
129
+ 1. Install dependencies: `pip install streamlit transformers PyPDF2 python-docx pandas`
130
+ 2. Run the app: `streamlit run app.py`
131
+ 3. Access the app in your browser at `http://localhost:8501`
132
+ """)