Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,9 @@ import streamlit as st
|
|
3 |
import PyPDF2
|
4 |
import matplotlib.pyplot as plt
|
5 |
from io import BytesIO
|
6 |
-
from
|
7 |
-
from
|
|
|
8 |
import dotenv
|
9 |
import re
|
10 |
import requests
|
@@ -16,13 +17,13 @@ dotenv.load_dotenv()
|
|
16 |
API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
|
17 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
|
18 |
|
|
|
|
|
|
|
19 |
def query_huggingface_api(payload):
|
20 |
response = requests.post(API_URL, headers=headers, json=payload)
|
21 |
return response.json()
|
22 |
|
23 |
-
# Configure embedding model
|
24 |
-
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
|
25 |
-
|
26 |
def write_to_file(content, filename="./files/test.pdf"):
|
27 |
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
28 |
with open(filename, "wb") as f:
|
@@ -40,14 +41,15 @@ def extract_financial_data(document_text):
|
|
40 |
for i, line in enumerate(lines):
|
41 |
if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
|
42 |
for j in range(i + 1, i + 6):
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
51 |
|
52 |
if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
|
53 |
financial_data["Date"].append(line.strip())
|
@@ -58,38 +60,27 @@ def extract_financial_data(document_text):
|
|
58 |
|
59 |
return financial_data
|
60 |
|
61 |
-
def
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
1. Revenue and profit trends
|
83 |
-
2. Key financial metrics
|
84 |
-
3. Major financial events and decisions
|
85 |
-
4. Comparison with previous periods
|
86 |
-
5. Future outlook or forecasts
|
87 |
-
6. Any notable financial risks or opportunities
|
88 |
-
|
89 |
-
Provide a clear, concise, and professional response.
|
90 |
-
"""
|
91 |
-
})
|
92 |
-
return llm_response.get("generated_text", "No response from model.")
|
93 |
|
94 |
def generate_comparison_graph(data):
|
95 |
if not data["Date"] or not data["Revenue"]:
|
@@ -107,12 +98,29 @@ def generate_comparison_graph(data):
|
|
107 |
plt.tight_layout()
|
108 |
st.pyplot(fig)
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# Streamlit app
|
111 |
def main():
|
112 |
st.title("Fortune 500 Financial Document Analyzer")
|
113 |
st.write("Upload a financial document, ask questions, and get detailed analysis!")
|
114 |
|
115 |
-
uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf"])
|
116 |
|
117 |
if uploaded_file is not None:
|
118 |
if uploaded_file.type == "application/pdf":
|
@@ -130,18 +138,20 @@ def main():
|
|
130 |
# Extract financial data
|
131 |
financial_data = extract_financial_data(document_text)
|
132 |
|
133 |
-
# Ingest documents for summarization and query-driven analysis
|
134 |
-
documents = ingest_documents()
|
135 |
-
index = load_data(documents)
|
136 |
-
|
137 |
# Add a provision for user query input
|
138 |
query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
|
139 |
|
140 |
if query:
|
141 |
-
summary = generate_summary(
|
142 |
st.write("## Financial Analysis Result")
|
143 |
st.write(summary)
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
# Display revenue comparison graph
|
146 |
if financial_data["Revenue"] and financial_data["Date"]:
|
147 |
st.write("## Revenue Comparison")
|
@@ -150,4 +160,4 @@ def main():
|
|
150 |
st.write("No revenue data found for comparison.")
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
-
main()
|
|
|
3 |
import PyPDF2
|
4 |
import matplotlib.pyplot as plt
|
5 |
from io import BytesIO
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
import numpy as np
|
9 |
import dotenv
|
10 |
import re
|
11 |
import requests
|
|
|
17 |
API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
|
18 |
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
|
19 |
|
20 |
+
# Initialize SentenceTransformer model
|
21 |
+
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
22 |
+
|
23 |
def query_huggingface_api(payload):
|
24 |
response = requests.post(API_URL, headers=headers, json=payload)
|
25 |
return response.json()
|
26 |
|
|
|
|
|
|
|
27 |
def write_to_file(content, filename="./files/test.pdf"):
|
28 |
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
29 |
with open(filename, "wb") as f:
|
|
|
41 |
for i, line in enumerate(lines):
|
42 |
if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
|
43 |
for j in range(i + 1, i + 6):
|
44 |
+
if j < len(lines):
|
45 |
+
matches = revenue_pattern.findall(lines[j])
|
46 |
+
if matches:
|
47 |
+
for match in matches:
|
48 |
+
try:
|
49 |
+
value = float(match.replace("$", "").replace(",", ""))
|
50 |
+
financial_data["Revenue"].append(value)
|
51 |
+
except ValueError:
|
52 |
+
continue
|
53 |
|
54 |
if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
|
55 |
financial_data["Date"].append(line.strip())
|
|
|
60 |
|
61 |
return financial_data
|
62 |
|
63 |
+
def generate_summary(document_text, query):
|
64 |
+
prompt = f"""
|
65 |
+
You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
|
66 |
+
Analyze the following document and respond to the query:
|
67 |
+
{document_text}
|
68 |
+
|
69 |
+
Query: {query}
|
70 |
+
|
71 |
+
If the query is too general, respond with:
|
72 |
+
Please cover the following aspects:
|
73 |
+
1. Revenue and profit trends
|
74 |
+
2. Key financial metrics
|
75 |
+
3. Major financial events and decisions
|
76 |
+
4. Comparison with previous periods
|
77 |
+
5. Future outlook or forecasts
|
78 |
+
6. Any notable financial risks or opportunities
|
79 |
+
|
80 |
+
Provide a clear, concise, and professional response.
|
81 |
+
"""
|
82 |
+
response = query_huggingface_api({"inputs": prompt})
|
83 |
+
return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
def generate_comparison_graph(data):
|
86 |
if not data["Date"] or not data["Revenue"]:
|
|
|
98 |
plt.tight_layout()
|
99 |
st.pyplot(fig)
|
100 |
|
101 |
+
def search_similar_sections(document_text, query, top_k=3):
|
102 |
+
# Split the document into sections (you may need to adjust this based on your document structure)
|
103 |
+
sections = document_text.split('\n\n')
|
104 |
+
|
105 |
+
# Compute embeddings for the query and all sections
|
106 |
+
query_embedding = embed_model.encode([query])[0]
|
107 |
+
section_embeddings = embed_model.encode(sections)
|
108 |
+
|
109 |
+
# Compute cosine similarities
|
110 |
+
similarities = cosine_similarity([query_embedding], section_embeddings)[0]
|
111 |
+
|
112 |
+
# Get indices of top-k similar sections
|
113 |
+
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
114 |
+
|
115 |
+
# Return top-k similar sections
|
116 |
+
return [sections[i] for i in top_indices]
|
117 |
+
|
118 |
# Streamlit app
|
119 |
def main():
|
120 |
st.title("Fortune 500 Financial Document Analyzer")
|
121 |
st.write("Upload a financial document, ask questions, and get detailed analysis!")
|
122 |
|
123 |
+
uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"])
|
124 |
|
125 |
if uploaded_file is not None:
|
126 |
if uploaded_file.type == "application/pdf":
|
|
|
138 |
# Extract financial data
|
139 |
financial_data = extract_financial_data(document_text)
|
140 |
|
|
|
|
|
|
|
|
|
141 |
# Add a provision for user query input
|
142 |
query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
|
143 |
|
144 |
if query:
|
145 |
+
summary = generate_summary(document_text, query)
|
146 |
st.write("## Financial Analysis Result")
|
147 |
st.write(summary)
|
148 |
|
149 |
+
st.write("## Relevant Document Sections")
|
150 |
+
similar_sections = search_similar_sections(document_text, query)
|
151 |
+
for i, section in enumerate(similar_sections, 1):
|
152 |
+
st.write(f"### Section {i}")
|
153 |
+
st.write(section)
|
154 |
+
|
155 |
# Display revenue comparison graph
|
156 |
if financial_data["Revenue"] and financial_data["Date"]:
|
157 |
st.write("## Revenue Comparison")
|
|
|
160 |
st.write("No revenue data found for comparison.")
|
161 |
|
162 |
if __name__ == "__main__":
|
163 |
+
main()
|