rolwinpinto commited on
Commit
953c4c1
·
verified ·
1 Parent(s): 320e108

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -52
app.py CHANGED
@@ -3,8 +3,9 @@ import streamlit as st
3
  import PyPDF2
4
  import matplotlib.pyplot as plt
5
  from io import BytesIO
6
- from llama_index import VectorStoreIndex, SimpleDirectoryReader
7
- from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
8
  import dotenv
9
  import re
10
  import requests
@@ -16,13 +17,13 @@ dotenv.load_dotenv()
16
  API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
17
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
18
 
 
 
 
19
  def query_huggingface_api(payload):
20
  response = requests.post(API_URL, headers=headers, json=payload)
21
  return response.json()
22
 
23
- # Configure embedding model
24
- embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
25
-
26
  def write_to_file(content, filename="./files/test.pdf"):
27
  os.makedirs(os.path.dirname(filename), exist_ok=True)
28
  with open(filename, "wb") as f:
@@ -40,14 +41,15 @@ def extract_financial_data(document_text):
40
  for i, line in enumerate(lines):
41
  if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
42
  for j in range(i + 1, i + 6):
43
- matches = revenue_pattern.findall(lines[j])
44
- if matches:
45
- for match in matches:
46
- try:
47
- value = float(match.replace("$", "").replace(",", ""))
48
- financial_data["Revenue"].append(value)
49
- except ValueError:
50
- continue
 
51
 
52
  if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
53
  financial_data["Date"].append(line.strip())
@@ -58,38 +60,27 @@ def extract_financial_data(document_text):
58
 
59
  return financial_data
60
 
61
- def ingest_documents():
62
- reader = SimpleDirectoryReader("./files/")
63
- documents = reader.load_data()
64
- return documents
65
-
66
- def load_data(documents):
67
- index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
68
- return index
69
-
70
- def generate_summary(index, document_text, query):
71
- query_engine = index.as_query_engine()
72
- llm_response = query_huggingface_api({
73
- "inputs": f"""
74
- You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
75
- Analyze the following document and respond to the query:
76
- {document_text}
77
-
78
- Query: {query}
79
-
80
- If the query is too general, respond with:
81
- Please cover the following aspects:
82
- 1. Revenue and profit trends
83
- 2. Key financial metrics
84
- 3. Major financial events and decisions
85
- 4. Comparison with previous periods
86
- 5. Future outlook or forecasts
87
- 6. Any notable financial risks or opportunities
88
-
89
- Provide a clear, concise, and professional response.
90
- """
91
- })
92
- return llm_response.get("generated_text", "No response from model.")
93
 
94
  def generate_comparison_graph(data):
95
  if not data["Date"] or not data["Revenue"]:
@@ -107,12 +98,29 @@ def generate_comparison_graph(data):
107
  plt.tight_layout()
108
  st.pyplot(fig)
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # Streamlit app
111
  def main():
112
  st.title("Fortune 500 Financial Document Analyzer")
113
  st.write("Upload a financial document, ask questions, and get detailed analysis!")
114
 
115
- uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf"])
116
 
117
  if uploaded_file is not None:
118
  if uploaded_file.type == "application/pdf":
@@ -130,18 +138,20 @@ def main():
130
  # Extract financial data
131
  financial_data = extract_financial_data(document_text)
132
 
133
- # Ingest documents for summarization and query-driven analysis
134
- documents = ingest_documents()
135
- index = load_data(documents)
136
-
137
  # Add a provision for user query input
138
  query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
139
 
140
  if query:
141
- summary = generate_summary(index, document_text, query)
142
  st.write("## Financial Analysis Result")
143
  st.write(summary)
144
 
 
 
 
 
 
 
145
  # Display revenue comparison graph
146
  if financial_data["Revenue"] and financial_data["Date"]:
147
  st.write("## Revenue Comparison")
@@ -150,4 +160,4 @@ def main():
150
  st.write("No revenue data found for comparison.")
151
 
152
  if __name__ == "__main__":
153
- main()
 
3
  import PyPDF2
4
  import matplotlib.pyplot as plt
5
  from io import BytesIO
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import numpy as np
9
  import dotenv
10
  import re
11
  import requests
 
17
  API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
18
  headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
19
 
20
+ # Initialize SentenceTransformer model
21
+ embed_model = SentenceTransformer('all-MiniLM-L6-v2')
22
+
23
  def query_huggingface_api(payload):
24
  response = requests.post(API_URL, headers=headers, json=payload)
25
  return response.json()
26
 
 
 
 
27
  def write_to_file(content, filename="./files/test.pdf"):
28
  os.makedirs(os.path.dirname(filename), exist_ok=True)
29
  with open(filename, "wb") as f:
 
41
  for i, line in enumerate(lines):
42
  if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
43
  for j in range(i + 1, i + 6):
44
+ if j < len(lines):
45
+ matches = revenue_pattern.findall(lines[j])
46
+ if matches:
47
+ for match in matches:
48
+ try:
49
+ value = float(match.replace("$", "").replace(",", ""))
50
+ financial_data["Revenue"].append(value)
51
+ except ValueError:
52
+ continue
53
 
54
  if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
55
  financial_data["Date"].append(line.strip())
 
60
 
61
  return financial_data
62
 
63
+ def generate_summary(document_text, query):
64
+ prompt = f"""
65
+ You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
66
+ Analyze the following document and respond to the query:
67
+ {document_text}
68
+
69
+ Query: {query}
70
+
71
+ If the query is too general, respond with:
72
+ Please cover the following aspects:
73
+ 1. Revenue and profit trends
74
+ 2. Key financial metrics
75
+ 3. Major financial events and decisions
76
+ 4. Comparison with previous periods
77
+ 5. Future outlook or forecasts
78
+ 6. Any notable financial risks or opportunities
79
+
80
+ Provide a clear, concise, and professional response.
81
+ """
82
+ response = query_huggingface_api({"inputs": prompt})
83
+ return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model."
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  def generate_comparison_graph(data):
86
  if not data["Date"] or not data["Revenue"]:
 
98
  plt.tight_layout()
99
  st.pyplot(fig)
100
 
101
+ def search_similar_sections(document_text, query, top_k=3):
102
+ # Split the document into sections (you may need to adjust this based on your document structure)
103
+ sections = document_text.split('\n\n')
104
+
105
+ # Compute embeddings for the query and all sections
106
+ query_embedding = embed_model.encode([query])[0]
107
+ section_embeddings = embed_model.encode(sections)
108
+
109
+ # Compute cosine similarities
110
+ similarities = cosine_similarity([query_embedding], section_embeddings)[0]
111
+
112
+ # Get indices of top-k similar sections
113
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
114
+
115
+ # Return top-k similar sections
116
+ return [sections[i] for i in top_indices]
117
+
118
  # Streamlit app
119
  def main():
120
  st.title("Fortune 500 Financial Document Analyzer")
121
  st.write("Upload a financial document, ask questions, and get detailed analysis!")
122
 
123
+ uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"])
124
 
125
  if uploaded_file is not None:
126
  if uploaded_file.type == "application/pdf":
 
138
  # Extract financial data
139
  financial_data = extract_financial_data(document_text)
140
 
 
 
 
 
141
  # Add a provision for user query input
142
  query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
143
 
144
  if query:
145
+ summary = generate_summary(document_text, query)
146
  st.write("## Financial Analysis Result")
147
  st.write(summary)
148
 
149
+ st.write("## Relevant Document Sections")
150
+ similar_sections = search_similar_sections(document_text, query)
151
+ for i, section in enumerate(similar_sections, 1):
152
+ st.write(f"### Section {i}")
153
+ st.write(section)
154
+
155
  # Display revenue comparison graph
156
  if financial_data["Revenue"] and financial_data["Date"]:
157
  st.write("## Revenue Comparison")
 
160
  st.write("No revenue data found for comparison.")
161
 
162
  if __name__ == "__main__":
163
+ main()