Spaces:

rolwinpinto
/

finanalyst

Sleeping

App Files Files Community

rolwinpinto commited on Aug 14, 2024

Commit

953c4c1

verified ·

1 Parent(s): 320e108

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -52

app.py CHANGED Viewed

@@ -3,8 +3,9 @@ import streamlit as st
 import PyPDF2
 import matplotlib.pyplot as plt
 from io import BytesIO
-from llama_index import VectorStoreIndex, SimpleDirectoryReader
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 import dotenv
 import re
 import requests
@@ -16,13 +17,13 @@ dotenv.load_dotenv()
 API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
 headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
 def query_huggingface_api(payload):
     response = requests.post(API_URL, headers=headers, json=payload)
     return response.json()
-# Configure embedding model
-embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
 def write_to_file(content, filename="./files/test.pdf"):
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "wb") as f:
@@ -40,14 +41,15 @@ def extract_financial_data(document_text):
     for i, line in enumerate(lines):
         if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
             for j in range(i + 1, i + 6):
-                matches = revenue_pattern.findall(lines[j])
-                if matches:
-                    for match in matches:
-                        try:
-                            value = float(match.replace("$", "").replace(",", ""))
-                            financial_data["Revenue"].append(value)
-                        except ValueError:
-                            continue
         if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
             financial_data["Date"].append(line.strip())
@@ -58,38 +60,27 @@ def extract_financial_data(document_text):
     return financial_data
-def ingest_documents():
-    reader = SimpleDirectoryReader("./files/")
-    documents = reader.load_data()
-    return documents
-def load_data(documents):
-    index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
-    return index
-def generate_summary(index, document_text, query):
-    query_engine = index.as_query_engine()
-    llm_response = query_huggingface_api({
-        "inputs": f"""
-        You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
-        Analyze the following document and respond to the query:
-        {document_text}
-        Query: {query}
-        If the query is too general, respond with:
-        Please cover the following aspects:
-        1. Revenue and profit trends
-        2. Key financial metrics
-        3. Major financial events and decisions
-        4. Comparison with previous periods
-        5. Future outlook or forecasts
-        6. Any notable financial risks or opportunities
-        Provide a clear, concise, and professional response.
-        """
-    })
-    return llm_response.get("generated_text", "No response from model.")
 def generate_comparison_graph(data):
     if not data["Date"] or not data["Revenue"]:
@@ -107,12 +98,29 @@ def generate_comparison_graph(data):
     plt.tight_layout()
     st.pyplot(fig)
 # Streamlit app
 def main():
     st.title("Fortune 500 Financial Document Analyzer")
     st.write("Upload a financial document, ask questions, and get detailed analysis!")
-    uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf"])
     if uploaded_file is not None:
         if uploaded_file.type == "application/pdf":
@@ -130,18 +138,20 @@ def main():
         # Extract financial data
         financial_data = extract_financial_data(document_text)
-        # Ingest documents for summarization and query-driven analysis
-        documents = ingest_documents()
-        index = load_data(documents)
         # Add a provision for user query input
         query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
         if query:
-            summary = generate_summary(index, document_text, query)
             st.write("## Financial Analysis Result")
             st.write(summary)
         # Display revenue comparison graph
         if financial_data["Revenue"] and financial_data["Date"]:
             st.write("## Revenue Comparison")
@@ -150,4 +160,4 @@ def main():
             st.write("No revenue data found for comparison.")
 if __name__ == "__main__":
-    main()

 import PyPDF2
 import matplotlib.pyplot as plt
 from io import BytesIO
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 import dotenv
 import re
 import requests
 API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
 headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
+# Initialize SentenceTransformer model
+embed_model = SentenceTransformer('all-MiniLM-L6-v2')
 def query_huggingface_api(payload):
     response = requests.post(API_URL, headers=headers, json=payload)
     return response.json()
 def write_to_file(content, filename="./files/test.pdf"):
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "wb") as f:
     for i, line in enumerate(lines):
         if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
             for j in range(i + 1, i + 6):
+                if j < len(lines):
+                    matches = revenue_pattern.findall(lines[j])
+                    if matches:
+                        for match in matches:
+                            try:
+                                value = float(match.replace("$", "").replace(",", ""))
+                                financial_data["Revenue"].append(value)
+                            except ValueError:
+                                continue
         if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
             financial_data["Date"].append(line.strip())
     return financial_data
+def generate_summary(document_text, query):
+    prompt = f"""
+    You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
+    Analyze the following document and respond to the query:
+    {document_text}
+    Query: {query}
+    If the query is too general, respond with:
+    Please cover the following aspects:
+    1. Revenue and profit trends
+    2. Key financial metrics
+    3. Major financial events and decisions
+    4. Comparison with previous periods
+    5. Future outlook or forecasts
+    6. Any notable financial risks or opportunities
+    Provide a clear, concise, and professional response.
+    """
+    response = query_huggingface_api({"inputs": prompt})
+    return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model."
 def generate_comparison_graph(data):
     if not data["Date"] or not data["Revenue"]:
     plt.tight_layout()
     st.pyplot(fig)
+def search_similar_sections(document_text, query, top_k=3):
+    # Split the document into sections (you may need to adjust this based on your document structure)
+    sections = document_text.split('\n\n')
+    # Compute embeddings for the query and all sections
+    query_embedding = embed_model.encode([query])[0]
+    section_embeddings = embed_model.encode(sections)
+    # Compute cosine similarities
+    similarities = cosine_similarity([query_embedding], section_embeddings)[0]
+    # Get indices of top-k similar sections
+    top_indices = np.argsort(similarities)[-top_k:][::-1]
+    # Return top-k similar sections
+    return [sections[i] for i in top_indices]
 # Streamlit app
 def main():
     st.title("Fortune 500 Financial Document Analyzer")
     st.write("Upload a financial document, ask questions, and get detailed analysis!")
+    uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"])
     if uploaded_file is not None:
         if uploaded_file.type == "application/pdf":
         # Extract financial data
         financial_data = extract_financial_data(document_text)
         # Add a provision for user query input
         query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
         if query:
+            summary = generate_summary(document_text, query)
             st.write("## Financial Analysis Result")
             st.write(summary)
+            st.write("## Relevant Document Sections")
+            similar_sections = search_similar_sections(document_text, query)
+            for i, section in enumerate(similar_sections, 1):
+                st.write(f"### Section {i}")
+                st.write(section)
         # Display revenue comparison graph
         if financial_data["Revenue"] and financial_data["Date"]:
             st.write("## Revenue Comparison")
             st.write("No revenue data found for comparison.")
 if __name__ == "__main__":
+    main()