File size: 6,216 Bytes
86b7caa
 
ea26600
86b7caa
 
1d241e5
 
953c4c1
 
dd35c43
925ce67
0fb0810
dd35c43
 
 
ea26600
0fb0810
 
 
 
e53d8c9
 
953c4c1
0fb0810
 
 
 
ea26600
 
 
 
 
925ce67
 
 
 
 
 
 
 
 
 
 
0fb0810
953c4c1
 
 
 
 
 
 
 
 
925ce67
 
 
 
 
 
 
 
 
 
953c4c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea26600
 
925ce67
 
 
 
 
 
ea26600
 
 
925ce67
 
 
 
ea26600
 
953c4c1
 
 
 
e53d8c9
 
 
953c4c1
e53d8c9
 
953c4c1
 
 
 
 
 
 
 
 
 
ea26600
 
925ce67
 
ea26600
953c4c1
ea26600
 
 
 
 
 
 
 
 
 
 
 
 
 
925ce67
 
 
 
 
 
 
953c4c1
925ce67
 
 
953c4c1
 
 
 
 
 
925ce67
 
 
 
 
 
86b7caa
 
953c4c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
import streamlit as st
import PyPDF2
import matplotlib.pyplot as plt
from io import BytesIO
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import dotenv
import re
import requests

# Load environment variables
dotenv.load_dotenv()

# Configure Hugging Face API
API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}

# Configure embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

def query_huggingface_api(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def write_to_file(content, filename="./files/test.pdf"):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as f:
        f.write(content)

def extract_financial_data(document_text):
    financial_data = {
        "Revenue": [],
        "Date": []
    }

    lines = document_text.split("\n")
    revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?')

    for i, line in enumerate(lines):
        if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
            for j in range(i + 1, i + 6):
                if j < len(lines):
                    matches = revenue_pattern.findall(lines[j])
                    if matches:
                        for match in matches:
                            try:
                                value = float(match.replace("$", "").replace(",", ""))
                                financial_data["Revenue"].append(value)
                            except ValueError:
                                continue

        if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
            financial_data["Date"].append(line.strip())

    min_length = min(len(financial_data["Revenue"]), len(financial_data["Date"]))
    financial_data["Revenue"] = financial_data["Revenue"][:min_length]
    financial_data["Date"] = financial_data["Date"][:min_length]

    return financial_data

def generate_summary(document_text, query):
    prompt = f"""
    You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
    Analyze the following document and respond to the query:
    {document_text}
    
    Query: {query}
    
    If the query is too general, respond with:
    Please cover the following aspects:
    1. Revenue and profit trends
    2. Key financial metrics
    3. Major financial events and decisions
    4. Comparison with previous periods
    5. Future outlook or forecasts
    6. Any notable financial risks or opportunities
    
    Provide a clear, concise, and professional response.
    """
    response = query_huggingface_api({"inputs": prompt})
    return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model."

def generate_comparison_graph(data):
    if not data["Date"] or not data["Revenue"]:
        st.write("Insufficient data for generating the revenue comparison graph.")
        return

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(data["Date"], data["Revenue"], marker="o", linestyle="-", color="b", label="Revenue")
    ax.set_title("Revenue Comparison")
    ax.set_xlabel("Date")
    ax.set_ylabel("Revenue (in millions)")
    ax.grid(True)
    ax.legend()
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    st.pyplot(fig)

def search_similar_sections(document_text, query, top_k=3):
    # Split the document into sections (you may need to adjust this based on your document structure)
    sections = document_text.split('\n\n')
    
    # Create Document objects for each section
    documents = [Document(text=section) for section in sections]
    
    # Compute embeddings for the query and all sections
    query_embedding = embed_model.get_text_embedding(query)
    section_embeddings = [embed_model.get_text_embedding(doc.text) for doc in documents]
    
    # Compute cosine similarities
    similarities = cosine_similarity([query_embedding], section_embeddings)[0]
    
    # Get indices of top-k similar sections
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    # Return top-k similar sections
    return [sections[i] for i in top_indices]

# Streamlit app
def main():
    st.title("Fortune 500 Financial Document Analyzer")
    st.write("Upload a financial document, ask questions, and get detailed analysis!")

    uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"])

    if uploaded_file is not None:
        if uploaded_file.type == "application/pdf":
            pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue()))
            document_text = ""
            for page in pdf_reader.pages:
                document_text += page.extract_text()
        else:
            document_text = uploaded_file.getvalue().decode("utf-8")

        write_to_file(uploaded_file.getvalue())

        st.write("Analyzing financial document...")

        # Extract financial data
        financial_data = extract_financial_data(document_text)

        # Add a provision for user query input
        query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")

        if query:
            summary = generate_summary(document_text, query)
            st.write("## Financial Analysis Result")
            st.write(summary)

            st.write("## Relevant Document Sections")
            similar_sections = search_similar_sections(document_text, query)
            for i, section in enumerate(similar_sections, 1):
                st.write(f"### Section {i}")
                st.write(section)

        # Display revenue comparison graph
        if financial_data["Revenue"] and financial_data["Date"]:
            st.write("## Revenue Comparison")
            generate_comparison_graph(financial_data)
        else:
            st.write("No revenue data found for comparison.")

if __name__ == "__main__":
    main()