Spaces:
Sleeping
Sleeping
File size: 6,216 Bytes
86b7caa ea26600 86b7caa 1d241e5 953c4c1 dd35c43 925ce67 0fb0810 dd35c43 ea26600 0fb0810 e53d8c9 953c4c1 0fb0810 ea26600 925ce67 0fb0810 953c4c1 925ce67 953c4c1 ea26600 925ce67 ea26600 925ce67 ea26600 953c4c1 e53d8c9 953c4c1 e53d8c9 953c4c1 ea26600 925ce67 ea26600 953c4c1 ea26600 925ce67 953c4c1 925ce67 953c4c1 925ce67 86b7caa 953c4c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import os
import streamlit as st
import PyPDF2
import matplotlib.pyplot as plt
from io import BytesIO
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import dotenv
import re
import requests
# Load environment variables
dotenv.load_dotenv()
# Configure Hugging Face API
API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}
# Configure embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
def query_huggingface_api(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
def write_to_file(content, filename="./files/test.pdf"):
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "wb") as f:
f.write(content)
def extract_financial_data(document_text):
financial_data = {
"Revenue": [],
"Date": []
}
lines = document_text.split("\n")
revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?')
for i, line in enumerate(lines):
if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
for j in range(i + 1, i + 6):
if j < len(lines):
matches = revenue_pattern.findall(lines[j])
if matches:
for match in matches:
try:
value = float(match.replace("$", "").replace(",", ""))
financial_data["Revenue"].append(value)
except ValueError:
continue
if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
financial_data["Date"].append(line.strip())
min_length = min(len(financial_data["Revenue"]), len(financial_data["Date"]))
financial_data["Revenue"] = financial_data["Revenue"][:min_length]
financial_data["Date"] = financial_data["Date"][:min_length]
return financial_data
def generate_summary(document_text, query):
prompt = f"""
You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
Analyze the following document and respond to the query:
{document_text}
Query: {query}
If the query is too general, respond with:
Please cover the following aspects:
1. Revenue and profit trends
2. Key financial metrics
3. Major financial events and decisions
4. Comparison with previous periods
5. Future outlook or forecasts
6. Any notable financial risks or opportunities
Provide a clear, concise, and professional response.
"""
response = query_huggingface_api({"inputs": prompt})
return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model."
def generate_comparison_graph(data):
if not data["Date"] or not data["Revenue"]:
st.write("Insufficient data for generating the revenue comparison graph.")
return
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(data["Date"], data["Revenue"], marker="o", linestyle="-", color="b", label="Revenue")
ax.set_title("Revenue Comparison")
ax.set_xlabel("Date")
ax.set_ylabel("Revenue (in millions)")
ax.grid(True)
ax.legend()
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
st.pyplot(fig)
def search_similar_sections(document_text, query, top_k=3):
# Split the document into sections (you may need to adjust this based on your document structure)
sections = document_text.split('\n\n')
# Create Document objects for each section
documents = [Document(text=section) for section in sections]
# Compute embeddings for the query and all sections
query_embedding = embed_model.get_text_embedding(query)
section_embeddings = [embed_model.get_text_embedding(doc.text) for doc in documents]
# Compute cosine similarities
similarities = cosine_similarity([query_embedding], section_embeddings)[0]
# Get indices of top-k similar sections
top_indices = np.argsort(similarities)[-top_k:][::-1]
# Return top-k similar sections
return [sections[i] for i in top_indices]
# Streamlit app
def main():
st.title("Fortune 500 Financial Document Analyzer")
st.write("Upload a financial document, ask questions, and get detailed analysis!")
uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"])
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue()))
document_text = ""
for page in pdf_reader.pages:
document_text += page.extract_text()
else:
document_text = uploaded_file.getvalue().decode("utf-8")
write_to_file(uploaded_file.getvalue())
st.write("Analyzing financial document...")
# Extract financial data
financial_data = extract_financial_data(document_text)
# Add a provision for user query input
query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")
if query:
summary = generate_summary(document_text, query)
st.write("## Financial Analysis Result")
st.write(summary)
st.write("## Relevant Document Sections")
similar_sections = search_similar_sections(document_text, query)
for i, section in enumerate(similar_sections, 1):
st.write(f"### Section {i}")
st.write(section)
# Display revenue comparison graph
if financial_data["Revenue"] and financial_data["Date"]:
st.write("## Revenue Comparison")
generate_comparison_graph(financial_data)
else:
st.write("No revenue data found for comparison.")
if __name__ == "__main__":
main() |