Spaces:

rolwinpinto
/

finanalyst

Build error

App Files Files Community

finanalyst / app.py

rolwinpinto

Update app.py

1d241e5 verified over 1 year ago

raw

history blame

6.22 kB

	import os
	import streamlit as st
	import PyPDF2
	import matplotlib.pyplot as plt
	from io import BytesIO
	from llama_index.embeddings import HuggingFaceEmbedding
	from llama_index.schema import Document
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import dotenv
	import re
	import requests

	# Load environment variables
	dotenv.load_dotenv()

	# Configure Hugging Face API
	API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5"
	headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"}

	# Configure embedding model
	embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

	def query_huggingface_api(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

	def write_to_file(content, filename="./files/test.pdf"):
	os.makedirs(os.path.dirname(filename), exist_ok=True)
	with open(filename, "wb") as f:
	f.write(content)

	def extract_financial_data(document_text):
	financial_data = {
	"Revenue": [],
	"Date": []
	}

	lines = document_text.split("\n")
	revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?')

	for i, line in enumerate(lines):
	if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]):
	for j in range(i + 1, i + 6):
	if j < len(lines):
	matches = revenue_pattern.findall(lines[j])
	if matches:
	for match in matches:
	try:
	value = float(match.replace("$", "").replace(",", ""))
	financial_data["Revenue"].append(value)
	except ValueError:
	continue

	if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line):
	financial_data["Date"].append(line.strip())

	min_length = min(len(financial_data["Revenue"]), len(financial_data["Date"]))
	financial_data["Revenue"] = financial_data["Revenue"][:min_length]
	financial_data["Date"] = financial_data["Date"][:min_length]

	return financial_data

	def generate_summary(document_text, query):
	prompt = f"""
	You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document.
	Analyze the following document and respond to the query:
	{document_text}

	Query: {query}

	If the query is too general, respond with:
	Please cover the following aspects:
	1. Revenue and profit trends
	2. Key financial metrics
	3. Major financial events and decisions
	4. Comparison with previous periods
	5. Future outlook or forecasts
	6. Any notable financial risks or opportunities

	Provide a clear, concise, and professional response.
	"""
	response = query_huggingface_api({"inputs": prompt})
	return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model."

	def generate_comparison_graph(data):
	if not data["Date"] or not data["Revenue"]:
	st.write("Insufficient data for generating the revenue comparison graph.")
	return

	fig, ax = plt.subplots(figsize=(10, 6))
	ax.plot(data["Date"], data["Revenue"], marker="o", linestyle="-", color="b", label="Revenue")
	ax.set_title("Revenue Comparison")
	ax.set_xlabel("Date")
	ax.set_ylabel("Revenue (in millions)")
	ax.grid(True)
	ax.legend()
	plt.xticks(rotation=45, ha="right")
	plt.tight_layout()
	st.pyplot(fig)

	def search_similar_sections(document_text, query, top_k=3):
	# Split the document into sections (you may need to adjust this based on your document structure)
	sections = document_text.split('\n\n')

	# Create Document objects for each section
	documents = [Document(text=section) for section in sections]

	# Compute embeddings for the query and all sections
	query_embedding = embed_model.get_text_embedding(query)
	section_embeddings = [embed_model.get_text_embedding(doc.text) for doc in documents]

	# Compute cosine similarities
	similarities = cosine_similarity([query_embedding], section_embeddings)[0]

	# Get indices of top-k similar sections
	top_indices = np.argsort(similarities)[-top_k:][::-1]

	# Return top-k similar sections
	return [sections[i] for i in top_indices]

	# Streamlit app
	def main():
	st.title("Fortune 500 Financial Document Analyzer")
	st.write("Upload a financial document, ask questions, and get detailed analysis!")

	uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"])

	if uploaded_file is not None:
	if uploaded_file.type == "application/pdf":
	pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue()))
	document_text = ""
	for page in pdf_reader.pages:
	document_text += page.extract_text()
	else:
	document_text = uploaded_file.getvalue().decode("utf-8")

	write_to_file(uploaded_file.getvalue())

	st.write("Analyzing financial document...")

	# Extract financial data
	financial_data = extract_financial_data(document_text)

	# Add a provision for user query input
	query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "")

	if query:
	summary = generate_summary(document_text, query)
	st.write("## Financial Analysis Result")
	st.write(summary)

	st.write("## Relevant Document Sections")
	similar_sections = search_similar_sections(document_text, query)
	for i, section in enumerate(similar_sections, 1):
	st.write(f"### Section {i}")
	st.write(section)

	# Display revenue comparison graph
	if financial_data["Revenue"] and financial_data["Date"]:
	st.write("## Revenue Comparison")
	generate_comparison_graph(financial_data)
	else:
	st.write("No revenue data found for comparison.")

	if __name__ == "__main__":
	main()