Spaces:
Build error
Build error
| import os | |
| import streamlit as st | |
| import PyPDF2 | |
| import matplotlib.pyplot as plt | |
| from io import BytesIO | |
| from llama_index.embeddings import HuggingFaceEmbedding | |
| from llama_index.schema import Document | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import dotenv | |
| import re | |
| import requests | |
| # Load environment variables | |
| dotenv.load_dotenv() | |
| # Configure Hugging Face API | |
| API_URL = "https://api-inference.huggingface.co/models/sarvamai/sarvam-2b-v0.5" | |
| headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_API_KEY')}"} | |
| # Configure embedding model | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| def query_huggingface_api(payload): | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| return response.json() | |
| def write_to_file(content, filename="./files/test.pdf"): | |
| os.makedirs(os.path.dirname(filename), exist_ok=True) | |
| with open(filename, "wb") as f: | |
| f.write(content) | |
| def extract_financial_data(document_text): | |
| financial_data = { | |
| "Revenue": [], | |
| "Date": [] | |
| } | |
| lines = document_text.split("\n") | |
| revenue_pattern = re.compile(r'\$?\d+(?:,\d{3})*(?:\.\d+)?') | |
| for i, line in enumerate(lines): | |
| if any(keyword in line.lower() for keyword in ["revenue", "total revenue", "sales"]): | |
| for j in range(i + 1, i + 6): | |
| if j < len(lines): | |
| matches = revenue_pattern.findall(lines[j]) | |
| if matches: | |
| for match in matches: | |
| try: | |
| value = float(match.replace("$", "").replace(",", "")) | |
| financial_data["Revenue"].append(value) | |
| except ValueError: | |
| continue | |
| if "Q1" in line or "Q2" in line or "Q3" in line or "Q4" in line or re.search(r'FY\s*\d{4}', line): | |
| financial_data["Date"].append(line.strip()) | |
| min_length = min(len(financial_data["Revenue"]), len(financial_data["Date"])) | |
| financial_data["Revenue"] = financial_data["Revenue"][:min_length] | |
| financial_data["Date"] = financial_data["Date"][:min_length] | |
| return financial_data | |
| def generate_summary(document_text, query): | |
| prompt = f""" | |
| You are a financial analyst. Your task is to provide a comprehensive analysis of the financial document. | |
| Analyze the following document and respond to the query: | |
| {document_text} | |
| Query: {query} | |
| If the query is too general, respond with: | |
| Please cover the following aspects: | |
| 1. Revenue and profit trends | |
| 2. Key financial metrics | |
| 3. Major financial events and decisions | |
| 4. Comparison with previous periods | |
| 5. Future outlook or forecasts | |
| 6. Any notable financial risks or opportunities | |
| Provide a clear, concise, and professional response. | |
| """ | |
| response = query_huggingface_api({"inputs": prompt}) | |
| return response[0]["generated_text"] if response and isinstance(response, list) else "No response from model." | |
| def generate_comparison_graph(data): | |
| if not data["Date"] or not data["Revenue"]: | |
| st.write("Insufficient data for generating the revenue comparison graph.") | |
| return | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| ax.plot(data["Date"], data["Revenue"], marker="o", linestyle="-", color="b", label="Revenue") | |
| ax.set_title("Revenue Comparison") | |
| ax.set_xlabel("Date") | |
| ax.set_ylabel("Revenue (in millions)") | |
| ax.grid(True) | |
| ax.legend() | |
| plt.xticks(rotation=45, ha="right") | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| def search_similar_sections(document_text, query, top_k=3): | |
| # Split the document into sections (you may need to adjust this based on your document structure) | |
| sections = document_text.split('\n\n') | |
| # Create Document objects for each section | |
| documents = [Document(text=section) for section in sections] | |
| # Compute embeddings for the query and all sections | |
| query_embedding = embed_model.get_text_embedding(query) | |
| section_embeddings = [embed_model.get_text_embedding(doc.text) for doc in documents] | |
| # Compute cosine similarities | |
| similarities = cosine_similarity([query_embedding], section_embeddings)[0] | |
| # Get indices of top-k similar sections | |
| top_indices = np.argsort(similarities)[-top_k:][::-1] | |
| # Return top-k similar sections | |
| return [sections[i] for i in top_indices] | |
| # Streamlit app | |
| def main(): | |
| st.title("Fortune 500 Financial Document Analyzer") | |
| st.write("Upload a financial document, ask questions, and get detailed analysis!") | |
| uploaded_file = st.file_uploader("Choose a financial document file", type=["pdf", "txt"]) | |
| if uploaded_file is not None: | |
| if uploaded_file.type == "application/pdf": | |
| pdf_reader = PyPDF2.PdfReader(BytesIO(uploaded_file.getvalue())) | |
| document_text = "" | |
| for page in pdf_reader.pages: | |
| document_text += page.extract_text() | |
| else: | |
| document_text = uploaded_file.getvalue().decode("utf-8") | |
| write_to_file(uploaded_file.getvalue()) | |
| st.write("Analyzing financial document...") | |
| # Extract financial data | |
| financial_data = extract_financial_data(document_text) | |
| # Add a provision for user query input | |
| query = st.text_input("Enter your financial analysis query (e.g., 'What are the revenue trends?')", "") | |
| if query: | |
| summary = generate_summary(document_text, query) | |
| st.write("## Financial Analysis Result") | |
| st.write(summary) | |
| st.write("## Relevant Document Sections") | |
| similar_sections = search_similar_sections(document_text, query) | |
| for i, section in enumerate(similar_sections, 1): | |
| st.write(f"### Section {i}") | |
| st.write(section) | |
| # Display revenue comparison graph | |
| if financial_data["Revenue"] and financial_data["Date"]: | |
| st.write("## Revenue Comparison") | |
| generate_comparison_graph(financial_data) | |
| else: | |
| st.write("No revenue data found for comparison.") | |
| if __name__ == "__main__": | |
| main() |