Sentinel-AI-Beta-Test

Sleeping

App Files Files Community

Shreyas094 commited on Jul 1, 2024

Commit

b432dd9

verified ·

1 Parent(s): 781f919

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -214

app.py CHANGED Viewed

@@ -1,218 +1,72 @@
-import requests
-from bs4 import BeautifulSoup
-import gradio as gr
-from huggingface_hub import InferenceClient
-import random
-import urllib.parse
-from datetime import datetime, timedelta
-import re
 import os
-import PyPDF2
-# List of user agents to rotate through
-_useragent_list = [
-   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
-   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
-   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
-   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
-   "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
-]
-API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
-headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
-def query_llama(payload):
-   """Send a query to the Llama model via Hugging Face API"""
-   try:
-       print(f"Payload: {payload}")  # Debug: Print payload
-       response = requests.post(API_URL, headers=headers, json=payload)
-       response.raise_for_status()
-       return response.json()
-   except requests.exceptions.RequestException as e:
-       print(f"Error querying Llama model: {e}")
-       return None
-def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90):
-   """Perform a Google search and return results"""
-   print(f"Searching for term: {term}")
-   # Calculate the date range
-   end_date = datetime.now()
-   start_date = end_date - timedelta(days=days_back)
-   # Format dates as strings
-   start_date_str = start_date.strftime("%Y-%m-%d")
-   end_date_str = end_date.strftime("%Y-%m-%d")
-   # Add the date range to the search term
-   search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
-   escaped_term = urllib.parse.quote_plus(search_term)
-   start = 0
-   all_results = []
-   max_attempts = num_results * 2  # Allow for some failed attempts
-   with requests.Session() as session:
-       attempts = 0
-       while len(all_results) < num_results and attempts < max_attempts:
-           try:
-               # Choose a random user agent
-               user_agent = random.choice(_useragent_list)
-               headers = {'User-Agent': user_agent}
-               resp = session.get(
-                   url="https://www.google.com/search",
-                   headers=headers,
-                   params={
-                       "q": search_term,
-                       "num": num_results - len(all_results),
-                       "hl": lang,
-                       "start": start,
-                       "safe": safe,
-                   },
-                   timeout=timeout,
-                   verify=ssl_verify,
-               )
-               resp.raise_for_status()
-               soup = BeautifulSoup(resp.text, "html.parser")
-               result_block = soup.find_all("div", attrs={"class": "g"})
-               if not result_block:
-                   print("No more results found.")
-                   break
-               for result in result_block:
-                   if len(all_results) >= num_results:
-                       break
-                   link = result.find("a", href=True)
-                   if link:
-                       link = link["href"]
-                       print(f"Found link: {link}")
-                       try:
-                           webpage = session.get(link, headers=headers, timeout=timeout)
-                           webpage.raise_for_status()
-                           visible_text = extract_text_from_webpage(webpage.text)
-                           all_results.append({"link": link, "text": visible_text})
-                       except requests.exceptions.HTTPError as e:
-                           if e.response.status_code == 403:
-                               print(f"403 Forbidden error for {link}, skipping...")
-                           else:
-                               print(f"HTTP error {e.response.status_code} for {link}, skipping...")
-                       except requests.exceptions.RequestException as e:
-                           print(f"Error fetching or processing {link}: {e}")
-                   else:
-                       print("No link found in result.")
-               start += len(result_block)
-               attempts += 1
-           except requests.exceptions.RequestException as e:
-               print(f"Error fetching search results: {e}")
-               attempts += 1
-   print(f"Total results fetched: {len(all_results)}")
-   return all_results
-def extract_text_from_webpage(html_content):
-   """Extract visible text from HTML content"""
-   soup = BeautifulSoup(html_content, 'html.parser')
-   # Remove script and style elements
-   for script in soup(["script", "style"]):
-       script.decompose()
-   # Get text
-   text = soup.get_text()
-   # Break into lines and remove leading and trailing space on each
-   lines = (line.strip() for line in text.splitlines())
-   # Break multi-headlines into a line each
-   chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-   # Drop blank lines
-   text = '\n'.join(chunk for chunk in chunks if chunk)
-   return text
-def filter_relevant_content(text):
-   """Filter out irrelevant content"""
-   # List of keywords related to financial reports
-   keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
-   # Split the text into sentences
-   sentences = re.split(r'(?<=[.!?])\s+', text)
-   # Filter sentences containing at least one keyword
-   relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
-   # Join the relevant sentences back into a single string
-   filtered_text = ' '.join(relevant_sentences)
-   return filtered_text
-def chunk_text(text, max_chunk_size=1000, overlap=100):
-   # List of keywords that might indicate new sections
-   section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
-   # Split text into sentences
-   sentences = re.split(r'(?<=[.!?])\s+', text)
-   chunks = []
-   current_chunk = ""
-   for sentence in sentences:
-       if len(current_chunk) + len(sentence) > max_chunk_size:
-           # If adding this sentence exceeds max_chunk_size, start a new chunk
-           chunks.append(current_chunk.strip())
-           current_chunk = sentence + " "
-       elif any(keyword in sentence.lower() for keyword in section_keywords):
-           # If sentence contains a section keyword, start a new chunk
-           if current_chunk:
-               chunks.append(current_chunk.strip())
-           current_chunk = sentence + " "
-       else:
-           current_chunk += sentence + " "
-   # Add the last chunk if it's not empty
-   if current_chunk:
-       chunks.append(current_chunk.strip())
-   # Add overlap
-   overlapped_chunks = []
-   for i, chunk in enumerate(chunks):
-       if i > 0:
-           chunk = chunks[i-1][-overlap:] + chunk
-       if i < len(chunks) - 1:
-           chunk = chunk + chunks[i+1][:overlap]
-       overlapped_chunks.append(chunk)
-   return overlapped_chunks
-def summarize_text(text, context_instructions):
-   chunks = chunk_text(text, max_chunk_size=3000, overlap=200)
-   summaries = []
-   for chunk in chunks:
-       prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
-{chunk}
-{context_instructions}"""
-       summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
-       if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
-           summaries.append(summary[0]['generated_text'])
-   # Combine summaries
-   combined_summary = "\n\n".join(summaries)
-   # Final summarization of combined summaries
-   final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
-{combined_summary}
-Focus on the most important financial implications and analysis."""
-   final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
-   if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
-       return final_summary[0]['generated_text']
-   else:
-       return "Unable to generate summary due to an error."
-def summarize_financial_news(query, read_pdf=False, pdf=None):
-   """Search for financial news, extract relevant content
-, and summarize"""
-   all_filtered_text = ""
-   if read_pdf and pdf is not None:
-       pdf_text = extract_text_from_pdf(pdf)
-       all_filtered_text += pdf_text + "\n\n"
-   else:
-       search_results = google_search(query, num_results=1)
-       for result in search_results:
-           if result['text']:
-               filtered_text = filter_relevant_content(result['text'])
-               all_filtered_text += filtered_text + "\n\n"
-   if not all_filtered_text:
-       return "No relevant financial information found."
-   context_instructions = "Provide a detailed, coherent summary focusing on financial implications and analysis."
-   return summarize_text(all_filtered_text, context_instructions)
-def extract_text_from_pdf(pdf):
-   """Extract text from each page of the PDF"""
-   reader = PyPDF2.PdfFileReader(pdf)
-   text = ""
-   for page_num in range(reader.getNumPages()):
-       page = reader.getPage(page_num)
-       text += page.extract_text() + "\n"
-   return text
-# Gradio Interface
-def interface_function(query, read_pdf, pdf):
-   return summarize_financial_news(query, read_pdf, pdf)
 iface = gr.Interface(
-   fn=interface_function,
-   inputs=[
-       gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
-       gr.Checkbox(label="Read PDF"),
-       gr.File(label="Upload PDF", type="file")
-   ],
-   outputs="text",
-   title="Financial News Summarizer",
-   description="Enter a company name or financial topic to get a summary of recent financial news. Optionally, upload a PDF to summarize its content."
 )
 iface.launch()

 import os
+import gradio as gr
+from PyPDF2 import PdfReader
+import requests
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Get the Hugging Face API token
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+def summarize_text(text, instructions):
+    API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
+    headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
+    payload = {
+        "inputs": f"{instructions}\n\nText to summarize:\n{text}",
+        "parameters": {"max_length": 500}
+    }
+    response = requests.post(API_URL, headers=headers, json=payload)
+    return response.json()[0]["generated_text"]
+def process_pdf(pdf_file, chunk_instructions, final_instructions):
+    # Read PDF
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() + "\n\n"
+    # Chunk the text (simple splitting by pages for this example)
+    chunks = text.split("\n\n")
+    # Agent 1: Summarize each chunk
+    agent1_summaries = []
+    for chunk in chunks:
+        summary = summarize_text(chunk, chunk_instructions)
+        agent1_summaries.append(summary)
+    # Concatenate Agent 1 summaries
+    concatenated_summary = "\n\n".join(agent1_summaries)
+    # Agent 2: Final summarization
+    final_summary = summarize_text(concatenated_summary, final_instructions)
+    return final_summary
+def pdf_summarizer(pdf_file, chunk_instructions, final_instructions):
+    if pdf_file is None:
+        return "Please upload a PDF file."
+    try:
+        summary = process_pdf(pdf_file.name, chunk_instructions, final_instructions)
+        return summary
+    except Exception as e:
+        return f"An error occurred: {str(e)}"
+# Gradio interface
 iface = gr.Interface(
+    fn=pdf_summarizer,
+    inputs=[
+        gr.File(label="Upload PDF"),
+        gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
+        gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
+    ],
+    outputs=gr.Textbox(label="Summary"),
+    title="PDF Earnings Summary Generator",
+    description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
 )
 iface.launch()