Shreyas094 commited on
Commit
b432dd9
·
verified ·
1 Parent(s): 781f919

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -214
app.py CHANGED
@@ -1,218 +1,72 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import gradio as gr
4
- from huggingface_hub import InferenceClient
5
- import random
6
- import urllib.parse
7
- from datetime import datetime, timedelta
8
- import re
9
  import os
10
- import PyPDF2
11
- # List of user agents to rotate through
12
- _useragent_list = [
13
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
14
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
15
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
16
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
17
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
18
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
19
- ]
20
- API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
21
- headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}
22
- def query_llama(payload):
23
- """Send a query to the Llama model via Hugging Face API"""
24
- try:
25
- print(f"Payload: {payload}") # Debug: Print payload
26
- response = requests.post(API_URL, headers=headers, json=payload)
27
- response.raise_for_status()
28
- return response.json()
29
- except requests.exceptions.RequestException as e:
30
- print(f"Error querying Llama model: {e}")
31
- return None
32
- def google_search(term, num_results=1, lang="en", timeout=30, safe="active", ssl_verify=None, days_back=90):
33
- """Perform a Google search and return results"""
34
- print(f"Searching for term: {term}")
35
- # Calculate the date range
36
- end_date = datetime.now()
37
- start_date = end_date - timedelta(days=days_back)
38
- # Format dates as strings
39
- start_date_str = start_date.strftime("%Y-%m-%d")
40
- end_date_str = end_date.strftime("%Y-%m-%d")
41
- # Add the date range to the search term
42
- search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
43
- escaped_term = urllib.parse.quote_plus(search_term)
44
- start = 0
45
- all_results = []
46
- max_attempts = num_results * 2 # Allow for some failed attempts
47
- with requests.Session() as session:
48
- attempts = 0
49
- while len(all_results) < num_results and attempts < max_attempts:
50
- try:
51
- # Choose a random user agent
52
- user_agent = random.choice(_useragent_list)
53
- headers = {'User-Agent': user_agent}
54
- resp = session.get(
55
- url="https://www.google.com/search",
56
- headers=headers,
57
- params={
58
- "q": search_term,
59
- "num": num_results - len(all_results),
60
- "hl": lang,
61
- "start": start,
62
- "safe": safe,
63
- },
64
- timeout=timeout,
65
- verify=ssl_verify,
66
- )
67
- resp.raise_for_status()
68
- soup = BeautifulSoup(resp.text, "html.parser")
69
- result_block = soup.find_all("div", attrs={"class": "g"})
70
- if not result_block:
71
- print("No more results found.")
72
- break
73
- for result in result_block:
74
- if len(all_results) >= num_results:
75
- break
76
- link = result.find("a", href=True)
77
- if link:
78
- link = link["href"]
79
- print(f"Found link: {link}")
80
- try:
81
- webpage = session.get(link, headers=headers, timeout=timeout)
82
- webpage.raise_for_status()
83
- visible_text = extract_text_from_webpage(webpage.text)
84
- all_results.append({"link": link, "text": visible_text})
85
- except requests.exceptions.HTTPError as e:
86
- if e.response.status_code == 403:
87
- print(f"403 Forbidden error for {link}, skipping...")
88
- else:
89
- print(f"HTTP error {e.response.status_code} for {link}, skipping...")
90
- except requests.exceptions.RequestException as e:
91
- print(f"Error fetching or processing {link}: {e}")
92
- else:
93
- print("No link found in result.")
94
- start += len(result_block)
95
- attempts += 1
96
- except requests.exceptions.RequestException as e:
97
- print(f"Error fetching search results: {e}")
98
- attempts += 1
99
- print(f"Total results fetched: {len(all_results)}")
100
- return all_results
101
- def extract_text_from_webpage(html_content):
102
- """Extract visible text from HTML content"""
103
- soup = BeautifulSoup(html_content, 'html.parser')
104
- # Remove script and style elements
105
- for script in soup(["script", "style"]):
106
- script.decompose()
107
- # Get text
108
- text = soup.get_text()
109
- # Break into lines and remove leading and trailing space on each
110
- lines = (line.strip() for line in text.splitlines())
111
- # Break multi-headlines into a line each
112
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
113
- # Drop blank lines
114
- text = '\n'.join(chunk for chunk in chunks if chunk)
115
- return text
116
- def filter_relevant_content(text):
117
- """Filter out irrelevant content"""
118
- # List of keywords related to financial reports
119
- keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
120
- # Split the text into sentences
121
- sentences = re.split(r'(?<=[.!?])\s+', text)
122
- # Filter sentences containing at least one keyword
123
- relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
124
- # Join the relevant sentences back into a single string
125
- filtered_text = ' '.join(relevant_sentences)
126
- return filtered_text
127
- def chunk_text(text, max_chunk_size=1000, overlap=100):
128
- # List of keywords that might indicate new sections
129
- section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
130
- # Split text into sentences
131
- sentences = re.split(r'(?<=[.!?])\s+', text)
132
- chunks = []
133
- current_chunk = ""
134
- for sentence in sentences:
135
- if len(current_chunk) + len(sentence) > max_chunk_size:
136
- # If adding this sentence exceeds max_chunk_size, start a new chunk
137
- chunks.append(current_chunk.strip())
138
- current_chunk = sentence + " "
139
- elif any(keyword in sentence.lower() for keyword in section_keywords):
140
- # If sentence contains a section keyword, start a new chunk
141
- if current_chunk:
142
- chunks.append(current_chunk.strip())
143
- current_chunk = sentence + " "
144
- else:
145
- current_chunk += sentence + " "
146
- # Add the last chunk if it's not empty
147
- if current_chunk:
148
- chunks.append(current_chunk.strip())
149
- # Add overlap
150
- overlapped_chunks = []
151
- for i, chunk in enumerate(chunks):
152
- if i > 0:
153
- chunk = chunks[i-1][-overlap:] + chunk
154
- if i < len(chunks) - 1:
155
- chunk = chunk + chunks[i+1][:overlap]
156
- overlapped_chunks.append(chunk)
157
- return overlapped_chunks
158
- def summarize_text(text, context_instructions):
159
- chunks = chunk_text(text, max_chunk_size=3000, overlap=200)
160
- summaries = []
161
- for chunk in chunks:
162
- prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
163
- {chunk}
164
- {context_instructions}"""
165
- summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
166
- if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
167
- summaries.append(summary[0]['generated_text'])
168
- # Combine summaries
169
- combined_summary = "\n\n".join(summaries)
170
- # Final summarization of combined summaries
171
- final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
172
- {combined_summary}
173
- Focus on the most important financial implications and analysis."""
174
- final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
175
- if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
176
- return final_summary[0]['generated_text']
177
- else:
178
- return "Unable to generate summary due to an error."
179
- def summarize_financial_news(query, read_pdf=False, pdf=None):
180
- """Search for financial news, extract relevant content
181
- , and summarize"""
182
- all_filtered_text = ""
183
- if read_pdf and pdf is not None:
184
- pdf_text = extract_text_from_pdf(pdf)
185
- all_filtered_text += pdf_text + "\n\n"
186
- else:
187
- search_results = google_search(query, num_results=1)
188
- for result in search_results:
189
- if result['text']:
190
- filtered_text = filter_relevant_content(result['text'])
191
- all_filtered_text += filtered_text + "\n\n"
192
- if not all_filtered_text:
193
- return "No relevant financial information found."
194
- context_instructions = "Provide a detailed, coherent summary focusing on financial implications and analysis."
195
- return summarize_text(all_filtered_text, context_instructions)
196
- def extract_text_from_pdf(pdf):
197
- """Extract text from each page of the PDF"""
198
- reader = PyPDF2.PdfFileReader(pdf)
199
- text = ""
200
- for page_num in range(reader.getNumPages()):
201
- page = reader.getPage(page_num)
202
- text += page.extract_text() + "\n"
203
- return text
204
- # Gradio Interface
205
- def interface_function(query, read_pdf, pdf):
206
- return summarize_financial_news(query, read_pdf, pdf)
207
  iface = gr.Interface(
208
- fn=interface_function,
209
- inputs=[
210
- gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
211
- gr.Checkbox(label="Read PDF"),
212
- gr.File(label="Upload PDF", type="file")
213
- ],
214
- outputs="text",
215
- title="Financial News Summarizer",
216
- description="Enter a company name or financial topic to get a summary of recent financial news. Optionally, upload a PDF to summarize its content."
217
  )
 
218
  iface.launch()
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import gradio as gr
3
+ from PyPDF2 import PdfReader
4
+ import requests
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Get the Hugging Face API token
11
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
12
+
13
+ def summarize_text(text, instructions):
14
+ API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
15
+ headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
16
+
17
+ payload = {
18
+ "inputs": f"{instructions}\n\nText to summarize:\n{text}",
19
+ "parameters": {"max_length": 500}
20
+ }
21
+
22
+ response = requests.post(API_URL, headers=headers, json=payload)
23
+ return response.json()[0]["generated_text"]
24
+
25
+ def process_pdf(pdf_file, chunk_instructions, final_instructions):
26
+ # Read PDF
27
+ reader = PdfReader(pdf_file)
28
+ text = ""
29
+ for page in reader.pages:
30
+ text += page.extract_text() + "\n\n"
31
+
32
+ # Chunk the text (simple splitting by pages for this example)
33
+ chunks = text.split("\n\n")
34
+
35
+ # Agent 1: Summarize each chunk
36
+ agent1_summaries = []
37
+ for chunk in chunks:
38
+ summary = summarize_text(chunk, chunk_instructions)
39
+ agent1_summaries.append(summary)
40
+
41
+ # Concatenate Agent 1 summaries
42
+ concatenated_summary = "\n\n".join(agent1_summaries)
43
+
44
+ # Agent 2: Final summarization
45
+ final_summary = summarize_text(concatenated_summary, final_instructions)
46
+
47
+ return final_summary
48
+
49
+ def pdf_summarizer(pdf_file, chunk_instructions, final_instructions):
50
+ if pdf_file is None:
51
+ return "Please upload a PDF file."
52
+
53
+ try:
54
+ summary = process_pdf(pdf_file.name, chunk_instructions, final_instructions)
55
+ return summary
56
+ except Exception as e:
57
+ return f"An error occurred: {str(e)}"
58
+
59
+ # Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  iface = gr.Interface(
61
+ fn=pdf_summarizer,
62
+ inputs=[
63
+ gr.File(label="Upload PDF"),
64
+ gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
65
+ gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
66
+ ],
67
+ outputs=gr.Textbox(label="Summary"),
68
+ title="PDF Earnings Summary Generator",
69
+ description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
70
  )
71
+
72
  iface.launch()