File size: 9,849 Bytes
a2335c5
 
10e4113
 
c1cdf7c
10e4113
e283410
10e4113
a7932b8
ecb4d0c
10e4113
a2335c5
 
 
 
 
 
 
 
 
46adcec
ef5fd00
10e4113
 
 
abc7a3c
6db5c55
abc7a3c
 
 
 
 
 
a2335c5
3081b5f
10e4113
a2335c5
00ac423
 
 
 
 
 
 
 
 
 
10e4113
00ac423
 
a2335c5
 
abc7a3c
7d6eec9
a2335c5
abc7a3c
 
a2335c5
87e3f69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
abc7a3c
87e3f69
 
 
 
abc7a3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2335c5
 
abc7a3c
328806f
a2335c5
 
 
10e4113
 
 
b50112d
10e4113
 
 
e2808f2
10e4113
 
e2808f2
10e4113
 
e2808f2
10e4113
 
e2808f2
10e4113
 
e2808f2
 
 
10e4113
 
 
 
e2808f2
10e4113
 
e2808f2
10e4113
 
e2808f2
10e4113
 
e2808f2
10e4113
e2808f2
ecb4d0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10e4113
 
 
e2808f2
10e4113
 
 
 
 
e2808f2
10e4113
 
e2808f2
ecb4d0c
 
 
 
 
 
 
10e4113
a65ba38
ecb4d0c
87e3f69
abc7a3c
ecb4d0c
 
 
 
 
 
 
 
 
3081b5f
ab3adb5
87e3f69
abc7a3c
 
 
c1cdf7c
10e4113
 
 
 
 
 
 
 
34054e0
4bac2a5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import requests
from bs4 import BeautifulSoup
import gradio as gr
from huggingface_hub import InferenceClient
import random
import urllib.parse
from datetime import datetime, timedelta
import re
import os

# List of user agents to rotate through
_useragent_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36",
]

API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
headers = {"Authorization": f"Bearer {os.getenv('HUGGINGFACE_TOKEN')}"}

def query_llama(payload):
    """Send a query to the Llama model via Hugging Face API"""
    try:
        print(f"Payload: {payload}")  # Debug: Print payload
        response = requests.post(API_URL, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error querying Llama model: {e}")
        return None

def google_search(term, num_results=1, lang="en", timeout=5, safe="active", ssl_verify=None, days_back=90):
    """Perform a Google search and return results"""
    print(f"Searching for term: {term}")
    
    # Calculate the date range
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    
    # Format dates as strings
    start_date_str = start_date.strftime("%Y-%m-%d")
    end_date_str = end_date.strftime("%Y-%m-%d")
    
    # Add the date range to the search term
    search_term = f"{term} financial earnings report after:{start_date_str} before:{end_date_str}"
    
    escaped_term = urllib.parse.quote_plus(search_term)
    start = 0
    all_results = []
    max_attempts = num_results * 2  # Allow for some failed attempts

    with requests.Session() as session:
        attempts = 0
        while len(all_results) < num_results and attempts < max_attempts:
            try:
                # Choose a random user agent
                user_agent = random.choice(_useragent_list)
                headers = {'User-Agent': user_agent}

                resp = session.get(
                    url="https://www.google.com/search",
                    headers=headers,
                    params={
                        "q": search_term,
                        "num": num_results - len(all_results),
                        "hl": lang,
                        "start": start,
                        "safe": safe,
                    },
                    timeout=timeout,
                    verify=ssl_verify,
                )
                resp.raise_for_status()

                soup = BeautifulSoup(resp.text, "html.parser")
                result_block = soup.find_all("div", attrs={"class": "g"})
                
                if not result_block:
                    print("No more results found.")
                    break

                for result in result_block:
                    if len(all_results) >= num_results:
                        break
                    link = result.find("a", href=True)
                    if link:
                        link = link["href"]
                        print(f"Found link: {link}")
                        try:
                            webpage = session.get(link, headers=headers, timeout=timeout)
                            webpage.raise_for_status()
                            visible_text = extract_text_from_webpage(webpage.text)
                            all_results.append({"link": link, "text": visible_text})
                        except requests.exceptions.HTTPError as e:
                            if e.response.status_code == 403:
                                print(f"403 Forbidden error for {link}, skipping...")
                            else:
                                print(f"HTTP error {e.response.status_code} for {link}, skipping...")
                        except requests.exceptions.RequestException as e:
                            print(f"Error fetching or processing {link}: {e}")
                    else:
                        print("No link found in result.")
                
                start += len(result_block)
                attempts += 1
            except requests.exceptions.RequestException as e:
                print(f"Error fetching search results: {e}")
                attempts += 1

    print(f"Total results fetched: {len(all_results)}")
    return all_results

def extract_text_from_webpage(html_content):
    """Extract visible text from HTML content"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Get text
    text = soup.get_text()
    
    # Break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    
    # Break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # Drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    return text

def filter_relevant_content(text):
    """Filter out irrelevant content"""
    # List of keywords related to financial reports
    keywords = ['revenue', 'profit', 'earnings', 'financial', 'quarter', 'fiscal', 'growth', 'income', 'loss', 'dividend']
    
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Filter sentences containing at least one keyword
    relevant_sentences = [sentence for sentence in sentences if any(keyword in sentence.lower() for keyword in keywords)]
    
    # Join the relevant sentences back into a single string
    filtered_text = ' '.join(relevant_sentences)
    
    return filtered_text

def chunk_text(text, max_chunk_size=1000, overlap=100):
    # List of keywords that might indicate new sections
    section_keywords = ["revenue", "income", "profit", "loss", "expenses", "outlook", "forecast", "quarter", "year"]
    
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) > max_chunk_size:
            # If adding this sentence exceeds max_chunk_size, start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
        elif any(keyword in sentence.lower() for keyword in section_keywords):
            # If sentence contains a section keyword, start a new chunk
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
        else:
            current_chunk += sentence + " "
    
    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    # Add overlap
    overlapped_chunks = []
    for i, chunk in enumerate(chunks):
        if i > 0:
            chunk = chunks[i-1][-overlap:] + chunk
        if i < len(chunks) - 1:
            chunk = chunk + chunks[i+1][:overlap]
        overlapped_chunks.append(chunk)
    
    return overlapped_chunks

def summarize_financial_news(query):
    """Search for financial news, extract relevant content, and summarize"""
    search_results = google_search(query, num_results=3)
    
    all_filtered_text = ""
    for result in search_results:
        if result['text']:
            filtered_text = filter_relevant_content(result['text'])
            all_filtered_text += filtered_text + "\n\n"
    
    if not all_filtered_text:
        return "No relevant financial information found."
    
    # Chunk the filtered text
    chunks = chunk_text(all_filtered_text, max_chunk_size=3000, overlap=200)
    
    summaries = []
    for chunk in chunks:
        prompt = f"""You are a financial analyst. Summarize the following text from a financial perspective:
{chunk}
Provide a detailed, coherent summary focusing on financial implications and analysis."""

        summary = query_llama({"inputs": prompt, "parameters": {"max_length": 1000}})
        if summary and isinstance(summary, list) and 'generated_text' in summary[0]:
            summaries.append(summary[0]['generated_text'])
    
    # Combine summaries
    combined_summary = "\n\n".join(summaries)
    
    # Final summarization of combined summaries
    final_prompt = f"""As a financial analyst, provide a coherent and comprehensive summary of the following financial information:
{combined_summary}
Focus on the most important financial implications and analysis."""

    final_summary = query_llama({"inputs": final_prompt, "parameters": {"max_length": 3000}})
    
    if final_summary and isinstance(final_summary, list) and 'generated_text' in final_summary[0]:
        return final_summary[0]['generated_text']
    else:
        return "Unable to generate summary due to an error."

# Gradio Interface
iface = gr.Interface(
    fn=summarize_financial_news,
    inputs=gr.Textbox(lines=2, placeholder="Enter a company name or financial topic..."),
    outputs="text",
    title="Financial News Summarizer",
    description="Enter a company name or financial topic to get a summary of recent financial news."
)

iface.launch()