File size: 5,233 Bytes
3bf50ea
 
 
 
 
 
 
 
 
 
 
 
 
6208c00
3bf50ea
 
 
 
 
 
6208c00
 
 
 
 
 
3bf50ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import requests
import re
import spacy
import nltk
from bs4 import BeautifulSoup
from newspaper import Article
from transformers import pipeline
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from nltk.sentiment import SentimentIntensityAnalyzer
import time
import subprocess   

# Download NLTK resources
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Load spaCy Named Entity Recognition model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading 'en_core_web_sm' model...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Load BERT Sentiment Analyzer
bert_sentiment = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

def get_valid_news_urls(company_name):
    search_url = f'https://www.google.com/search?q={company_name}+news&tbm=nws'
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"⚠️ Google News request failed: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    links = set()
    for g in soup.find_all('a', href=True):
        url_match = re.search(r'(https?://\S+)', g['href'])
        if url_match:
            url = url_match.group(1).split('&')[0]
            if "google.com" not in url:  # Ignore Google-related URLs
                links.add(url)
    
    return list(links)[:10]  # Limit to top 10 results

def extract_article_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        if article.text.strip():
            return article.text
    except Exception as e:
        print(f"⚠️ Newspaper3k failed: {e}")
    
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = '\n'.join(p.text for p in paragraphs if p.text)
        if text.strip():
            return text
    except Exception as e:
        print(f"⚠️ BeautifulSoup failed: {e}")
    
    try:
        options = Options()
        options.add_argument("--headless")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)  # Allow time for JavaScript to load content
        page_content = driver.page_source
        driver.quit()
        
        soup = BeautifulSoup(page_content, 'html.parser')
        paragraphs = soup.find_all('p')
        text = '\n'.join(p.text for p in paragraphs if p.text)
        if text.strip():
            return text
    except Exception as e:
        print(f"⚠️ Selenium failed: {e}")
    
    return None

def filter_relevant_sentences(text, company_name):
    doc = nlp(text)
    relevant_sentences = []

    for sent in text.split('. '):
        doc_sent = nlp(sent)
        for ent in doc_sent.ents:
            if company_name.lower() in ent.text.lower():
                relevant_sentences.append(sent)
                break  
    
    return '. '.join(relevant_sentences) if relevant_sentences else text  

def analyze_sentiment(text):
    if not text.strip():
        return "Neutral", 0.0  
    
    vader_scores = sia.polarity_scores(text)
    vader_compound = vader_scores['compound']
    
    try:
        bert_result = bert_sentiment(text[:512])[0]  # Limit to 512 tokens
        bert_label = bert_result['label']
        bert_score = bert_result['score']
        bert_value = bert_score if bert_label == "POSITIVE" else -bert_score
    except Exception as e:
        print(f"⚠️ BERT sentiment analysis failed: {e}")
        bert_value = 0.0  
    
    final_sentiment = (vader_compound + bert_value) / 2
    
    if final_sentiment > 0.2:
        return "Positive", final_sentiment
    elif final_sentiment < -0.2:
        return "Negative", final_sentiment
    else:
        return "Neutral", final_sentiment

def main():
    company_name = input("Enter company name: ")
    print(f"\n🔎 Searching news for: {company_name}\n")
    urls = get_valid_news_urls(company_name)
    
    if not urls:
        print("❌ No valid news URLs found.")
        return
    
    seen_articles = set()
    
    for i, url in enumerate(urls, 1):
        if url in seen_articles:
            continue  
        seen_articles.add(url)
        
        print(f"\n🔗 Article {i}: {url}\n")
        content = extract_article_content(url)
        
        if content:
            filtered_text = filter_relevant_sentences(content, company_name)
            sentiment, score = analyze_sentiment(filtered_text)
            
            print(f"📰 Extracted Content:\n{filtered_text[:500]}...")  
            print(f"📊 Sentiment: {sentiment} (Score: {score:.2f})")
        else:
            print("⚠️ Failed to extract content....")

if __name__ == "__main__":
    main()