news-summarise-tts / sentiV_v2.py
Shakespeared101's picture
fixed conflicts
6208c00
import requests
import re
import spacy
import nltk
from bs4 import BeautifulSoup
from newspaper import Article
from transformers import pipeline
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from nltk.sentiment import SentimentIntensityAnalyzer
import time
import subprocess
# Download NLTK resources
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
# Load spaCy Named Entity Recognition model
try:
nlp = spacy.load("en_core_web_sm")
except OSError:
print("Downloading 'en_core_web_sm' model...")
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
nlp = spacy.load("en_core_web_sm")
# Load BERT Sentiment Analyzer
bert_sentiment = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")
def get_valid_news_urls(company_name):
search_url = f'https://www.google.com/search?q={company_name}+news&tbm=nws'
headers = {'User-Agent': 'Mozilla/5.0'}
try:
response = requests.get(search_url, headers=headers)
response.raise_for_status()
except requests.RequestException as e:
print(f"⚠️ Google News request failed: {e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
links = set()
for g in soup.find_all('a', href=True):
url_match = re.search(r'(https?://\S+)', g['href'])
if url_match:
url = url_match.group(1).split('&')[0]
if "google.com" not in url: # Ignore Google-related URLs
links.add(url)
return list(links)[:10] # Limit to top 10 results
def extract_article_content(url):
try:
article = Article(url)
article.download()
article.parse()
if article.text.strip():
return article.text
except Exception as e:
print(f"⚠️ Newspaper3k failed: {e}")
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
text = '\n'.join(p.text for p in paragraphs if p.text)
if text.strip():
return text
except Exception as e:
print(f"⚠️ BeautifulSoup failed: {e}")
try:
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)
time.sleep(3) # Allow time for JavaScript to load content
page_content = driver.page_source
driver.quit()
soup = BeautifulSoup(page_content, 'html.parser')
paragraphs = soup.find_all('p')
text = '\n'.join(p.text for p in paragraphs if p.text)
if text.strip():
return text
except Exception as e:
print(f"⚠️ Selenium failed: {e}")
return None
def filter_relevant_sentences(text, company_name):
doc = nlp(text)
relevant_sentences = []
for sent in text.split('. '):
doc_sent = nlp(sent)
for ent in doc_sent.ents:
if company_name.lower() in ent.text.lower():
relevant_sentences.append(sent)
break
return '. '.join(relevant_sentences) if relevant_sentences else text
def analyze_sentiment(text):
if not text.strip():
return "Neutral", 0.0
vader_scores = sia.polarity_scores(text)
vader_compound = vader_scores['compound']
try:
bert_result = bert_sentiment(text[:512])[0] # Limit to 512 tokens
bert_label = bert_result['label']
bert_score = bert_result['score']
bert_value = bert_score if bert_label == "POSITIVE" else -bert_score
except Exception as e:
print(f"⚠️ BERT sentiment analysis failed: {e}")
bert_value = 0.0
final_sentiment = (vader_compound + bert_value) / 2
if final_sentiment > 0.2:
return "Positive", final_sentiment
elif final_sentiment < -0.2:
return "Negative", final_sentiment
else:
return "Neutral", final_sentiment
def main():
company_name = input("Enter company name: ")
print(f"\n🔎 Searching news for: {company_name}\n")
urls = get_valid_news_urls(company_name)
if not urls:
print("❌ No valid news URLs found.")
return
seen_articles = set()
for i, url in enumerate(urls, 1):
if url in seen_articles:
continue
seen_articles.add(url)
print(f"\n🔗 Article {i}: {url}\n")
content = extract_article_content(url)
if content:
filtered_text = filter_relevant_sentences(content, company_name)
sentiment, score = analyze_sentiment(filtered_text)
print(f"📰 Extracted Content:\n{filtered_text[:500]}...")
print(f"📊 Sentiment: {sentiment} (Score: {score:.2f})")
else:
print("⚠️ Failed to extract content....")
if __name__ == "__main__":
main()