import requests import re from bs4 import BeautifulSoup from newspaper import Article from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager def get_valid_news_urls(company_name): search_url = f'https://www.google.com/search?q={company_name}+news&tbm=nws' headers = {'User-Agent': 'Mozilla/5.0'} response = requests.get(search_url, headers=headers) if response.status_code != 200: print("⚠️ Google News request failed!") return [] soup = BeautifulSoup(response.text, 'html.parser') links = [] for g in soup.find_all('a', href=True): url_match = re.search(r'(https?://\S+)', g['href']) if url_match: url = url_match.group(1).split('&')[0] if "google.com" not in url: links.append(url) return links[:10] # Limit to top 10 results def extract_article_content(url): try: article = Article(url) article.download() article.parse() return article.text except Exception as e: print(f"⚠️ Newspaper3k failed: {e}") try: response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}) if response.status_code != 200: raise Exception("Request failed") soup = BeautifulSoup(response.text, 'html.parser') paragraphs = soup.find_all('p') return '\n'.join(p.text for p in paragraphs if p.text) except Exception as e: print(f"⚠️ BeautifulSoup failed: {e}") try: options = Options() options.add_argument("--headless") # Run in headless mode options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) driver.get(url) page_content = driver.page_source driver.quit() soup = BeautifulSoup(page_content, 'html.parser') paragraphs = soup.find_all('p') return '\n'.join(p.text for p in paragraphs if p.text) except Exception as e: print(f"⚠️ Selenium failed: {e}") return None def main(): company_name = input("Enter company name: ") print(f"\n🔎 Searching news for: {company_name}\n") urls = get_valid_news_urls(company_name) for i, url in enumerate(urls, 1): print(f"\n🔗 Article {i}: {url}\n") content = extract_article_content(url) if content: print("📰 Extracted Content:\n", content[:], "...") else: print("⚠️ Failed to extract content....") if __name__ == "__main__": main()