Spaces:
Sleeping
Sleeping
File size: 2,793 Bytes
3bf50ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import requests
import re
from bs4 import BeautifulSoup
from newspaper import Article
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
def get_valid_news_urls(company_name):
search_url = f'https://www.google.com/search?q={company_name}+news&tbm=nws'
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(search_url, headers=headers)
if response.status_code != 200:
print("⚠️ Google News request failed!")
return []
soup = BeautifulSoup(response.text, 'html.parser')
links = []
for g in soup.find_all('a', href=True):
url_match = re.search(r'(https?://\S+)', g['href'])
if url_match:
url = url_match.group(1).split('&')[0]
if "google.com" not in url:
links.append(url)
return links[:10] # Limit to top 10 results
def extract_article_content(url):
try:
article = Article(url)
article.download()
article.parse()
return article.text
except Exception as e:
print(f"⚠️ Newspaper3k failed: {e}")
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
if response.status_code != 200:
raise Exception("Request failed")
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p')
return '\n'.join(p.text for p in paragraphs if p.text)
except Exception as e:
print(f"⚠️ BeautifulSoup failed: {e}")
try:
options = Options()
options.add_argument("--headless") # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)
page_content = driver.page_source
driver.quit()
soup = BeautifulSoup(page_content, 'html.parser')
paragraphs = soup.find_all('p')
return '\n'.join(p.text for p in paragraphs if p.text)
except Exception as e:
print(f"⚠️ Selenium failed: {e}")
return None
def main():
company_name = input("Enter company name: ")
print(f"\n🔎 Searching news for: {company_name}\n")
urls = get_valid_news_urls(company_name)
for i, url in enumerate(urls, 1):
print(f"\n🔗 Article {i}: {url}\n")
content = extract_article_content(url)
if content:
print("📰 Extracted Content:\n", content[:], "...")
else:
print("⚠️ Failed to extract content....")
if __name__ == "__main__":
main()
|