Spaces:

Shakespeared101
/

news-summarise-tts

Sleeping

App Files Files Community

news-summarise-tts / scrapes.py

Shakespeared101

Reinitialize repository without large files

3bf50ea 7 months ago

raw

history blame

2.79 kB

	import requests
	import re
	from bs4 import BeautifulSoup
	from newspaper import Article
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from webdriver_manager.chrome import ChromeDriverManager


	def get_valid_news_urls(company_name):
	search_url = f'https://www.google.com/search?q={company_name}+news&tbm=nws'
	headers = {'User-Agent': 'Mozilla/5.0'}
	response = requests.get(search_url, headers=headers)

	if response.status_code != 200:
	print("⚠️ Google News request failed!")
	return []

	soup = BeautifulSoup(response.text, 'html.parser')
	links = []
	for g in soup.find_all('a', href=True):
	url_match = re.search(r'(https?://\S+)', g['href'])
	if url_match:
	url = url_match.group(1).split('&')[0]
	if "google.com" not in url:
	links.append(url)

	return links[:10] # Limit to top 10 results

	def extract_article_content(url):
	try:
	article = Article(url)
	article.download()
	article.parse()
	return article.text
	except Exception as e:
	print(f"⚠️ Newspaper3k failed: {e}")

	try:
	response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
	if response.status_code != 200:
	raise Exception("Request failed")
	soup = BeautifulSoup(response.text, 'html.parser')
	paragraphs = soup.find_all('p')
	return '\n'.join(p.text for p in paragraphs if p.text)
	except Exception as e:
	print(f"⚠️ BeautifulSoup failed: {e}")

	try:
	options = Options()
	options.add_argument("--headless") # Run in headless mode
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
	driver.get(url)
	page_content = driver.page_source
	driver.quit()
	soup = BeautifulSoup(page_content, 'html.parser')
	paragraphs = soup.find_all('p')
	return '\n'.join(p.text for p in paragraphs if p.text)
	except Exception as e:
	print(f"⚠️ Selenium failed: {e}")

	return None

	def main():
	company_name = input("Enter company name: ")
	print(f"\n🔎 Searching news for: {company_name}\n")
	urls = get_valid_news_urls(company_name)

	for i, url in enumerate(urls, 1):
	print(f"\n🔗 Article {i}: {url}\n")
	content = extract_article_content(url)
	if content:
	print("📰 Extracted Content:\n", content[:], "...")
	else:
	print("⚠️ Failed to extract content....")

	if __name__ == "__main__":
	main()