Spaces:

pmkhanh7890
/

news_verification

Sleeping

App Files Files

news_verification / src /application /url_reader.py

pmkhanh7890

complete the 1st version of GUI

da7dbd0 7 months ago

raw

history blame

3.83 kB

	import string
	from bs4 import BeautifulSoup
	from newspaper import article, ArticleException, ArticleBinaryDataException
	import requests

	# TODO: move this to a config file
	MAX_URL_SIZE = 2000000 # ~2MB
	class URLReader():
	def __init__(self, url: string, newspaper: bool=True):
	self.url = url
	self.text = None # string
	self.title = None # string
	self.images = None # list of Image objects
	self.top_image = None # Image object
	self.is_extracted = False

	url_size = self.get_size()
	if url_size == None or url_size > MAX_URL_SIZE:
	return
	else:
	self.is_extracted = True

	self.newspaper = newspaper # True if using newspaper4k, False if using BS
	if self.newspaper is True:
	self.extract_content_newspaper()
	else:
	self.extract_content_bs()

	def extract_content_newspaper(self):
	"""
	Use newspaper4k to extracts content from a URL

	Args:
	url: The URL of the web page.

	Returns:
	The extracted content (title, text, images)
	"""

	try:
	response = requests.get(self.url)
	response.raise_for_status() # Raise exception for unsuccessful requests
	except requests.exceptions.RequestException as e:
	print(f"Error fetching URL: {e}")
	return None

	try:
	news = article(url=self.url, fetch_images=True)
	except (ArticleException, ArticleBinaryDataException) as e:
	print(f"\t\t↑↑↑ Error downloading article: {e}")
	return None

	self.title = news.title
	self.text = news.text
	self.images = list(set(news.images)) # Remove duplicates
	self.top_image = news.top_image

	def extract_content_bs(self):
	"""
	Use BS and process content
	"""
	response = requests.get(self.url)
	response.raise_for_status()

	response.encoding = response.apparent_encoding

	try:
	soup = BeautifulSoup(response.content, "html.parser")
	except:
	print(f"Error parsing HTML content from {self.url}")
	return None

	self.title = soup.title.string.strip() if soup.title else None

	image_urls = [img['src'] for img in soup.find_all('img')]
	self.images = image_urls
	self.top_image = self.images[0]

	# Exclude text within specific elements
	for element in soup(["img", "figcaption", "table", "script", "style"]):
	element.extract()
	#text = soup.get_text(separator="\n")
	paragraphs = soup.find_all('p')
	text = ' '.join([p.get_text() for p in paragraphs])

	self.text = text

	def get_size(self):
	"""
	Retrieves the size of a URL's content using a HEAD request.

	Args:
	url: The URL to check.

	Returns:
	The size of the content in bytes, or None if the size cannot be determined
	(e.g., due to network errors or missing Content-Length header).
	"""
	try:
	response = requests.head(self.url, allow_redirects=True, timeout=5) # Add timeout
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

	content_length = response.headers.get('Content-Length')
	if content_length is not None:
	return int(content_length)
	else:
	print(f"\t\t↑↑↑ Content-Length header not found")
	return None

	except requests.exceptions.RequestException as e:
	print(f"\t\t↑↑↑ Error getting URL size: {e}")
	return None