Spaces:

weanalyze
/

analyze_url

Running

analyze_url / utils /extractor.py

Upload with huggingface_hub

4ed95aa about 2 years ago

983 Bytes

	import requests
	from selectolax.parser import HTMLParser
	import re
	from string import punctuation


	def preprocess_text(text):
	text = text.lower() # Lowercase text
	# punctuation = r'\'\":'
	text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation
	text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
	return text

	def get_html(url):
	# request web page
	resp = requests.get(url)
	# get the response text. in this case it is HTML
	html = resp.text
	return html

	def get_text(html):
	tree = HTMLParser(html)
	if tree.body is None:
	return None
	for tag in tree.css('script'):
	tag.decompose()
	for tag in tree.css('style'):
	tag.decompose()
	# get the text from the body tag
	text = tree.body.text(separator='')
	# preprocess
	text = preprocess_text(text)
	return text

	def get_html_text(url):
	html = get_html(url)
	text = get_text(html)
	return text