import requests
from selectolax.parser import HTMLParser
import re
from string import punctuation
def preprocess_text(text):
text = text.lower() # Lowercase text
# punctuation = r'\'\":'
text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation
text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines
return text
def get_html(url):
# request web page
resp = requests.get(url)
# get the response text. in this case it is HTML
html = resp.text
return html
def get_text(html):
tree = HTMLParser(html)
if tree.body is None:
return None
for tag in tree.css('script'):
tag.decompose()
for tag in tree.css('style'):
tag.decompose()
# get the text from the body tag
text = tree.body.text(separator='')
# preprocess
text = preprocess_text(text)
return text
def get_html_text(url):
html = get_html(url)
text = get_text(html)
return text