import requests from selectolax.parser import HTMLParser import re from string import punctuation def preprocess_text(text): text = text.lower() # Lowercase text # punctuation = r'\'\":' text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines return text def get_html(url): # request web page resp = requests.get(url) # get the response text. in this case it is HTML html = resp.text return html def get_text(html): tree = HTMLParser(html) if tree.body is None: return None for tag in tree.css('script'): tag.decompose() for tag in tree.css('style'): tag.decompose() # get the text from the body tag text = tree.body.text(separator='') # preprocess text = preprocess_text(text) return text def get_html_text(url): html = get_html(url) text = get_text(html) return text