Spaces:
Running
Running
import requests | |
from selectolax.parser import HTMLParser | |
import re | |
from string import punctuation | |
def preprocess_text(text): | |
text = text.lower() # Lowercase text | |
# punctuation = r'\'\":' | |
text = re.sub(f"[{re.escape(punctuation)}]", "", text) # Remove punctuation | |
text = " ".join(text.split()) # Remove extra spaces, tabs, and new lines | |
return text | |
def get_html(url): | |
# request web page | |
resp = requests.get(url) | |
# get the response text. in this case it is HTML | |
html = resp.text | |
return html | |
def get_text(html): | |
tree = HTMLParser(html) | |
if tree.body is None: | |
return None | |
for tag in tree.css('script'): | |
tag.decompose() | |
for tag in tree.css('style'): | |
tag.decompose() | |
# get the text from the body tag | |
text = tree.body.text(separator='') | |
# preprocess | |
text = preprocess_text(text) | |
return text | |
def get_html_text(url): | |
html = get_html(url) | |
text = get_text(html) | |
return text | |