Spaces:
Runtime error
Runtime error
Commit
·
4f7d130
1
Parent(s):
4161807
remove get child pages
Browse files- scrape_website.py +0 -32
scrape_website.py
CHANGED
|
@@ -3,38 +3,6 @@ from bs4 import BeautifulSoup
|
|
| 3 |
|
| 4 |
|
| 5 |
def process_webpage(url: str):
|
| 6 |
-
# A set to keep track of visited pages
|
| 7 |
-
visited_pages = set()
|
| 8 |
-
|
| 9 |
-
text_list = []
|
| 10 |
-
|
| 11 |
-
# A function to recursively get all child pages
|
| 12 |
-
def get_child_pages(url):
|
| 13 |
-
# Make a GET request to the page and get the HTML content
|
| 14 |
-
response = requests.get(url)
|
| 15 |
-
html_content = response.content
|
| 16 |
-
|
| 17 |
-
# Parse the HTML content using BeautifulSoup
|
| 18 |
-
soup = BeautifulSoup(html_content, "html.parser")
|
| 19 |
-
|
| 20 |
-
# Get all the text content from the relevant HTML tags
|
| 21 |
-
text_content = ""
|
| 22 |
-
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
|
| 23 |
-
for element in soup.find_all(tag):
|
| 24 |
-
text_content += element.get_text() + " "
|
| 25 |
-
|
| 26 |
-
# Add the page to the set of visited pages
|
| 27 |
-
text_content = f"page {url} contains: " + text_content
|
| 28 |
-
visited_pages.add(url)
|
| 29 |
-
|
| 30 |
-
# Find all the child links and recursively get their text content
|
| 31 |
-
for link in soup.find_all("a"):
|
| 32 |
-
href = link.get("href")
|
| 33 |
-
if href and href not in visited_pages and url in href:
|
| 34 |
-
get_child_pages(href)
|
| 35 |
-
|
| 36 |
-
text_list.append(text_content)
|
| 37 |
-
|
| 38 |
# Make a GET request to the page and get the HTML content
|
| 39 |
response = requests.get(url)
|
| 40 |
html_content = response.content
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def process_webpage(url: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
# Make a GET request to the page and get the HTML content
|
| 7 |
response = requests.get(url)
|
| 8 |
html_content = response.content
|