import os import requests from bs4 import BeautifulSoup from urllib.parse import unquote, urlparse def fetch_url_content(url): try: response = requests.get(url) response.raise_for_status() # ensures we notice bad responses return response.text except requests.RequestException as e: print(f"Error fetching {url}: {e}") return None def clean_text(text): # Remove edit links and other unwanted parts return text.replace("[Bearbeiten | Quelltext bearbeiten]", "").strip() def extract_content(html_content): soup = BeautifulSoup(html_content, "html.parser") content = soup.find("div", {"class": "mw-parser-output"}) if content: return " ".join( clean_text(text.get_text()) for text in content.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"]) ) return "" def save_content(content, path): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as file: file.write(content) if __name__ == "__main__": with open("links.txt", "r") as file: urls = file.readlines() for url in urls: url = url.strip() print(f"Crawling {url}") html_content = fetch_url_content(url) if html_content: article_content = extract_content(html_content) article_name = unquote( urlparse(url).path.split("/")[-1] ) # Extract the last part of the URL save_path = f"data/{article_name}.txt" save_content(article_content, save_path) print(f"Saved content to {save_path}")