import os import time import PyPDF2 import docx import readability from langdetect import detect from newspaper import fulltext, Article from selenium import webdriver def web_crawler_newspaper(url: str) -> tuple[list[str], str]: """Run the web crawler.""" raw_html, lang = _get_raw_html(url) try: text = fulltext(raw_html, language=lang) except: article = Article(url) article.download() article.parse() text = article.text contents = [text.strip() for text in text.splitlines() if text.strip()] return contents, lang def _get_raw_html(url): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36') with webdriver.Chrome(options=chrome_options) as driver: driver.get(url) print("Please wait for 5 seconds until the webpage finishes loading.") time.sleep(5) html = driver.page_source doc = readability.Document(html) html = doc.summary() lang = detect(html) return html, lang[0:2] def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]: """Extract text content from a PDF file.""" with open(file_path, 'rb') as f: pdf_reader = PyPDF2.PdfReader(f) contents = [] for page in pdf_reader.pages: page_text = page.extract_text().strip() raw_text = [text.strip() for text in page_text.splitlines() if text.strip()] new_text = '' for text in raw_text: new_text += text if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', ';', ':', ':', '”', '’', ')', '】', '》', '」', '』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']: contents.append(new_text) new_text = '' if new_text: contents.append(new_text) lang = detect('\n'.join(contents)) return contents, lang[0:2] def extract_text_from_txt(file_path: str) -> tuple[list[str], str]: """Extract text content from a TXT file.""" with open(file_path, 'r', encoding='utf-8') as f: contents = [text.strip() for text in f.readlines() if text.strip()] lang = detect('\n'.join(contents)) return contents, lang[0:2] def extract_text_from_docx(file_path: str) -> tuple[list[str], str]: """Extract text content from a DOCX file.""" document = docx.Document(file_path) contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()] lang = detect('\n'.join(contents)) return contents, lang[0:2]