Spaces:
Runtime error
Runtime error
| import os | |
| import time | |
| import PyPDF2 | |
| import docx | |
| import readability | |
| from langdetect import detect | |
| from newspaper import fulltext, Article | |
| from selenium import webdriver | |
| def web_crawler_newspaper(url: str) -> tuple[list[str], str]: | |
| """Run the web crawler.""" | |
| raw_html, lang = _get_raw_html(url) | |
| try: | |
| text = fulltext(raw_html, language=lang) | |
| except: | |
| article = Article(url) | |
| article.download() | |
| article.parse() | |
| text = article.text | |
| contents = [text.strip() for text in text.splitlines() if text.strip()] | |
| return contents, lang | |
| def _get_raw_html(url): | |
| chrome_options = webdriver.ChromeOptions() | |
| chrome_options.add_argument('--headless') | |
| chrome_options.add_argument('--disable-gpu') | |
| chrome_options.add_argument('--no-sandbox') | |
| chrome_options.add_argument('--disable-dev-shm-usage') | |
| chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
| 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36') | |
| with webdriver.Chrome(options=chrome_options) as driver: | |
| driver.get(url) | |
| print("Please wait for 5 seconds until the webpage finishes loading.") | |
| time.sleep(5) | |
| html = driver.page_source | |
| doc = readability.Document(html) | |
| html = doc.summary() | |
| lang = detect(html) | |
| return html, lang[0:2] | |
| def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]: | |
| """Extract text content from a PDF file.""" | |
| with open(file_path, 'rb') as f: | |
| pdf_reader = PyPDF2.PdfReader(f) | |
| contents = [] | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text().strip() | |
| raw_text = [text.strip() for text in page_text.splitlines() if text.strip()] | |
| new_text = '' | |
| for text in raw_text: | |
| new_text += text | |
| if text[-1] in ['.', '!', '?', 'γ', 'οΌ', 'οΌ', 'β¦', ';', 'οΌ', ':', 'οΌ', 'β', 'β', 'οΌ', 'γ', 'γ', 'γ', | |
| 'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'Β»', '"', "'", ')', ']', '}']: | |
| contents.append(new_text) | |
| new_text = '' | |
| if new_text: | |
| contents.append(new_text) | |
| lang = detect('\n'.join(contents)) | |
| return contents, lang[0:2] | |
| def extract_text_from_txt(file_path: str) -> tuple[list[str], str]: | |
| """Extract text content from a TXT file.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| contents = [text.strip() for text in f.readlines() if text.strip()] | |
| lang = detect('\n'.join(contents)) | |
| return contents, lang[0:2] | |
| def extract_text_from_docx(file_path: str) -> tuple[list[str], str]: | |
| """Extract text content from a DOCX file.""" | |
| document = docx.Document(file_path) | |
| contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()] | |
| lang = detect('\n'.join(contents)) | |
| return contents, lang[0:2] | |