Spaces:
Runtime error
Runtime error
import os | |
import time | |
import PyPDF2 | |
import docx | |
import readability | |
from langdetect import detect | |
from newspaper import fulltext, Article | |
from selenium import webdriver | |
def web_crawler_newspaper(url: str) -> tuple[list[str], str]: | |
"""Run the web crawler.""" | |
raw_html, lang = _get_raw_html(url) | |
try: | |
text = fulltext(raw_html, language=lang) | |
except: | |
article = Article(url) | |
article.download() | |
article.parse() | |
text = article.text | |
contents = [text.strip() for text in text.splitlines() if text.strip()] | |
return contents, lang | |
def _get_raw_html(url): | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument('--headless') | |
chrome_options.add_argument('--disable-gpu') | |
chrome_options.add_argument('--no-sandbox') | |
chrome_options.add_argument('--disable-dev-shm-usage') | |
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36') | |
with webdriver.Chrome(options=chrome_options) as driver: | |
driver.get(url) | |
print("Please wait for 5 seconds until the webpage finishes loading.") | |
time.sleep(5) | |
html = driver.page_source | |
doc = readability.Document(html) | |
html = doc.summary() | |
lang = detect(html) | |
return html, lang[0:2] | |
def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]: | |
"""Extract text content from a PDF file.""" | |
with open(file_path, 'rb') as f: | |
pdf_reader = PyPDF2.PdfReader(f) | |
contents = [] | |
for page in pdf_reader.pages: | |
page_text = page.extract_text().strip() | |
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()] | |
new_text = '' | |
for text in raw_text: | |
new_text += text | |
if text[-1] in ['.', '!', '?', 'γ', 'οΌ', 'οΌ', 'β¦', ';', 'οΌ', ':', 'οΌ', 'β', 'β', 'οΌ', 'γ', 'γ', 'γ', | |
'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'Β»', '"', "'", ')', ']', '}']: | |
contents.append(new_text) | |
new_text = '' | |
if new_text: | |
contents.append(new_text) | |
lang = detect('\n'.join(contents)) | |
return contents, lang[0:2] | |
def extract_text_from_txt(file_path: str) -> tuple[list[str], str]: | |
"""Extract text content from a TXT file.""" | |
with open(file_path, 'r', encoding='utf-8') as f: | |
contents = [text.strip() for text in f.readlines() if text.strip()] | |
lang = detect('\n'.join(contents)) | |
return contents, lang[0:2] | |
def extract_text_from_docx(file_path: str) -> tuple[list[str], str]: | |
"""Extract text content from a DOCX file.""" | |
document = docx.Document(file_path) | |
contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()] | |
lang = detect('\n'.join(contents)) | |
return contents, lang[0:2] | |