Spaces:
Runtime error
Runtime error
File size: 3,019 Bytes
14e11d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os
import time
import PyPDF2
import docx
import readability
from langdetect import detect
from newspaper import fulltext, Article
from selenium import webdriver
def web_crawler_newspaper(url: str) -> tuple[list[str], str]:
"""Run the web crawler."""
raw_html, lang = _get_raw_html(url)
try:
text = fulltext(raw_html, language=lang)
except:
article = Article(url)
article.download()
article.parse()
text = article.text
contents = [text.strip() for text in text.splitlines() if text.strip()]
return contents, lang
def _get_raw_html(url):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
with webdriver.Chrome(options=chrome_options) as driver:
driver.get(url)
print("Please wait for 5 seconds until the webpage finishes loading.")
time.sleep(5)
html = driver.page_source
doc = readability.Document(html)
html = doc.summary()
lang = detect(html)
return html, lang[0:2]
def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a PDF file."""
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
contents = []
for page in pdf_reader.pages:
page_text = page.extract_text().strip()
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
new_text = ''
for text in raw_text:
new_text += text
if text[-1] in ['.', '!', '?', 'γ', 'οΌ', 'οΌ', 'β¦', ';', 'οΌ', ':', 'οΌ', 'β', 'β', 'οΌ', 'γ', 'γ', 'γ',
'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'γ', 'Β»', '"', "'", ')', ']', '}']:
contents.append(new_text)
new_text = ''
if new_text:
contents.append(new_text)
lang = detect('\n'.join(contents))
return contents, lang[0:2]
def extract_text_from_txt(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a TXT file."""
with open(file_path, 'r', encoding='utf-8') as f:
contents = [text.strip() for text in f.readlines() if text.strip()]
lang = detect('\n'.join(contents))
return contents, lang[0:2]
def extract_text_from_docx(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a DOCX file."""
document = docx.Document(file_path)
contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
lang = detect('\n'.join(contents))
return contents, lang[0:2]
|