i135e1fi414i41tqe / contents.py
serhan's picture
Upload 16 files
14e11d6
import os
import time
import PyPDF2
import docx
import readability
from langdetect import detect
from newspaper import fulltext, Article
from selenium import webdriver
def web_crawler_newspaper(url: str) -> tuple[list[str], str]:
"""Run the web crawler."""
raw_html, lang = _get_raw_html(url)
try:
text = fulltext(raw_html, language=lang)
except:
article = Article(url)
article.download()
article.parse()
text = article.text
contents = [text.strip() for text in text.splitlines() if text.strip()]
return contents, lang
def _get_raw_html(url):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')
with webdriver.Chrome(options=chrome_options) as driver:
driver.get(url)
print("Please wait for 5 seconds until the webpage finishes loading.")
time.sleep(5)
html = driver.page_source
doc = readability.Document(html)
html = doc.summary()
lang = detect(html)
return html, lang[0:2]
def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a PDF file."""
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
contents = []
for page in pdf_reader.pages:
page_text = page.extract_text().strip()
raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
new_text = ''
for text in raw_text:
new_text += text
if text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', 'οΌ›', ':', ':', '”', '’', 'οΌ‰', '】', '》', '」',
'』', '〕', '〉', '》', 'γ€—', 'γ€ž', 'γ€Ÿ', 'Β»', '"', "'", ')', ']', '}']:
contents.append(new_text)
new_text = ''
if new_text:
contents.append(new_text)
lang = detect('\n'.join(contents))
return contents, lang[0:2]
def extract_text_from_txt(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a TXT file."""
with open(file_path, 'r', encoding='utf-8') as f:
contents = [text.strip() for text in f.readlines() if text.strip()]
lang = detect('\n'.join(contents))
return contents, lang[0:2]
def extract_text_from_docx(file_path: str) -> tuple[list[str], str]:
"""Extract text content from a DOCX file."""
document = docx.Document(file_path)
contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
lang = detect('\n'.join(contents))
return contents, lang[0:2]