Spaces:

serhan
/

i135e1fi414i41tqe

Runtime error

App Files Files Community

i135e1fi414i41tqe / contents.py

serhan

Upload 16 files

14e11d6 about 2 years ago

raw

history blame contribute delete

3.02 kB

	import os
	import time

	import PyPDF2
	import docx
	import readability
	from langdetect import detect
	from newspaper import fulltext, Article
	from selenium import webdriver


	def web_crawler_newspaper(url: str) -> tuple[list[str], str]:
	"""Run the web crawler."""
	raw_html, lang = _get_raw_html(url)
	try:
	text = fulltext(raw_html, language=lang)
	except:
	article = Article(url)
	article.download()
	article.parse()
	text = article.text
	contents = [text.strip() for text in text.splitlines() if text.strip()]
	return contents, lang


	def _get_raw_html(url):
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument('--headless')
	chrome_options.add_argument('--disable-gpu')
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-dev-shm-usage')
	chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
	'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36')

	with webdriver.Chrome(options=chrome_options) as driver:
	driver.get(url)
	print("Please wait for 5 seconds until the webpage finishes loading.")
	time.sleep(5)
	html = driver.page_source

	doc = readability.Document(html)
	html = doc.summary()
	lang = detect(html)
	return html, lang[0:2]


	def extract_text_from_pdf(file_path: str) -> tuple[list[str], str]:
	"""Extract text content from a PDF file."""
	with open(file_path, 'rb') as f:
	pdf_reader = PyPDF2.PdfReader(f)
	contents = []
	for page in pdf_reader.pages:
	page_text = page.extract_text().strip()
	raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]
	new_text = ''
	for text in raw_text:
	new_text += text
	if text[-1] in ['.', '!', '?', '。', '！', '？', '…', ';', '；', ':', '：', '”', '’', '）', '】', '》', '」',
	'』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']:
	contents.append(new_text)
	new_text = ''
	if new_text:
	contents.append(new_text)
	lang = detect('\n'.join(contents))
	return contents, lang[0:2]


	def extract_text_from_txt(file_path: str) -> tuple[list[str], str]:
	"""Extract text content from a TXT file."""
	with open(file_path, 'r', encoding='utf-8') as f:
	contents = [text.strip() for text in f.readlines() if text.strip()]
	lang = detect('\n'.join(contents))
	return contents, lang[0:2]


	def extract_text_from_docx(file_path: str) -> tuple[list[str], str]:
	"""Extract text content from a DOCX file."""
	document = docx.Document(file_path)
	contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]
	lang = detect('\n'.join(contents))
	return contents, lang[0:2]