urlcrawl

Sleeping

urlcrawl / app.py

Update app.py

b1a3ea2 verified over 1 year ago

1.19 kB

	import gradio as gr
	import re
	import requests
	from bs4 import BeautifulSoup

	def extract_pdf_links_and_title(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')

	# 페이지 제목 추출
	page_title = soup.title.text if soup.title else "No title found"

	pdf_links = []
	for link in soup.find_all('a', href=True):
	if re.search(r'\.pdf', link['href']):
	pdf_links.append(link['href'])

	# PDF 링크와 페이지 제목을 반환
	return pdf_links[:100], page_title

	def generate_html(pdf_links_and_title):
	pdf_links = pdf_links_and_title[0] # PDF 링크 리스트
	page_title = pdf_links_and_title[1] # 페이지 제목

	html = f"<h1>{page_title}</h1>" # 제목을 HTML에 추가
	for link in pdf_links:
	html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'

	return html

	title = "네이버 증권 리서치 링크- https://finance.naver.com/research/company_list.naver"

	iface = gr.Interface(fn=extract_pdf_links_and_title,
	inputs="text",
	outputs=["text", "html"],
	title=title)

	iface.launch()