urlcrawl / app.py
springwater's picture
Update app.py
b1a3ea2 verified
raw
history blame
1.19 kB
import gradio as gr
import re
import requests
from bs4 import BeautifulSoup
def extract_pdf_links_and_title(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# ํŽ˜์ด์ง€ ์ œ๋ชฉ ์ถ”์ถœ
page_title = soup.title.text if soup.title else "No title found"
pdf_links = []
for link in soup.find_all('a', href=True):
if re.search(r'\.pdf', link['href']):
pdf_links.append(link['href'])
# PDF ๋งํฌ์™€ ํŽ˜์ด์ง€ ์ œ๋ชฉ์„ ๋ฐ˜ํ™˜
return pdf_links[:100], page_title
def generate_html(pdf_links_and_title):
pdf_links = pdf_links_and_title[0] # PDF ๋งํฌ ๋ฆฌ์ŠคํŠธ
page_title = pdf_links_and_title[1] # ํŽ˜์ด์ง€ ์ œ๋ชฉ
html = f"<h1>{page_title}</h1>" # ์ œ๋ชฉ์„ HTML์— ์ถ”๊ฐ€
for link in pdf_links:
html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
return html
title = "๋„ค์ด๋ฒ„ ์ฆ๊ถŒ ๋ฆฌ์„œ์น˜ ๋งํฌ- https://finance.naver.com/research/company_list.naver"
iface = gr.Interface(fn=extract_pdf_links_and_title,
inputs="text",
outputs=["text", "html"],
title=title)
iface.launch()