urlcrawl / app.py
springwater's picture
Update app.py
51af61d verified
raw
history blame
1.24 kB
import gradio as gr
import re
import requests
from bs4 import BeautifulSoup
def extract_pdf_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
pdf_links = []
for link in soup.find_all('a', href=True):
if re.search(r'\.pdf', link['href']):
pdf_links.append(link['href'])
return pdf_links[:100]
def filter_links_by_keyword(pdf_links, keyword):
filtered_links = [link for link in pdf_links if keyword in link]
return filtered_links
def generate_html(pdf_links):
html = ""
for link in pdf_links:
html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'
return html
def main(url, keyword):
pdf_links = extract_pdf_links(url)
if keyword:
pdf_links = filter_links_by_keyword(pdf_links, keyword)
return generate_html(pdf_links)
title = "๋„ค์ด๋ฒ„ ์ฆ๊ถŒ ๋ฆฌ์„œ์น˜ ๋งํฌ - https://finance.naver.com/research/company_list.naver"
iface = gr.Interface(main,
inputs=["text", "text"],
outputs="text",
title=title,
inputs=["text", "text"],
inputs=["URL", "ํ‚ค์›Œ๋“œ(์„ ํƒ)"])
iface.launch()