urlcrawl

Running

File size: 1,115 Bytes

738953f
43a0009
 
 
 
a280e58
43a0009
 
 
 
9eb2a18
cdbedd5
a280e58
 
0750144
a280e58
 
9eb2a18
 
cdbedd5
 
 
0750144
43a0009
 
a280e58
cdbedd5
0750144
a280e58
cdbedd5
0750144
43a0009

import gradio as gr
import requests
from bs4 import BeautifulSoup
import re

def fetch_pdf_links_and_titles():
    url = "https://finance.naver.com/research/company_list.naver"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    seen_urls = set()
    links_html = ""
    # 모든 PDF 링크와 제목을 찾습니다.
    pdf_links = soup.find_all('a', href=re.compile("^https://ssl.pstatic.net/imgstock/upload/research/company/.*\.pdf$"))
    for link in pdf_links:
        title = link.text.strip()  # 링크 텍스트에서 제목 추출
        full_url = link['href']
        if full_url not in seen_urls:
            seen_urls.add(full_url)
            # HTML 문자열로 링크 추가
            links_html += f"<div><a href='{full_url}' download='{full_url.split('/')[-1]}'>{title}</a></div>"
    return links_html

# Gradio 인터페이스
with gr.Blocks() as app:
    btn_fetch = gr.Button("PDF 링크 및 정보 조회")
    output_html = gr.HTML()
    btn_fetch.click(
        fn=fetch_pdf_links_and_titles,
        outputs=output_html
    )

app.launch()