urlcrawl

Sleeping

File size: 1,186 Bytes

738953f
4012bf8
43a0009
 
 
b1a3ea2
4012bf8
 
 
b1a3ea2
 
 
4012bf8
 
 
 
 
b1a3ea2
 
7e27e95
b1a3ea2
 
 
dc39e39
b1a3ea2
4012bf8
dfb729a
0750144
b1a3ea2
ae656a9
b1a3ea2
43a0009
b1a3ea2
 
 
0290677
75fb651
dc39e39

import gradio as gr
import re
import requests
from bs4 import BeautifulSoup

def extract_pdf_links_and_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # 페이지 제목 추출
    page_title = soup.title.text if soup.title else "No title found"

    pdf_links = []
    for link in soup.find_all('a', href=True):
        if re.search(r'\.pdf', link['href']):
            pdf_links.append(link['href'])

    # PDF 링크와 페이지 제목을 반환
    return pdf_links[:100], page_title

def generate_html(pdf_links_and_title):
    pdf_links = pdf_links_and_title[0]  # PDF 링크 리스트
    page_title = pdf_links_and_title[1]  # 페이지 제목

    html = f"<h1>{page_title}</h1>"  # 제목을 HTML에 추가
    for link in pdf_links:
        html += f'<a href="{link}" target="_blank" download>{link}</a><br/>'

    return html

title = "네이버 증권 리서치 링크-  https://finance.naver.com/research/company_list.naver"

iface = gr.Interface(fn=extract_pdf_links_and_title, 
                     inputs="text", 
                     outputs=["text", "html"],
                     title=title)

iface.launch()