File size: 2,703 Bytes
923dd57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# app.py

import gradio as gr
import requests
from bs4 import BeautifulSoup
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
import torch
import re

# 모델 로딩
tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-summarization")
model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-summarization")

# 요약 함수
def summarize_news(url, min_len, max_len):
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")

        article = soup.find("article")
        if article:
            text = article.get_text()
        else:
            body = soup.find("div", id="articleBody") or soup.find("div", class_="news_body")
            if body:
                text = body.get_text()
            else:
                paragraphs = [p.get_text() for p in soup.find_all("p")]
                paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 40]
                text = " ".join(paragraphs)
                if len(text) < 30:
                    text = soup.get_text()

        text = re.sub(r'[\r\n\t]+', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()

        if len(text) < 30:
            return "본문이 너무 짧거나 추출에 실패했습니다. 다른 뉴스 URL을 시도해보세요."

        input_ids = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
        summary_ids = model.generate(
            input_ids,
            max_length=int(max_len),
            min_length=int(min_len),
            num_beams=4,
            early_stopping=True,
            length_penalty=1.2,
            no_repeat_ngram_size=3,
            repetition_penalty=1.5
        )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

    except Exception as e:
        return f"오류 발생: {e}"

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 📰 뉴스 요약기 (KoBART 기반)")
    gr.Markdown("뉴스 기사 URL을 입력하면 AI가 요약해줍니다.")

    with gr.Row():
        url_input = gr.Textbox(label="뉴스 URL", placeholder="https://news.naver.com/article/...", lines=1)
        submit_btn = gr.Button("요약하기")

    with gr.Row():
        min_len = gr.Slider(20, 200, value=50, step=10, label="최소 길이")
        max_len = gr.Slider(50, 400, value=150, step=10, label="최대 길이")

    output = gr.Textbox(label="요약 결과", lines=10)

    submit_btn.click(fn=summarize_news, inputs=[url_input, min_len, max_len], outputs=output)

# ✅ Hugging Face Spaces에서는 이렇게 실행
if __name__ == "__main__":
    demo.launch()