N_B_analysis-5 / app.py
Kims12's picture
Update app.py
0fedf9f verified
raw
history blame
42 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
import urllib.parse # iframe ๊ฒฝ๋กœ ๋ณด์ •์„ ์œ„ํ•œ ๋ชจ๋“ˆ
import re
import logging
import tempfile
import pandas as pd
import mecab # python?mecab?ko ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉ
import os
import time
import hmac
import hashlib
import base64
from concurrent.futures import ThreadPoolExecutor, as_completed
# --- ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์„ค์ • ---
# API ํ˜ธ์ถœ ์ œํ•œ์— ๋งž์ถฐ ์ ์ ˆํžˆ ์กฐ์ ˆํ•˜์„ธ์š”.
# ๋„ˆ๋ฌด ๋†’์€ ๊ฐ’์€ API ์ œํ•œ์— ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
MAX_WORKERS_RELATED_KEYWORDS = 5 # fetch_related_keywords ๋ณ‘๋ ฌ ์ž‘์—…์ž ์ˆ˜
MAX_WORKERS_BLOG_COUNT = 10 # fetch_blog_count ๋ณ‘๋ ฌ ์ž‘์—…์ž ์ˆ˜
# ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
def debug_log(message: str):
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] [DEBUG] {message}")
# --- ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํ•‘ ---
def scrape_naver_blog(url: str) -> str:
debug_log("scrape_naver_blog ํ•จ์ˆ˜ ์‹œ์ž‘")
debug_log(f"์š”์ฒญ๋ฐ›์€ URL: {url}")
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/96.0.4664.110 Safari/537.36"
)
}
try:
response = requests.get(url, headers=headers, timeout=10)
debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
if response.status_code != 200:
debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"
soup = BeautifulSoup(response.text, "html.parser")
debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
iframe = soup.select_one("iframe#mainFrame")
if not iframe:
debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
# ์ผ๋ถ€ ๋ธ”๋กœ๊ทธ๋Š” mainFrame์ด ์—†์„ ์ˆ˜ ์žˆ์Œ. ๋ณธ๋ฌธ ์ง์ ‘ ์‹œ๋„
content_div_direct = soup.select_one('.se-main-container')
if content_div_direct:
title_div_direct = soup.select_one('.se-module.se-module-text.se-title-text')
title = title_div_direct.get_text(strip=True) if title_div_direct else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
content = content_div_direct.get_text("\n", strip=True)
debug_log("iframe ์—†์ด ๋ณธ๋ฌธ ์ง์ ‘ ์ถ”์ถœ ์™„๋ฃŒ")
return f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. (๋ณธ๋ฌธ ์ง์ ‘ ์ถ”์ถœ ์‹คํŒจ)"
iframe_src = iframe.get("src")
if not iframe_src:
debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
# iframe_src๊ฐ€ ์ ˆ๋Œ€ URL์ด ์•„๋‹Œ ๊ฒฝ์šฐ๋ฅผ ๋Œ€๋น„
if iframe_src.startswith("//"):
parsed_iframe_url = "https:" + iframe_src
elif iframe_src.startswith("/"):
parsed_main_url = urllib.parse.urlparse(url)
parsed_iframe_url = urllib.parse.urlunparse(
(parsed_main_url.scheme, parsed_main_url.netloc, iframe_src, None, None, None)
)
else:
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
iframe_response = requests.get(parsed_iframe_url, headers=headers, timeout=10)
debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
if iframe_response.status_code != 200:
debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
# ์ œ๋ชฉ ์ถ”์ถœ (๋‹ค์–‘ํ•œ ๊ตฌ์กฐ ์‹œ๋„)
title_selectors = [
'.se-module.se-module-text.se-title-text', # ์ผ๋ฐ˜์ ์ธ ์Šค๋งˆํŠธ์—๋””ํ„ฐ ONE
'.title_text', # ๊ตฌ๋ฒ„์ „ ์—๋””ํ„ฐ ๋˜๋Š” ๋‹ค๋ฅธ ๊ตฌ์กฐ
'div[class*="title"] h3',
'h1', 'h2', 'h3' # ์ผ๋ฐ˜์ ์ธ ์ œ๋ชฉ ํƒœ๊ทธ
]
title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
for selector in title_selectors:
title_div = iframe_soup.select_one(selector)
if title_div:
title = title_div.get_text(strip=True)
break
debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
# ๋ณธ๋ฌธ ์ถ”์ถœ (๋‹ค์–‘ํ•œ ๊ตฌ์กฐ ์‹œ๋„)
content_selectors = [
'.se-main-container', # ์Šค๋งˆํŠธ์—๋””ํ„ฐ ONE
'div#content', # ๊ตฌ๋ฒ„์ „ ์—๋””ํ„ฐ
'div.post_ct', # ์ผ๋ถ€ ๋ธ”๋กœ๊ทธ ๊ตฌ์กฐ
'article', 'main' # ์‹œ๋งจํ‹ฑ ํƒœ๊ทธ
]
content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
for selector in content_selectors:
content_div = iframe_soup.select_one(selector)
if content_div:
# ๋ถˆํ•„์š”ํ•œ ์Šคํฌ๋ฆฝํŠธ, ์Šคํƒ€์ผ ํƒœ๊ทธ ์ œ๊ฑฐ
for s in content_div(['script', 'style']):
s.decompose()
content = content_div.get_text("\n", strip=True)
break
debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํ•ฉ์นจ ์™„๋ฃŒ")
return result
except requests.exceptions.Timeout:
debug_log(f"์š”์ฒญ ์‹œ๊ฐ„ ์ดˆ๊ณผ: {url}")
return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์‹œ๊ฐ„ ์ดˆ๊ณผ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {url}"
except Exception as e:
debug_log(f"์Šคํฌ๋ž˜ํ•‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
# --- ํ˜•ํƒœ์†Œ ๋ถ„์„ (์ฐธ์กฐ์ฝ”๋“œ-1) ---
def analyze_text(text: str):
logging.basicConfig(level=logging.INFO) # INFO ๋ ˆ๋ฒจ๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ๋„ˆ๋ฌด ๋งŽ์€ ๋กœ๊ทธ ๋ฐฉ์ง€
logger = logging.getLogger(__name__)
# logger.debug("์›๋ณธ ํ…์ŠคํŠธ: %s", text) # ๋„ˆ๋ฌด ๊ธธ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์ฃผ์„ ์ฒ˜๋ฆฌ
filtered_text = re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9\s]', '', text) # ์˜์–ด, ์ˆซ์ž, ๊ณต๋ฐฑ ํฌํ•จ
# logger.debug("ํ•„ํ„ฐ๋ง๋œ ํ…์ŠคํŠธ: %s", filtered_text)
if not filtered_text.strip():
logger.info("์œ ํšจํ•œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Œ (ํ•„ํ„ฐ๋ง ํ›„).")
return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
try:
mecab_instance = mecab.MeCab()
tokens = mecab_instance.pos(filtered_text)
except Exception as e:
logger.error(f"MeCab ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜: {e}")
return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
# logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ: %s", tokens)
freq = {}
for word, pos in tokens:
# ์ผ๋ฐ˜๋ช…์‚ฌ(NNG), ๊ณ ์œ ๋ช…์‚ฌ(NNP), ์™ธ๊ตญ์–ด(SL), ์ˆซ์ž(SN) ๋“ฑ ํฌํ•จ, ํ•œ ๊ธ€์ž ๋‹จ์–ด๋Š” ์ œ์™ธ (์„ ํƒ ์‚ฌํ•ญ)
if word and word.strip() and (pos.startswith("NN") or pos in ["SL", "SH"]) and len(word) > 1 :
freq[word] = freq.get(word, 0) + 1
# logger.debug("๋‹จ์–ด: %s, ํ’ˆ์‚ฌ: %s, ๋นˆ๋„: %d", word, pos, freq[word])
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
# logger.debug("์ •๋ ฌ๋œ ๋‹จ์–ด ๋นˆ๋„: %s", sorted_freq)
df = pd.DataFrame(sorted_freq, columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"])
logger.info(f"ํ˜•ํƒœ์†Œ ๋ถ„์„ DataFrame ์ƒ์„ฑ๋จ, shape: {df.shape}")
temp_file_path = ""
if not df.empty:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx", mode='w+b') as temp_file:
df.to_excel(temp_file.name, index=False, engine='openpyxl')
temp_file_path = temp_file.name
logger.info(f"Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {temp_file_path}")
except Exception as e:
logger.error(f"Excel ํŒŒ์ผ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜: {e}")
temp_file_path = "" # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๊ฒฝ๋กœ ์ดˆ๊ธฐํ™”
return df, temp_file_path
# --- ๋„ค์ด๋ฒ„ ๊ฒ€์ƒ‰ ๋ฐ ๊ด‘๊ณ  API ๊ด€๋ จ (์ฐธ์กฐ์ฝ”๋“œ-2) ---
def generate_signature(timestamp, method, uri, secret_key):
message = f"{timestamp}.{method}.{uri}"
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
return base64.b64encode(digest).decode()
def get_header(method, uri, api_key, secret_key, customer_id):
timestamp = str(round(time.time() * 1000))
signature = generate_signature(timestamp, method, uri, secret_key)
return {
"Content-Type": "application/json; charset=UTF-8",
"X-Timestamp": timestamp,
"X-API-KEY": api_key,
"X-Customer": str(customer_id),
"X-Signature": signature
}
# API ํ‚ค ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ ํ•จ์ˆ˜
def get_env_variable(var_name):
value = os.environ.get(var_name)
if value is None:
debug_log(f"ํ™˜๊ฒฝ ๋ณ€์ˆ˜ '{var_name}'๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. API ํ˜ธ์ถœ์ด ์‹คํŒจํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
# ํ•„์š”์‹œ ์—ฌ๊ธฐ์„œ raise Exception ๋˜๋Š” ๊ธฐ๋ณธ๊ฐ’ ๋ฐ˜ํ™˜
return value
def fetch_related_keywords(keyword):
debug_log(f"fetch_related_keywords ํ˜ธ์ถœ ์‹œ์ž‘, ํ‚ค์›Œ๋“œ: {keyword}")
API_KEY = get_env_variable("NAVER_API_KEY")
SECRET_KEY = get_env_variable("NAVER_SECRET_KEY")
CUSTOMER_ID = get_env_variable("NAVER_CUSTOMER_ID")
if not all([API_KEY, SECRET_KEY, CUSTOMER_ID]):
debug_log(f"๋„ค์ด๋ฒ„ ๊ด‘๊ณ  API ํ‚ค ์ •๋ณด ๋ถ€์กฑ์œผ๋กœ '{keyword}' ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ๋ฅผ ๊ฑด๋„ˆ<0xEB><0xB5>๋‹ˆ๋‹ค.")
return pd.DataFrame()
BASE_URL = "https://api.naver.com"
uri = "/keywordstool"
method = "GET"
try:
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
params = {
"hintKeywords": keyword, # ๋‹จ์ผ ํ‚ค์›Œ๋“œ ๋ฌธ์ž์—ด๋กœ ์ „๋‹ฌ
"showDetail": "1"
}
# hintKeywords๋Š” ๋ฆฌ์ŠคํŠธ๋กœ ๋ฐ›์„ ์ˆ˜ ์žˆ์œผ๋‚˜, ์—ฌ๊ธฐ์„œ๋Š” ๋‹จ์ผ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ๋ฅผ ๊ฐ€์ •ํ•˜๊ณ  ๋ฌธ์ž์—ด๋กœ ์ „๋‹ฌ
# ๋งŒ์•ฝ API๊ฐ€ hintKeywords๋ฅผ ๋ฆฌ์ŠคํŠธ๋กœ๋งŒ ๋ฐ›๋Š”๋‹ค๋ฉด [keyword]๋กœ ์ˆ˜์ • ํ•„์š”
response = requests.get(BASE_URL + uri, params=params, headers=headers, timeout=10)
response.raise_for_status() # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์˜ˆ์™ธ ๋ฐœ์ƒ
data = response.json()
if "keywordList" not in data or not data["keywordList"]:
debug_log(f"'{keyword}'์— ๋Œ€ํ•œ ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ๊ฒฐ๊ณผ ์—†์Œ.")
return pd.DataFrame() # ๋นˆ DataFrame ๋ฐ˜ํ™˜
df = pd.DataFrame(data["keywordList"])
# API ์‘๋‹ต์— ํ•ด๋‹น ์ปฌ๋Ÿผ์ด ์—†์„ ๊ฒฝ์šฐ๋ฅผ ๋Œ€๋น„
df["monthlyPcQcCnt"] = df.get("monthlyPcQcCnt", 0)
df["monthlyMobileQcCnt"] = df.get("monthlyMobileQcCnt", 0)
def parse_count(x):
if pd.isna(x) or str(x).lower() == '< 10': # ๋„ค์ด๋ฒ„ API๋Š” 10 ๋ฏธ๋งŒ์ผ ๋•Œ "< 10"์œผ๋กœ ๋ฐ˜ํ™˜
return 5 # ๋˜๋Š” 0, ๋˜๋Š” ๋‹ค๋ฅธ ๋Œ€ํ‘œ๊ฐ’ (์˜ˆ: 5)
try:
return int(str(x).replace(",", ""))
except ValueError:
return 0
df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["monthlyPcQcCnt"].apply(parse_count)
df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["monthlyMobileQcCnt"].apply(parse_count)
df["ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] + df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
df.rename(columns={"relKeyword": "์ •๋ณดํ‚ค์›Œ๋“œ"}, inplace=True)
# ํ•„์š”ํ•œ ์ปฌ๋Ÿผ๋งŒ ์„ ํƒ, ์—†๋Š” ๊ฒฝ์šฐ ๋Œ€๋น„
required_cols = ["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
result_df = pd.DataFrame(columns=required_cols)
for col in required_cols:
if col in df.columns:
result_df[col] = df[col]
else: # ํ•ด๋‹น ์ปฌ๋Ÿผ์ด API ์‘๋‹ต์— ์—†์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์ฑ„์›€
if col == "์ •๋ณดํ‚ค์›Œ๋“œ": # ์ •๋ณดํ‚ค์›Œ๋“œ๋Š” ํ•„์ˆ˜
debug_log(f"API ์‘๋‹ต์— 'relKeyword'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. '{keyword}' ์ฒ˜๋ฆฌ ์ค‘๋‹จ.")
return pd.DataFrame()
result_df[col] = 0
debug_log(f"fetch_related_keywords '{keyword}' ์™„๋ฃŒ, ๊ฒฐ๊ณผ {len(result_df)}๊ฐœ")
return result_df.head(100) # ์ตœ๋Œ€ 100๊ฐœ๋กœ ์ œํ•œ
except requests.exceptions.HTTPError as http_err:
debug_log(f"HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_related_keywords for '{keyword}'): {http_err} - ์‘๋‹ต: {response.text if 'response' in locals() else 'N/A'}")
except requests.exceptions.RequestException as req_err:
debug_log(f"์š”์ฒญ ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_related_keywords for '{keyword}'): {req_err}")
except Exception as e:
debug_log(f"์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_related_keywords for '{keyword}'): {e}")
return pd.DataFrame() # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๋นˆ DataFrame ๋ฐ˜ํ™˜
def fetch_blog_count(keyword):
debug_log(f"fetch_blog_count ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ: {keyword}")
client_id = get_env_variable("NAVER_SEARCH_CLIENT_ID")
client_secret = get_env_variable("NAVER_SEARCH_CLIENT_SECRET")
if not client_id or not client_secret:
debug_log(f"๋„ค์ด๋ฒ„ ๊ฒ€์ƒ‰ API ํ‚ค ์ •๋ณด ๋ถ€์กฑ์œผ๋กœ '{keyword}' ๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ๋ฅผ ๊ฑด๋„ˆ<0xEB><0xB5>๋‹ˆ๋‹ค.")
return 0
url = "https://openapi.naver.com/v1/search/blog.json"
headers = {
"X-Naver-Client-Id": client_id,
"X-Naver-Client-Secret": client_secret
}
params = {"query": keyword, "display": 1} # display=1๋กœ ์„ค์ •ํ•˜์—ฌ total ๊ฐ’๋งŒ ๋น ๋ฅด๊ฒŒ ํ™•์ธ
try:
response = requests.get(url, headers=headers, params=params, timeout=5)
response.raise_for_status() # HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์˜ˆ์™ธ ๋ฐœ์ƒ
data = response.json()
total_count = data.get("total", 0)
debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {total_count} for '{keyword}'")
return total_count
except requests.exceptions.HTTPError as http_err:
debug_log(f"HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_blog_count for '{keyword}'): {http_err} - ์‘๋‹ต: {response.text}")
except requests.exceptions.RequestException as req_err: # Timeout, ConnectionError ๋“ฑ
debug_log(f"์š”์ฒญ ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_blog_count for '{keyword}'): {req_err}")
except Exception as e: # JSONDecodeError ๋“ฑ ๊ธฐํƒ€ ์˜ˆ์™ธ
debug_log(f"์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_blog_count for '{keyword}'): {e}")
return 0 # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ 0 ๋ฐ˜ํ™˜
def create_excel_file(df):
if df.empty:
debug_log("๋นˆ DataFrame์œผ๋กœ Excel ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
# ๋นˆ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ฑฐ๋‚˜, None์„ ๋ฐ˜ํ™˜ํ•˜์—ฌ Gradio์—์„œ ์ฒ˜๋ฆฌํ•˜๋„๋ก ํ•  ์ˆ˜ ์žˆ์Œ
# ์—ฌ๊ธฐ์„œ๋Š” ๋นˆ ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜์—ฌ ๋ฐ˜ํ™˜ (Gradio File ์ปดํฌ๋„ŒํŠธ๊ฐ€ ๊ฒฝ๋กœ๋ฅผ ๊ธฐ๋Œ€ํ•˜๋ฏ€๋กœ)
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
excel_path = tmp.name
# ๋นˆ ์—‘์…€ ํŒŒ์ผ์— ํ—ค๋”๋งŒ์ด๋ผ๋„ ์จ์ฃผ๋ ค๋ฉด
# pd.DataFrame(columns=df.columns).to_excel(excel_path, index=False)
# ์•„๋‹ˆ๋ฉด ๊ทธ๋ƒฅ ๋นˆ ํŒŒ์ผ์„ ๋ฐ˜ํ™˜
return excel_path
try:
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False, mode='w+b') as tmp:
excel_path = tmp.name
df.to_excel(excel_path, index=False, engine='openpyxl')
debug_log(f"Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {excel_path}")
return excel_path
except Exception as e:
debug_log(f"Excel ํŒŒ์ผ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}")
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๋นˆ ํŒŒ์ผ ๊ฒฝ๋กœ๋ผ๋„ ๋ฐ˜ํ™˜ (Gradio ํ˜ธํ™˜์„ฑ)
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
return tmp.name
def process_keyword(keywords: str, include_related: bool):
debug_log(f"process_keyword ํ˜ธ์ถœ ์‹œ์ž‘, ํ‚ค์›Œ๋“œ๋“ค: '{keywords[:100]}...', ์—ฐ๊ด€๊ฒ€์ƒ‰์–ด ํฌํ•จ: {include_related}")
input_keywords_orig = [k.strip() for k in keywords.splitlines() if k.strip()]
if not input_keywords_orig:
debug_log("์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return pd.DataFrame(columns=["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]), ""
all_related_keywords_dfs = []
# 1. fetch_related_keywords ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
debug_log(f"์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์‹œ์ž‘ (์ตœ๋Œ€ ์ž‘์—…์ž ์ˆ˜: {MAX_WORKERS_RELATED_KEYWORDS})")
with ThreadPoolExecutor(max_workers=MAX_WORKERS_RELATED_KEYWORDS) as executor:
future_to_keyword_related = {
executor.submit(fetch_related_keywords, kw): kw for kw in input_keywords_orig
}
for i, future in enumerate(as_completed(future_to_keyword_related)):
kw = future_to_keyword_related[future]
try:
df_kw_related = future.result() # DataFrame ๋ฐ˜ํ™˜
if not df_kw_related.empty:
# ์›๋ณธ ํ‚ค์›Œ๋“œ๊ฐ€ ๊ฒฐ๊ณผ์— ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธํ•˜๊ณ , ์—†์œผ๋ฉด ์ถ”๊ฐ€ ์‹œ๋„ (API๊ฐ€ ํ•ญ์ƒ relKeyword๋กœ ์ž์‹ ์„ ์ฃผ์ง„ ์•Š์Œ)
# ํ•˜์ง€๋งŒ fetch_related_keywords์—์„œ ์ด๋ฏธ hintKeyword๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๊ฒ€์ƒ‰ํ•˜๋ฏ€๋กœ,
# ์ผ๋ฐ˜์ ์œผ๋กœ๋Š” ํ•ด๋‹น ํ‚ค์›Œ๋“œ ์ •๋ณด๊ฐ€ ์žˆ๊ฑฐ๋‚˜, ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋งŒ ๋‚˜์˜ด.
# ์—ฌ๊ธฐ์„œ๋Š” API ์‘๋‹ต์„ ๊ทธ๋Œ€๋กœ ํ™œ์šฉ.
# ์ฒซ ๋ฒˆ์งธ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ์ด๊ณ , ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ํฌํ•จ ์˜ต์…˜์ด ์ผœ์ ธ ์žˆ์œผ๋ฉด ๋ชจ๋“  ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”๊ฐ€
# ๊ทธ ์™ธ์˜ ๊ฒฝ์šฐ์—๋Š” ํ•ด๋‹น ํ‚ค์›Œ๋“œ ์ž์ฒด์˜ ์ •๋ณด๋งŒ (์žˆ๋‹ค๋ฉด) ์‚ฌ์šฉํ•˜๊ฑฐ๋‚˜, ์ตœ์ƒ๋‹จ ํ‚ค์›Œ๋“œ ์‚ฌ์šฉ
if include_related and kw == input_keywords_orig[0]:
all_related_keywords_dfs.append(df_kw_related)
debug_log(f"์ฒซ ๋ฒˆ์งธ ํ‚ค์›Œ๋“œ '{kw}'์˜ ๋ชจ๋“  ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ({len(df_kw_related)}๊ฐœ) ์ถ”๊ฐ€๋จ.")
else:
# ํ•ด๋‹น ํ‚ค์›Œ๋“œ์™€ ์ผ์น˜ํ•˜๋Š” ํ–‰์„ ์ฐพ๊ฑฐ๋‚˜, ์—†์œผ๋ฉด API๊ฐ€ ๋ฐ˜ํ™˜ํ•œ ์ฒซ๋ฒˆ์งธ ํ–‰์„ ์‚ฌ์šฉ
row_kw = df_kw_related[df_kw_related["์ •๋ณดํ‚ค์›Œ๋“œ"] == kw]
if not row_kw.empty:
all_related_keywords_dfs.append(row_kw)
debug_log(f"ํ‚ค์›Œ๋“œ '{kw}'์˜ ์ง์ ‘ ์ •๋ณด ์ถ”๊ฐ€๋จ.")
elif not df_kw_related.empty : # ์ง์ ‘ ์ •๋ณด๋Š” ์—†์ง€๋งŒ ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋Š” ์žˆ์„ ๋•Œ
all_related_keywords_dfs.append(df_kw_related.head(1)) # ๊ฐ€์žฅ ์—ฐ๊ด€์„ฑ ๋†’์€ ํ‚ค์›Œ๋“œ ์ถ”๊ฐ€
debug_log(f"ํ‚ค์›Œ๋“œ '{kw}'์˜ ์ง์ ‘ ์ •๋ณด๋Š” ์—†์œผ๋‚˜, ๊ฐ€์žฅ ์—ฐ๊ด€์„ฑ ๋†’์€ ํ‚ค์›Œ๋“œ 1๊ฐœ ์ถ”๊ฐ€๋จ.")
# else: ํ‚ค์›Œ๋“œ ์ •๋ณด๋„, ์—ฐ๊ด€ ์ •๋ณด๋„ ์—†์„ ๋•Œ (df_kw_related๊ฐ€ ๋น„์–ด์žˆ์Œ)
debug_log(f"'{kw}' ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ ({i+1}/{len(input_keywords_orig)})")
except Exception as e:
debug_log(f"'{kw}' ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ ์ค‘ ๋ณ‘๋ ฌ ์ž‘์—… ์˜ค๋ฅ˜: {e}")
if not all_related_keywords_dfs:
debug_log("์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ ๊ฒฐ๊ณผ๊ฐ€ ๋ชจ๋‘ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค.")
# ๋นˆ DataFrame์— ๋ธ”๋กœ๊ทธ ๋ฌธ์„œ์ˆ˜ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
empty_df = pd.DataFrame(columns=["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"])
empty_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = None
return empty_df, create_excel_file(empty_df)
result_df = pd.concat(all_related_keywords_dfs, ignore_index=True)
result_df.drop_duplicates(subset=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True) # ์ค‘๋ณต ์ œ๊ฑฐ
debug_log(f"์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์™„๋ฃŒ. ํ†ตํ•ฉ๋œ DataFrame shape: {result_df.shape}")
# 2. fetch_blog_count ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
keywords_for_blog_count = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].dropna().unique().tolist()
blog_counts_map = {}
if keywords_for_blog_count:
debug_log(f"๋ธ”๋กœ๊ทธ ๋ฌธ์„œ ์ˆ˜ ์กฐํšŒ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์‹œ์ž‘ (ํ‚ค์›Œ๋“œ {len(keywords_for_blog_count)}๊ฐœ, ์ตœ๋Œ€ ์ž‘์—…์ž ์ˆ˜: {MAX_WORKERS_BLOG_COUNT})")
with ThreadPoolExecutor(max_workers=MAX_WORKERS_BLOG_COUNT) as executor:
future_to_keyword_blog = {
executor.submit(fetch_blog_count, kw): kw for kw in keywords_for_blog_count
}
for i, future in enumerate(as_completed(future_to_keyword_blog)):
kw = future_to_keyword_blog[future]
try:
count = future.result() # ์ˆซ์ž ๋ฐ˜ํ™˜
blog_counts_map[kw] = count
if (i+1) % 50 == 0: # ๋„ˆ๋ฌด ๋งŽ์€ ๋กœ๊ทธ ๋ฐฉ์ง€
debug_log(f"๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ ์ง„ํ–‰ ์ค‘... ({i+1}/{len(keywords_for_blog_count)})")
except Exception as e:
debug_log(f"'{kw}' ๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ ์ค‘ ๋ณ‘๋ ฌ ์ž‘์—… ์˜ค๋ฅ˜: {e}")
blog_counts_map[kw] = 0 # ์˜ค๋ฅ˜ ์‹œ 0์œผ๋กœ ์ฒ˜๋ฆฌ
result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].map(blog_counts_map).fillna(0).astype(int)
debug_log("๋ธ”๋กœ๊ทธ ๋ฌธ์„œ ์ˆ˜ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์™„๋ฃŒ.")
else:
result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = 0 # ์กฐํšŒํ•  ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด 0์œผ๋กœ ์ฑ„์›€
result_df.sort_values(by="ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", ascending=False, inplace=True)
debug_log(f"process_keyword ์ตœ์ข… ์™„๋ฃŒ. DataFrame shape: {result_df.shape}")
# ์ตœ์ข… ์ปฌ๋Ÿผ ์ˆœ์„œ ๋ฐ ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ
final_columns = ["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
for col in final_columns:
if col not in result_df.columns:
result_df[col] = 0 if col != "์ •๋ณดํ‚ค์›Œ๋“œ" else "" # ์—†๋Š” ์ปฌ๋Ÿผ์€ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์ฑ„์›€
result_df = result_df[final_columns] # ์ปฌ๋Ÿผ ์ˆœ์„œ ๊ณ ์ •
return result_df, create_excel_file(result_df)
# --- ํ˜•ํƒœ์†Œ ๋ถ„์„๊ณผ ๊ฒ€์ƒ‰๋Ÿ‰/๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ๋ณ‘ํ•ฉ ---
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์‹œ์ž‘")
df_freq, _ = analyze_text(text) # ์—‘์…€ ํŒŒ์ผ ๊ฒฝ๋กœ๋Š” ์—ฌ๊ธฐ์„  ์‚ฌ์šฉ ์•ˆ ํ•จ
if df_freq.empty:
debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋นˆ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์ž…๋‹ˆ๋‹ค.")
return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]), ""
if remove_freq1:
before_count = len(df_freq)
df_freq = df_freq[df_freq["๋นˆ๋„์ˆ˜"] > 1].copy() # .copy() ์ถ”๊ฐ€
debug_log(f"๋นˆ๋„์ˆ˜ 1 ์ œ๊ฑฐ ์ ์šฉ๋จ. {before_count} -> {len(df_freq)}")
if df_freq.empty:
debug_log("๋นˆ๋„์ˆ˜ 1 ์ œ๊ฑฐ ํ›„ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]), ""
keywords_from_morph = "\n".join(df_freq["๋‹จ์–ด"].tolist())
debug_log(f"ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ({len(df_freq['๋‹จ์–ด'])}๊ฐœ)์— ๋Œ€ํ•œ ์ •๋ณด ์กฐํšŒ ์‹œ์ž‘")
# process_keyword๋Š” ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜์ง€ ์•Š๋„๋ก ํ˜ธ์ถœ (include_related=False)
df_keyword_info, _ = process_keyword(keywords_from_morph, include_related=False)
debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ์™„๋ฃŒ")
if df_keyword_info.empty:
debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ API ์ •๋ณด ์กฐํšŒ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
# df_freq์— ๋นˆ ์ปฌ๋Ÿผ๋“ค ์ถ”๊ฐ€
for col in ["PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]:
df_freq[col] = None
merged_df = df_freq
else:
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋‹จ์–ด", right_on="์ •๋ณดํ‚ค์›Œ๋“œ", how="left")
if "์ •๋ณดํ‚ค์›Œ๋“œ" in merged_df.columns: # merge ํ›„ ์ •๋ณดํ‚ค์›Œ๋“œ ์ปฌ๋Ÿผ์ด ์ƒ๊ฒผ๋‹ค๋ฉด ์‚ญ์ œ
merged_df.drop(columns=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True, errors='ignore')
# ๋ˆ„๋ฝ๋œ ์ปฌ๋Ÿผ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ
expected_cols = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
for col in expected_cols:
if col not in merged_df.columns:
merged_df[col] = None if col not in ["๋นˆ๋„์ˆ˜"] else 0
merged_df = merged_df[expected_cols] # ์ปฌ๋Ÿผ ์ˆœ์„œ ๊ณ ์ •
merged_excel_path = create_excel_file(merged_df)
debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์™„๋ฃŒ")
return merged_df, merged_excel_path
# --- ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ถ„์„ (๋‹จ๋… ๋ถ„์„) ---
def direct_keyword_analysis(text: str, keyword_input: str):
debug_log("direct_keyword_analysis ํ•จ์ˆ˜ ์‹œ์ž‘")
direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', keyword_input) if kw.strip()]
debug_log(f"์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ชฉ๋ก: {direct_keywords_list}")
if not direct_keywords_list:
debug_log("์ง์ ‘ ์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return pd.DataFrame(columns=["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜"]), ""
# 1. ๋ณธ๋ฌธ ๋‚ด ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
results_freq = []
for kw in direct_keywords_list:
count = text.count(kw) # ๋Œ€์†Œ๋ฌธ์ž ๊ตฌ๋ถ„, ์ •ํ™•ํ•œ ๋ฌธ์ž์—ด ์นด์šดํŠธ
results_freq.append({"ํ‚ค์›Œ๋“œ": kw, "๋นˆ๋„์ˆ˜": count})
debug_log(f"์ง์ ‘ ํ‚ค์›Œ๋“œ '{kw}'์˜ ๋ณธ๋ฌธ ๋‚ด ๋นˆ๋„์ˆ˜: {count}")
df_direct_freq = pd.DataFrame(results_freq)
# 2. API๋ฅผ ํ†ตํ•ด ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ (๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋œ process_keyword ์‚ฌ์šฉ)
# ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ ์ง์ ‘ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ์ •๋ณด๋งŒ ํ•„์š”ํ•˜๋ฏ€๋กœ include_related=False
keywords_for_api = "\n".join(direct_keywords_list)
df_direct_api_info, _ = process_keyword(keywords_for_api, include_related=False)
# 3. ๋นˆ๋„์ˆ˜ ๊ฒฐ๊ณผ์™€ API ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ
if not df_direct_api_info.empty:
# API ๊ฒฐ๊ณผ์˜ '์ •๋ณดํ‚ค์›Œ๋“œ'๋ฅผ 'ํ‚ค์›Œ๋“œ'๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ๋ณ‘ํ•ฉ ๊ธฐ์ค€ ํ†ต์ผ
df_direct_api_info.rename(columns={"์ •๋ณดํ‚ค์›Œ๋“œ": "ํ‚ค์›Œ๋“œ"}, inplace=True)
merged_df = pd.merge(df_direct_freq, df_direct_api_info, on="ํ‚ค์›Œ๋“œ", how="left")
else:
merged_df = df_direct_freq
for col in ["PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]:
merged_df[col] = None # API ์ •๋ณด๊ฐ€ ์—†์„ ๊ฒฝ์šฐ ๋นˆ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
# ์ปฌ๋Ÿผ ์ˆœ์„œ ๋ฐ ๊ธฐ๋ณธ๊ฐ’ ์ •๋ฆฌ
final_cols = ["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
for col in final_cols:
if col not in merged_df.columns:
merged_df[col] = 0 if col != "ํ‚ค์›Œ๋“œ" else ""
merged_df = merged_df[final_cols]
excel_path = create_excel_file(merged_df)
debug_log("direct_keyword_analysis ํ•จ์ˆ˜ ์™„๋ฃŒ")
return merged_df, excel_path
# --- ํ†ตํ•ฉ ๋ถ„์„ (ํ˜•ํƒœ์†Œ ๋ถ„์„ + ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ถ„์„) ---
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
debug_log("combined_analysis ํ•จ์ˆ˜ ์‹œ์ž‘")
# 1. ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋ฐ˜ ๊ฒฐ๊ณผ (API ์ •๋ณด ํฌํ•จ)
df_morph, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
# df_morph ์ปฌ๋Ÿผ: "๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"
# 2. ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ
direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', direct_keyword_input) if kw.strip()]
debug_log(f"ํ†ตํ•ฉ ๋ถ„์„ - ์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ: {direct_keywords_list}")
if not direct_keywords_list: # ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๋งŒ ๋ฐ˜ํ™˜
if "์ง์ ‘์ž…๋ ฅ" not in df_morph.columns and not df_morph.empty:
df_morph["์ง์ ‘์ž…๋ ฅ"] = "" # ์ง์ ‘์ž…๋ ฅ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
# ์ปฌ๋Ÿผ ์ˆœ์„œ ์กฐ์ •
cols = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"]
for col in cols:
if col not in df_morph.columns:
df_morph[col] = "" if col == "์ง์ ‘์ž…๋ ฅ" else (0 if col != "๋‹จ์–ด" else "")
df_morph = df_morph[cols]
return df_morph, create_excel_file(df_morph)
# ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ์ •๋ณด (๋นˆ๋„์ˆ˜, API ์ •๋ณด) ๊ฐ€์ ธ์˜ค๊ธฐ
# direct_keyword_analysis๋Š” "ํ‚ค์›Œ๋“œ" ์ปฌ๋Ÿผ์„ ์‚ฌ์šฉํ•˜๋ฏ€๋กœ, df_morph์˜ "๋‹จ์–ด"์™€ ํ†ต์ผ ํ•„์š”
df_direct_raw, _ = direct_keyword_analysis(blog_text, direct_keyword_input)
# df_direct_raw ์ปฌ๋Ÿผ: "ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"
df_direct_raw.rename(columns={"ํ‚ค์›Œ๋“œ": "๋‹จ์–ด"}, inplace=True) # ์ปฌ๋Ÿผ๋ช… ํ†ต์ผ
# ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์— '์ง์ ‘์ž…๋ ฅ' ํ‘œ๊ธฐ
if not df_morph.empty:
df_morph["์ง์ ‘์ž…๋ ฅ"] = df_morph["๋‹จ์–ด"].apply(lambda x: "์ง์ ‘์ž…๋ ฅ" if x in direct_keywords_list else "")
else: # ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋น„์–ด์žˆ์„ ์ˆ˜ ์žˆ์Œ
df_morph = pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"])
# ์ง์ ‘ ์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ ์ค‘ ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์— ์—†๋Š” ๊ฒƒ๋“ค์„ ์ถ”๊ฐ€
# df_direct_raw์—๋Š” ๋ชจ๋“  ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ์˜ ์ •๋ณด๊ฐ€ ์žˆ์Œ
# df_morph์™€ df_direct_raw๋ฅผ ํ•ฉ์น˜๋˜, '๋‹จ์–ด' ๊ธฐ์ค€์œผ๋กœ ์ค‘๋ณต ์ฒ˜๋ฆฌ
# ๋จผ์ € df_direct_raw์— '์ง์ ‘์ž…๋ ฅ' ์ปฌ๋Ÿผ์„ ์ถ”๊ฐ€ํ•˜๊ณ  "์ง์ ‘์ž…๋ ฅ"์œผ๋กœ ์ฑ„์›€
df_direct_raw["์ง์ ‘์ž…๋ ฅ"] = "์ง์ ‘์ž…๋ ฅ"
# df_morph์— ์žˆ๋Š” ๋‹จ์–ด๋Š” df_morph ์ •๋ณด๋ฅผ ์šฐ์„  ์‚ฌ์šฉ (์ง์ ‘์ž…๋ ฅ ํ”Œ๋ž˜๊ทธ๋งŒ ์—…๋ฐ์ดํŠธ)
# df_direct_raw์—์„œ df_morph์— ์—†๋Š” ๋‹จ์–ด๋งŒ ๊ณจ๋ผ์„œ ์ถ”๊ฐ€
# ํ•ฉ์น˜๊ธฐ: df_morph๋ฅผ ๊ธฐ์ค€์œผ๋กœ df_direct_raw์˜ ์ •๋ณด๋ฅผ ์ถ”๊ฐ€/์—…๋ฐ์ดํŠธ
# Pandas 0.25.0 ์ด์ƒ์—์„œ๋Š” combine_first์˜ overwrite ๋™์ž‘์ด ์•ฝ๊ฐ„ ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ merge ์‚ฌ์šฉ ๊ณ ๋ ค
# 1. df_morph์˜ ๋‹จ์–ด๋“ค์— ๋Œ€ํ•ด df_direct_raw์˜ ์ •๋ณด๋กœ ์—…๋ฐ์ดํŠธ (API ์ •๋ณด ๋“ฑ)
# ๋‹จ, ๋นˆ๋„์ˆ˜๋Š” ๊ฐ์ž ๊ณ„์‚ฐํ•œ ๊ฒƒ์„ ์œ ์ง€ํ• ์ง€, ์•„๋‹ˆ๋ฉด ํ•œ์ชฝ์„ ํƒํ• ์ง€ ๊ฒฐ์ • ํ•„์š”.
# ์—ฌ๊ธฐ์„œ๋Š” df_morph์˜ ๋นˆ๋„์ˆ˜(ํ˜•ํƒœ์†Œ๋ถ„์„ ๊ธฐ๋ฐ˜)์™€ df_direct_raw์˜ ๋นˆ๋„์ˆ˜(๋‹จ์ˆœ count)๊ฐ€ ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์Œ.
# ์ผ๋‹จ์€ df_morph ๊ธฐ์ค€์œผ๋กœ ํ•˜๊ณ , ์—†๋Š” ์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ df_direct_raw์—์„œ ์ถ”๊ฐ€ํ•˜๋Š” ๋ฐฉ์‹.
# df_morph์˜ '์ง์ ‘์ž…๋ ฅ' ์ปฌ๋Ÿผ์€ ์ด๋ฏธ ์œ„์—์„œ ์ฒ˜๋ฆฌ๋จ.
# ์ด์ œ df_direct_raw์—๋งŒ ์žˆ๋Š” ํ‚ค์›Œ๋“œ๋ฅผ df_morph์— ์ถ”๊ฐ€
# df_morph์— ์žˆ๋Š” ๋‹จ์–ด ๋ชฉ๋ก
morph_words = df_morph['๋‹จ์–ด'].tolist() if not df_morph.empty else []
rows_to_add = []
for idx, row in df_direct_raw.iterrows():
if row['๋‹จ์–ด'] not in morph_words:
rows_to_add.append(row)
if rows_to_add:
df_to_add = pd.DataFrame(rows_to_add)
combined_df = pd.concat([df_morph, df_to_add], ignore_index=True)
else:
combined_df = df_morph.copy() # df_morph๊ฐ€ ๋น„์–ด์žˆ์„ ์ˆ˜๋„ ์žˆ์Œ
# ์ตœ์ข… ์ปฌ๋Ÿผ ์ •๋ฆฌ ๋ฐ ์ˆœ์„œ
final_cols_combined = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"]
for col in final_cols_combined:
if col not in combined_df.columns:
# ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •: '์ง์ ‘์ž…๋ ฅ'์€ "", ๋‚˜๋จธ์ง€๋Š” 0 ๋˜๋Š” None (API ๊ฐ’์€ None ํ—ˆ์šฉ)
if col == "์ง์ ‘์ž…๋ ฅ":
combined_df[col] = ""
elif col == "๋นˆ๋„์ˆ˜":
combined_df[col] = 0
elif col == "๋‹จ์–ด":
combined_df[col] = ""
else: # API ๊ด€๋ จ ์ปฌ๋Ÿผ
combined_df[col] = None # pd.NA๋„ ๊ฐ€๋Šฅ
# NA ๊ฐ’๋“ค์„ ์ ์ ˆํžˆ ์ฒ˜๋ฆฌ (์˜ˆ: 0์œผ๋กœ ์ฑ„์šฐ๊ฑฐ๋‚˜ ๊ทธ๋Œ€๋กœ ๋‘๊ธฐ)
# API ๊ฐ’๋“ค์€ ์ˆซ์ž๊ฐ€ ์•„๋‹ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ (์˜ˆ: "< 10"), process_keyword์—์„œ ์ฒ˜๋ฆฌ๋จ. ์—ฌ๊ธฐ์„œ๋Š” intํ˜• ๋ณ€ํ™˜ ์ „์ด๋ฏ€๋กœ ๊ทธ๋Œ€๋กœ ๋‘ .
# Gradio Dataframe์€ None์„ ์ž˜ ํ‘œ์‹œํ•จ.
# ๋นˆ๋„์ˆ˜๋Š” ์ •์ˆ˜ํ˜•์ด์–ด์•ผ ํ•จ
if "๋นˆ๋„์ˆ˜" in combined_df.columns:
combined_df["๋นˆ๋„์ˆ˜"] = combined_df["๋นˆ๋„์ˆ˜"].fillna(0).astype(int)
combined_df = combined_df[final_cols_combined].drop_duplicates(subset=['๋‹จ์–ด'], keep='first') # ๋งŒ์•ฝ์„ ์œ„ํ•œ ์ค‘๋ณต ์ œ๊ฑฐ
combined_df.sort_values(by=["์ง์ ‘์ž…๋ ฅ", "๋นˆ๋„์ˆ˜"], ascending=[False, False], inplace=True, na_position='last') # ์ง์ ‘์ž…๋ ฅ ์šฐ์„ , ๊ทธ ๋‹ค์Œ ๋นˆ๋„์ˆ˜
combined_df.reset_index(drop=True, inplace=True)
combined_excel = create_excel_file(combined_df)
debug_log("combined_analysis ํ•จ์ˆ˜ ์™„๋ฃŒ")
return combined_df, combined_excel
# --- ๋ถ„์„ ํ•ธ๋“ค๋Ÿฌ ---
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
debug_log(f"analysis_handler ํ•จ์ˆ˜ ์‹œ์ž‘. ์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ ๋ถ„์„: {direct_keyword_only}")
start_time = time.time()
if not blog_text or blog_text.strip() == "์Šคํฌ๋ž˜ํ•‘๋œ ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค." or blog_text.strip() == "":
debug_log("๋ถ„์„ํ•  ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—†์Šต๋‹ˆ๋‹ค.")
# ๋นˆ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•˜๊ธฐ ์œ„ํ•œ DataFrame ๊ตฌ์กฐ ๋ช…์‹œ
empty_cols_direct = ["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
empty_cols_combined = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"]
df_empty = pd.DataFrame(columns=empty_cols_direct if direct_keyword_only else empty_cols_combined)
return df_empty, create_excel_file(df_empty)
if direct_keyword_only:
# "์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ๋งŒ ๋ถ„์„" ์„ ํƒ ์‹œ ๋‹จ๋… ๋ถ„์„ ์ˆ˜ํ–‰
if not direct_keyword_input or not direct_keyword_input.strip():
debug_log("์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ ๋ถ„์„ ์„ ํƒ๋˜์—ˆ์œผ๋‚˜, ์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
empty_cols_direct = ["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
df_empty = pd.DataFrame(columns=empty_cols_direct)
return df_empty, create_excel_file(df_empty)
result_df, excel_path = direct_keyword_analysis(blog_text, direct_keyword_input)
else:
# ๊ธฐ๋ณธ ํ†ตํ•ฉ ๋ถ„์„ ์ˆ˜ํ–‰
result_df, excel_path = combined_analysis(blog_text, remove_freq1, direct_keyword_input)
end_time = time.time()
debug_log(f"analysis_handler ์ด ์‹คํ–‰ ์‹œ๊ฐ„: {end_time - start_time:.2f} ์ดˆ")
return result_df, excel_path
# --- ์Šคํฌ๋ž˜ํ•‘ ์‹คํ–‰ ---
def fetch_blog_content(url: str):
debug_log("fetch_blog_content ํ•จ์ˆ˜ ์‹œ์ž‘")
if not url or not url.strip():
return "๋ธ”๋กœ๊ทธ URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
if not url.startswith("http://") and not url.startswith("https://"):
return "์œ ํšจํ•œ URL ํ˜•์‹(http:// ๋˜๋Š” https://)์œผ๋กœ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
start_time = time.time()
content = scrape_naver_blog(url)
end_time = time.time()
debug_log(f"fetch_blog_content ์ด ์‹คํ–‰ ์‹œ๊ฐ„: {end_time - start_time:.2f} ์ดˆ. ๋‚ด์šฉ ๊ธธ์ด: {len(content)}")
return content
# --- Custom CSS ---
custom_css = """
/* ์ „์ฒด ์ปจํ…Œ์ด๋„ˆ ์Šคํƒ€์ผ */
.gradio-container {
max-width: 1080px; /* ๋„ˆ๋น„ ํ™•์žฅ */
margin: auto;
font-family: 'Helvetica Neue', Arial, sans-serif;
background: #f5f7fa;
padding: 2rem;
}
/* ํ—ค๋” ์Šคํƒ€์ผ */
.custom-header {
text-align: center;
font-size: 2.5rem;
font-weight: bold;
margin-bottom: 1.5rem;
color: #333;
}
/* ๊ทธ๋ฃน ๋ฐ•์Šค ์Šคํƒ€์ผ */
.custom-group {
background: #ffffff;
border-radius: 8px;
padding: 1.5rem;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
margin-bottom: 1.5rem;
}
/* ๋ฒ„ํŠผ ์Šคํƒ€์ผ */
.custom-button {
background-color: #007bff;
color: #fff;
border: none;
border-radius: 4px;
padding: 0.6rem 1.2rem;
font-size: 1rem;
cursor: pointer;
min-width: 150px; /* ๋ฒ„ํŠผ ์ตœ์†Œ ๋„ˆ๋น„ */
}
.custom-button:hover {
background-color: #0056b3;
}
/* ์ฒดํฌ๋ฐ•์Šค ์Šคํƒ€์ผ */
.custom-checkbox {
margin-right: 1rem;
}
/* ๊ฒฐ๊ณผ ํ…Œ์ด๋ธ” ๋ฐ ๋‹ค์šด๋กœ๋“œ ๋ฒ„ํŠผ */
.custom-result {
margin-top: 1.5rem;
}
/* ๊ฐ€์šด๋ฐ ์ •๋ ฌ */
.centered {
display: flex;
justify-content: center;
align-items: center;
}
"""
# --- Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ ---
with gr.Blocks(title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ํ‚ค์›Œ๋“œ ๋ถ„์„ ์„œ๋น„์Šค", css=custom_css) as demo:
gr.HTML("<div class='custom-header'>๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ํ‚ค์›Œ๋“œ ๋ถ„์„ ์„œ๋น„์Šค</div>")
with gr.Row():
with gr.Column(scale=2): # ์™ผ์ชฝ ์ปฌ๋Ÿผ (์ž…๋ ฅ ์˜์—ญ)
with gr.Group(elem_classes="custom-group"):
blog_url_input = gr.Textbox(
label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
placeholder="์˜ˆ: https://blog.naver.com/์•„์ด๋””/๊ธ€๋ฒˆํ˜ธ",
lines=1,
info="๋ถ„์„ํ•  ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๊ฒŒ์‹œ๋ฌผ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
)
with gr.Row(elem_classes="centered"):
scrape_button = gr.Button("๋ธ”๋กœ๊ทธ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ", elem_classes="custom-button", variant="primary")
with gr.Group(elem_classes="custom-group"):
blog_content_box = gr.Textbox(
label="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ (์ˆ˜์ • ๊ฐ€๋Šฅ)",
lines=10,
placeholder="์Šคํฌ๋ž˜ํ•‘๋œ ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ์ง์ ‘ ์ˆ˜์ •ํ•˜๊ฑฐ๋‚˜ ๋ถ™์—ฌ๋„ฃ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค."
)
with gr.Group(elem_classes="custom-group"):
gr.Markdown("### ๋ถ„์„ ์˜ต์…˜ ์„ค์ •")
with gr.Row():
remove_freq_checkbox = gr.Checkbox(
label="๋นˆ๋„์ˆ˜ 1์ธ ๋‹จ์–ด ์ œ๊ฑฐ (ํ˜•ํƒœ์†Œ ๋ถ„์„ ์‹œ)",
value=True,
elem_classes="custom-checkbox",
info="ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์—์„œ ๋นˆ๋„์ˆ˜๊ฐ€ 1์ธ ๋‹จ์–ด๋ฅผ ์ œ์™ธํ•ฉ๋‹ˆ๋‹ค."
)
with gr.Row():
direct_keyword_only_checkbox = gr.Checkbox(
label="์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ ๋ถ„์„",
value=False,
elem_classes="custom-checkbox",
info="์ด ์˜ต์…˜์„ ์„ ํƒํ•˜๋ฉด ์•„๋ž˜ ์ž…๋ ฅํ•œ ์ง์ ‘ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด์„œ๋งŒ ๋ถ„์„์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค (ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ƒ๋žต)."
)
with gr.Row():
direct_keyword_box = gr.Textbox(
label="์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ (์—”ํ„ฐ ๋˜๋Š” ','๋กœ ๊ตฌ๋ถ„)",
lines=3,
placeholder="์˜ˆ: ํ‚ค์›Œ๋“œ1, ํ‚ค์›Œ๋“œ2\nํ‚ค์›Œ๋“œ3\n...\n(ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์™€ ๋ณ„๋„๋กœ ๋ถ„์„ํ•˜๊ฑฐ๋‚˜, ํ†ตํ•ฉ ๋ถ„์„์— ์ถ”๊ฐ€ํ•  ํ‚ค์›Œ๋“œ)",
info="๋ถ„์„์— ํฌํ•จํ•˜๊ฑฐ๋‚˜ ๋‹จ๋…์œผ๋กœ ๋ถ„์„ํ•  ํ‚ค์›Œ๋“œ๋ฅผ ์ง์ ‘ ์ž…๋ ฅํ•ฉ๋‹ˆ๋‹ค."
)
with gr.Group(elem_classes="custom-group"):
with gr.Row(elem_classes="centered"):
analyze_button = gr.Button("ํ‚ค์›Œ๋“œ ๋ถ„์„ ์‹คํ–‰", elem_classes="custom-button", variant="primary")
with gr.Column(scale=3): # ์˜ค๋ฅธ์ชฝ ์ปฌ๋Ÿผ (๊ฒฐ๊ณผ ์˜์—ญ)
with gr.Group(elem_classes="custom-group custom-result"):
gr.Markdown("### ๋ถ„์„ ๊ฒฐ๊ณผ")
result_df_display = gr.Dataframe(
label="ํ†ตํ•ฉ ๋ถ„์„ ๊ฒฐ๊ณผ (๋‹จ์–ด, ๋นˆ๋„์ˆ˜, ๊ฒ€์ƒ‰๋Ÿ‰, ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜, ์ง์ ‘์ž…๋ ฅ ์—ฌ๋ถ€)",
interactive=False, # ์‚ฌ์šฉ์ž๊ฐ€ ์ง์ ‘ ์ˆ˜์ • ๋ถˆ๊ฐ€
height=600, # ๋†’์ด ์กฐ์ ˆ
wrap=True # ๊ธด ํ…์ŠคํŠธ ์ค„๋ฐ”๊ฟˆ
)
with gr.Group(elem_classes="custom-group"):
gr.Markdown("### ๊ฒฐ๊ณผ ๋‹ค์šด๋กœ๋“œ")
excel_file_display = gr.File(label="๋ถ„์„ ๊ฒฐ๊ณผ Excel ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
analyze_button.click(
fn=analysis_handler,
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
outputs=[result_df_display, excel_file_display]
)
if __name__ == "__main__":
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • ์˜ˆ์‹œ (์‹ค์ œ ์‹คํ–‰ ์‹œ์—๋Š” ์‹œ์Šคํ…œ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋กœ ์„ค์ •ํ•˜๊ฑฐ๋‚˜, .env ํŒŒ์ผ ๋“ฑ์„ ์‚ฌ์šฉ)
# os.environ["NAVER_API_KEY"] = "YOUR_NAVER_API_KEY"
# os.environ["NAVER_SECRET_KEY"] = "YOUR_NAVER_SECRET_KEY"
# os.environ["NAVER_CUSTOMER_ID"] = "YOUR_NAVER_CUSTOMER_ID"
# os.environ["NAVER_SEARCH_CLIENT_ID"] = "YOUR_NAVER_SEARCH_CLIENT_ID"
# os.environ["NAVER_SEARCH_CLIENT_SECRET"] = "YOUR_NAVER_SEARCH_CLIENT_SECRET"
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • ํ™•์ธ
required_env_vars = [
"NAVER_API_KEY", "NAVER_SECRET_KEY", "NAVER_CUSTOMER_ID",
"NAVER_SEARCH_CLIENT_ID", "NAVER_SEARCH_CLIENT_SECRET"
]
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
if missing_vars:
debug_log(f"๊ฒฝ๊ณ : ๋‹ค์Œ ํ•„์ˆ˜ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค - {', '.join(missing_vars)}")
debug_log("API ํ˜ธ์ถœ ๊ธฐ๋Šฅ์ด ์ •์ƒ์ ์œผ๋กœ ๋™์ž‘ํ•˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
debug_log("์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ ์ „์— ํ•ด๋‹น ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”.")
# Gradio ์•ฑ์€ ์‹คํ–‰ํ•˜๋˜, API ํ˜ธ์ถœ ์‹œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•  ์ˆ˜ ์žˆ์Œ์„ ์‚ฌ์šฉ์ž์—๊ฒŒ ์•Œ๋ฆผ.
debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
demo.launch(debug=True) # ๊ฐœ๋ฐœ ์ค‘์—๋Š” debug=True๋กœ ์„ค์ •ํ•˜์—ฌ ์˜ค๋ฅ˜ ํ™•์ธ ์šฉ์ด
debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")