Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import urllib.parse # iframe ๊ฒฝ๋ก ๋ณด์ ์ ์ํ ๋ชจ๋ | |
import re | |
import logging | |
import tempfile | |
import pandas as pd | |
import mecab # python?mecab?ko ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ฌ์ฉ | |
import os | |
import time | |
import hmac | |
import hashlib | |
import base64 | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
# --- ๋ณ๋ ฌ ์ฒ๋ฆฌ ์ค์ --- | |
# API ํธ์ถ ์ ํ์ ๋ง์ถฐ ์ ์ ํ ์กฐ์ ํ์ธ์. | |
# ๋๋ฌด ๋์ ๊ฐ์ API ์ ํ์ ๊ฑธ๋ฆด ์ ์์ต๋๋ค. | |
MAX_WORKERS_RELATED_KEYWORDS = 5 # fetch_related_keywords ๋ณ๋ ฌ ์์ ์ ์ | |
MAX_WORKERS_BLOG_COUNT = 10 # fetch_blog_count ๋ณ๋ ฌ ์์ ์ ์ | |
# ๋๋ฒ๊น (๋ก๊ทธ)์ฉ ํจ์ | |
def debug_log(message: str): | |
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] [DEBUG] {message}") | |
# --- ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ --- | |
def scrape_naver_blog(url: str) -> str: | |
debug_log("scrape_naver_blog ํจ์ ์์") | |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}") | |
headers = { | |
"User-Agent": ( | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
"AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/96.0.4664.110 Safari/537.36" | |
) | |
} | |
try: | |
response = requests.get(url, headers=headers, timeout=10) | |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ") | |
if response.status_code != 200: | |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}") | |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}" | |
soup = BeautifulSoup(response.text, "html.parser") | |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ") | |
iframe = soup.select_one("iframe#mainFrame") | |
if not iframe: | |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.") | |
# ์ผ๋ถ ๋ธ๋ก๊ทธ๋ mainFrame์ด ์์ ์ ์์. ๋ณธ๋ฌธ ์ง์ ์๋ | |
content_div_direct = soup.select_one('.se-main-container') | |
if content_div_direct: | |
title_div_direct = soup.select_one('.se-module.se-module-text.se-title-text') | |
title = title_div_direct.get_text(strip=True) if title_div_direct else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค." | |
content = content_div_direct.get_text("\n", strip=True) | |
debug_log("iframe ์์ด ๋ณธ๋ฌธ ์ง์ ์ถ์ถ ์๋ฃ") | |
return f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}" | |
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค. (๋ณธ๋ฌธ ์ง์ ์ถ์ถ ์คํจ)" | |
iframe_src = iframe.get("src") | |
if not iframe_src: | |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.") | |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค." | |
# iframe_src๊ฐ ์ ๋ URL์ด ์๋ ๊ฒฝ์ฐ๋ฅผ ๋๋น | |
if iframe_src.startswith("//"): | |
parsed_iframe_url = "https:" + iframe_src | |
elif iframe_src.startswith("/"): | |
parsed_main_url = urllib.parse.urlparse(url) | |
parsed_iframe_url = urllib.parse.urlunparse( | |
(parsed_main_url.scheme, parsed_main_url.netloc, iframe_src, None, None, None) | |
) | |
else: | |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src) | |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}") | |
iframe_response = requests.get(parsed_iframe_url, headers=headers, timeout=10) | |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ") | |
if iframe_response.status_code != 200: | |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}") | |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}" | |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser") | |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ") | |
# ์ ๋ชฉ ์ถ์ถ (๋ค์ํ ๊ตฌ์กฐ ์๋) | |
title_selectors = [ | |
'.se-module.se-module-text.se-title-text', # ์ผ๋ฐ์ ์ธ ์ค๋งํธ์๋ํฐ ONE | |
'.title_text', # ๊ตฌ๋ฒ์ ์๋ํฐ ๋๋ ๋ค๋ฅธ ๊ตฌ์กฐ | |
'div[class*="title"] h3', | |
'h1', 'h2', 'h3' # ์ผ๋ฐ์ ์ธ ์ ๋ชฉ ํ๊ทธ | |
] | |
title = "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค." | |
for selector in title_selectors: | |
title_div = iframe_soup.select_one(selector) | |
if title_div: | |
title = title_div.get_text(strip=True) | |
break | |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}") | |
# ๋ณธ๋ฌธ ์ถ์ถ (๋ค์ํ ๊ตฌ์กฐ ์๋) | |
content_selectors = [ | |
'.se-main-container', # ์ค๋งํธ์๋ํฐ ONE | |
'div#content', # ๊ตฌ๋ฒ์ ์๋ํฐ | |
'div.post_ct', # ์ผ๋ถ ๋ธ๋ก๊ทธ ๊ตฌ์กฐ | |
'article', 'main' # ์๋งจํฑ ํ๊ทธ | |
] | |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค." | |
for selector in content_selectors: | |
content_div = iframe_soup.select_one(selector) | |
if content_div: | |
# ๋ถํ์ํ ์คํฌ๋ฆฝํธ, ์คํ์ผ ํ๊ทธ ์ ๊ฑฐ | |
for s in content_div(['script', 'style']): | |
s.decompose() | |
content = content_div.get_text("\n", strip=True) | |
break | |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ") | |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}" | |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํฉ์นจ ์๋ฃ") | |
return result | |
except requests.exceptions.Timeout: | |
debug_log(f"์์ฒญ ์๊ฐ ์ด๊ณผ: {url}") | |
return f"์คํฌ๋ํ ์ค ์๊ฐ ์ด๊ณผ๊ฐ ๋ฐ์ํ์ต๋๋ค: {url}" | |
except Exception as e: | |
debug_log(f"์คํฌ๋ํ ์๋ฌ ๋ฐ์: {str(e)}") | |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}" | |
# --- ํํ์ ๋ถ์ (์ฐธ์กฐ์ฝ๋-1) --- | |
def analyze_text(text: str): | |
logging.basicConfig(level=logging.INFO) # INFO ๋ ๋ฒจ๋ก ๋ณ๊ฒฝํ์ฌ ๋๋ฌด ๋ง์ ๋ก๊ทธ ๋ฐฉ์ง | |
logger = logging.getLogger(__name__) | |
# logger.debug("์๋ณธ ํ ์คํธ: %s", text) # ๋๋ฌด ๊ธธ ์ ์์ผ๋ฏ๋ก ์ฃผ์ ์ฒ๋ฆฌ | |
filtered_text = re.sub(r'[^๊ฐ-ํฃa-zA-Z0-9\s]', '', text) # ์์ด, ์ซ์, ๊ณต๋ฐฑ ํฌํจ | |
# logger.debug("ํํฐ๋ง๋ ํ ์คํธ: %s", filtered_text) | |
if not filtered_text.strip(): | |
logger.info("์ ํจํ ํ ์คํธ๊ฐ ์์ (ํํฐ๋ง ํ).") | |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), "" | |
try: | |
mecab_instance = mecab.MeCab() | |
tokens = mecab_instance.pos(filtered_text) | |
except Exception as e: | |
logger.error(f"MeCab ํํ์ ๋ถ์ ์ค ์ค๋ฅ: {e}") | |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), "" | |
# logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens) | |
freq = {} | |
for word, pos in tokens: | |
# ์ผ๋ฐ๋ช ์ฌ(NNG), ๊ณ ์ ๋ช ์ฌ(NNP), ์ธ๊ตญ์ด(SL), ์ซ์(SN) ๋ฑ ํฌํจ, ํ ๊ธ์ ๋จ์ด๋ ์ ์ธ (์ ํ ์ฌํญ) | |
if word and word.strip() and (pos.startswith("NN") or pos in ["SL", "SH"]) and len(word) > 1 : | |
freq[word] = freq.get(word, 0) + 1 | |
# logger.debug("๋จ์ด: %s, ํ์ฌ: %s, ๋น๋: %d", word, pos, freq[word]) | |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True) | |
# logger.debug("์ ๋ ฌ๋ ๋จ์ด ๋น๋: %s", sorted_freq) | |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"]) | |
logger.info(f"ํํ์ ๋ถ์ DataFrame ์์ฑ๋จ, shape: {df.shape}") | |
temp_file_path = "" | |
if not df.empty: | |
try: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx", mode='w+b') as temp_file: | |
df.to_excel(temp_file.name, index=False, engine='openpyxl') | |
temp_file_path = temp_file.name | |
logger.info(f"Excel ํ์ผ ์์ฑ๋จ: {temp_file_path}") | |
except Exception as e: | |
logger.error(f"Excel ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ: {e}") | |
temp_file_path = "" # ์ค๋ฅ ๋ฐ์ ์ ๊ฒฝ๋ก ์ด๊ธฐํ | |
return df, temp_file_path | |
# --- ๋ค์ด๋ฒ ๊ฒ์ ๋ฐ ๊ด๊ณ API ๊ด๋ จ (์ฐธ์กฐ์ฝ๋-2) --- | |
def generate_signature(timestamp, method, uri, secret_key): | |
message = f"{timestamp}.{method}.{uri}" | |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest() | |
return base64.b64encode(digest).decode() | |
def get_header(method, uri, api_key, secret_key, customer_id): | |
timestamp = str(round(time.time() * 1000)) | |
signature = generate_signature(timestamp, method, uri, secret_key) | |
return { | |
"Content-Type": "application/json; charset=UTF-8", | |
"X-Timestamp": timestamp, | |
"X-API-KEY": api_key, | |
"X-Customer": str(customer_id), | |
"X-Signature": signature | |
} | |
# API ํค ํ๊ฒฝ ๋ณ์ ํ์ธ ํจ์ | |
def get_env_variable(var_name): | |
value = os.environ.get(var_name) | |
if value is None: | |
debug_log(f"ํ๊ฒฝ ๋ณ์ '{var_name}'๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. API ํธ์ถ์ด ์คํจํ ์ ์์ต๋๋ค.") | |
# ํ์์ ์ฌ๊ธฐ์ raise Exception ๋๋ ๊ธฐ๋ณธ๊ฐ ๋ฐํ | |
return value | |
def fetch_related_keywords(keyword): | |
debug_log(f"fetch_related_keywords ํธ์ถ ์์, ํค์๋: {keyword}") | |
API_KEY = get_env_variable("NAVER_API_KEY") | |
SECRET_KEY = get_env_variable("NAVER_SECRET_KEY") | |
CUSTOMER_ID = get_env_variable("NAVER_CUSTOMER_ID") | |
if not all([API_KEY, SECRET_KEY, CUSTOMER_ID]): | |
debug_log(f"๋ค์ด๋ฒ ๊ด๊ณ API ํค ์ ๋ณด ๋ถ์กฑ์ผ๋ก '{keyword}' ์ฐ๊ด ํค์๋ ์กฐํ๋ฅผ ๊ฑด๋<0xEB><0xB5>๋๋ค.") | |
return pd.DataFrame() | |
BASE_URL = "https://api.naver.com" | |
uri = "/keywordstool" | |
method = "GET" | |
try: | |
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID) | |
params = { | |
"hintKeywords": keyword, # ๋จ์ผ ํค์๋ ๋ฌธ์์ด๋ก ์ ๋ฌ | |
"showDetail": "1" | |
} | |
# hintKeywords๋ ๋ฆฌ์คํธ๋ก ๋ฐ์ ์ ์์ผ๋, ์ฌ๊ธฐ์๋ ๋จ์ผ ํค์๋ ์ฒ๋ฆฌ๋ฅผ ๊ฐ์ ํ๊ณ ๋ฌธ์์ด๋ก ์ ๋ฌ | |
# ๋ง์ฝ API๊ฐ hintKeywords๋ฅผ ๋ฆฌ์คํธ๋ก๋ง ๋ฐ๋๋ค๋ฉด [keyword]๋ก ์์ ํ์ | |
response = requests.get(BASE_URL + uri, params=params, headers=headers, timeout=10) | |
response.raise_for_status() # ์ค๋ฅ ๋ฐ์ ์ ์์ธ ๋ฐ์ | |
data = response.json() | |
if "keywordList" not in data or not data["keywordList"]: | |
debug_log(f"'{keyword}'์ ๋ํ ์ฐ๊ด ํค์๋ ๊ฒฐ๊ณผ ์์.") | |
return pd.DataFrame() # ๋น DataFrame ๋ฐํ | |
df = pd.DataFrame(data["keywordList"]) | |
# API ์๋ต์ ํด๋น ์ปฌ๋ผ์ด ์์ ๊ฒฝ์ฐ๋ฅผ ๋๋น | |
df["monthlyPcQcCnt"] = df.get("monthlyPcQcCnt", 0) | |
df["monthlyMobileQcCnt"] = df.get("monthlyMobileQcCnt", 0) | |
def parse_count(x): | |
if pd.isna(x) or str(x).lower() == '< 10': # ๋ค์ด๋ฒ API๋ 10 ๋ฏธ๋ง์ผ ๋ "< 10"์ผ๋ก ๋ฐํ | |
return 5 # ๋๋ 0, ๋๋ ๋ค๋ฅธ ๋ํ๊ฐ (์: 5) | |
try: | |
return int(str(x).replace(",", "")) | |
except ValueError: | |
return 0 | |
df["PC์๊ฒ์๋"] = df["monthlyPcQcCnt"].apply(parse_count) | |
df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] = df["monthlyMobileQcCnt"].apply(parse_count) | |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] | |
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True) | |
# ํ์ํ ์ปฌ๋ผ๋ง ์ ํ, ์๋ ๊ฒฝ์ฐ ๋๋น | |
required_cols = ["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"] | |
result_df = pd.DataFrame(columns=required_cols) | |
for col in required_cols: | |
if col in df.columns: | |
result_df[col] = df[col] | |
else: # ํด๋น ์ปฌ๋ผ์ด API ์๋ต์ ์์ ๊ฒฝ์ฐ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ฑ์ | |
if col == "์ ๋ณดํค์๋": # ์ ๋ณดํค์๋๋ ํ์ | |
debug_log(f"API ์๋ต์ 'relKeyword'๊ฐ ์์ต๋๋ค. '{keyword}' ์ฒ๋ฆฌ ์ค๋จ.") | |
return pd.DataFrame() | |
result_df[col] = 0 | |
debug_log(f"fetch_related_keywords '{keyword}' ์๋ฃ, ๊ฒฐ๊ณผ {len(result_df)}๊ฐ") | |
return result_df.head(100) # ์ต๋ 100๊ฐ๋ก ์ ํ | |
except requests.exceptions.HTTPError as http_err: | |
debug_log(f"HTTP ์ค๋ฅ ๋ฐ์ (fetch_related_keywords for '{keyword}'): {http_err} - ์๋ต: {response.text if 'response' in locals() else 'N/A'}") | |
except requests.exceptions.RequestException as req_err: | |
debug_log(f"์์ฒญ ์ค๋ฅ ๋ฐ์ (fetch_related_keywords for '{keyword}'): {req_err}") | |
except Exception as e: | |
debug_log(f"์ ์ ์๋ ์ค๋ฅ ๋ฐ์ (fetch_related_keywords for '{keyword}'): {e}") | |
return pd.DataFrame() # ์ค๋ฅ ๋ฐ์ ์ ๋น DataFrame ๋ฐํ | |
def fetch_blog_count(keyword): | |
debug_log(f"fetch_blog_count ํธ์ถ, ํค์๋: {keyword}") | |
client_id = get_env_variable("NAVER_SEARCH_CLIENT_ID") | |
client_secret = get_env_variable("NAVER_SEARCH_CLIENT_SECRET") | |
if not client_id or not client_secret: | |
debug_log(f"๋ค์ด๋ฒ ๊ฒ์ API ํค ์ ๋ณด ๋ถ์กฑ์ผ๋ก '{keyword}' ๋ธ๋ก๊ทธ ์ ์กฐํ๋ฅผ ๊ฑด๋<0xEB><0xB5>๋๋ค.") | |
return 0 | |
url = "https://openapi.naver.com/v1/search/blog.json" | |
headers = { | |
"X-Naver-Client-Id": client_id, | |
"X-Naver-Client-Secret": client_secret | |
} | |
params = {"query": keyword, "display": 1} # display=1๋ก ์ค์ ํ์ฌ total ๊ฐ๋ง ๋น ๋ฅด๊ฒ ํ์ธ | |
try: | |
response = requests.get(url, headers=headers, params=params, timeout=5) | |
response.raise_for_status() # HTTP ์ค๋ฅ ๋ฐ์ ์ ์์ธ ๋ฐ์ | |
data = response.json() | |
total_count = data.get("total", 0) | |
debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {total_count} for '{keyword}'") | |
return total_count | |
except requests.exceptions.HTTPError as http_err: | |
debug_log(f"HTTP ์ค๋ฅ ๋ฐ์ (fetch_blog_count for '{keyword}'): {http_err} - ์๋ต: {response.text}") | |
except requests.exceptions.RequestException as req_err: # Timeout, ConnectionError ๋ฑ | |
debug_log(f"์์ฒญ ์ค๋ฅ ๋ฐ์ (fetch_blog_count for '{keyword}'): {req_err}") | |
except Exception as e: # JSONDecodeError ๋ฑ ๊ธฐํ ์์ธ | |
debug_log(f"์ ์ ์๋ ์ค๋ฅ ๋ฐ์ (fetch_blog_count for '{keyword}'): {e}") | |
return 0 # ์ค๋ฅ ๋ฐ์ ์ 0 ๋ฐํ | |
def create_excel_file(df): | |
if df.empty: | |
debug_log("๋น DataFrame์ผ๋ก Excel ํ์ผ์ ์์ฑํ์ง ์์ต๋๋ค.") | |
# ๋น ํ์ผ์ ์์ฑํ๊ฑฐ๋, None์ ๋ฐํํ์ฌ Gradio์์ ์ฒ๋ฆฌํ๋๋ก ํ ์ ์์ | |
# ์ฌ๊ธฐ์๋ ๋น ์์ ํ์ผ์ ์์ฑํ์ฌ ๋ฐํ (Gradio File ์ปดํฌ๋ํธ๊ฐ ๊ฒฝ๋ก๋ฅผ ๊ธฐ๋ํ๋ฏ๋ก) | |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp: | |
excel_path = tmp.name | |
# ๋น ์์ ํ์ผ์ ํค๋๋ง์ด๋ผ๋ ์จ์ฃผ๋ ค๋ฉด | |
# pd.DataFrame(columns=df.columns).to_excel(excel_path, index=False) | |
# ์๋๋ฉด ๊ทธ๋ฅ ๋น ํ์ผ์ ๋ฐํ | |
return excel_path | |
try: | |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False, mode='w+b') as tmp: | |
excel_path = tmp.name | |
df.to_excel(excel_path, index=False, engine='openpyxl') | |
debug_log(f"Excel ํ์ผ ์์ฑ๋จ: {excel_path}") | |
return excel_path | |
except Exception as e: | |
debug_log(f"Excel ํ์ผ ์์ฑ ์ค ์ค๋ฅ: {e}") | |
# ์ค๋ฅ ๋ฐ์ ์ ๋น ํ์ผ ๊ฒฝ๋ก๋ผ๋ ๋ฐํ (Gradio ํธํ์ฑ) | |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp: | |
return tmp.name | |
def process_keyword(keywords: str, include_related: bool): | |
debug_log(f"process_keyword ํธ์ถ ์์, ํค์๋๋ค: '{keywords[:100]}...', ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}") | |
input_keywords_orig = [k.strip() for k in keywords.splitlines() if k.strip()] | |
if not input_keywords_orig: | |
debug_log("์ ๋ ฅ๋ ํค์๋๊ฐ ์์ต๋๋ค.") | |
return pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]), "" | |
all_related_keywords_dfs = [] | |
# 1. fetch_related_keywords ๋ณ๋ ฌ ์ฒ๋ฆฌ | |
debug_log(f"์ฐ๊ด ํค์๋ ์กฐํ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์์ (์ต๋ ์์ ์ ์: {MAX_WORKERS_RELATED_KEYWORDS})") | |
with ThreadPoolExecutor(max_workers=MAX_WORKERS_RELATED_KEYWORDS) as executor: | |
future_to_keyword_related = { | |
executor.submit(fetch_related_keywords, kw): kw for kw in input_keywords_orig | |
} | |
for i, future in enumerate(as_completed(future_to_keyword_related)): | |
kw = future_to_keyword_related[future] | |
try: | |
df_kw_related = future.result() # DataFrame ๋ฐํ | |
if not df_kw_related.empty: | |
# ์๋ณธ ํค์๋๊ฐ ๊ฒฐ๊ณผ์ ํฌํจ๋์ด ์๋์ง ํ์ธํ๊ณ , ์์ผ๋ฉด ์ถ๊ฐ ์๋ (API๊ฐ ํญ์ relKeyword๋ก ์์ ์ ์ฃผ์ง ์์) | |
# ํ์ง๋ง fetch_related_keywords์์ ์ด๋ฏธ hintKeyword๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ๊ฒ์ํ๋ฏ๋ก, | |
# ์ผ๋ฐ์ ์ผ๋ก๋ ํด๋น ํค์๋ ์ ๋ณด๊ฐ ์๊ฑฐ๋, ์ฐ๊ด ํค์๋๋ง ๋์ด. | |
# ์ฌ๊ธฐ์๋ API ์๋ต์ ๊ทธ๋๋ก ํ์ฉ. | |
# ์ฒซ ๋ฒ์งธ ์ ๋ ฅ ํค์๋์ด๊ณ , ์ฐ๊ด ํค์๋ ํฌํจ ์ต์ ์ด ์ผ์ ธ ์์ผ๋ฉด ๋ชจ๋ ์ฐ๊ด ํค์๋๋ฅผ ์ถ๊ฐ | |
# ๊ทธ ์ธ์ ๊ฒฝ์ฐ์๋ ํด๋น ํค์๋ ์์ฒด์ ์ ๋ณด๋ง (์๋ค๋ฉด) ์ฌ์ฉํ๊ฑฐ๋, ์ต์๋จ ํค์๋ ์ฌ์ฉ | |
if include_related and kw == input_keywords_orig[0]: | |
all_related_keywords_dfs.append(df_kw_related) | |
debug_log(f"์ฒซ ๋ฒ์งธ ํค์๋ '{kw}'์ ๋ชจ๋ ์ฐ๊ด ํค์๋ ({len(df_kw_related)}๊ฐ) ์ถ๊ฐ๋จ.") | |
else: | |
# ํด๋น ํค์๋์ ์ผ์นํ๋ ํ์ ์ฐพ๊ฑฐ๋, ์์ผ๋ฉด API๊ฐ ๋ฐํํ ์ฒซ๋ฒ์งธ ํ์ ์ฌ์ฉ | |
row_kw = df_kw_related[df_kw_related["์ ๋ณดํค์๋"] == kw] | |
if not row_kw.empty: | |
all_related_keywords_dfs.append(row_kw) | |
debug_log(f"ํค์๋ '{kw}'์ ์ง์ ์ ๋ณด ์ถ๊ฐ๋จ.") | |
elif not df_kw_related.empty : # ์ง์ ์ ๋ณด๋ ์์ง๋ง ์ฐ๊ด ํค์๋๋ ์์ ๋ | |
all_related_keywords_dfs.append(df_kw_related.head(1)) # ๊ฐ์ฅ ์ฐ๊ด์ฑ ๋์ ํค์๋ ์ถ๊ฐ | |
debug_log(f"ํค์๋ '{kw}'์ ์ง์ ์ ๋ณด๋ ์์ผ๋, ๊ฐ์ฅ ์ฐ๊ด์ฑ ๋์ ํค์๋ 1๊ฐ ์ถ๊ฐ๋จ.") | |
# else: ํค์๋ ์ ๋ณด๋, ์ฐ๊ด ์ ๋ณด๋ ์์ ๋ (df_kw_related๊ฐ ๋น์ด์์) | |
debug_log(f"'{kw}' ์ฐ๊ด ํค์๋ ์ฒ๋ฆฌ ์๋ฃ ({i+1}/{len(input_keywords_orig)})") | |
except Exception as e: | |
debug_log(f"'{kw}' ์ฐ๊ด ํค์๋ ์กฐํ ์ค ๋ณ๋ ฌ ์์ ์ค๋ฅ: {e}") | |
if not all_related_keywords_dfs: | |
debug_log("์ฐ๊ด ํค์๋ ์กฐํ ๊ฒฐ๊ณผ๊ฐ ๋ชจ๋ ๋น์ด์์ต๋๋ค.") | |
# ๋น DataFrame์ ๋ธ๋ก๊ทธ ๋ฌธ์์ ์ปฌ๋ผ ์ถ๊ฐ | |
empty_df = pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]) | |
empty_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = None | |
return empty_df, create_excel_file(empty_df) | |
result_df = pd.concat(all_related_keywords_dfs, ignore_index=True) | |
result_df.drop_duplicates(subset=["์ ๋ณดํค์๋"], inplace=True) # ์ค๋ณต ์ ๊ฑฐ | |
debug_log(f"์ฐ๊ด ํค์๋ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์๋ฃ. ํตํฉ๋ DataFrame shape: {result_df.shape}") | |
# 2. fetch_blog_count ๋ณ๋ ฌ ์ฒ๋ฆฌ | |
keywords_for_blog_count = result_df["์ ๋ณดํค์๋"].dropna().unique().tolist() | |
blog_counts_map = {} | |
if keywords_for_blog_count: | |
debug_log(f"๋ธ๋ก๊ทธ ๋ฌธ์ ์ ์กฐํ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์์ (ํค์๋ {len(keywords_for_blog_count)}๊ฐ, ์ต๋ ์์ ์ ์: {MAX_WORKERS_BLOG_COUNT})") | |
with ThreadPoolExecutor(max_workers=MAX_WORKERS_BLOG_COUNT) as executor: | |
future_to_keyword_blog = { | |
executor.submit(fetch_blog_count, kw): kw for kw in keywords_for_blog_count | |
} | |
for i, future in enumerate(as_completed(future_to_keyword_blog)): | |
kw = future_to_keyword_blog[future] | |
try: | |
count = future.result() # ์ซ์ ๋ฐํ | |
blog_counts_map[kw] = count | |
if (i+1) % 50 == 0: # ๋๋ฌด ๋ง์ ๋ก๊ทธ ๋ฐฉ์ง | |
debug_log(f"๋ธ๋ก๊ทธ ์ ์กฐํ ์งํ ์ค... ({i+1}/{len(keywords_for_blog_count)})") | |
except Exception as e: | |
debug_log(f"'{kw}' ๋ธ๋ก๊ทธ ์ ์กฐํ ์ค ๋ณ๋ ฌ ์์ ์ค๋ฅ: {e}") | |
blog_counts_map[kw] = 0 # ์ค๋ฅ ์ 0์ผ๋ก ์ฒ๋ฆฌ | |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].map(blog_counts_map).fillna(0).astype(int) | |
debug_log("๋ธ๋ก๊ทธ ๋ฌธ์ ์ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์๋ฃ.") | |
else: | |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = 0 # ์กฐํํ ํค์๋๊ฐ ์์ผ๋ฉด 0์ผ๋ก ์ฑ์ | |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True) | |
debug_log(f"process_keyword ์ต์ข ์๋ฃ. DataFrame shape: {result_df.shape}") | |
# ์ต์ข ์ปฌ๋ผ ์์ ๋ฐ ์กด์ฌ ์ฌ๋ถ ํ์ธ | |
final_columns = ["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"] | |
for col in final_columns: | |
if col not in result_df.columns: | |
result_df[col] = 0 if col != "์ ๋ณดํค์๋" else "" # ์๋ ์ปฌ๋ผ์ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ฑ์ | |
result_df = result_df[final_columns] # ์ปฌ๋ผ ์์ ๊ณ ์ | |
return result_df, create_excel_file(result_df) | |
# --- ํํ์ ๋ถ์๊ณผ ๊ฒ์๋/๋ธ๋ก๊ทธ๋ฌธ์์ ๋ณํฉ --- | |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool): | |
debug_log("morphological_analysis_and_enrich ํจ์ ์์") | |
df_freq, _ = analyze_text(text) # ์์ ํ์ผ ๊ฒฝ๋ก๋ ์ฌ๊ธฐ์ ์ฌ์ฉ ์ ํจ | |
if df_freq.empty: | |
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์ ๋๋ค.") | |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]), "" | |
if remove_freq1: | |
before_count = len(df_freq) | |
df_freq = df_freq[df_freq["๋น๋์"] > 1].copy() # .copy() ์ถ๊ฐ | |
debug_log(f"๋น๋์ 1 ์ ๊ฑฐ ์ ์ฉ๋จ. {before_count} -> {len(df_freq)}") | |
if df_freq.empty: | |
debug_log("๋น๋์ 1 ์ ๊ฑฐ ํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.") | |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]), "" | |
keywords_from_morph = "\n".join(df_freq["๋จ์ด"].tolist()) | |
debug_log(f"ํํ์ ๋ถ์ ๊ธฐ๋ฐ ํค์๋ ({len(df_freq['๋จ์ด'])}๊ฐ)์ ๋ํ ์ ๋ณด ์กฐํ ์์") | |
# process_keyword๋ ์ฐ๊ด ํค์๋๋ฅผ ํฌํจํ์ง ์๋๋ก ํธ์ถ (include_related=False) | |
df_keyword_info, _ = process_keyword(keywords_from_morph, include_related=False) | |
debug_log("ํํ์ ๋ถ์ ํค์๋์ ๋ํ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ") | |
if df_keyword_info.empty: | |
debug_log("ํํ์ ๋ถ์ ํค์๋์ ๋ํ API ์ ๋ณด ์กฐํ ๊ฒฐ๊ณผ๊ฐ ์์ต๋๋ค.") | |
# df_freq์ ๋น ์ปฌ๋ผ๋ค ์ถ๊ฐ | |
for col in ["PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]: | |
df_freq[col] = None | |
merged_df = df_freq | |
else: | |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left") | |
if "์ ๋ณดํค์๋" in merged_df.columns: # merge ํ ์ ๋ณดํค์๋ ์ปฌ๋ผ์ด ์๊ฒผ๋ค๋ฉด ์ญ์ | |
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True, errors='ignore') | |
# ๋๋ฝ๋ ์ปฌ๋ผ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ | |
expected_cols = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"] | |
for col in expected_cols: | |
if col not in merged_df.columns: | |
merged_df[col] = None if col not in ["๋น๋์"] else 0 | |
merged_df = merged_df[expected_cols] # ์ปฌ๋ผ ์์ ๊ณ ์ | |
merged_excel_path = create_excel_file(merged_df) | |
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ") | |
return merged_df, merged_excel_path | |
# --- ์ง์ ํค์๋ ๋ถ์ (๋จ๋ ๋ถ์) --- | |
def direct_keyword_analysis(text: str, keyword_input: str): | |
debug_log("direct_keyword_analysis ํจ์ ์์") | |
direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', keyword_input) if kw.strip()] | |
debug_log(f"์ ๋ ฅ๋ ์ง์ ํค์๋ ๋ชฉ๋ก: {direct_keywords_list}") | |
if not direct_keywords_list: | |
debug_log("์ง์ ์ ๋ ฅ๋ ํค์๋๊ฐ ์์ต๋๋ค.") | |
return pd.DataFrame(columns=["ํค์๋", "๋น๋์"]), "" | |
# 1. ๋ณธ๋ฌธ ๋ด ๋น๋์ ๊ณ์ฐ | |
results_freq = [] | |
for kw in direct_keywords_list: | |
count = text.count(kw) # ๋์๋ฌธ์ ๊ตฌ๋ถ, ์ ํํ ๋ฌธ์์ด ์นด์ดํธ | |
results_freq.append({"ํค์๋": kw, "๋น๋์": count}) | |
debug_log(f"์ง์ ํค์๋ '{kw}'์ ๋ณธ๋ฌธ ๋ด ๋น๋์: {count}") | |
df_direct_freq = pd.DataFrame(results_freq) | |
# 2. API๋ฅผ ํตํด ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ ์ ์กฐํ (๋ณ๋ ฌ ์ฒ๋ฆฌ๋ process_keyword ์ฌ์ฉ) | |
# ์ฌ๊ธฐ์๋ ๊ฐ ์ง์ ํค์๋์ ๋ํ ์ ๋ณด๋ง ํ์ํ๋ฏ๋ก include_related=False | |
keywords_for_api = "\n".join(direct_keywords_list) | |
df_direct_api_info, _ = process_keyword(keywords_for_api, include_related=False) | |
# 3. ๋น๋์ ๊ฒฐ๊ณผ์ API ๊ฒฐ๊ณผ ๋ณํฉ | |
if not df_direct_api_info.empty: | |
# API ๊ฒฐ๊ณผ์ '์ ๋ณดํค์๋'๋ฅผ 'ํค์๋'๋ก ๋ณ๊ฒฝํ์ฌ ๋ณํฉ ๊ธฐ์ค ํต์ผ | |
df_direct_api_info.rename(columns={"์ ๋ณดํค์๋": "ํค์๋"}, inplace=True) | |
merged_df = pd.merge(df_direct_freq, df_direct_api_info, on="ํค์๋", how="left") | |
else: | |
merged_df = df_direct_freq | |
for col in ["PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]: | |
merged_df[col] = None # API ์ ๋ณด๊ฐ ์์ ๊ฒฝ์ฐ ๋น ์ปฌ๋ผ ์ถ๊ฐ | |
# ์ปฌ๋ผ ์์ ๋ฐ ๊ธฐ๋ณธ๊ฐ ์ ๋ฆฌ | |
final_cols = ["ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"] | |
for col in final_cols: | |
if col not in merged_df.columns: | |
merged_df[col] = 0 if col != "ํค์๋" else "" | |
merged_df = merged_df[final_cols] | |
excel_path = create_excel_file(merged_df) | |
debug_log("direct_keyword_analysis ํจ์ ์๋ฃ") | |
return merged_df, excel_path | |
# --- ํตํฉ ๋ถ์ (ํํ์ ๋ถ์ + ์ง์ ํค์๋ ๋ถ์) --- | |
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str): | |
debug_log("combined_analysis ํจ์ ์์") | |
# 1. ํํ์ ๋ถ์ ๊ธฐ๋ฐ ๊ฒฐ๊ณผ (API ์ ๋ณด ํฌํจ) | |
df_morph, _ = morphological_analysis_and_enrich(blog_text, remove_freq1) | |
# df_morph ์ปฌ๋ผ: "๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์" | |
# 2. ์ง์ ์ ๋ ฅ ํค์๋ ์ฒ๋ฆฌ | |
direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', direct_keyword_input) if kw.strip()] | |
debug_log(f"ํตํฉ ๋ถ์ - ์ ๋ ฅ๋ ์ง์ ํค์๋: {direct_keywords_list}") | |
if not direct_keywords_list: # ์ง์ ์ ๋ ฅ ํค์๋๊ฐ ์์ผ๋ฉด ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๋ง ๋ฐํ | |
if "์ง์ ์ ๋ ฅ" not in df_morph.columns and not df_morph.empty: | |
df_morph["์ง์ ์ ๋ ฅ"] = "" # ์ง์ ์ ๋ ฅ ์ปฌ๋ผ ์ถ๊ฐ | |
# ์ปฌ๋ผ ์์ ์กฐ์ | |
cols = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์", "์ง์ ์ ๋ ฅ"] | |
for col in cols: | |
if col not in df_morph.columns: | |
df_morph[col] = "" if col == "์ง์ ์ ๋ ฅ" else (0 if col != "๋จ์ด" else "") | |
df_morph = df_morph[cols] | |
return df_morph, create_excel_file(df_morph) | |
# ์ง์ ์ ๋ ฅ ํค์๋์ ๋ํ ์ ๋ณด (๋น๋์, API ์ ๋ณด) ๊ฐ์ ธ์ค๊ธฐ | |
# direct_keyword_analysis๋ "ํค์๋" ์ปฌ๋ผ์ ์ฌ์ฉํ๋ฏ๋ก, df_morph์ "๋จ์ด"์ ํต์ผ ํ์ | |
df_direct_raw, _ = direct_keyword_analysis(blog_text, direct_keyword_input) | |
# df_direct_raw ์ปฌ๋ผ: "ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์" | |
df_direct_raw.rename(columns={"ํค์๋": "๋จ์ด"}, inplace=True) # ์ปฌ๋ผ๋ช ํต์ผ | |
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ '์ง์ ์ ๋ ฅ' ํ๊ธฐ | |
if not df_morph.empty: | |
df_morph["์ง์ ์ ๋ ฅ"] = df_morph["๋จ์ด"].apply(lambda x: "์ง์ ์ ๋ ฅ" if x in direct_keywords_list else "") | |
else: # ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น์ด์์ ์ ์์ | |
df_morph = pd.DataFrame(columns=["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์", "์ง์ ์ ๋ ฅ"]) | |
# ์ง์ ์ ๋ ฅ๋ ํค์๋ ์ค ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ์๋ ๊ฒ๋ค์ ์ถ๊ฐ | |
# df_direct_raw์๋ ๋ชจ๋ ์ง์ ์ ๋ ฅ ํค์๋์ ์ ๋ณด๊ฐ ์์ | |
# df_morph์ df_direct_raw๋ฅผ ํฉ์น๋, '๋จ์ด' ๊ธฐ์ค์ผ๋ก ์ค๋ณต ์ฒ๋ฆฌ | |
# ๋จผ์ df_direct_raw์ '์ง์ ์ ๋ ฅ' ์ปฌ๋ผ์ ์ถ๊ฐํ๊ณ "์ง์ ์ ๋ ฅ"์ผ๋ก ์ฑ์ | |
df_direct_raw["์ง์ ์ ๋ ฅ"] = "์ง์ ์ ๋ ฅ" | |
# df_morph์ ์๋ ๋จ์ด๋ df_morph ์ ๋ณด๋ฅผ ์ฐ์ ์ฌ์ฉ (์ง์ ์ ๋ ฅ ํ๋๊ทธ๋ง ์ ๋ฐ์ดํธ) | |
# df_direct_raw์์ df_morph์ ์๋ ๋จ์ด๋ง ๊ณจ๋ผ์ ์ถ๊ฐ | |
# ํฉ์น๊ธฐ: df_morph๋ฅผ ๊ธฐ์ค์ผ๋ก df_direct_raw์ ์ ๋ณด๋ฅผ ์ถ๊ฐ/์ ๋ฐ์ดํธ | |
# Pandas 0.25.0 ์ด์์์๋ combine_first์ overwrite ๋์์ด ์ฝ๊ฐ ๋ค๋ฅผ ์ ์์ผ๋ฏ๋ก merge ์ฌ์ฉ ๊ณ ๋ ค | |
# 1. df_morph์ ๋จ์ด๋ค์ ๋ํด df_direct_raw์ ์ ๋ณด๋ก ์ ๋ฐ์ดํธ (API ์ ๋ณด ๋ฑ) | |
# ๋จ, ๋น๋์๋ ๊ฐ์ ๊ณ์ฐํ ๊ฒ์ ์ ์งํ ์ง, ์๋๋ฉด ํ์ชฝ์ ํํ ์ง ๊ฒฐ์ ํ์. | |
# ์ฌ๊ธฐ์๋ df_morph์ ๋น๋์(ํํ์๋ถ์ ๊ธฐ๋ฐ)์ df_direct_raw์ ๋น๋์(๋จ์ count)๊ฐ ๋ค๋ฅผ ์ ์์. | |
# ์ผ๋จ์ df_morph ๊ธฐ์ค์ผ๋ก ํ๊ณ , ์๋ ์ง์ ํค์๋๋ง df_direct_raw์์ ์ถ๊ฐํ๋ ๋ฐฉ์. | |
# df_morph์ '์ง์ ์ ๋ ฅ' ์ปฌ๋ผ์ ์ด๋ฏธ ์์์ ์ฒ๋ฆฌ๋จ. | |
# ์ด์ df_direct_raw์๋ง ์๋ ํค์๋๋ฅผ df_morph์ ์ถ๊ฐ | |
# df_morph์ ์๋ ๋จ์ด ๋ชฉ๋ก | |
morph_words = df_morph['๋จ์ด'].tolist() if not df_morph.empty else [] | |
rows_to_add = [] | |
for idx, row in df_direct_raw.iterrows(): | |
if row['๋จ์ด'] not in morph_words: | |
rows_to_add.append(row) | |
if rows_to_add: | |
df_to_add = pd.DataFrame(rows_to_add) | |
combined_df = pd.concat([df_morph, df_to_add], ignore_index=True) | |
else: | |
combined_df = df_morph.copy() # df_morph๊ฐ ๋น์ด์์ ์๋ ์์ | |
# ์ต์ข ์ปฌ๋ผ ์ ๋ฆฌ ๋ฐ ์์ | |
final_cols_combined = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์", "์ง์ ์ ๋ ฅ"] | |
for col in final_cols_combined: | |
if col not in combined_df.columns: | |
# ๊ธฐ๋ณธ๊ฐ ์ค์ : '์ง์ ์ ๋ ฅ'์ "", ๋๋จธ์ง๋ 0 ๋๋ None (API ๊ฐ์ None ํ์ฉ) | |
if col == "์ง์ ์ ๋ ฅ": | |
combined_df[col] = "" | |
elif col == "๋น๋์": | |
combined_df[col] = 0 | |
elif col == "๋จ์ด": | |
combined_df[col] = "" | |
else: # API ๊ด๋ จ ์ปฌ๋ผ | |
combined_df[col] = None # pd.NA๋ ๊ฐ๋ฅ | |
# NA ๊ฐ๋ค์ ์ ์ ํ ์ฒ๋ฆฌ (์: 0์ผ๋ก ์ฑ์ฐ๊ฑฐ๋ ๊ทธ๋๋ก ๋๊ธฐ) | |
# API ๊ฐ๋ค์ ์ซ์๊ฐ ์๋ ์ ์์ผ๋ฏ๋ก (์: "< 10"), process_keyword์์ ์ฒ๋ฆฌ๋จ. ์ฌ๊ธฐ์๋ intํ ๋ณํ ์ ์ด๋ฏ๋ก ๊ทธ๋๋ก ๋ . | |
# Gradio Dataframe์ None์ ์ ํ์ํจ. | |
# ๋น๋์๋ ์ ์ํ์ด์ด์ผ ํจ | |
if "๋น๋์" in combined_df.columns: | |
combined_df["๋น๋์"] = combined_df["๋น๋์"].fillna(0).astype(int) | |
combined_df = combined_df[final_cols_combined].drop_duplicates(subset=['๋จ์ด'], keep='first') # ๋ง์ฝ์ ์ํ ์ค๋ณต ์ ๊ฑฐ | |
combined_df.sort_values(by=["์ง์ ์ ๋ ฅ", "๋น๋์"], ascending=[False, False], inplace=True, na_position='last') # ์ง์ ์ ๋ ฅ ์ฐ์ , ๊ทธ ๋ค์ ๋น๋์ | |
combined_df.reset_index(drop=True, inplace=True) | |
combined_excel = create_excel_file(combined_df) | |
debug_log("combined_analysis ํจ์ ์๋ฃ") | |
return combined_df, combined_excel | |
# --- ๋ถ์ ํธ๋ค๋ฌ --- | |
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool): | |
debug_log(f"analysis_handler ํจ์ ์์. ์ง์ ํค์๋๋ง ๋ถ์: {direct_keyword_only}") | |
start_time = time.time() | |
if not blog_text or blog_text.strip() == "์คํฌ๋ํ๋ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค." or blog_text.strip() == "": | |
debug_log("๋ถ์ํ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์์ต๋๋ค.") | |
# ๋น ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํ๊ธฐ ์ํ DataFrame ๊ตฌ์กฐ ๋ช ์ | |
empty_cols_direct = ["ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"] | |
empty_cols_combined = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์", "์ง์ ์ ๋ ฅ"] | |
df_empty = pd.DataFrame(columns=empty_cols_direct if direct_keyword_only else empty_cols_combined) | |
return df_empty, create_excel_file(df_empty) | |
if direct_keyword_only: | |
# "์ง์ ํค์๋ ์ ๋ ฅ๋ง ๋ถ์" ์ ํ ์ ๋จ๋ ๋ถ์ ์ํ | |
if not direct_keyword_input or not direct_keyword_input.strip(): | |
debug_log("์ง์ ํค์๋๋ง ๋ถ์ ์ ํ๋์์ผ๋, ์ ๋ ฅ๋ ์ง์ ํค์๋๊ฐ ์์ต๋๋ค.") | |
empty_cols_direct = ["ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"] | |
df_empty = pd.DataFrame(columns=empty_cols_direct) | |
return df_empty, create_excel_file(df_empty) | |
result_df, excel_path = direct_keyword_analysis(blog_text, direct_keyword_input) | |
else: | |
# ๊ธฐ๋ณธ ํตํฉ ๋ถ์ ์ํ | |
result_df, excel_path = combined_analysis(blog_text, remove_freq1, direct_keyword_input) | |
end_time = time.time() | |
debug_log(f"analysis_handler ์ด ์คํ ์๊ฐ: {end_time - start_time:.2f} ์ด") | |
return result_df, excel_path | |
# --- ์คํฌ๋ํ ์คํ --- | |
def fetch_blog_content(url: str): | |
debug_log("fetch_blog_content ํจ์ ์์") | |
if not url or not url.strip(): | |
return "๋ธ๋ก๊ทธ URL์ ์ ๋ ฅํด์ฃผ์ธ์." | |
if not url.startswith("http://") and not url.startswith("https://"): | |
return "์ ํจํ URL ํ์(http:// ๋๋ https://)์ผ๋ก ์ ๋ ฅํด์ฃผ์ธ์." | |
start_time = time.time() | |
content = scrape_naver_blog(url) | |
end_time = time.time() | |
debug_log(f"fetch_blog_content ์ด ์คํ ์๊ฐ: {end_time - start_time:.2f} ์ด. ๋ด์ฉ ๊ธธ์ด: {len(content)}") | |
return content | |
# --- Custom CSS --- | |
custom_css = """ | |
/* ์ ์ฒด ์ปจํ ์ด๋ ์คํ์ผ */ | |
.gradio-container { | |
max-width: 1080px; /* ๋๋น ํ์ฅ */ | |
margin: auto; | |
font-family: 'Helvetica Neue', Arial, sans-serif; | |
background: #f5f7fa; | |
padding: 2rem; | |
} | |
/* ํค๋ ์คํ์ผ */ | |
.custom-header { | |
text-align: center; | |
font-size: 2.5rem; | |
font-weight: bold; | |
margin-bottom: 1.5rem; | |
color: #333; | |
} | |
/* ๊ทธ๋ฃน ๋ฐ์ค ์คํ์ผ */ | |
.custom-group { | |
background: #ffffff; | |
border-radius: 8px; | |
padding: 1.5rem; | |
box-shadow: 0 2px 8px rgba(0,0,0,0.1); | |
margin-bottom: 1.5rem; | |
} | |
/* ๋ฒํผ ์คํ์ผ */ | |
.custom-button { | |
background-color: #007bff; | |
color: #fff; | |
border: none; | |
border-radius: 4px; | |
padding: 0.6rem 1.2rem; | |
font-size: 1rem; | |
cursor: pointer; | |
min-width: 150px; /* ๋ฒํผ ์ต์ ๋๋น */ | |
} | |
.custom-button:hover { | |
background-color: #0056b3; | |
} | |
/* ์ฒดํฌ๋ฐ์ค ์คํ์ผ */ | |
.custom-checkbox { | |
margin-right: 1rem; | |
} | |
/* ๊ฒฐ๊ณผ ํ ์ด๋ธ ๋ฐ ๋ค์ด๋ก๋ ๋ฒํผ */ | |
.custom-result { | |
margin-top: 1.5rem; | |
} | |
/* ๊ฐ์ด๋ฐ ์ ๋ ฌ */ | |
.centered { | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
} | |
""" | |
# --- Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ --- | |
with gr.Blocks(title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ํค์๋ ๋ถ์ ์๋น์ค", css=custom_css) as demo: | |
gr.HTML("<div class='custom-header'>๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ํค์๋ ๋ถ์ ์๋น์ค</div>") | |
with gr.Row(): | |
with gr.Column(scale=2): # ์ผ์ชฝ ์ปฌ๋ผ (์ ๋ ฅ ์์ญ) | |
with gr.Group(elem_classes="custom-group"): | |
blog_url_input = gr.Textbox( | |
label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", | |
placeholder="์: https://blog.naver.com/์์ด๋/๊ธ๋ฒํธ", | |
lines=1, | |
info="๋ถ์ํ ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๊ฒ์๋ฌผ URL์ ์ ๋ ฅํ์ธ์." | |
) | |
with gr.Row(elem_classes="centered"): | |
scrape_button = gr.Button("๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ", elem_classes="custom-button", variant="primary") | |
with gr.Group(elem_classes="custom-group"): | |
blog_content_box = gr.Textbox( | |
label="๋ธ๋ก๊ทธ ๋ด์ฉ (์์ ๊ฐ๋ฅ)", | |
lines=10, | |
placeholder="์คํฌ๋ํ๋ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค. ์ง์ ์์ ํ๊ฑฐ๋ ๋ถ์ฌ๋ฃ์ ์ ์์ต๋๋ค." | |
) | |
with gr.Group(elem_classes="custom-group"): | |
gr.Markdown("### ๋ถ์ ์ต์ ์ค์ ") | |
with gr.Row(): | |
remove_freq_checkbox = gr.Checkbox( | |
label="๋น๋์ 1์ธ ๋จ์ด ์ ๊ฑฐ (ํํ์ ๋ถ์ ์)", | |
value=True, | |
elem_classes="custom-checkbox", | |
info="ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์์ ๋น๋์๊ฐ 1์ธ ๋จ์ด๋ฅผ ์ ์ธํฉ๋๋ค." | |
) | |
with gr.Row(): | |
direct_keyword_only_checkbox = gr.Checkbox( | |
label="์ง์ ํค์๋๋ง ๋ถ์", | |
value=False, | |
elem_classes="custom-checkbox", | |
info="์ด ์ต์ ์ ์ ํํ๋ฉด ์๋ ์ ๋ ฅํ ์ง์ ํค์๋์ ๋ํด์๋ง ๋ถ์์ ์ํํฉ๋๋ค (ํํ์ ๋ถ์ ์๋ต)." | |
) | |
with gr.Row(): | |
direct_keyword_box = gr.Textbox( | |
label="์ง์ ํค์๋ ์ ๋ ฅ (์ํฐ ๋๋ ','๋ก ๊ตฌ๋ถ)", | |
lines=3, | |
placeholder="์: ํค์๋1, ํค์๋2\nํค์๋3\n...\n(ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ๋ณ๋๋ก ๋ถ์ํ๊ฑฐ๋, ํตํฉ ๋ถ์์ ์ถ๊ฐํ ํค์๋)", | |
info="๋ถ์์ ํฌํจํ๊ฑฐ๋ ๋จ๋ ์ผ๋ก ๋ถ์ํ ํค์๋๋ฅผ ์ง์ ์ ๋ ฅํฉ๋๋ค." | |
) | |
with gr.Group(elem_classes="custom-group"): | |
with gr.Row(elem_classes="centered"): | |
analyze_button = gr.Button("ํค์๋ ๋ถ์ ์คํ", elem_classes="custom-button", variant="primary") | |
with gr.Column(scale=3): # ์ค๋ฅธ์ชฝ ์ปฌ๋ผ (๊ฒฐ๊ณผ ์์ญ) | |
with gr.Group(elem_classes="custom-group custom-result"): | |
gr.Markdown("### ๋ถ์ ๊ฒฐ๊ณผ") | |
result_df_display = gr.Dataframe( | |
label="ํตํฉ ๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์, ์ง์ ์ ๋ ฅ ์ฌ๋ถ)", | |
interactive=False, # ์ฌ์ฉ์๊ฐ ์ง์ ์์ ๋ถ๊ฐ | |
height=600, # ๋์ด ์กฐ์ | |
wrap=True # ๊ธด ํ ์คํธ ์ค๋ฐ๊ฟ | |
) | |
with gr.Group(elem_classes="custom-group"): | |
gr.Markdown("### ๊ฒฐ๊ณผ ๋ค์ด๋ก๋") | |
excel_file_display = gr.File(label="๋ถ์ ๊ฒฐ๊ณผ Excel ํ์ผ ๋ค์ด๋ก๋") | |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ | |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box) | |
analyze_button.click( | |
fn=analysis_handler, | |
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox], | |
outputs=[result_df_display, excel_file_display] | |
) | |
if __name__ == "__main__": | |
# ํ๊ฒฝ ๋ณ์ ์ค์ ์์ (์ค์ ์คํ ์์๋ ์์คํ ํ๊ฒฝ ๋ณ์๋ก ์ค์ ํ๊ฑฐ๋, .env ํ์ผ ๋ฑ์ ์ฌ์ฉ) | |
# os.environ["NAVER_API_KEY"] = "YOUR_NAVER_API_KEY" | |
# os.environ["NAVER_SECRET_KEY"] = "YOUR_NAVER_SECRET_KEY" | |
# os.environ["NAVER_CUSTOMER_ID"] = "YOUR_NAVER_CUSTOMER_ID" | |
# os.environ["NAVER_SEARCH_CLIENT_ID"] = "YOUR_NAVER_SEARCH_CLIENT_ID" | |
# os.environ["NAVER_SEARCH_CLIENT_SECRET"] = "YOUR_NAVER_SEARCH_CLIENT_SECRET" | |
# ํ๊ฒฝ ๋ณ์ ์ค์ ํ์ธ | |
required_env_vars = [ | |
"NAVER_API_KEY", "NAVER_SECRET_KEY", "NAVER_CUSTOMER_ID", | |
"NAVER_SEARCH_CLIENT_ID", "NAVER_SEARCH_CLIENT_SECRET" | |
] | |
missing_vars = [var for var in required_env_vars if not os.environ.get(var)] | |
if missing_vars: | |
debug_log(f"๊ฒฝ๊ณ : ๋ค์ ํ์ ํ๊ฒฝ ๋ณ์๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค - {', '.join(missing_vars)}") | |
debug_log("API ํธ์ถ ๊ธฐ๋ฅ์ด ์ ์์ ์ผ๋ก ๋์ํ์ง ์์ ์ ์์ต๋๋ค.") | |
debug_log("์คํฌ๋ฆฝํธ ์คํ ์ ์ ํด๋น ํ๊ฒฝ ๋ณ์๋ฅผ ์ค์ ํด์ฃผ์ธ์.") | |
# Gradio ์ฑ์ ์คํํ๋, API ํธ์ถ ์ ์ค๋ฅ๊ฐ ๋ฐ์ํ ์ ์์์ ์ฌ์ฉ์์๊ฒ ์๋ฆผ. | |
debug_log("Gradio ์ฑ ์คํ ์์") | |
demo.launch(debug=True) # ๊ฐ๋ฐ ์ค์๋ debug=True๋ก ์ค์ ํ์ฌ ์ค๋ฅ ํ์ธ ์ฉ์ด | |
debug_log("Gradio ์ฑ ์คํ ์ข ๋ฃ") |