Kims12 commited on
Commit
623047f
ยท
verified ยท
1 Parent(s): bc1cd74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -617
app.py CHANGED
@@ -1,29 +1,21 @@
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
- import urllib.parse # iframe ๊ฒฝ๋กœ ๋ณด์ •์„ ์œ„ํ•œ ๋ชจ๋“ˆ
5
  import re
6
  import logging
7
  import tempfile
8
  import pandas as pd
9
- import mecab # python?mecab?ko ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉ
10
  import os
11
  import time
12
  import hmac
13
  import hashlib
14
  import base64
15
- from concurrent.futures import ThreadPoolExecutor, as_completed
16
-
17
- # --- ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์„ค์ • ---
18
- # API ํ˜ธ์ถœ ์ œํ•œ์— ๋งž์ถฐ ์ ์ ˆํžˆ ์กฐ์ ˆํ•˜์„ธ์š”.
19
- # ๋„ˆ๋ฌด ๋†’์€ ๊ฐ’์€ API ์ œํ•œ์— ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
20
- MAX_WORKERS_RELATED_KEYWORDS = 5 # fetch_related_keywords ๋ณ‘๋ ฌ ์ž‘์—…์ž ์ˆ˜
21
- MAX_WORKERS_BLOG_COUNT = 10 # fetch_blog_count ๋ณ‘๋ ฌ ์ž‘์—…์ž ์ˆ˜
22
-
23
 
24
  # ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
25
  def debug_log(message: str):
26
- print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] [DEBUG] {message}")
27
 
28
  # --- ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํ•‘ ---
29
  def scrape_naver_blog(url: str) -> str:
@@ -37,7 +29,7 @@ def scrape_naver_blog(url: str) -> str:
37
  )
38
  }
39
  try:
40
- response = requests.get(url, headers=headers, timeout=10)
41
  debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
42
  if response.status_code != 200:
43
  debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
@@ -47,127 +39,63 @@ def scrape_naver_blog(url: str) -> str:
47
  iframe = soup.select_one("iframe#mainFrame")
48
  if not iframe:
49
  debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
50
- # ์ผ๋ถ€ ๋ธ”๋กœ๊ทธ๋Š” mainFrame์ด ์—†์„ ์ˆ˜ ์žˆ์Œ. ๋ณธ๋ฌธ ์ง์ ‘ ์‹œ๋„
51
- content_div_direct = soup.select_one('.se-main-container')
52
- if content_div_direct:
53
- title_div_direct = soup.select_one('.se-module.se-module-text.se-title-text')
54
- title = title_div_direct.get_text(strip=True) if title_div_direct else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
55
- content = content_div_direct.get_text("\n", strip=True)
56
- debug_log("iframe ์—†์ด ๋ณธ๋ฌธ ์ง์ ‘ ์ถ”์ถœ ์™„๋ฃŒ")
57
- return f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
58
- return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. (๋ณธ๋ฌธ ์ง์ ‘ ์ถ”์ถœ ์‹คํŒจ)"
59
-
60
  iframe_src = iframe.get("src")
61
  if not iframe_src:
62
  debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
63
  return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
64
-
65
- # iframe_src๊ฐ€ ์ ˆ๋Œ€ URL์ด ์•„๋‹Œ ๊ฒฝ์šฐ๋ฅผ ๋Œ€๋น„
66
- if iframe_src.startswith("//"):
67
- parsed_iframe_url = "https:" + iframe_src
68
- elif iframe_src.startswith("/"):
69
- parsed_main_url = urllib.parse.urlparse(url)
70
- parsed_iframe_url = urllib.parse.urlunparse(
71
- (parsed_main_url.scheme, parsed_main_url.netloc, iframe_src, None, None, None)
72
- )
73
- else:
74
- parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
75
-
76
  debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
77
- iframe_response = requests.get(parsed_iframe_url, headers=headers, timeout=10)
78
  debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
79
  if iframe_response.status_code != 200:
80
  debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
81
  return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
82
  iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
83
  debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
84
-
85
- # ์ œ๋ชฉ ์ถ”์ถœ (๋‹ค์–‘ํ•œ ๊ตฌ์กฐ ์‹œ๋„)
86
- title_selectors = [
87
- '.se-module.se-module-text.se-title-text', # ์ผ๋ฐ˜์ ์ธ ์Šค๋งˆํŠธ์—๋””ํ„ฐ ONE
88
- '.title_text', # ๊ตฌ๋ฒ„์ „ ์—๋””ํ„ฐ ๋˜๋Š” ๋‹ค๋ฅธ ๊ตฌ์กฐ
89
- 'div[class*="title"] h3',
90
- 'h1', 'h2', 'h3' # ์ผ๋ฐ˜์ ์ธ ์ œ๋ชฉ ํƒœ๊ทธ
91
- ]
92
- title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
93
- for selector in title_selectors:
94
- title_div = iframe_soup.select_one(selector)
95
- if title_div:
96
- title = title_div.get_text(strip=True)
97
- break
98
  debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
99
-
100
- # ๋ณธ๋ฌธ ์ถ”์ถœ (๋‹ค์–‘ํ•œ ๊ตฌ์กฐ ์‹œ๋„)
101
- content_selectors = [
102
- '.se-main-container', # ์Šค๋งˆํŠธ์—๋””ํ„ฐ ONE
103
- 'div#content', # ๊ตฌ๋ฒ„์ „ ์—๋””ํ„ฐ
104
- 'div.post_ct', # ์ผ๋ถ€ ๋ธ”๋กœ๊ทธ ๊ตฌ์กฐ
105
- 'article', 'main' # ์‹œ๋งจํ‹ฑ ํƒœ๊ทธ
106
- ]
107
- content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
108
- for selector in content_selectors:
109
- content_div = iframe_soup.select_one(selector)
110
- if content_div:
111
- # ๋ถˆํ•„์š”ํ•œ ์Šคํฌ๋ฆฝํŠธ, ์Šคํƒ€์ผ ํƒœ๊ทธ ์ œ๊ฑฐ
112
- for s in content_div(['script', 'style']):
113
- s.decompose()
114
- content = content_div.get_text("\n", strip=True)
115
- break
116
-
117
  debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
118
  result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
119
  debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํ•ฉ์นจ ์™„๋ฃŒ")
120
  return result
121
- except requests.exceptions.Timeout:
122
- debug_log(f"์š”์ฒญ ์‹œ๊ฐ„ ์ดˆ๊ณผ: {url}")
123
- return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์‹œ๊ฐ„ ์ดˆ๊ณผ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {url}"
124
  except Exception as e:
125
- debug_log(f"์Šคํฌ๋ž˜ํ•‘ ์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
126
  return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
127
 
128
  # --- ํ˜•ํƒœ์†Œ ๋ถ„์„ (์ฐธ์กฐ์ฝ”๋“œ-1) ---
129
  def analyze_text(text: str):
130
- logging.basicConfig(level=logging.INFO) # INFO ๋ ˆ๋ฒจ๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ๋„ˆ๋ฌด ๋งŽ์€ ๋กœ๊ทธ ๋ฐฉ์ง€
131
  logger = logging.getLogger(__name__)
132
- # logger.debug("์›๋ณธ ํ…์ŠคํŠธ: %s", text) # ๋„ˆ๋ฌด ๊ธธ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์ฃผ์„ ์ฒ˜๋ฆฌ
133
- filtered_text = re.sub(r'[^๊ฐ€-ํžฃa-zA-Z0-9\s]', '', text) # ์˜์–ด, ์ˆซ์ž, ๊ณต๋ฐฑ ํฌํ•จ
134
- # logger.debug("ํ•„ํ„ฐ๋ง๋œ ํ…์ŠคํŠธ: %s", filtered_text)
135
- if not filtered_text.strip():
136
- logger.info("์œ ํšจํ•œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Œ (ํ•„ํ„ฐ๋ง ํ›„).")
137
- return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
138
- try:
139
- mecab_instance = mecab.MeCab()
140
- tokens = mecab_instance.pos(filtered_text)
141
- except Exception as e:
142
- logger.error(f"MeCab ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ค‘ ์˜ค๋ฅ˜: {e}")
143
  return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
144
-
145
- # logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ: %s", tokens)
 
146
  freq = {}
147
  for word, pos in tokens:
148
- # ์ผ๋ฐ˜๋ช…์‚ฌ(NNG), ๊ณ ์œ ๋ช…์‚ฌ(NNP), ์™ธ๊ตญ์–ด(SL), ์ˆซ์ž(SN) ๋“ฑ ํฌํ•จ, ํ•œ ๊ธ€์ž ๋‹จ์–ด๋Š” ์ œ์™ธ (์„ ํƒ ์‚ฌํ•ญ)
149
- if word and word.strip() and (pos.startswith("NN") or pos in ["SL", "SH"]) and len(word) > 1 :
150
  freq[word] = freq.get(word, 0) + 1
151
- # logger.debug("๋‹จ์–ด: %s, ํ’ˆ์‚ฌ: %s, ๋นˆ๋„: %d", word, pos, freq[word])
152
-
153
  sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
154
- # logger.debug("์ •๋ ฌ๋œ ๋‹จ์–ด ๋นˆ๋„: %s", sorted_freq)
155
  df = pd.DataFrame(sorted_freq, columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"])
156
- logger.info(f"ํ˜•ํƒœ์†Œ ๋ถ„์„ DataFrame ์ƒ์„ฑ๋จ, shape: {df.shape}")
157
-
158
- temp_file_path = ""
159
- if not df.empty:
160
- try:
161
- with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx", mode='w+b') as temp_file:
162
- df.to_excel(temp_file.name, index=False, engine='openpyxl')
163
- temp_file_path = temp_file.name
164
- logger.info(f"Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {temp_file_path}")
165
- except Exception as e:
166
- logger.error(f"Excel ํŒŒ์ผ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜: {e}")
167
- temp_file_path = "" # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๊ฒฝ๋กœ ์ดˆ๊ธฐํ™”
168
-
169
- return df, temp_file_path
170
-
171
 
172
  # --- ๋„ค์ด๋ฒ„ ๊ฒ€์ƒ‰ ๋ฐ ๊ด‘๊ณ  API ๊ด€๋ จ (์ฐธ์กฐ์ฝ”๋“œ-2) ---
173
  def generate_signature(timestamp, method, uri, secret_key):
@@ -186,483 +114,189 @@ def get_header(method, uri, api_key, secret_key, customer_id):
186
  "X-Signature": signature
187
  }
188
 
189
- # API ํ‚ค ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ํ™•์ธ ํ•จ์ˆ˜
190
- def get_env_variable(var_name):
191
- value = os.environ.get(var_name)
192
- if value is None:
193
- debug_log(f"ํ™˜๊ฒฝ ๋ณ€์ˆ˜ '{var_name}'๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. API ํ˜ธ์ถœ์ด ์‹คํŒจํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
194
- # ํ•„์š”์‹œ ์—ฌ๊ธฐ์„œ raise Exception ๋˜๋Š” ๏ฟฝ๏ฟฝ๏ฟฝ๋ณธ๊ฐ’ ๋ฐ˜ํ™˜
195
- return value
196
-
197
  def fetch_related_keywords(keyword):
198
- debug_log(f"fetch_related_keywords ํ˜ธ์ถœ ์‹œ์ž‘, ํ‚ค์›Œ๋“œ: {keyword}")
199
- API_KEY = get_env_variable("NAVER_API_KEY")
200
- SECRET_KEY = get_env_variable("NAVER_SECRET_KEY")
201
- CUSTOMER_ID = get_env_variable("NAVER_CUSTOMER_ID")
202
-
203
- if not all([API_KEY, SECRET_KEY, CUSTOMER_ID]):
204
- debug_log(f"๋„ค์ด๋ฒ„ ๊ด‘๊ณ  API ํ‚ค ์ •๋ณด ๋ถ€์กฑ์œผ๋กœ '{keyword}' ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ๋ฅผ ๊ฑด๋„ˆ<0xEB><0x8><0xB5>๋‹ˆ๋‹ค.")
205
- return pd.DataFrame()
206
-
207
  BASE_URL = "https://api.naver.com"
208
  uri = "/keywordstool"
209
  method = "GET"
210
-
211
- try:
212
- headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
213
- params = {
214
- "hintKeywords": keyword, # ๋‹จ์ผ ํ‚ค์›Œ๋“œ ๋ฌธ์ž์—ด๋กœ ์ „๋‹ฌ
215
- "showDetail": "1"
216
- }
217
- # hintKeywords๋Š” ๋ฆฌ์ŠคํŠธ๋กœ ๋ฐ›์„ ์ˆ˜ ์žˆ์œผ๋‚˜, ์—ฌ๊ธฐ์„œ๋Š” ๋‹จ์ผ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ๋ฅผ ๊ฐ€์ •ํ•˜๊ณ  ๋ฌธ์ž์—ด๋กœ ์ „๋‹ฌ
218
- # ๋งŒ์•ฝ API๊ฐ€ hintKeywords๋ฅผ ๋ฆฌ์ŠคํŠธ๋กœ๋งŒ ๋ฐ›๋Š”๋‹ค๋ฉด [keyword]๋กœ ์ˆ˜์ • ํ•„์š”
219
-
220
- response = requests.get(BASE_URL + uri, params=params, headers=headers, timeout=10)
221
- response.raise_for_status() # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์˜ˆ์™ธ ๋ฐœ์ƒ
222
- data = response.json()
223
-
224
- if "keywordList" not in data or not data["keywordList"]:
225
- debug_log(f"'{keyword}'์— ๋Œ€ํ•œ ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ๊ฒฐ๊ณผ ์—†์Œ.")
226
- return pd.DataFrame() # ๋นˆ DataFrame ๋ฐ˜ํ™˜
227
-
228
- df = pd.DataFrame(data["keywordList"])
229
-
230
- # API ์‘๋‹ต์— ํ•ด๋‹น ์ปฌ๋Ÿผ์ด ์—†์„ ๊ฒฝ์šฐ๋ฅผ ๋Œ€๋น„
231
- df["monthlyPcQcCnt"] = df.get("monthlyPcQcCnt", 0)
232
- df["monthlyMobileQcCnt"] = df.get("monthlyMobileQcCnt", 0)
233
-
234
- def parse_count(x):
235
- if pd.isna(x) or str(x).lower() == '< 10': # ๋„ค์ด๋ฒ„ API๋Š” 10 ๋ฏธ๋งŒ์ผ ๋•Œ "< 10"์œผ๋กœ ๋ฐ˜ํ™˜
236
- return 5 # ๋˜๋Š” 0, ๋˜๋Š” ๋‹ค๋ฅธ ๋Œ€ํ‘œ๊ฐ’ (์˜ˆ: 5)
237
- try:
238
- return int(str(x).replace(",", ""))
239
- except ValueError:
240
- return 0
241
-
242
- df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["monthlyPcQcCnt"].apply(parse_count)
243
- df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["monthlyMobileQcCnt"].apply(parse_count)
244
- df["ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] + df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
245
- df.rename(columns={"relKeyword": "์ •๋ณดํ‚ค์›Œ๋“œ"}, inplace=True)
246
-
247
- # ํ•„์š”ํ•œ ์ปฌ๋Ÿผ๋งŒ ์„ ํƒ, ์—†๋Š” ๊ฒฝ์šฐ ๋Œ€๋น„
248
- required_cols = ["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
249
- result_df = pd.DataFrame(columns=required_cols)
250
- for col in required_cols:
251
- if col in df.columns:
252
- result_df[col] = df[col]
253
- else: # ํ•ด๋‹น ์ปฌ๋Ÿผ์ด API ์‘๋‹ต์— ์—†์„ ๊ฒฝ์šฐ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์ฑ„์›€
254
- if col == "์ •๋ณดํ‚ค์›Œ๋“œ": # ์ •๋ณดํ‚ค์›Œ๋“œ๋Š” ํ•„์ˆ˜
255
- debug_log(f"API ์‘๋‹ต์— 'relKeyword'๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. '{keyword}' ์ฒ˜๋ฆฌ ์ค‘๋‹จ.")
256
- return pd.DataFrame()
257
- result_df[col] = 0
258
-
259
- debug_log(f"fetch_related_keywords '{keyword}' ์™„๋ฃŒ, ๊ฒฐ๊ณผ {len(result_df)}๊ฐœ")
260
- return result_df.head(100) # ์ตœ๋Œ€ 100๊ฐœ๋กœ ์ œํ•œ
261
-
262
- except requests.exceptions.HTTPError as http_err:
263
- debug_log(f"HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_related_keywords for '{keyword}'): {http_err} - ์‘๋‹ต: {response.text if 'response' in locals() else 'N/A'}")
264
- except requests.exceptions.RequestException as req_err:
265
- debug_log(f"์š”์ฒญ ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_related_keywords for '{keyword}'): {req_err}")
266
- except Exception as e:
267
- debug_log(f"์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_related_keywords for '{keyword}'): {e}")
268
- return pd.DataFrame() # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๋นˆ DataFrame ๋ฐ˜ํ™˜
269
-
270
 
271
  def fetch_blog_count(keyword):
272
  debug_log(f"fetch_blog_count ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ: {keyword}")
273
- client_id = get_env_variable("NAVER_SEARCH_CLIENT_ID")
274
- client_secret = get_env_variable("NAVER_SEARCH_CLIENT_SECRET")
275
-
276
- if not client_id or not client_secret:
277
- debug_log(f"๋„ค์ด๋ฒ„ ๊ฒ€์ƒ‰ API ํ‚ค ์ •๋ณด ๋ถ€์กฑ์œผ๋กœ '{keyword}' ๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ๋ฅผ ๊ฑด๋„ˆ<0xEB><0x8><0xB5>๋‹ˆ๋‹ค.")
278
- return 0
279
-
280
  url = "https://openapi.naver.com/v1/search/blog.json"
281
  headers = {
282
  "X-Naver-Client-Id": client_id,
283
  "X-Naver-Client-Secret": client_secret
284
  }
285
- params = {"query": keyword, "display": 1} # display=1๋กœ ์„ค์ •ํ•˜์—ฌ total ๊ฐ’๋งŒ ๋น ๋ฅด๊ฒŒ ํ™•์ธ
286
-
287
- try:
288
- response = requests.get(url, headers=headers, params=params, timeout=5)
289
- response.raise_for_status() # HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์˜ˆ์™ธ ๋ฐœ์ƒ
290
  data = response.json()
291
- total_count = data.get("total", 0)
292
- debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {total_count} for '{keyword}'")
293
- return total_count
294
- except requests.exceptions.HTTPError as http_err:
295
- debug_log(f"HTTP ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_blog_count for '{keyword}'): {http_err} - ์‘๋‹ต: {response.text}")
296
- except requests.exceptions.RequestException as req_err: # Timeout, ConnectionError ๋“ฑ
297
- debug_log(f"์š”์ฒญ ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_blog_count for '{keyword}'): {req_err}")
298
- except Exception as e: # JSONDecodeError ๋“ฑ ๊ธฐํƒ€ ์˜ˆ์™ธ
299
- debug_log(f"์•Œ ์ˆ˜ ์—†๋Š” ์˜ค๋ฅ˜ ๋ฐœ์ƒ (fetch_blog_count for '{keyword}'): {e}")
300
- return 0 # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ 0 ๋ฐ˜ํ™˜
301
 
302
  def create_excel_file(df):
303
- if df.empty:
304
- debug_log("๋นˆ DataFrame์œผ๋กœ Excel ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
305
- # ๋นˆ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ฑฐ๋‚˜, None์„ ๋ฐ˜ํ™˜ํ•˜์—ฌ Gradio์—์„œ ์ฒ˜๋ฆฌํ•˜๋„๋ก ํ•  ์ˆ˜ ์žˆ์Œ
306
- # ์—ฌ๊ธฐ์„œ๋Š” ๋นˆ ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜์—ฌ ๋ฐ˜ํ™˜ (Gradio File ์ปดํฌ๋„ŒํŠธ๊ฐ€ ๊ฒฝ๋กœ๋ฅผ ๊ธฐ๋Œ€ํ•˜๋ฏ€๋กœ)
307
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
308
- excel_path = tmp.name
309
- # ๋นˆ ์—‘์…€ ํŒŒ์ผ์— ํ—ค๋”๋งŒ์ด๋ผ๋„ ์จ์ฃผ๋ ค๋ฉด
310
- # pd.DataFrame(columns=df.columns).to_excel(excel_path, index=False)
311
- # ์•„๋‹ˆ๋ฉด ๊ทธ๋ƒฅ ๋นˆ ํŒŒ์ผ์„ ๋ฐ˜ํ™˜
312
- return excel_path
313
-
314
- try:
315
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False, mode='w+b') as tmp:
316
- excel_path = tmp.name
317
- df.to_excel(excel_path, index=False, engine='openpyxl')
318
- debug_log(f"Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {excel_path}")
319
- return excel_path
320
- except Exception as e:
321
- debug_log(f"Excel ํŒŒ์ผ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}")
322
- # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ๋นˆ ํŒŒ์ผ ๊ฒฝ๋กœ๋ผ๋„ ๋ฐ˜ํ™˜ (Gradio ํ˜ธํ™˜์„ฑ)
323
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
324
- return tmp.name
325
-
326
 
327
  def process_keyword(keywords: str, include_related: bool):
328
- debug_log(f"process_keyword ํ˜ธ์ถœ ์‹œ์ž‘, ํ‚ค์›Œ๋“œ๋“ค: '{keywords[:100]}...', ์—ฐ๊ด€๊ฒ€์ƒ‰์–ด ํฌํ•จ: {include_related}")
329
- input_keywords_orig = [k.strip() for k in keywords.splitlines() if k.strip()]
330
-
331
- if not input_keywords_orig:
332
- debug_log("์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
333
- return pd.DataFrame(columns=["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]), ""
334
-
335
- all_related_keywords_dfs = []
336
-
337
- # 1. fetch_related_keywords ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
338
- debug_log(f"์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์‹œ์ž‘ (์ตœ๋Œ€ ์ž‘์—…์ž ์ˆ˜: {MAX_WORKERS_RELATED_KEYWORDS})")
339
- with ThreadPoolExecutor(max_workers=MAX_WORKERS_RELATED_KEYWORDS) as executor:
340
- future_to_keyword_related = {
341
- executor.submit(fetch_related_keywords, kw): kw for kw in input_keywords_orig
342
- }
343
- for i, future in enumerate(as_completed(future_to_keyword_related)):
344
- kw = future_to_keyword_related[future]
345
- try:
346
- df_kw_related = future.result() # DataFrame ๋ฐ˜ํ™˜
347
- if not df_kw_related.empty:
348
- # ์›๋ณธ ํ‚ค์›Œ๋“œ๊ฐ€ ๊ฒฐ๊ณผ์— ํฌํ•จ๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธํ•˜๊ณ , ์—†์œผ๋ฉด ์ถ”๊ฐ€ ์‹œ๋„ (API๊ฐ€ ํ•ญ์ƒ relKeyword๋กœ ์ž์‹ ์„ ์ฃผ์ง„ ์•Š์Œ)
349
- # ํ•˜์ง€๋งŒ fetch_related_keywords์—์„œ ์ด๋ฏธ hintKeyword๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ๊ฒ€์ƒ‰ํ•˜๋ฏ€๋กœ,
350
- # ์ผ๋ฐ˜์ ์œผ๋กœ๋Š” ํ•ด๋‹น ํ‚ค์›Œ๋“œ ์ •๋ณด๊ฐ€ ์žˆ๊ฑฐ๋‚˜, ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋งŒ ๋‚˜์˜ด.
351
- # ์—ฌ๊ธฐ์„œ๋Š” API ์‘๋‹ต์„ ๊ทธ๋Œ€๋กœ ํ™œ์šฉ.
352
-
353
- # ์ฒซ ๋ฒˆ์งธ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ์ด๊ณ , ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ํฌํ•จ ์˜ต์…˜์ด ์ผœ์ ธ ์žˆ์œผ๋ฉด ๋ชจ๋“  ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”๊ฐ€
354
- # ๊ทธ ์™ธ์˜ ๊ฒฝ์šฐ์—๋Š” ํ•ด๋‹น ํ‚ค์›Œ๋“œ ์ž์ฒด์˜ ์ •๋ณด๋งŒ (์žˆ๋‹ค๋ฉด) ์‚ฌ์šฉํ•˜๊ฑฐ๋‚˜, ์ตœ์ƒ๋‹จ ํ‚ค์›Œ๋“œ ์‚ฌ์šฉ
355
- if include_related and kw == input_keywords_orig[0]:
356
- all_related_keywords_dfs.append(df_kw_related)
357
- debug_log(f"์ฒซ ๋ฒˆ์งธ ํ‚ค์›Œ๋“œ '{kw}'์˜ ๋ชจ๋“  ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ({len(df_kw_related)}๊ฐœ) ์ถ”๊ฐ€๋จ.")
358
- else:
359
- # ํ•ด๋‹น ํ‚ค์›Œ๋“œ์™€ ์ผ์น˜ํ•˜๋Š” ํ–‰์„ ์ฐพ๊ฑฐ๋‚˜, ์—†์œผ๋ฉด API๊ฐ€ ๋ฐ˜ํ™˜ํ•œ ์ฒซ๋ฒˆ์งธ ํ–‰์„ ์‚ฌ์šฉ
360
- row_kw = df_kw_related[df_kw_related["์ •๋ณดํ‚ค์›Œ๋“œ"] == kw]
361
- if not row_kw.empty:
362
- all_related_keywords_dfs.append(row_kw)
363
- debug_log(f"ํ‚ค์›Œ๋“œ '{kw}'์˜ ์ง์ ‘ ์ •๋ณด ์ถ”๊ฐ€๋จ.")
364
- elif not df_kw_related.empty : # ์ง์ ‘ ์ •๋ณด๋Š” ์—†์ง€๋งŒ ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋Š” ์žˆ์„ ๋•Œ
365
- all_related_keywords_dfs.append(df_kw_related.head(1)) # ๊ฐ€์žฅ ์—ฐ๊ด€์„ฑ ๋†’์€ ํ‚ค์›Œ๋“œ ์ถ”๊ฐ€
366
- debug_log(f"ํ‚ค์›Œ๋“œ '{kw}'์˜ ์ง์ ‘ ์ •๋ณด๋Š” ์—†์œผ๋‚˜, ๊ฐ€์žฅ ์—ฐ๊ด€์„ฑ ๋†’์€ ํ‚ค์›Œ๋“œ 1๊ฐœ ์ถ”๊ฐ€๋จ.")
367
- # else: ํ‚ค์›Œ๋“œ ์ •๋ณด๋„, ์—ฐ๊ด€ ์ •๋ณด๋„ ์—†์„ ๋•Œ (df_kw_related๊ฐ€ ๋น„์–ด์žˆ์Œ)
368
-
369
- debug_log(f"'{kw}' ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ ({i+1}/{len(input_keywords_orig)})")
370
- except Exception as e:
371
- debug_log(f"'{kw}' ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ ์ค‘ ๋ณ‘๋ ฌ ์ž‘์—… ์˜ค๋ฅ˜: {e}")
372
-
373
- if not all_related_keywords_dfs:
374
- debug_log("์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ์กฐํšŒ ๊ฒฐ๊ณผ๊ฐ€ ๋ชจ๋‘ ๋น„์–ด์žˆ์Šต๋‹ˆ๋‹ค.")
375
- # ๋นˆ DataFrame์— ๋ธ”๋กœ๊ทธ ๋ฌธ์„œ์ˆ˜ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
376
- empty_df = pd.DataFrame(columns=["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"])
377
- empty_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = None
378
- return empty_df, create_excel_file(empty_df)
379
-
380
- result_df = pd.concat(all_related_keywords_dfs, ignore_index=True)
381
- result_df.drop_duplicates(subset=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True) # ์ค‘๋ณต ์ œ๊ฑฐ
382
- debug_log(f"์—ฐ๊ด€ ํ‚ค์›Œ๋“œ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์™„๋ฃŒ. ํ†ตํ•ฉ๋œ DataFrame shape: {result_df.shape}")
383
-
384
- # 2. fetch_blog_count ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
385
- keywords_for_blog_count = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].dropna().unique().tolist()
386
- blog_counts_map = {}
387
-
388
- if keywords_for_blog_count:
389
- debug_log(f"๋ธ”๋กœ๊ทธ ๋ฌธ์„œ ์ˆ˜ ์กฐํšŒ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์‹œ์ž‘ (ํ‚ค์›Œ๋“œ {len(keywords_for_blog_count)}๊ฐœ, ์ตœ๋Œ€ ์ž‘์—…์ž ์ˆ˜: {MAX_WORKERS_BLOG_COUNT})")
390
- with ThreadPoolExecutor(max_workers=MAX_WORKERS_BLOG_COUNT) as executor:
391
- future_to_keyword_blog = {
392
- executor.submit(fetch_blog_count, kw): kw for kw in keywords_for_blog_count
393
- }
394
- for i, future in enumerate(as_completed(future_to_keyword_blog)):
395
- kw = future_to_keyword_blog[future]
396
- try:
397
- count = future.result() # ์ˆซ์ž ๋ฐ˜ํ™˜
398
- blog_counts_map[kw] = count
399
- if (i+1) % 50 == 0: # ๋„ˆ๋ฌด ๋งŽ์€ ๋กœ๊ทธ ๋ฐฉ์ง€
400
- debug_log(f"๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ ์ง„ํ–‰ ์ค‘... ({i+1}/{len(keywords_for_blog_count)})")
401
- except Exception as e:
402
- debug_log(f"'{kw}' ๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ ์ค‘ ๋ณ‘๋ ฌ ์ž‘์—… ์˜ค๋ฅ˜: {e}")
403
- blog_counts_map[kw] = 0 # ์˜ค๋ฅ˜ ์‹œ 0์œผ๋กœ ์ฒ˜๋ฆฌ
404
-
405
- result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].map(blog_counts_map).fillna(0).astype(int)
406
- debug_log("๋ธ”๋กœ๊ทธ ๋ฌธ์„œ ์ˆ˜ ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ ์™„๋ฃŒ.")
407
  else:
408
- result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = 0 # ์กฐํšŒํ•  ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด 0์œผ๋กœ ์ฑ„์›€
409
-
410
  result_df.sort_values(by="ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", ascending=False, inplace=True)
411
- debug_log(f"process_keyword ์ตœ์ข… ์™„๋ฃŒ. DataFrame shape: {result_df.shape}")
412
-
413
- # ์ตœ์ข… ์ปฌ๋Ÿผ ์ˆœ์„œ ๋ฐ ์กด์žฌ ์—ฌ๋ถ€ ํ™•์ธ
414
- final_columns = ["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
415
- for col in final_columns:
416
- if col not in result_df.columns:
417
- result_df[col] = 0 if col != "์ •๋ณดํ‚ค์›Œ๋“œ" else "" # ์—†๋Š” ์ปฌ๋Ÿผ์€ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์ฑ„์›€
418
-
419
- result_df = result_df[final_columns] # ์ปฌ๋Ÿผ ์ˆœ์„œ ๊ณ ์ •
420
-
421
  return result_df, create_excel_file(result_df)
422
 
423
-
424
  # --- ํ˜•ํƒœ์†Œ ๋ถ„์„๊ณผ ๊ฒ€์ƒ‰๋Ÿ‰/๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ๋ณ‘ํ•ฉ ---
425
  def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
426
  debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์‹œ์ž‘")
427
- df_freq, _ = analyze_text(text) # ์—‘์…€ ํŒŒ์ผ ๊ฒฝ๋กœ๋Š” ์—ฌ๊ธฐ์„  ์‚ฌ์šฉ ์•ˆ ํ•จ
428
-
429
  if df_freq.empty:
430
  debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋นˆ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์ž…๋‹ˆ๋‹ค.")
431
- return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]), ""
432
-
433
  if remove_freq1:
434
- before_count = len(df_freq)
435
- df_freq = df_freq[df_freq["๋นˆ๋„์ˆ˜"] > 1].copy() # .copy() ์ถ”๊ฐ€
436
- debug_log(f"๋นˆ๋„์ˆ˜ 1 ์ œ๊ฑฐ ์ ์šฉ๋จ. {before_count} -> {len(df_freq)}")
437
-
438
- if df_freq.empty:
439
- debug_log("๋นˆ๋„์ˆ˜ 1 ์ œ๊ฑฐ ํ›„ ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
440
- return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]), ""
441
-
442
- keywords_from_morph = "\n".join(df_freq["๋‹จ์–ด"].tolist())
443
- debug_log(f"ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ({len(df_freq['๋‹จ์–ด'])}๊ฐœ)์— ๋Œ€ํ•œ ์ •๋ณด ์กฐํšŒ ์‹œ์ž‘")
444
-
445
- # process_keyword๋Š” ์—ฐ๊ด€ ํ‚ค์›Œ๋“œ๋ฅผ ํฌํ•จํ•˜์ง€ ์•Š๋„๋ก ํ˜ธ์ถœ (include_related=False)
446
- df_keyword_info, _ = process_keyword(keywords_from_morph, include_related=False)
447
- debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ์™„๋ฃŒ")
448
-
449
- if df_keyword_info.empty:
450
- debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ API ์ •๋ณด ์กฐํšŒ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
451
- # df_freq์— ๋นˆ ์ปฌ๋Ÿผ๋“ค ์ถ”๊ฐ€
452
- for col in ["PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]:
453
- df_freq[col] = None
454
- merged_df = df_freq
455
- else:
456
- merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋‹จ์–ด", right_on="์ •๋ณดํ‚ค์›Œ๋“œ", how="left")
457
- if "์ •๋ณดํ‚ค์›Œ๋“œ" in merged_df.columns: # merge ํ›„ ์ •๋ณดํ‚ค์›Œ๋“œ ์ปฌ๋Ÿผ์ด ์ƒ๊ฒผ๋‹ค๋ฉด ์‚ญ์ œ
458
- merged_df.drop(columns=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True, errors='ignore')
459
-
460
- # ๋ˆ„๋ฝ๋œ ์ปฌ๋Ÿผ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์ฑ„์šฐ๊ธฐ
461
- expected_cols = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
462
- for col in expected_cols:
463
- if col not in merged_df.columns:
464
- merged_df[col] = None if col not in ["๋นˆ๋„์ˆ˜"] else 0
465
-
466
- merged_df = merged_df[expected_cols] # ์ปฌ๋Ÿผ ์ˆœ์„œ ๊ณ ์ •
467
-
468
  merged_excel_path = create_excel_file(merged_df)
469
  debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์™„๋ฃŒ")
470
  return merged_df, merged_excel_path
471
 
472
-
473
  # --- ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ถ„์„ (๋‹จ๋… ๋ถ„์„) ---
474
  def direct_keyword_analysis(text: str, keyword_input: str):
475
  debug_log("direct_keyword_analysis ํ•จ์ˆ˜ ์‹œ์ž‘")
476
- direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', keyword_input) if kw.strip()]
477
- debug_log(f"์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ชฉ๋ก: {direct_keywords_list}")
478
-
479
- if not direct_keywords_list:
480
- debug_log("์ง์ ‘ ์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
481
- return pd.DataFrame(columns=["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜"]), ""
482
-
483
- # 1. ๋ณธ๋ฌธ ๋‚ด ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
484
- results_freq = []
485
- for kw in direct_keywords_list:
486
- count = text.count(kw) # ๋Œ€์†Œ๋ฌธ์ž ๊ตฌ๋ถ„, ์ •ํ™•ํ•œ ๋ฌธ์ž์—ด ์นด์šดํŠธ
487
- results_freq.append({"ํ‚ค์›Œ๋“œ": kw, "๋นˆ๋„์ˆ˜": count})
488
- debug_log(f"์ง์ ‘ ํ‚ค์›Œ๋“œ '{kw}'์˜ ๋ณธ๋ฌธ ๋‚ด ๋นˆ๋„์ˆ˜: {count}")
489
- df_direct_freq = pd.DataFrame(results_freq)
490
-
491
- # 2. API๋ฅผ ํ†ตํ•ด ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ ์ˆ˜ ์กฐํšŒ (๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ๋œ process_keyword ์‚ฌ์šฉ)
492
- # ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ ์ง์ ‘ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ์ •๋ณด๋งŒ ํ•„์š”ํ•˜๋ฏ€๋กœ include_related=False
493
- keywords_for_api = "\n".join(direct_keywords_list)
494
- df_direct_api_info, _ = process_keyword(keywords_for_api, include_related=False)
495
-
496
- # 3. ๋นˆ๋„์ˆ˜ ๊ฒฐ๊ณผ์™€ API ๊ฒฐ๊ณผ ๋ณ‘ํ•ฉ
497
- if not df_direct_api_info.empty:
498
- # API ๊ฒฐ๊ณผ์˜ '์ •๋ณดํ‚ค์›Œ๋“œ'๋ฅผ 'ํ‚ค์›Œ๋“œ'๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ๋ณ‘ํ•ฉ ๊ธฐ์ค€ ํ†ต์ผ
499
- df_direct_api_info.rename(columns={"์ •๋ณดํ‚ค์›Œ๋“œ": "ํ‚ค์›Œ๋“œ"}, inplace=True)
500
- merged_df = pd.merge(df_direct_freq, df_direct_api_info, on="ํ‚ค์›Œ๋“œ", how="left")
501
- else:
502
- merged_df = df_direct_freq
503
- for col in ["PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]:
504
- merged_df[col] = None # API ์ •๋ณด๊ฐ€ ์—†์„ ๊ฒฝ์šฐ ๋นˆ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
505
-
506
- # ์ปฌ๋Ÿผ ์ˆœ์„œ ๋ฐ ๊ธฐ๋ณธ๊ฐ’ ์ •๋ฆฌ
507
- final_cols = ["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
508
- for col in final_cols:
509
- if col not in merged_df.columns:
510
- merged_df[col] = 0 if col != "ํ‚ค์›Œ๋“œ" else ""
511
- merged_df = merged_df[final_cols]
512
-
513
-
514
- excel_path = create_excel_file(merged_df)
515
  debug_log("direct_keyword_analysis ํ•จ์ˆ˜ ์™„๋ฃŒ")
516
- return merged_df, excel_path
517
-
518
 
519
  # --- ํ†ตํ•ฉ ๋ถ„์„ (ํ˜•ํƒœ์†Œ ๋ถ„์„ + ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ถ„์„) ---
520
  def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
521
  debug_log("combined_analysis ํ•จ์ˆ˜ ์‹œ์ž‘")
522
-
523
- # 1. ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋ฐ˜ ๊ฒฐ๊ณผ (API ์ •๋ณด ํฌํ•จ)
524
- df_morph, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
525
- # df_morph ์ปฌ๋Ÿผ: "๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"
526
-
527
- # 2. ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ
528
- direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', direct_keyword_input) if kw.strip()]
529
- debug_log(f"ํ†ตํ•ฉ ๋ถ„์„ - ์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ: {direct_keywords_list}")
530
-
531
- if not direct_keywords_list: # ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๋งŒ ๋ฐ˜ํ™˜
532
- if "์ง์ ‘์ž…๋ ฅ" not in df_morph.columns and not df_morph.empty:
533
- df_morph["์ง์ ‘์ž…๋ ฅ"] = "" # ์ง์ ‘์ž…๋ ฅ ์ปฌ๋Ÿผ ์ถ”๊ฐ€
534
- # ์ปฌ๋Ÿผ ์ˆœ์„œ ์กฐ์ •
535
- cols = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"]
536
- for col in cols:
537
- if col not in df_morph.columns:
538
- df_morph[col] = "" if col == "์ง์ ‘์ž…๋ ฅ" else (0 if col != "๋‹จ์–ด" else "")
539
- if not df_morph.empty: # df_morph๊ฐ€ ๋น„์–ด์žˆ์ง€ ์•Š์„ ๋•Œ๋งŒ ์ปฌ๋Ÿผ ์ˆœ์„œ ์ ์šฉ
540
- df_morph = df_morph[cols]
541
- return df_morph, create_excel_file(df_morph)
542
-
543
- # ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•œ ์ •๋ณด (๋นˆ๋„์ˆ˜, API ์ •๋ณด) ๊ฐ€์ ธ์˜ค๊ธฐ
544
- # direct_keyword_analysis๋Š” "ํ‚ค์›Œ๋“œ" ์ปฌ๋Ÿผ์„ ์‚ฌ์šฉํ•˜๋ฏ€๋กœ, df_morph์˜ "๋‹จ์–ด"์™€ ํ†ต์ผ ํ•„์š”
545
- df_direct_raw, _ = direct_keyword_analysis(blog_text, direct_keyword_input)
546
- # df_direct_raw ์ปฌ๋Ÿผ: "ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"
547
- df_direct_raw.rename(columns={"ํ‚ค์›Œ๋“œ": "๋‹จ์–ด"}, inplace=True) # ์ปฌ๋Ÿผ๋ช… ํ†ต์ผ
548
-
549
- # ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์— '์ง์ ‘์ž…๋ ฅ' ํ‘œ๊ธฐ
550
- if not df_morph.empty:
551
- df_morph["์ง์ ‘์ž…๋ ฅ"] = df_morph["๋‹จ์–ด"].apply(lambda x: "์ง์ ‘์ž…๋ ฅ" if x in direct_keywords_list else "")
552
- else: # ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋น„์–ด์žˆ์„ ์ˆ˜ ์žˆ์Œ
553
- df_morph = pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"])
554
-
555
-
556
- # ์ง์ ‘ ์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ ์ค‘ ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์— ์—†๋Š” ๊ฒƒ๋“ค์„ ์ถ”๊ฐ€
557
- # df_direct_raw์—๋Š” ๋ชจ๋“  ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ์˜ ์ •๋ณด๊ฐ€ ์žˆ์Œ
558
-
559
- # df_morph์™€ df_direct_raw๋ฅผ ํ•ฉ์น˜๋˜, '๋‹จ์–ด' ๊ธฐ์ค€์œผ๋กœ ์ค‘๋ณต ์ฒ˜๋ฆฌ
560
- # ๋จผ์ € df_direct_raw์— '์ง์ ‘์ž…๋ ฅ' ์ปฌ๋Ÿผ์„ ์ถ”๊ฐ€ํ•˜๊ณ  "์ง์ ‘์ž…๋ ฅ"์œผ๋กœ ์ฑ„์›€
561
- df_direct_raw["์ง์ ‘์ž…๋ ฅ"] = "์ง์ ‘์ž…๋ ฅ"
562
-
563
- # df_morph์— ์žˆ๋Š” ๋‹จ์–ด๋Š” df_morph ์ •๋ณด๋ฅผ ์šฐ์„  ์‚ฌ์šฉ (์ง์ ‘์ž…๋ ฅ ํ”Œ๋ž˜๊ทธ๋งŒ ์—…๋ฐ์ดํŠธ)
564
- # df_direct_raw์—์„œ df_morph์— ์—†๋Š” ๋‹จ์–ด๋งŒ ๊ณจ๋ผ์„œ ์ถ”๊ฐ€
565
-
566
- # df_morph์˜ '์ง์ ‘์ž…๋ ฅ' ์ปฌ๋Ÿผ์€ ์ด๋ฏธ ์œ„์—์„œ ์ฒ˜๋ฆฌ๋จ.
567
- # ์ด์ œ df_direct_raw์—๋งŒ ์žˆ๏ฟฝ๏ฟฝ๏ฟฝ ํ‚ค์›Œ๋“œ๋ฅผ df_morph์— ์ถ”๊ฐ€
568
-
569
- # df_morph์— ์žˆ๋Š” ๋‹จ์–ด ๋ชฉ๋ก
570
- morph_words = df_morph['๋‹จ์–ด'].tolist() if not df_morph.empty else []
571
-
572
- rows_to_add = []
573
- for idx, row in df_direct_raw.iterrows():
574
- if row['๋‹จ์–ด'] not in morph_words:
575
- rows_to_add.append(row)
576
-
577
- if rows_to_add:
578
- df_to_add = pd.DataFrame(rows_to_add)
579
- combined_df = pd.concat([df_morph, df_to_add], ignore_index=True)
580
- else:
581
- combined_df = df_morph.copy() # df_morph๊ฐ€ ๋น„์–ด์žˆ์„ ์ˆ˜๋„ ์žˆ์Œ
582
-
583
- # ์ตœ์ข… ์ปฌ๋Ÿผ ์ •๋ฆฌ ๋ฐ ์ˆœ์„œ
584
- final_cols_combined = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"]
585
- for col in final_cols_combined:
586
- if col not in combined_df.columns:
587
- # ๊ธฐ๋ณธ๊ฐ’ ์„ค์ •: '์ง์ ‘์ž…๋ ฅ'์€ "", ๋‚˜๋จธ์ง€๋Š” 0 ๋˜๋Š” None (API ๊ฐ’์€ None ํ—ˆ์šฉ)
588
- if col == "์ง์ ‘์ž…๋ ฅ":
589
- combined_df[col] = ""
590
- elif col == "๋นˆ๋„์ˆ˜":
591
- combined_df[col] = 0
592
- elif col == "๋‹จ์–ด":
593
- combined_df[col] = ""
594
- else: # API ๊ด€๋ จ ์ปฌ๋Ÿผ
595
- combined_df[col] = None # pd.NA๋„ ๊ฐ€๋Šฅ
596
-
597
- # NA ๊ฐ’๋“ค์„ ์ ์ ˆํžˆ ์ฒ˜๋ฆฌ (์˜ˆ: 0์œผ๋กœ ์ฑ„์šฐ๊ฑฐ๋‚˜ ๊ทธ๋Œ€๋กœ ๋‘๊ธฐ)
598
- # API ๊ฐ’๋“ค์€ ์ˆซ์ž๊ฐ€ ์•„๋‹ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ (์˜ˆ: "< 10"), process_keyword์—์„œ ์ฒ˜๋ฆฌ๋จ. ์—ฌ๊ธฐ์„œ๋Š” intํ˜• ๋ณ€ํ™˜ ์ „์ด๋ฏ€๋กœ ๊ทธ๋Œ€๋กœ ๋‘ .
599
- # Gradio DataFrame์€ None์„ ์ž˜ ํ‘œ์‹œํ•จ.
600
- # ๋นˆ๋„์ˆ˜๋Š” ์ •์ˆ˜ํ˜•์ด์–ด์•ผ ํ•จ
601
- if "๋นˆ๋„์ˆ˜" in combined_df.columns:
602
- combined_df["๋นˆ๋„์ˆ˜"] = combined_df["๋นˆ๋„์ˆ˜"].fillna(0).astype(int)
603
-
604
-
605
- if not combined_df.empty : # ๋น„์–ด์žˆ์ง€ ์•Š์„ ๋•Œ๋งŒ ์ •๋ ฌ ๋ฐ ์ค‘๋ณต ์ œ๊ฑฐ
606
- combined_df = combined_df[final_cols_combined].drop_duplicates(subset=['๋‹จ์–ด'], keep='first') # ๋งŒ์•ฝ์„ ์œ„ํ•œ ์ค‘๋ณต ์ œ๊ฑฐ
607
- combined_df.sort_values(by=["์ง์ ‘์ž…๋ ฅ", "๋นˆ๋„์ˆ˜"], ascending=[False, False], inplace=True, na_position='last') # ์ง์ ‘์ž…๋ ฅ ์šฐ์„ , ๊ทธ ๋‹ค์Œ ๋นˆ๋„์ˆ˜
608
- combined_df.reset_index(drop=True, inplace=True)
609
-
610
- combined_excel = create_excel_file(combined_df)
611
  debug_log("combined_analysis ํ•จ์ˆ˜ ์™„๋ฃŒ")
612
- return combined_df, combined_excel
613
-
614
 
615
  # --- ๋ถ„์„ ํ•ธ๋“ค๋Ÿฌ ---
616
  def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
617
- debug_log(f"analysis_handler ํ•จ์ˆ˜ ์‹œ์ž‘. ์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ ๋ถ„์„: {direct_keyword_only}")
618
- start_time = time.time()
619
-
620
- if not blog_text or blog_text.strip() == "์Šคํฌ๋ž˜ํ•‘๋œ ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค." or blog_text.strip() == "":
621
- debug_log("๋ถ„์„ํ•  ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—†์Šต๋‹ˆ๋‹ค.")
622
- # ๋นˆ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•˜๊ธฐ ์œ„ํ•œ DataFrame ๊ตฌ์กฐ ๋ช…์‹œ
623
- empty_cols_direct = ["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
624
- empty_cols_combined = ["๋‹จ์–ด", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", "์ง์ ‘์ž…๋ ฅ"]
625
- df_empty = pd.DataFrame(columns=empty_cols_direct if direct_keyword_only else empty_cols_combined)
626
- return df_empty, create_excel_file(df_empty)
627
-
628
-
629
  if direct_keyword_only:
630
  # "์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ๋งŒ ๋ถ„์„" ์„ ํƒ ์‹œ ๋‹จ๋… ๋ถ„์„ ์ˆ˜ํ–‰
631
- if not direct_keyword_input or not direct_keyword_input.strip():
632
- debug_log("์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ ๋ถ„์„ ์„ ํƒ๋˜์—ˆ์œผ๋‚˜, ์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
633
- empty_cols_direct = ["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"]
634
- df_empty = pd.DataFrame(columns=empty_cols_direct)
635
- return df_empty, create_excel_file(df_empty)
636
-
637
- result_df, excel_path = direct_keyword_analysis(blog_text, direct_keyword_input)
638
  else:
639
  # ๊ธฐ๋ณธ ํ†ตํ•ฉ ๋ถ„์„ ์ˆ˜ํ–‰
640
- result_df, excel_path = combined_analysis(blog_text, remove_freq1, direct_keyword_input)
641
-
642
- end_time = time.time()
643
- debug_log(f"analysis_handler ์ด ์‹คํ–‰ ์‹œ๊ฐ„: {end_time - start_time:.2f} ์ดˆ")
644
- return result_df, excel_path
645
-
646
 
647
  # --- ์Šคํฌ๋ž˜ํ•‘ ์‹คํ–‰ ---
648
  def fetch_blog_content(url: str):
649
  debug_log("fetch_blog_content ํ•จ์ˆ˜ ์‹œ์ž‘")
650
- if not url or not url.strip():
651
- return "๋ธ”๋กœ๊ทธ URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
652
- if not url.startswith("http://") and not url.startswith("https://"):
653
- return "์œ ํšจํ•œ URL ํ˜•์‹(http:// ๋˜๋Š” https://)์œผ๋กœ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”."
654
-
655
- start_time = time.time()
656
  content = scrape_naver_blog(url)
657
- end_time = time.time()
658
- debug_log(f"fetch_blog_content ์ด ์‹คํ–‰ ์‹œ๊ฐ„: {end_time - start_time:.2f} ์ดˆ. ๋‚ด์šฉ ๊ธธ์ด: {len(content)}")
659
  return content
660
 
661
  # --- Custom CSS ---
662
  custom_css = """
663
  /* ์ „์ฒด ์ปจํ…Œ์ด๋„ˆ ์Šคํƒ€์ผ */
664
  .gradio-container {
665
- max-width: 1080px; /* ๋„ˆ๋น„ ํ™•์žฅ */
666
  margin: auto;
667
  font-family: 'Helvetica Neue', Arial, sans-serif;
668
  background: #f5f7fa;
@@ -696,12 +330,7 @@ custom_css = """
696
  padding: 0.6rem 1.2rem;
697
  font-size: 1rem;
698
  cursor: pointer;
699
- min-width: 150px; /* ๋ฒ„ํŠผ ์ตœ์†Œ ๋„ˆ๋น„ */
700
  }
701
- .custom-button:hover {
702
- background-color: #0056b3;
703
- }
704
-
705
 
706
  /* ์ฒดํฌ๋ฐ•์Šค ์Šคํƒ€์ผ */
707
  .custom-checkbox {
@@ -722,97 +351,39 @@ custom_css = """
722
  """
723
 
724
  # --- Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ ---
725
- with gr.Blocks(title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ํ‚ค์›Œ๋“œ ๋ถ„์„ ์„œ๋น„์Šค", css=custom_css) as demo:
726
- gr.HTML("<div class='custom-header'>๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ํ‚ค์›Œ๋“œ ๋ถ„์„ ์„œ๋น„์Šค</div>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
727
 
728
- with gr.Row():
729
- with gr.Column(scale=2): # ์™ผ์ชฝ ์ปฌ๋Ÿผ (์ž…๋ ฅ ์˜์—ญ)
730
- with gr.Group(elem_classes="custom-group"):
731
- blog_url_input = gr.Textbox(
732
- label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ",
733
- placeholder="์˜ˆ: https://blog.naver.com/์•„์ด๋””/๊ธ€๋ฒˆํ˜ธ",
734
- lines=1,
735
- info="๋ถ„์„ํ•  ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๊ฒŒ์‹œ๋ฌผ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
736
- )
737
- with gr.Row(elem_classes="centered"):
738
- scrape_button = gr.Button("๋ธ”๋กœ๊ทธ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ", elem_classes="custom-button", variant="primary")
739
-
740
- with gr.Group(elem_classes="custom-group"):
741
- blog_content_box = gr.Textbox(
742
- label="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ (์ˆ˜์ • ๊ฐ€๋Šฅ)",
743
- lines=10,
744
- placeholder="์Šคํฌ๋ž˜ํ•‘๋œ ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ์ง์ ‘ ์ˆ˜์ •ํ•˜๊ฑฐ๋‚˜ ๋ถ™์—ฌ๋„ฃ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค."
745
- )
746
-
747
- with gr.Group(elem_classes="custom-group"):
748
- gr.Markdown("### ๋ถ„์„ ์˜ต์…˜ ์„ค์ •")
749
- with gr.Row():
750
- remove_freq_checkbox = gr.Checkbox(
751
- label="๋นˆ๋„์ˆ˜ 1์ธ ๋‹จ์–ด ์ œ๊ฑฐ (ํ˜•ํƒœ์†Œ ๋ถ„์„ ์‹œ)",
752
- value=True,
753
- elem_classes="custom-checkbox",
754
- info="ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์—์„œ ๋นˆ๋„์ˆ˜๊ฐ€ 1์ธ ๋‹จ์–ด๋ฅผ ์ œ์™ธํ•ฉ๋‹ˆ๋‹ค."
755
- )
756
- with gr.Row():
757
- direct_keyword_only_checkbox = gr.Checkbox(
758
- label="์ง์ ‘ ํ‚ค์›Œ๋“œ๋งŒ ๋ถ„์„",
759
- value=False,
760
- elem_classes="custom-checkbox",
761
- info="์ด ์˜ต์…˜์„ ์„ ํƒํ•˜๋ฉด ์•„๋ž˜ ์ž…๋ ฅํ•œ ์ง์ ‘ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด์„œ๋งŒ ๋ถ„์„์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค (ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ƒ๋žต)."
762
- )
763
- with gr.Row():
764
- direct_keyword_box = gr.Textbox(
765
- label="์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ (์—”ํ„ฐ ๋˜๋Š” ','๋กœ ๊ตฌ๋ถ„)",
766
- lines=3,
767
- placeholder="์˜ˆ: ํ‚ค์›Œ๋“œ1, ํ‚ค์›Œ๋“œ2\nํ‚ค์›Œ๋“œ3\n...\n(ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์™€ ๋ณ„๋„๋กœ ๋ถ„์„ํ•˜๊ฑฐ๋‚˜, ํ†ตํ•ฉ ๋ถ„์„์— ์ถ”๊ฐ€ํ•  ํ‚ค์›Œ๋“œ)",
768
- info="๋ถ„์„์— ํฌํ•จํ•˜๊ฑฐ๋‚˜ ๋‹จ๋…์œผ๋กœ ๋ถ„์„ํ•  ํ‚ค์›Œ๋“œ๋ฅผ ์ง์ ‘ ์ž…๋ ฅํ•ฉ๋‹ˆ๋‹ค."
769
- )
770
-
771
- with gr.Group(elem_classes="custom-group"):
772
- with gr.Row(elem_classes="centered"):
773
- analyze_button = gr.Button("ํ‚ค์›Œ๋“œ ๋ถ„์„ ์‹คํ–‰", elem_classes="custom-button", variant="primary")
774
-
775
- with gr.Column(scale=3): # ์˜ค๋ฅธ์ชฝ ์ปฌ๋Ÿผ (๊ฒฐ๊ณผ ์˜์—ญ)
776
- with gr.Group(elem_classes="custom-group custom-result"):
777
- gr.Markdown("### ๋ถ„์„ ๊ฒฐ๊ณผ")
778
- result_df_display = gr.DataFrame( # gr.Dataframe -> gr.DataFrame ์œผ๋กœ ๋ณ€๊ฒฝ
779
- label="ํ†ตํ•ฉ ๋ถ„์„ ๊ฒฐ๊ณผ (๋‹จ์–ด, ๋นˆ๋„์ˆ˜, ๊ฒ€์ƒ‰๋Ÿ‰, ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜, ์ง์ ‘์ž…๋ ฅ ์—ฌ๋ถ€)",
780
- interactive=False,
781
- # height=600, # Gradio ๋ฒ„์ „ ํ˜ธํ™˜์„ฑ์„ ์œ„ํ•ด height ํŒŒ๋ผ๋ฏธํ„ฐ ์ œ๊ฑฐ ๋˜๋Š” ์ฃผ์„ ์ฒ˜๋ฆฌ
782
- wrap=True
783
- )
784
- with gr.Group(elem_classes="custom-group"):
785
- gr.Markdown("### ๊ฒฐ๊ณผ ๋‹ค์šด๋กœ๋“œ")
786
- excel_file_display = gr.File(label="๋ถ„์„ ๊ฒฐ๊ณผ Excel ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
787
-
788
  # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
789
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
790
- analyze_button.click(
791
- fn=analysis_handler,
792
- inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
793
- outputs=[result_df_display, excel_file_display]
794
- )
795
 
796
  if __name__ == "__main__":
797
- # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • ์˜ˆ์‹œ (์‹ค์ œ ์‹คํ–‰ ์‹œ์—๋Š” ์‹œ์Šคํ…œ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋กœ ์„ค์ •ํ•˜๊ฑฐ๋‚˜, .env ํŒŒ์ผ ๋“ฑ์„ ์‚ฌ์šฉ)
798
- # os.environ["NAVER_API_KEY"] = "YOUR_NAVER_API_KEY"
799
- # os.environ["NAVER_SECRET_KEY"] = "YOUR_NAVER_SECRET_KEY"
800
- # os.environ["NAVER_CUSTOMER_ID"] = "YOUR_NAVER_CUSTOMER_ID"
801
- # os.environ["NAVER_SEARCH_CLIENT_ID"] = "YOUR_NAVER_SEARCH_CLIENT_ID"
802
- # os.environ["NAVER_SEARCH_CLIENT_SECRET"] = "YOUR_NAVER_SEARCH_CLIENT_SECRET"
803
-
804
- # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ • ํ™•์ธ
805
- required_env_vars = [
806
- "NAVER_API_KEY", "NAVER_SECRET_KEY", "NAVER_CUSTOMER_ID",
807
- "NAVER_SEARCH_CLIENT_ID", "NAVER_SEARCH_CLIENT_SECRET"
808
- ]
809
- missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
810
- if missing_vars:
811
- debug_log(f"๊ฒฝ๊ณ : ๋‹ค์Œ ํ•„์ˆ˜ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค - {', '.join(missing_vars)}")
812
- debug_log("API ํ˜ธ์ถœ ๊ธฐ๋Šฅ์ด ์ •์ƒ์ ์œผ๋กœ ๋™์ž‘ํ•˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
813
- debug_log("์Šคํฌ๋ฆฝํŠธ ์‹คํ–‰ ์ „์— ํ•ด๋‹น ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ์„ค์ •ํ•ด์ฃผ์„ธ์š”.")
814
- # Gradio ์•ฑ์€ ์‹คํ–‰ํ•˜๋˜, API ํ˜ธ์ถœ ์‹œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•  ์ˆ˜ ์žˆ์Œ์„ ์‚ฌ์šฉ์ž์—๊ฒŒ ์•Œ๋ฆผ.
815
-
816
  debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
817
- demo.launch(debug=True) # ๊ฐœ๋ฐœ ์ค‘์—๋Š” debug=True๋กœ ์„ค์ •ํ•˜์—ฌ ์˜ค๋ฅ˜ ํ™•์ธ ์šฉ์ด
818
- debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")
 
1
  import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ import urllib.parse # iframe ๊ฒฝ๋กœ ๋ณด์ •์„ ์œ„ํ•œ ๋ชจ๋“ˆ
5
  import re
6
  import logging
7
  import tempfile
8
  import pandas as pd
9
+ import mecab # pythonโ€‘mecabโ€‘ko ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์‚ฌ์šฉ
10
  import os
11
  import time
12
  import hmac
13
  import hashlib
14
  import base64
 
 
 
 
 
 
 
 
15
 
16
  # ๋””๋ฒ„๊น…(๋กœ๊ทธ)์šฉ ํ•จ์ˆ˜
17
  def debug_log(message: str):
18
+ print(f"[DEBUG] {message}")
19
 
20
  # --- ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํ•‘ ---
21
  def scrape_naver_blog(url: str) -> str:
 
29
  )
30
  }
31
  try:
32
+ response = requests.get(url, headers=headers)
33
  debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
34
  if response.status_code != 200:
35
  debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
 
39
  iframe = soup.select_one("iframe#mainFrame")
40
  if not iframe:
41
  debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
42
+ return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
43
  iframe_src = iframe.get("src")
44
  if not iframe_src:
45
  debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
46
  return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
47
+ parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
 
 
 
 
 
 
 
 
 
 
 
48
  debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
49
+ iframe_response = requests.get(parsed_iframe_url, headers=headers)
50
  debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
51
  if iframe_response.status_code != 200:
52
  debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
53
  return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
54
  iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
55
  debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
56
+ title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
57
+ title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
 
 
 
58
  debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
59
+ content_div = iframe_soup.select_one('.se-main-container')
60
+ if content_div:
61
+ content = content_div.get_text("\n", strip=True)
62
+ else:
63
+ content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
65
  result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
66
  debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํ•ฉ์นจ ์™„๋ฃŒ")
67
  return result
 
 
 
68
  except Exception as e:
69
+ debug_log(f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
70
  return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
71
 
72
  # --- ํ˜•ํƒœ์†Œ ๋ถ„์„ (์ฐธ์กฐ์ฝ”๋“œ-1) ---
73
  def analyze_text(text: str):
74
+ logging.basicConfig(level=logging.DEBUG)
75
  logger = logging.getLogger(__name__)
76
+ logger.debug("์›๋ณธ ํ…์ŠคํŠธ: %s", text)
77
+ filtered_text = re.sub(r'[^๊ฐ€-ํžฃ]', '', text)
78
+ logger.debug("ํ•„ํ„ฐ๋ง๋œ ํ…์ŠคํŠธ: %s", filtered_text)
79
+ if not filtered_text:
80
+ logger.debug("์œ ํšจํ•œ ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ๊ฐ€ ์—†์Œ.")
 
 
 
 
 
 
81
  return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
82
+ mecab_instance = mecab.MeCab()
83
+ tokens = mecab_instance.pos(filtered_text)
84
+ logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ: %s", tokens)
85
  freq = {}
86
  for word, pos in tokens:
87
+ if word and word.strip() and pos.startswith("NN"):
 
88
  freq[word] = freq.get(word, 0) + 1
89
+ logger.debug("๋‹จ์–ด: %s, ํ’ˆ์‚ฌ: %s, ๋นˆ๋„: %d", word, pos, freq[word])
 
90
  sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
91
+ logger.debug("์ •๋ ฌ๋œ ๋‹จ์–ด ๋นˆ๋„: %s", sorted_freq)
92
  df = pd.DataFrame(sorted_freq, columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"])
93
+ logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ DataFrame ์ƒ์„ฑ๋จ, shape: %s", df.shape)
94
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
95
+ df.to_excel(temp_file.name, index=False, engine='openpyxl')
96
+ temp_file.close()
97
+ logger.debug("Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: %s", temp_file.name)
98
+ return df, temp_file.name
 
 
 
 
 
 
 
 
 
99
 
100
  # --- ๋„ค์ด๋ฒ„ ๊ฒ€์ƒ‰ ๋ฐ ๊ด‘๊ณ  API ๊ด€๋ จ (์ฐธ์กฐ์ฝ”๋“œ-2) ---
101
  def generate_signature(timestamp, method, uri, secret_key):
 
114
  "X-Signature": signature
115
  }
116
 
 
 
 
 
 
 
 
 
117
  def fetch_related_keywords(keyword):
118
+ debug_log(f"fetch_related_keywords ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ: {keyword}")
119
+ API_KEY = os.environ["NAVER_API_KEY"]
120
+ SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
121
+ CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
 
 
 
 
 
122
  BASE_URL = "https://api.naver.com"
123
  uri = "/keywordstool"
124
  method = "GET"
125
+ headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
126
+ params = {
127
+ "hintKeywords": [keyword],
128
+ "showDetail": "1"
129
+ }
130
+ response = requests.get(BASE_URL + uri, params=params, headers=headers)
131
+ data = response.json()
132
+ if "keywordList" not in data:
133
+ return pd.DataFrame()
134
+ df = pd.DataFrame(data["keywordList"])
135
+ if len(df) > 100:
136
+ df = df.head(100)
137
+ def parse_count(x):
138
+ try:
139
+ return int(str(x).replace(",", ""))
140
+ except:
141
+ return 0
142
+ df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["monthlyPcQcCnt"].apply(parse_count)
143
+ df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["monthlyMobileQcCnt"].apply(parse_count)
144
+ df["ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] + df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
145
+ df.rename(columns={"relKeyword": "์ •๋ณดํ‚ค์›Œ๋“œ"}, inplace=True)
146
+ result_df = df[["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"]]
147
+ debug_log("fetch_related_keywords ์™„๋ฃŒ")
148
+ return result_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  def fetch_blog_count(keyword):
151
  debug_log(f"fetch_blog_count ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ: {keyword}")
152
+ client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
153
+ client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
 
 
 
 
 
154
  url = "https://openapi.naver.com/v1/search/blog.json"
155
  headers = {
156
  "X-Naver-Client-Id": client_id,
157
  "X-Naver-Client-Secret": client_secret
158
  }
159
+ params = {"query": keyword, "display": 1}
160
+ response = requests.get(url, headers=headers, params=params)
161
+ if response.status_code == 200:
 
 
162
  data = response.json()
163
+ debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {data.get('total', 0)}")
164
+ return data.get("total", 0)
165
+ else:
166
+ debug_log(f"fetch_blog_count ์˜ค๋ฅ˜, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
167
+ return 0
 
 
 
 
 
168
 
169
  def create_excel_file(df):
170
+ with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
171
+ excel_path = tmp.name
172
+ df.to_excel(excel_path, index=False)
173
+ debug_log(f"Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {excel_path}")
174
+ return excel_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  def process_keyword(keywords: str, include_related: bool):
177
+ debug_log(f"process_keyword ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ๋“ค: {keywords}, ์—ฐ๊ด€๊ฒ€์ƒ‰์–ด ํฌํ•จ: {include_related}")
178
+ input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
179
+ result_dfs = []
180
+ for idx, kw in enumerate(input_keywords):
181
+ df_kw = fetch_related_keywords(kw)
182
+ if df_kw.empty:
183
+ continue
184
+ row_kw = df_kw[df_kw["์ •๋ณดํ‚ค์›Œ๋“œ"] == kw]
185
+ if not row_kw.empty:
186
+ result_dfs.append(row_kw)
187
+ else:
188
+ result_dfs.append(df_kw.head(1))
189
+ if include_related and idx == 0:
190
+ df_related = df_kw[df_kw["์ •๋ณดํ‚ค์›Œ๋“œ"] != kw]
191
+ if not df_related.empty:
192
+ result_dfs.append(df_related)
193
+ if result_dfs:
194
+ result_df = pd.concat(result_dfs, ignore_index=True)
195
+ result_df.drop_duplicates(subset=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  else:
197
+ result_df = pd.DataFrame(columns=["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๏ฟฝ๏ฟฝ๏ฟฝ์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"])
198
+ result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].apply(fetch_blog_count)
199
  result_df.sort_values(by="ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", ascending=False, inplace=True)
200
+ debug_log("process_keyword ์™„๋ฃŒ")
 
 
 
 
 
 
 
 
 
201
  return result_df, create_excel_file(result_df)
202
 
 
203
  # --- ํ˜•ํƒœ์†Œ ๋ถ„์„๊ณผ ๊ฒ€์ƒ‰๋Ÿ‰/๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ๋ณ‘ํ•ฉ ---
204
  def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
205
  debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์‹œ์ž‘")
206
+ df_freq, _ = analyze_text(text)
 
207
  if df_freq.empty:
208
  debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋นˆ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์ž…๋‹ˆ๋‹ค.")
209
+ return df_freq, ""
 
210
  if remove_freq1:
211
+ before_shape = df_freq.shape
212
+ df_freq = df_freq[df_freq["๋นˆ๋„์ˆ˜"] != 1]
213
+ debug_log(f"๋นˆ๋„์ˆ˜ 1 ์ œ๊ฑฐ ์ ์šฉ๋จ. {before_shape} -> {df_freq.shape}")
214
+ keywords = "\n".join(df_freq["๋‹จ์–ด"].tolist())
215
+ debug_log(f"๋ถ„์„๋œ ํ‚ค์›Œ๋“œ: {keywords}")
216
+ df_keyword_info, _ = process_keyword(keywords, include_related=False)
217
+ debug_log("๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ์™„๋ฃŒ")
218
+ merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋‹จ์–ด", right_on="์ •๋ณดํ‚ค์›Œ๋“œ", how="left")
219
+ merged_df.drop(columns=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  merged_excel_path = create_excel_file(merged_df)
221
  debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์™„๋ฃŒ")
222
  return merged_df, merged_excel_path
223
 
 
224
  # --- ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ถ„์„ (๋‹จ๋… ๋ถ„์„) ---
225
  def direct_keyword_analysis(text: str, keyword_input: str):
226
  debug_log("direct_keyword_analysis ํ•จ์ˆ˜ ์‹œ์ž‘")
227
+ keywords = re.split(r'[\n,]+', keyword_input)
228
+ keywords = [kw.strip() for kw in keywords if kw.strip()]
229
+ debug_log(f"์ž…๋ ฅ๋œ ํ‚ค์›Œ๋“œ ๋ชฉ๋ก: {keywords}")
230
+ results = []
231
+ for kw in keywords:
232
+ count = text.count(kw)
233
+ results.append((kw, count))
234
+ debug_log(f"ํ‚ค์›Œ๋“œ '{kw}'์˜ ๋นˆ๋„์ˆ˜: {count}")
235
+ df = pd.DataFrame(results, columns=["ํ‚ค์›Œ๋“œ", "๋นˆ๋„์ˆ˜"])
236
+ excel_path = create_excel_file(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  debug_log("direct_keyword_analysis ํ•จ์ˆ˜ ์™„๋ฃŒ")
238
+ return df, excel_path
 
239
 
240
  # --- ํ†ตํ•ฉ ๋ถ„์„ (ํ˜•ํƒœ์†Œ ๋ถ„์„ + ์ง์ ‘ ํ‚ค์›Œ๋“œ ๋ถ„์„) ---
241
  def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
242
  debug_log("combined_analysis ํ•จ์ˆ˜ ์‹œ์ž‘")
243
+ merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
244
+ if "์ง์ ‘์ž…๋ ฅ" not in merged_df.columns:
245
+ merged_df["์ง์ ‘์ž…๋ ฅ"] = ""
246
+ direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
247
+ direct_keywords = [kw.strip() for kw in direct_keywords if kw.strip()]
248
+ debug_log(f"์ž…๋ ฅ๋œ ์ง์ ‘ ํ‚ค์›Œ๋“œ: {direct_keywords}")
249
+ for dk in direct_keywords:
250
+ if dk in merged_df["๋‹จ์–ด"].values:
251
+ merged_df.loc[merged_df["๋‹จ์–ด"] == dk, "์ง์ ‘์ž…๋ ฅ"] = "์ง์ ‘์ž…๋ ฅ"
252
+ else:
253
+ freq = blog_text.count(dk)
254
+ df_direct, _ = process_keyword(dk, include_related=False)
255
+ if (not df_direct.empty) and (dk in df_direct["์ •๋ณดํ‚ค์›Œ๋“œ"].values):
256
+ row = df_direct[df_direct["์ •๋ณดํ‚ค์›Œ๋“œ"] == dk].iloc[0]
257
+ pc = row.get("PC์›”๊ฒ€์ƒ‰๋Ÿ‰", None)
258
+ mobile = row.get("๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", None)
259
+ total = row.get("ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", None)
260
+ blog_count = row.get("๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜", None)
261
+ else:
262
+ pc = mobile = total = blog_count = None
263
+ new_row = {
264
+ "๋‹จ์–ด": dk,
265
+ "๋นˆ๋„์ˆ˜": freq,
266
+ "PC์›”๊ฒ€์ƒ‰๋Ÿ‰": pc,
267
+ "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰": mobile,
268
+ "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰": total,
269
+ "๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜": blog_count,
270
+ "์ง์ ‘์ž…๋ ฅ": "์ง์ ‘์ž…๋ ฅ"
271
+ }
272
+ merged_df = pd.concat([merged_df, pd.DataFrame([new_row])], ignore_index=True)
273
+ merged_df = merged_df.sort_values(by="๋นˆ๋„์ˆ˜", ascending=False).reset_index(drop=True)
274
+ combined_excel = create_excel_file(merged_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  debug_log("combined_analysis ํ•จ์ˆ˜ ์™„๋ฃŒ")
276
+ return merged_df, combined_excel
 
277
 
278
  # --- ๋ถ„์„ ํ•ธ๋“ค๋Ÿฌ ---
279
  def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
280
+ debug_log("analysis_handler ํ•จ์ˆ˜ ์‹œ์ž‘")
 
 
 
 
 
 
 
 
 
 
 
281
  if direct_keyword_only:
282
  # "์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ๋งŒ ๋ถ„์„" ์„ ํƒ ์‹œ ๋‹จ๋… ๋ถ„์„ ์ˆ˜ํ–‰
283
+ return direct_keyword_analysis(blog_text, direct_keyword_input)
 
 
 
 
 
 
284
  else:
285
  # ๊ธฐ๋ณธ ํ†ตํ•ฉ ๋ถ„์„ ์ˆ˜ํ–‰
286
+ return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
 
 
 
 
 
287
 
288
  # --- ์Šคํฌ๋ž˜ํ•‘ ์‹คํ–‰ ---
289
  def fetch_blog_content(url: str):
290
  debug_log("fetch_blog_content ํ•จ์ˆ˜ ์‹œ์ž‘")
 
 
 
 
 
 
291
  content = scrape_naver_blog(url)
292
+ debug_log("fetch_blog_content ํ•จ์ˆ˜ ์™„๋ฃŒ")
 
293
  return content
294
 
295
  # --- Custom CSS ---
296
  custom_css = """
297
  /* ์ „์ฒด ์ปจํ…Œ์ด๋„ˆ ์Šคํƒ€์ผ */
298
  .gradio-container {
299
+ max-width: 960px;
300
  margin: auto;
301
  font-family: 'Helvetica Neue', Arial, sans-serif;
302
  background: #f5f7fa;
 
330
  padding: 0.6rem 1.2rem;
331
  font-size: 1rem;
332
  cursor: pointer;
 
333
  }
 
 
 
 
334
 
335
  /* ์ฒดํฌ๋ฐ•์Šค ์Šคํƒ€์ผ */
336
  .custom-checkbox {
 
351
  """
352
 
353
  # --- Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ ---
354
+ with gr.Blocks(title="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ํ˜•ํƒœ์†Œ ๋ถ„์„ ์„œ๋น„์Šค", css=custom_css) as demo:
355
+ gr.HTML("<div class='custom-header'>๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ํ˜•ํƒœ์†Œ ๋ถ„์„ ์„œ๋น„์Šค</div>")
356
+ # ๋ธ”๋กœ๊ทธ ๋งํฌ์™€ ์Šคํฌ๋ž˜ํ•‘ ์‹คํ–‰ ๋ฒ„ํŠผ์„ ํ•œ ๊ทธ๋ฃน ๋‚ด์— ๋ฐฐ์น˜ (๋ฒ„ํŠผ์€ ๊ฐ€์šด๋ฐ ์ •๋ ฌ)
357
+ with gr.Group(elem_classes="custom-group"):
358
+ with gr.Row():
359
+ blog_url_input = gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ", placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507", lines=1)
360
+ with gr.Row(elem_classes="centered"):
361
+ scrape_button = gr.Button("์Šคํฌ๋ž˜ํ•‘ ์‹คํ–‰", elem_classes="custom-button")
362
+ with gr.Group(elem_classes="custom-group"):
363
+ blog_content_box = gr.Textbox(label="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ (์ˆ˜์ • ๊ฐ€๋Šฅ)", lines=10, placeholder="์Šคํฌ๋ž˜ํ•‘๋œ ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์ด ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค.")
364
+ with gr.Group(elem_classes="custom-group"):
365
+ with gr.Row():
366
+ remove_freq_checkbox = gr.Checkbox(label="๋นˆ๋„์ˆ˜1 ์ œ๊ฑฐ", value=True, elem_classes="custom-checkbox")
367
+ with gr.Row():
368
+ # "๋นˆ๋„์ˆ˜1 ์ œ๊ฑฐ" ์•„๋ž˜์— "์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ๋งŒ ๋ถ„์„" ์ฒดํฌ๋ฐ•์Šค ๋ฐฐ์น˜
369
+ direct_keyword_only_checkbox = gr.Checkbox(label="์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ๋งŒ ๋ถ„์„", value=False, elem_classes="custom-checkbox")
370
+ with gr.Row():
371
+ direct_keyword_box = gr.Textbox(label="์ง์ ‘ ํ‚ค์›Œ๋“œ ์ž…๋ ฅ (์—”ํ„ฐ ๋˜๋Š” ','๋กœ ๊ตฌ๋ถ„)", lines=2, placeholder="์˜ˆ: ํ‚ค์›Œ๋“œ1, ํ‚ค์›Œ๋“œ2\nํ‚ค์›Œ๋“œ3")
372
+ with gr.Group(elem_classes="custom-group"):
373
+ with gr.Row(elem_classes="centered"):
374
+ analyze_button = gr.Button("๋ถ„์„ ์‹คํ–‰", elem_classes="custom-button")
375
+ with gr.Group(elem_classes="custom-group custom-result"):
376
+ result_df = gr.Dataframe(label="ํ†ตํ•ฉ ๋ถ„์„ ๊ฒฐ๊ณผ (๋‹จ์–ด, ๋นˆ๋„์ˆ˜, ๊ฒ€์ƒ‰๋Ÿ‰, ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜, ์ง์ ‘์ž…๋ ฅ)", interactive=True)
377
+ with gr.Group(elem_classes="custom-group"):
378
+ excel_file = gr.File(label="Excel ๋‹ค์šด๋กœ๋“œ")
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
381
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
382
+ analyze_button.click(fn=analysis_handler,
383
+ inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
384
+ outputs=[result_df, excel_file])
 
 
385
 
386
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
388
+ demo.launch()
389
+ debug_log("Gradio ์•ฑ ์‹คํ–‰ ์ข…๋ฃŒ")