Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,29 +1,21 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
-
import urllib.parse
|
5 |
import re
|
6 |
import logging
|
7 |
import tempfile
|
8 |
import pandas as pd
|
9 |
-
import mecab
|
10 |
import os
|
11 |
import time
|
12 |
import hmac
|
13 |
import hashlib
|
14 |
import base64
|
15 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
16 |
-
|
17 |
-
# --- ๋ณ๋ ฌ ์ฒ๋ฆฌ ์ค์ ---
|
18 |
-
# API ํธ์ถ ์ ํ์ ๋ง์ถฐ ์ ์ ํ ์กฐ์ ํ์ธ์.
|
19 |
-
# ๋๋ฌด ๋์ ๊ฐ์ API ์ ํ์ ๊ฑธ๋ฆด ์ ์์ต๋๋ค.
|
20 |
-
MAX_WORKERS_RELATED_KEYWORDS = 5 # fetch_related_keywords ๋ณ๋ ฌ ์์
์ ์
|
21 |
-
MAX_WORKERS_BLOG_COUNT = 10 # fetch_blog_count ๋ณ๋ ฌ ์์
์ ์
|
22 |
-
|
23 |
|
24 |
# ๋๋ฒ๊น
(๋ก๊ทธ)์ฉ ํจ์
|
25 |
def debug_log(message: str):
|
26 |
-
print(f"[
|
27 |
|
28 |
# --- ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ ---
|
29 |
def scrape_naver_blog(url: str) -> str:
|
@@ -37,7 +29,7 @@ def scrape_naver_blog(url: str) -> str:
|
|
37 |
)
|
38 |
}
|
39 |
try:
|
40 |
-
response = requests.get(url, headers=headers
|
41 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
42 |
if response.status_code != 200:
|
43 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
@@ -47,127 +39,63 @@ def scrape_naver_blog(url: str) -> str:
|
|
47 |
iframe = soup.select_one("iframe#mainFrame")
|
48 |
if not iframe:
|
49 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
50 |
-
|
51 |
-
content_div_direct = soup.select_one('.se-main-container')
|
52 |
-
if content_div_direct:
|
53 |
-
title_div_direct = soup.select_one('.se-module.se-module-text.se-title-text')
|
54 |
-
title = title_div_direct.get_text(strip=True) if title_div_direct else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
55 |
-
content = content_div_direct.get_text("\n", strip=True)
|
56 |
-
debug_log("iframe ์์ด ๋ณธ๋ฌธ ์ง์ ์ถ์ถ ์๋ฃ")
|
57 |
-
return f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
58 |
-
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค. (๋ณธ๋ฌธ ์ง์ ์ถ์ถ ์คํจ)"
|
59 |
-
|
60 |
iframe_src = iframe.get("src")
|
61 |
if not iframe_src:
|
62 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
63 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
64 |
-
|
65 |
-
# iframe_src๊ฐ ์ ๋ URL์ด ์๋ ๊ฒฝ์ฐ๋ฅผ ๋๋น
|
66 |
-
if iframe_src.startswith("//"):
|
67 |
-
parsed_iframe_url = "https:" + iframe_src
|
68 |
-
elif iframe_src.startswith("/"):
|
69 |
-
parsed_main_url = urllib.parse.urlparse(url)
|
70 |
-
parsed_iframe_url = urllib.parse.urlunparse(
|
71 |
-
(parsed_main_url.scheme, parsed_main_url.netloc, iframe_src, None, None, None)
|
72 |
-
)
|
73 |
-
else:
|
74 |
-
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
75 |
-
|
76 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
77 |
-
iframe_response = requests.get(parsed_iframe_url, headers=headers
|
78 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
79 |
if iframe_response.status_code != 200:
|
80 |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
81 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
82 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
83 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
84 |
-
|
85 |
-
|
86 |
-
title_selectors = [
|
87 |
-
'.se-module.se-module-text.se-title-text', # ์ผ๋ฐ์ ์ธ ์ค๋งํธ์๋ํฐ ONE
|
88 |
-
'.title_text', # ๊ตฌ๋ฒ์ ์๋ํฐ ๋๋ ๋ค๋ฅธ ๊ตฌ์กฐ
|
89 |
-
'div[class*="title"] h3',
|
90 |
-
'h1', 'h2', 'h3' # ์ผ๋ฐ์ ์ธ ์ ๋ชฉ ํ๊ทธ
|
91 |
-
]
|
92 |
-
title = "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
93 |
-
for selector in title_selectors:
|
94 |
-
title_div = iframe_soup.select_one(selector)
|
95 |
-
if title_div:
|
96 |
-
title = title_div.get_text(strip=True)
|
97 |
-
break
|
98 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
'div.post_ct', # ์ผ๋ถ ๋ธ๋ก๊ทธ ๊ตฌ์กฐ
|
105 |
-
'article', 'main' # ์๋งจํฑ ํ๊ทธ
|
106 |
-
]
|
107 |
-
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
108 |
-
for selector in content_selectors:
|
109 |
-
content_div = iframe_soup.select_one(selector)
|
110 |
-
if content_div:
|
111 |
-
# ๋ถํ์ํ ์คํฌ๋ฆฝํธ, ์คํ์ผ ํ๊ทธ ์ ๊ฑฐ
|
112 |
-
for s in content_div(['script', 'style']):
|
113 |
-
s.decompose()
|
114 |
-
content = content_div.get_text("\n", strip=True)
|
115 |
-
break
|
116 |
-
|
117 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
118 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
119 |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํฉ์นจ ์๋ฃ")
|
120 |
return result
|
121 |
-
except requests.exceptions.Timeout:
|
122 |
-
debug_log(f"์์ฒญ ์๊ฐ ์ด๊ณผ: {url}")
|
123 |
-
return f"์คํฌ๋ํ ์ค ์๊ฐ ์ด๊ณผ๊ฐ ๋ฐ์ํ์ต๋๋ค: {url}"
|
124 |
except Exception as e:
|
125 |
-
debug_log(f"
|
126 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
127 |
|
128 |
# --- ํํ์ ๋ถ์ (์ฐธ์กฐ์ฝ๋-1) ---
|
129 |
def analyze_text(text: str):
|
130 |
-
logging.basicConfig(level=logging.
|
131 |
logger = logging.getLogger(__name__)
|
132 |
-
|
133 |
-
filtered_text = re.sub(r'[^๊ฐ-ํฃ
|
134 |
-
|
135 |
-
if not filtered_text
|
136 |
-
logger.
|
137 |
-
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
138 |
-
try:
|
139 |
-
mecab_instance = mecab.MeCab()
|
140 |
-
tokens = mecab_instance.pos(filtered_text)
|
141 |
-
except Exception as e:
|
142 |
-
logger.error(f"MeCab ํํ์ ๋ถ์ ์ค ์ค๋ฅ: {e}")
|
143 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
144 |
-
|
145 |
-
|
|
|
146 |
freq = {}
|
147 |
for word, pos in tokens:
|
148 |
-
|
149 |
-
if word and word.strip() and (pos.startswith("NN") or pos in ["SL", "SH"]) and len(word) > 1 :
|
150 |
freq[word] = freq.get(word, 0) + 1
|
151 |
-
|
152 |
-
|
153 |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
154 |
-
|
155 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
156 |
-
logger.
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
163 |
-
temp_file_path = temp_file.name
|
164 |
-
logger.info(f"Excel ํ์ผ ์์ฑ๋จ: {temp_file_path}")
|
165 |
-
except Exception as e:
|
166 |
-
logger.error(f"Excel ํ์ผ ์ ์ฅ ์ค ์ค๋ฅ: {e}")
|
167 |
-
temp_file_path = "" # ์ค๋ฅ ๋ฐ์ ์ ๊ฒฝ๋ก ์ด๊ธฐํ
|
168 |
-
|
169 |
-
return df, temp_file_path
|
170 |
-
|
171 |
|
172 |
# --- ๋ค์ด๋ฒ ๊ฒ์ ๋ฐ ๊ด๊ณ API ๊ด๋ จ (์ฐธ์กฐ์ฝ๋-2) ---
|
173 |
def generate_signature(timestamp, method, uri, secret_key):
|
@@ -186,483 +114,189 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
186 |
"X-Signature": signature
|
187 |
}
|
188 |
|
189 |
-
# API ํค ํ๊ฒฝ ๋ณ์ ํ์ธ ํจ์
|
190 |
-
def get_env_variable(var_name):
|
191 |
-
value = os.environ.get(var_name)
|
192 |
-
if value is None:
|
193 |
-
debug_log(f"ํ๊ฒฝ ๋ณ์ '{var_name}'๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค. API ํธ์ถ์ด ์คํจํ ์ ์์ต๋๋ค.")
|
194 |
-
# ํ์์ ์ฌ๊ธฐ์ raise Exception ๋๋ ๏ฟฝ๏ฟฝ๏ฟฝ๋ณธ๊ฐ ๋ฐํ
|
195 |
-
return value
|
196 |
-
|
197 |
def fetch_related_keywords(keyword):
|
198 |
-
debug_log(f"fetch_related_keywords
|
199 |
-
API_KEY =
|
200 |
-
SECRET_KEY =
|
201 |
-
CUSTOMER_ID =
|
202 |
-
|
203 |
-
if not all([API_KEY, SECRET_KEY, CUSTOMER_ID]):
|
204 |
-
debug_log(f"๋ค์ด๋ฒ ๊ด๊ณ API ํค ์ ๋ณด ๋ถ์กฑ์ผ๋ก '{keyword}' ์ฐ๊ด ํค์๋ ์กฐํ๋ฅผ ๊ฑด๋<0xEB><0x8><0xB5>๋๋ค.")
|
205 |
-
return pd.DataFrame()
|
206 |
-
|
207 |
BASE_URL = "https://api.naver.com"
|
208 |
uri = "/keywordstool"
|
209 |
method = "GET"
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
return
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
def parse_count(x):
|
235 |
-
if pd.isna(x) or str(x).lower() == '< 10': # ๋ค์ด๋ฒ API๋ 10 ๋ฏธ๋ง์ผ ๋ "< 10"์ผ๋ก ๋ฐํ
|
236 |
-
return 5 # ๋๋ 0, ๋๋ ๋ค๋ฅธ ๋ํ๊ฐ (์: 5)
|
237 |
-
try:
|
238 |
-
return int(str(x).replace(",", ""))
|
239 |
-
except ValueError:
|
240 |
-
return 0
|
241 |
-
|
242 |
-
df["PC์๊ฒ์๋"] = df["monthlyPcQcCnt"].apply(parse_count)
|
243 |
-
df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
244 |
-
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
245 |
-
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
246 |
-
|
247 |
-
# ํ์ํ ์ปฌ๋ผ๋ง ์ ํ, ์๋ ๊ฒฝ์ฐ ๋๋น
|
248 |
-
required_cols = ["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]
|
249 |
-
result_df = pd.DataFrame(columns=required_cols)
|
250 |
-
for col in required_cols:
|
251 |
-
if col in df.columns:
|
252 |
-
result_df[col] = df[col]
|
253 |
-
else: # ํด๋น ์ปฌ๋ผ์ด API ์๋ต์ ์์ ๊ฒฝ์ฐ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ฑ์
|
254 |
-
if col == "์ ๋ณดํค์๋": # ์ ๋ณดํค์๋๋ ํ์
|
255 |
-
debug_log(f"API ์๋ต์ 'relKeyword'๊ฐ ์์ต๋๋ค. '{keyword}' ์ฒ๋ฆฌ ์ค๋จ.")
|
256 |
-
return pd.DataFrame()
|
257 |
-
result_df[col] = 0
|
258 |
-
|
259 |
-
debug_log(f"fetch_related_keywords '{keyword}' ์๋ฃ, ๊ฒฐ๊ณผ {len(result_df)}๊ฐ")
|
260 |
-
return result_df.head(100) # ์ต๋ 100๊ฐ๋ก ์ ํ
|
261 |
-
|
262 |
-
except requests.exceptions.HTTPError as http_err:
|
263 |
-
debug_log(f"HTTP ์ค๋ฅ ๋ฐ์ (fetch_related_keywords for '{keyword}'): {http_err} - ์๋ต: {response.text if 'response' in locals() else 'N/A'}")
|
264 |
-
except requests.exceptions.RequestException as req_err:
|
265 |
-
debug_log(f"์์ฒญ ์ค๋ฅ ๋ฐ์ (fetch_related_keywords for '{keyword}'): {req_err}")
|
266 |
-
except Exception as e:
|
267 |
-
debug_log(f"์ ์ ์๋ ์ค๋ฅ ๋ฐ์ (fetch_related_keywords for '{keyword}'): {e}")
|
268 |
-
return pd.DataFrame() # ์ค๋ฅ ๋ฐ์ ์ ๋น DataFrame ๋ฐํ
|
269 |
-
|
270 |
|
271 |
def fetch_blog_count(keyword):
|
272 |
debug_log(f"fetch_blog_count ํธ์ถ, ํค์๋: {keyword}")
|
273 |
-
client_id =
|
274 |
-
client_secret =
|
275 |
-
|
276 |
-
if not client_id or not client_secret:
|
277 |
-
debug_log(f"๋ค์ด๋ฒ ๊ฒ์ API ํค ์ ๋ณด ๋ถ์กฑ์ผ๋ก '{keyword}' ๋ธ๋ก๊ทธ ์ ์กฐํ๋ฅผ ๊ฑด๋<0xEB><0x8><0xB5>๋๋ค.")
|
278 |
-
return 0
|
279 |
-
|
280 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
281 |
headers = {
|
282 |
"X-Naver-Client-Id": client_id,
|
283 |
"X-Naver-Client-Secret": client_secret
|
284 |
}
|
285 |
-
params = {"query": keyword, "display": 1}
|
286 |
-
|
287 |
-
|
288 |
-
response = requests.get(url, headers=headers, params=params, timeout=5)
|
289 |
-
response.raise_for_status() # HTTP ์ค๋ฅ ๋ฐ์ ์ ์์ธ ๋ฐ์
|
290 |
data = response.json()
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
except requests.exceptions.RequestException as req_err: # Timeout, ConnectionError ๋ฑ
|
297 |
-
debug_log(f"์์ฒญ ์ค๋ฅ ๋ฐ์ (fetch_blog_count for '{keyword}'): {req_err}")
|
298 |
-
except Exception as e: # JSONDecodeError ๋ฑ ๊ธฐํ ์์ธ
|
299 |
-
debug_log(f"์ ์ ์๋ ์ค๋ฅ ๋ฐ์ (fetch_blog_count for '{keyword}'): {e}")
|
300 |
-
return 0 # ์ค๋ฅ ๋ฐ์ ์ 0 ๋ฐํ
|
301 |
|
302 |
def create_excel_file(df):
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
excel_path = tmp.name
|
309 |
-
# ๋น ์์
ํ์ผ์ ํค๋๋ง์ด๋ผ๋ ์จ์ฃผ๋ ค๋ฉด
|
310 |
-
# pd.DataFrame(columns=df.columns).to_excel(excel_path, index=False)
|
311 |
-
# ์๋๋ฉด ๊ทธ๋ฅ ๋น ํ์ผ์ ๋ฐํ
|
312 |
-
return excel_path
|
313 |
-
|
314 |
-
try:
|
315 |
-
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False, mode='w+b') as tmp:
|
316 |
-
excel_path = tmp.name
|
317 |
-
df.to_excel(excel_path, index=False, engine='openpyxl')
|
318 |
-
debug_log(f"Excel ํ์ผ ์์ฑ๋จ: {excel_path}")
|
319 |
-
return excel_path
|
320 |
-
except Exception as e:
|
321 |
-
debug_log(f"Excel ํ์ผ ์์ฑ ์ค ์ค๋ฅ: {e}")
|
322 |
-
# ์ค๋ฅ ๋ฐ์ ์ ๋น ํ์ผ ๊ฒฝ๋ก๋ผ๋ ๋ฐํ (Gradio ํธํ์ฑ)
|
323 |
-
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
324 |
-
return tmp.name
|
325 |
-
|
326 |
|
327 |
def process_keyword(keywords: str, include_related: bool):
|
328 |
-
debug_log(f"process_keyword
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
if not df_kw_related.empty:
|
348 |
-
# ์๋ณธ ํค์๋๊ฐ ๊ฒฐ๊ณผ์ ํฌํจ๋์ด ์๋์ง ํ์ธํ๊ณ , ์์ผ๋ฉด ์ถ๊ฐ ์๋ (API๊ฐ ํญ์ relKeyword๋ก ์์ ์ ์ฃผ์ง ์์)
|
349 |
-
# ํ์ง๋ง fetch_related_keywords์์ ์ด๋ฏธ hintKeyword๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ๊ฒ์ํ๋ฏ๋ก,
|
350 |
-
# ์ผ๋ฐ์ ์ผ๋ก๋ ํด๋น ํค์๋ ์ ๋ณด๊ฐ ์๊ฑฐ๋, ์ฐ๊ด ํค์๋๋ง ๋์ด.
|
351 |
-
# ์ฌ๊ธฐ์๋ API ์๋ต์ ๊ทธ๋๋ก ํ์ฉ.
|
352 |
-
|
353 |
-
# ์ฒซ ๋ฒ์งธ ์
๋ ฅ ํค์๋์ด๊ณ , ์ฐ๊ด ํค์๋ ํฌํจ ์ต์
์ด ์ผ์ ธ ์์ผ๋ฉด ๋ชจ๋ ์ฐ๊ด ํค์๋๋ฅผ ์ถ๊ฐ
|
354 |
-
# ๊ทธ ์ธ์ ๊ฒฝ์ฐ์๋ ํด๋น ํค์๋ ์์ฒด์ ์ ๋ณด๋ง (์๋ค๋ฉด) ์ฌ์ฉํ๊ฑฐ๋, ์ต์๋จ ํค์๋ ์ฌ์ฉ
|
355 |
-
if include_related and kw == input_keywords_orig[0]:
|
356 |
-
all_related_keywords_dfs.append(df_kw_related)
|
357 |
-
debug_log(f"์ฒซ ๋ฒ์งธ ํค์๋ '{kw}'์ ๋ชจ๋ ์ฐ๊ด ํค์๋ ({len(df_kw_related)}๊ฐ) ์ถ๊ฐ๋จ.")
|
358 |
-
else:
|
359 |
-
# ํด๋น ํค์๋์ ์ผ์นํ๋ ํ์ ์ฐพ๊ฑฐ๋, ์์ผ๋ฉด API๊ฐ ๋ฐํํ ์ฒซ๋ฒ์งธ ํ์ ์ฌ์ฉ
|
360 |
-
row_kw = df_kw_related[df_kw_related["์ ๋ณดํค์๋"] == kw]
|
361 |
-
if not row_kw.empty:
|
362 |
-
all_related_keywords_dfs.append(row_kw)
|
363 |
-
debug_log(f"ํค์๋ '{kw}'์ ์ง์ ์ ๋ณด ์ถ๊ฐ๋จ.")
|
364 |
-
elif not df_kw_related.empty : # ์ง์ ์ ๋ณด๋ ์์ง๋ง ์ฐ๊ด ํค์๋๋ ์์ ๋
|
365 |
-
all_related_keywords_dfs.append(df_kw_related.head(1)) # ๊ฐ์ฅ ์ฐ๊ด์ฑ ๋์ ํค์๋ ์ถ๊ฐ
|
366 |
-
debug_log(f"ํค์๋ '{kw}'์ ์ง์ ์ ๋ณด๋ ์์ผ๋, ๊ฐ์ฅ ์ฐ๊ด์ฑ ๋์ ํค์๋ 1๊ฐ ์ถ๊ฐ๋จ.")
|
367 |
-
# else: ํค์๋ ์ ๋ณด๋, ์ฐ๊ด ์ ๋ณด๋ ์์ ๋ (df_kw_related๊ฐ ๋น์ด์์)
|
368 |
-
|
369 |
-
debug_log(f"'{kw}' ์ฐ๊ด ํค์๋ ์ฒ๋ฆฌ ์๋ฃ ({i+1}/{len(input_keywords_orig)})")
|
370 |
-
except Exception as e:
|
371 |
-
debug_log(f"'{kw}' ์ฐ๊ด ํค์๋ ์กฐํ ์ค ๋ณ๋ ฌ ์์
์ค๋ฅ: {e}")
|
372 |
-
|
373 |
-
if not all_related_keywords_dfs:
|
374 |
-
debug_log("์ฐ๊ด ํค์๋ ์กฐํ ๊ฒฐ๊ณผ๊ฐ ๋ชจ๋ ๋น์ด์์ต๋๋ค.")
|
375 |
-
# ๋น DataFrame์ ๋ธ๋ก๊ทธ ๋ฌธ์์ ์ปฌ๋ผ ์ถ๊ฐ
|
376 |
-
empty_df = pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"])
|
377 |
-
empty_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = None
|
378 |
-
return empty_df, create_excel_file(empty_df)
|
379 |
-
|
380 |
-
result_df = pd.concat(all_related_keywords_dfs, ignore_index=True)
|
381 |
-
result_df.drop_duplicates(subset=["์ ๋ณดํค์๋"], inplace=True) # ์ค๋ณต ์ ๊ฑฐ
|
382 |
-
debug_log(f"์ฐ๊ด ํค์๋ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์๋ฃ. ํตํฉ๋ DataFrame shape: {result_df.shape}")
|
383 |
-
|
384 |
-
# 2. fetch_blog_count ๋ณ๋ ฌ ์ฒ๋ฆฌ
|
385 |
-
keywords_for_blog_count = result_df["์ ๋ณดํค์๋"].dropna().unique().tolist()
|
386 |
-
blog_counts_map = {}
|
387 |
-
|
388 |
-
if keywords_for_blog_count:
|
389 |
-
debug_log(f"๋ธ๋ก๊ทธ ๋ฌธ์ ์ ์กฐํ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์์ (ํค์๋ {len(keywords_for_blog_count)}๊ฐ, ์ต๋ ์์
์ ์: {MAX_WORKERS_BLOG_COUNT})")
|
390 |
-
with ThreadPoolExecutor(max_workers=MAX_WORKERS_BLOG_COUNT) as executor:
|
391 |
-
future_to_keyword_blog = {
|
392 |
-
executor.submit(fetch_blog_count, kw): kw for kw in keywords_for_blog_count
|
393 |
-
}
|
394 |
-
for i, future in enumerate(as_completed(future_to_keyword_blog)):
|
395 |
-
kw = future_to_keyword_blog[future]
|
396 |
-
try:
|
397 |
-
count = future.result() # ์ซ์ ๋ฐํ
|
398 |
-
blog_counts_map[kw] = count
|
399 |
-
if (i+1) % 50 == 0: # ๋๋ฌด ๋ง์ ๋ก๊ทธ ๋ฐฉ์ง
|
400 |
-
debug_log(f"๋ธ๋ก๊ทธ ์ ์กฐํ ์งํ ์ค... ({i+1}/{len(keywords_for_blog_count)})")
|
401 |
-
except Exception as e:
|
402 |
-
debug_log(f"'{kw}' ๋ธ๋ก๊ทธ ์ ์กฐํ ์ค ๋ณ๋ ฌ ์์
์ค๋ฅ: {e}")
|
403 |
-
blog_counts_map[kw] = 0 # ์ค๋ฅ ์ 0์ผ๋ก ์ฒ๋ฆฌ
|
404 |
-
|
405 |
-
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].map(blog_counts_map).fillna(0).astype(int)
|
406 |
-
debug_log("๋ธ๋ก๊ทธ ๋ฌธ์ ์ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์๋ฃ.")
|
407 |
else:
|
408 |
-
result_df["
|
409 |
-
|
410 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
411 |
-
debug_log(
|
412 |
-
|
413 |
-
# ์ต์ข
์ปฌ๋ผ ์์ ๋ฐ ์กด์ฌ ์ฌ๋ถ ํ์ธ
|
414 |
-
final_columns = ["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]
|
415 |
-
for col in final_columns:
|
416 |
-
if col not in result_df.columns:
|
417 |
-
result_df[col] = 0 if col != "์ ๋ณดํค์๋" else "" # ์๋ ์ปฌ๋ผ์ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ฑ์
|
418 |
-
|
419 |
-
result_df = result_df[final_columns] # ์ปฌ๋ผ ์์ ๊ณ ์
|
420 |
-
|
421 |
return result_df, create_excel_file(result_df)
|
422 |
|
423 |
-
|
424 |
# --- ํํ์ ๋ถ์๊ณผ ๊ฒ์๋/๋ธ๋ก๊ทธ๋ฌธ์์ ๋ณํฉ ---
|
425 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
426 |
debug_log("morphological_analysis_and_enrich ํจ์ ์์")
|
427 |
-
df_freq, _ = analyze_text(text)
|
428 |
-
|
429 |
if df_freq.empty:
|
430 |
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์
๋๋ค.")
|
431 |
-
return
|
432 |
-
|
433 |
if remove_freq1:
|
434 |
-
|
435 |
-
df_freq = df_freq[df_freq["๋น๋์"]
|
436 |
-
debug_log(f"๋น๋์ 1 ์ ๊ฑฐ ์ ์ฉ๋จ. {
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
debug_log(f"ํํ์ ๋ถ์ ๊ธฐ๋ฐ ํค์๋ ({len(df_freq['๋จ์ด'])}๊ฐ)์ ๋ํ ์ ๋ณด ์กฐํ ์์")
|
444 |
-
|
445 |
-
# process_keyword๋ ์ฐ๊ด ํค์๋๋ฅผ ํฌํจํ์ง ์๋๋ก ํธ์ถ (include_related=False)
|
446 |
-
df_keyword_info, _ = process_keyword(keywords_from_morph, include_related=False)
|
447 |
-
debug_log("ํํ์ ๋ถ์ ํค์๋์ ๋ํ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
448 |
-
|
449 |
-
if df_keyword_info.empty:
|
450 |
-
debug_log("ํํ์ ๋ถ์ ํค์๋์ ๋ํ API ์ ๋ณด ์กฐํ ๊ฒฐ๊ณผ๊ฐ ์์ต๋๋ค.")
|
451 |
-
# df_freq์ ๋น ์ปฌ๋ผ๋ค ์ถ๊ฐ
|
452 |
-
for col in ["PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]:
|
453 |
-
df_freq[col] = None
|
454 |
-
merged_df = df_freq
|
455 |
-
else:
|
456 |
-
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
457 |
-
if "์ ๋ณดํค์๋" in merged_df.columns: # merge ํ ์ ๋ณดํค์๋ ์ปฌ๋ผ์ด ์๊ฒผ๋ค๋ฉด ์ญ์
|
458 |
-
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True, errors='ignore')
|
459 |
-
|
460 |
-
# ๋๋ฝ๋ ์ปฌ๋ผ ๊ธฐ๋ณธ๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ
|
461 |
-
expected_cols = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]
|
462 |
-
for col in expected_cols:
|
463 |
-
if col not in merged_df.columns:
|
464 |
-
merged_df[col] = None if col not in ["๋น๋์"] else 0
|
465 |
-
|
466 |
-
merged_df = merged_df[expected_cols] # ์ปฌ๋ผ ์์ ๊ณ ์
|
467 |
-
|
468 |
merged_excel_path = create_excel_file(merged_df)
|
469 |
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
470 |
return merged_df, merged_excel_path
|
471 |
|
472 |
-
|
473 |
# --- ์ง์ ํค์๋ ๋ถ์ (๋จ๋
๋ถ์) ---
|
474 |
def direct_keyword_analysis(text: str, keyword_input: str):
|
475 |
debug_log("direct_keyword_analysis ํจ์ ์์")
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
count = text.count(kw) # ๋์๋ฌธ์ ๊ตฌ๋ถ, ์ ํํ ๋ฌธ์์ด ์นด์ดํธ
|
487 |
-
results_freq.append({"ํค์๋": kw, "๋น๋์": count})
|
488 |
-
debug_log(f"์ง์ ํค์๋ '{kw}'์ ๋ณธ๋ฌธ ๋ด ๋น๋์: {count}")
|
489 |
-
df_direct_freq = pd.DataFrame(results_freq)
|
490 |
-
|
491 |
-
# 2. API๋ฅผ ํตํด ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ ์ ์กฐํ (๋ณ๋ ฌ ์ฒ๋ฆฌ๋ process_keyword ์ฌ์ฉ)
|
492 |
-
# ์ฌ๊ธฐ์๋ ๊ฐ ์ง์ ํค์๋์ ๋ํ ์ ๋ณด๋ง ํ์ํ๋ฏ๋ก include_related=False
|
493 |
-
keywords_for_api = "\n".join(direct_keywords_list)
|
494 |
-
df_direct_api_info, _ = process_keyword(keywords_for_api, include_related=False)
|
495 |
-
|
496 |
-
# 3. ๋น๋์ ๊ฒฐ๊ณผ์ API ๊ฒฐ๊ณผ ๋ณํฉ
|
497 |
-
if not df_direct_api_info.empty:
|
498 |
-
# API ๊ฒฐ๊ณผ์ '์ ๋ณดํค์๋'๋ฅผ 'ํค์๋'๋ก ๋ณ๊ฒฝํ์ฌ ๋ณํฉ ๊ธฐ์ค ํต์ผ
|
499 |
-
df_direct_api_info.rename(columns={"์ ๋ณดํค์๋": "ํค์๋"}, inplace=True)
|
500 |
-
merged_df = pd.merge(df_direct_freq, df_direct_api_info, on="ํค์๋", how="left")
|
501 |
-
else:
|
502 |
-
merged_df = df_direct_freq
|
503 |
-
for col in ["PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]:
|
504 |
-
merged_df[col] = None # API ์ ๋ณด๊ฐ ์์ ๊ฒฝ์ฐ ๋น ์ปฌ๋ผ ์ถ๊ฐ
|
505 |
-
|
506 |
-
# ์ปฌ๋ผ ์์ ๋ฐ ๊ธฐ๋ณธ๊ฐ ์ ๋ฆฌ
|
507 |
-
final_cols = ["ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]
|
508 |
-
for col in final_cols:
|
509 |
-
if col not in merged_df.columns:
|
510 |
-
merged_df[col] = 0 if col != "ํค์๋" else ""
|
511 |
-
merged_df = merged_df[final_cols]
|
512 |
-
|
513 |
-
|
514 |
-
excel_path = create_excel_file(merged_df)
|
515 |
debug_log("direct_keyword_analysis ํจ์ ์๋ฃ")
|
516 |
-
return
|
517 |
-
|
518 |
|
519 |
# --- ํตํฉ ๋ถ์ (ํํ์ ๋ถ์ + ์ง์ ํค์๋ ๋ถ์) ---
|
520 |
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
521 |
debug_log("combined_analysis ํจ์ ์์")
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
# ์ง์ ์
๋ ฅ๋ ํค์๋ ์ค ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ์๋ ๊ฒ๋ค์ ์ถ๊ฐ
|
557 |
-
# df_direct_raw์๋ ๋ชจ๋ ์ง์ ์
๋ ฅ ํค์๋์ ์ ๋ณด๊ฐ ์์
|
558 |
-
|
559 |
-
# df_morph์ df_direct_raw๋ฅผ ํฉ์น๋, '๋จ์ด' ๊ธฐ์ค์ผ๋ก ์ค๋ณต ์ฒ๋ฆฌ
|
560 |
-
# ๋จผ์ df_direct_raw์ '์ง์ ์
๋ ฅ' ์ปฌ๋ผ์ ์ถ๊ฐํ๊ณ "์ง์ ์
๋ ฅ"์ผ๋ก ์ฑ์
|
561 |
-
df_direct_raw["์ง์ ์
๋ ฅ"] = "์ง์ ์
๋ ฅ"
|
562 |
-
|
563 |
-
# df_morph์ ์๋ ๋จ์ด๋ df_morph ์ ๋ณด๋ฅผ ์ฐ์ ์ฌ์ฉ (์ง์ ์
๋ ฅ ํ๋๊ทธ๋ง ์
๋ฐ์ดํธ)
|
564 |
-
# df_direct_raw์์ df_morph์ ์๋ ๋จ์ด๋ง ๊ณจ๋ผ์ ์ถ๊ฐ
|
565 |
-
|
566 |
-
# df_morph์ '์ง์ ์
๋ ฅ' ์ปฌ๋ผ์ ์ด๋ฏธ ์์์ ์ฒ๋ฆฌ๋จ.
|
567 |
-
# ์ด์ df_direct_raw์๋ง ์๏ฟฝ๏ฟฝ๏ฟฝ ํค์๋๋ฅผ df_morph์ ์ถ๊ฐ
|
568 |
-
|
569 |
-
# df_morph์ ์๋ ๋จ์ด ๋ชฉ๋ก
|
570 |
-
morph_words = df_morph['๋จ์ด'].tolist() if not df_morph.empty else []
|
571 |
-
|
572 |
-
rows_to_add = []
|
573 |
-
for idx, row in df_direct_raw.iterrows():
|
574 |
-
if row['๋จ์ด'] not in morph_words:
|
575 |
-
rows_to_add.append(row)
|
576 |
-
|
577 |
-
if rows_to_add:
|
578 |
-
df_to_add = pd.DataFrame(rows_to_add)
|
579 |
-
combined_df = pd.concat([df_morph, df_to_add], ignore_index=True)
|
580 |
-
else:
|
581 |
-
combined_df = df_morph.copy() # df_morph๊ฐ ๋น์ด์์ ์๋ ์์
|
582 |
-
|
583 |
-
# ์ต์ข
์ปฌ๋ผ ์ ๋ฆฌ ๋ฐ ์์
|
584 |
-
final_cols_combined = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์", "์ง์ ์
๋ ฅ"]
|
585 |
-
for col in final_cols_combined:
|
586 |
-
if col not in combined_df.columns:
|
587 |
-
# ๊ธฐ๋ณธ๊ฐ ์ค์ : '์ง์ ์
๋ ฅ'์ "", ๋๋จธ์ง๋ 0 ๋๋ None (API ๊ฐ์ None ํ์ฉ)
|
588 |
-
if col == "์ง์ ์
๋ ฅ":
|
589 |
-
combined_df[col] = ""
|
590 |
-
elif col == "๋น๋์":
|
591 |
-
combined_df[col] = 0
|
592 |
-
elif col == "๋จ์ด":
|
593 |
-
combined_df[col] = ""
|
594 |
-
else: # API ๊ด๋ จ ์ปฌ๋ผ
|
595 |
-
combined_df[col] = None # pd.NA๋ ๊ฐ๋ฅ
|
596 |
-
|
597 |
-
# NA ๊ฐ๋ค์ ์ ์ ํ ์ฒ๋ฆฌ (์: 0์ผ๋ก ์ฑ์ฐ๊ฑฐ๋ ๊ทธ๋๋ก ๋๊ธฐ)
|
598 |
-
# API ๊ฐ๋ค์ ์ซ์๊ฐ ์๋ ์ ์์ผ๋ฏ๋ก (์: "< 10"), process_keyword์์ ์ฒ๋ฆฌ๋จ. ์ฌ๊ธฐ์๋ intํ ๋ณํ ์ ์ด๋ฏ๋ก ๊ทธ๋๋ก ๋ .
|
599 |
-
# Gradio DataFrame์ None์ ์ ํ์ํจ.
|
600 |
-
# ๋น๋์๋ ์ ์ํ์ด์ด์ผ ํจ
|
601 |
-
if "๋น๋์" in combined_df.columns:
|
602 |
-
combined_df["๋น๋์"] = combined_df["๋น๋์"].fillna(0).astype(int)
|
603 |
-
|
604 |
-
|
605 |
-
if not combined_df.empty : # ๋น์ด์์ง ์์ ๋๋ง ์ ๋ ฌ ๋ฐ ์ค๋ณต ์ ๊ฑฐ
|
606 |
-
combined_df = combined_df[final_cols_combined].drop_duplicates(subset=['๋จ์ด'], keep='first') # ๋ง์ฝ์ ์ํ ์ค๋ณต ์ ๊ฑฐ
|
607 |
-
combined_df.sort_values(by=["์ง์ ์
๋ ฅ", "๋น๋์"], ascending=[False, False], inplace=True, na_position='last') # ์ง์ ์
๋ ฅ ์ฐ์ , ๊ทธ ๋ค์ ๋น๋์
|
608 |
-
combined_df.reset_index(drop=True, inplace=True)
|
609 |
-
|
610 |
-
combined_excel = create_excel_file(combined_df)
|
611 |
debug_log("combined_analysis ํจ์ ์๋ฃ")
|
612 |
-
return
|
613 |
-
|
614 |
|
615 |
# --- ๋ถ์ ํธ๋ค๋ฌ ---
|
616 |
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
617 |
-
debug_log(
|
618 |
-
start_time = time.time()
|
619 |
-
|
620 |
-
if not blog_text or blog_text.strip() == "์คํฌ๋ํ๋ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค." or blog_text.strip() == "":
|
621 |
-
debug_log("๋ถ์ํ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์์ต๋๋ค.")
|
622 |
-
# ๋น ๊ฒฐ๊ณผ๋ฅผ ๋ฐํํ๊ธฐ ์ํ DataFrame ๊ตฌ์กฐ ๋ช
์
|
623 |
-
empty_cols_direct = ["ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]
|
624 |
-
empty_cols_combined = ["๋จ์ด", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์", "์ง์ ์
๋ ฅ"]
|
625 |
-
df_empty = pd.DataFrame(columns=empty_cols_direct if direct_keyword_only else empty_cols_combined)
|
626 |
-
return df_empty, create_excel_file(df_empty)
|
627 |
-
|
628 |
-
|
629 |
if direct_keyword_only:
|
630 |
# "์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์" ์ ํ ์ ๋จ๋
๋ถ์ ์ํ
|
631 |
-
|
632 |
-
debug_log("์ง์ ํค์๋๋ง ๋ถ์ ์ ํ๋์์ผ๋, ์
๋ ฅ๋ ์ง์ ํค์๋๊ฐ ์์ต๋๋ค.")
|
633 |
-
empty_cols_direct = ["ํค์๋", "๋น๋์", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋", "๋ธ๋ก๊ทธ๋ฌธ์์"]
|
634 |
-
df_empty = pd.DataFrame(columns=empty_cols_direct)
|
635 |
-
return df_empty, create_excel_file(df_empty)
|
636 |
-
|
637 |
-
result_df, excel_path = direct_keyword_analysis(blog_text, direct_keyword_input)
|
638 |
else:
|
639 |
# ๊ธฐ๋ณธ ํตํฉ ๋ถ์ ์ํ
|
640 |
-
|
641 |
-
|
642 |
-
end_time = time.time()
|
643 |
-
debug_log(f"analysis_handler ์ด ์คํ ์๊ฐ: {end_time - start_time:.2f} ์ด")
|
644 |
-
return result_df, excel_path
|
645 |
-
|
646 |
|
647 |
# --- ์คํฌ๋ํ ์คํ ---
|
648 |
def fetch_blog_content(url: str):
|
649 |
debug_log("fetch_blog_content ํจ์ ์์")
|
650 |
-
if not url or not url.strip():
|
651 |
-
return "๋ธ๋ก๊ทธ URL์ ์
๋ ฅํด์ฃผ์ธ์."
|
652 |
-
if not url.startswith("http://") and not url.startswith("https://"):
|
653 |
-
return "์ ํจํ URL ํ์(http:// ๋๋ https://)์ผ๋ก ์
๋ ฅํด์ฃผ์ธ์."
|
654 |
-
|
655 |
-
start_time = time.time()
|
656 |
content = scrape_naver_blog(url)
|
657 |
-
|
658 |
-
debug_log(f"fetch_blog_content ์ด ์คํ ์๊ฐ: {end_time - start_time:.2f} ์ด. ๋ด์ฉ ๊ธธ์ด: {len(content)}")
|
659 |
return content
|
660 |
|
661 |
# --- Custom CSS ---
|
662 |
custom_css = """
|
663 |
/* ์ ์ฒด ์ปจํ
์ด๋ ์คํ์ผ */
|
664 |
.gradio-container {
|
665 |
-
max-width:
|
666 |
margin: auto;
|
667 |
font-family: 'Helvetica Neue', Arial, sans-serif;
|
668 |
background: #f5f7fa;
|
@@ -696,12 +330,7 @@ custom_css = """
|
|
696 |
padding: 0.6rem 1.2rem;
|
697 |
font-size: 1rem;
|
698 |
cursor: pointer;
|
699 |
-
min-width: 150px; /* ๋ฒํผ ์ต์ ๋๋น */
|
700 |
}
|
701 |
-
.custom-button:hover {
|
702 |
-
background-color: #0056b3;
|
703 |
-
}
|
704 |
-
|
705 |
|
706 |
/* ์ฒดํฌ๋ฐ์ค ์คํ์ผ */
|
707 |
.custom-checkbox {
|
@@ -722,97 +351,39 @@ custom_css = """
|
|
722 |
"""
|
723 |
|
724 |
# --- Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ ---
|
725 |
-
with gr.Blocks(title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ
|
726 |
-
gr.HTML("<div class='custom-header'>๋ค์ด๋ฒ ๋ธ๋ก๊ทธ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
-
with gr.Row():
|
729 |
-
with gr.Column(scale=2): # ์ผ์ชฝ ์ปฌ๋ผ (์
๋ ฅ ์์ญ)
|
730 |
-
with gr.Group(elem_classes="custom-group"):
|
731 |
-
blog_url_input = gr.Textbox(
|
732 |
-
label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ",
|
733 |
-
placeholder="์: https://blog.naver.com/์์ด๋/๊ธ๋ฒํธ",
|
734 |
-
lines=1,
|
735 |
-
info="๋ถ์ํ ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๊ฒ์๋ฌผ URL์ ์
๋ ฅํ์ธ์."
|
736 |
-
)
|
737 |
-
with gr.Row(elem_classes="centered"):
|
738 |
-
scrape_button = gr.Button("๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ", elem_classes="custom-button", variant="primary")
|
739 |
-
|
740 |
-
with gr.Group(elem_classes="custom-group"):
|
741 |
-
blog_content_box = gr.Textbox(
|
742 |
-
label="๋ธ๋ก๊ทธ ๋ด์ฉ (์์ ๊ฐ๋ฅ)",
|
743 |
-
lines=10,
|
744 |
-
placeholder="์คํฌ๋ํ๋ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค. ์ง์ ์์ ํ๊ฑฐ๋ ๋ถ์ฌ๋ฃ์ ์ ์์ต๋๋ค."
|
745 |
-
)
|
746 |
-
|
747 |
-
with gr.Group(elem_classes="custom-group"):
|
748 |
-
gr.Markdown("### ๋ถ์ ์ต์
์ค์ ")
|
749 |
-
with gr.Row():
|
750 |
-
remove_freq_checkbox = gr.Checkbox(
|
751 |
-
label="๋น๋์ 1์ธ ๋จ์ด ์ ๊ฑฐ (ํํ์ ๋ถ์ ์)",
|
752 |
-
value=True,
|
753 |
-
elem_classes="custom-checkbox",
|
754 |
-
info="ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์์ ๋น๋์๊ฐ 1์ธ ๋จ์ด๋ฅผ ์ ์ธํฉ๋๋ค."
|
755 |
-
)
|
756 |
-
with gr.Row():
|
757 |
-
direct_keyword_only_checkbox = gr.Checkbox(
|
758 |
-
label="์ง์ ํค์๋๋ง ๋ถ์",
|
759 |
-
value=False,
|
760 |
-
elem_classes="custom-checkbox",
|
761 |
-
info="์ด ์ต์
์ ์ ํํ๋ฉด ์๋ ์
๋ ฅํ ์ง์ ํค์๋์ ๋ํด์๋ง ๋ถ์์ ์ํํฉ๋๋ค (ํํ์ ๋ถ์ ์๋ต)."
|
762 |
-
)
|
763 |
-
with gr.Row():
|
764 |
-
direct_keyword_box = gr.Textbox(
|
765 |
-
label="์ง์ ํค์๋ ์
๋ ฅ (์ํฐ ๋๋ ','๋ก ๊ตฌ๋ถ)",
|
766 |
-
lines=3,
|
767 |
-
placeholder="์: ํค์๋1, ํค์๋2\nํค์๋3\n...\n(ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ๋ณ๋๋ก ๋ถ์ํ๊ฑฐ๋, ํตํฉ ๋ถ์์ ์ถ๊ฐํ ํค์๋)",
|
768 |
-
info="๋ถ์์ ํฌํจํ๊ฑฐ๋ ๋จ๋
์ผ๋ก ๋ถ์ํ ํค์๋๋ฅผ ์ง์ ์
๋ ฅํฉ๋๋ค."
|
769 |
-
)
|
770 |
-
|
771 |
-
with gr.Group(elem_classes="custom-group"):
|
772 |
-
with gr.Row(elem_classes="centered"):
|
773 |
-
analyze_button = gr.Button("ํค์๋ ๋ถ์ ์คํ", elem_classes="custom-button", variant="primary")
|
774 |
-
|
775 |
-
with gr.Column(scale=3): # ์ค๋ฅธ์ชฝ ์ปฌ๋ผ (๊ฒฐ๊ณผ ์์ญ)
|
776 |
-
with gr.Group(elem_classes="custom-group custom-result"):
|
777 |
-
gr.Markdown("### ๋ถ์ ๊ฒฐ๊ณผ")
|
778 |
-
result_df_display = gr.DataFrame( # gr.Dataframe -> gr.DataFrame ์ผ๋ก ๋ณ๊ฒฝ
|
779 |
-
label="ํตํฉ ๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์, ์ง์ ์
๋ ฅ ์ฌ๋ถ)",
|
780 |
-
interactive=False,
|
781 |
-
# height=600, # Gradio ๋ฒ์ ํธํ์ฑ์ ์ํด height ํ๋ผ๋ฏธํฐ ์ ๊ฑฐ ๋๋ ์ฃผ์ ์ฒ๋ฆฌ
|
782 |
-
wrap=True
|
783 |
-
)
|
784 |
-
with gr.Group(elem_classes="custom-group"):
|
785 |
-
gr.Markdown("### ๊ฒฐ๊ณผ ๋ค์ด๋ก๋")
|
786 |
-
excel_file_display = gr.File(label="๋ถ์ ๊ฒฐ๊ณผ Excel ํ์ผ ๋ค์ด๋ก๋")
|
787 |
-
|
788 |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
789 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
790 |
-
analyze_button.click(
|
791 |
-
|
792 |
-
|
793 |
-
outputs=[result_df_display, excel_file_display]
|
794 |
-
)
|
795 |
|
796 |
if __name__ == "__main__":
|
797 |
-
# ํ๊ฒฝ ๋ณ์ ์ค์ ์์ (์ค์ ์คํ ์์๋ ์์คํ
ํ๊ฒฝ ๋ณ์๋ก ์ค์ ํ๊ฑฐ๋, .env ํ์ผ ๋ฑ์ ์ฌ์ฉ)
|
798 |
-
# os.environ["NAVER_API_KEY"] = "YOUR_NAVER_API_KEY"
|
799 |
-
# os.environ["NAVER_SECRET_KEY"] = "YOUR_NAVER_SECRET_KEY"
|
800 |
-
# os.environ["NAVER_CUSTOMER_ID"] = "YOUR_NAVER_CUSTOMER_ID"
|
801 |
-
# os.environ["NAVER_SEARCH_CLIENT_ID"] = "YOUR_NAVER_SEARCH_CLIENT_ID"
|
802 |
-
# os.environ["NAVER_SEARCH_CLIENT_SECRET"] = "YOUR_NAVER_SEARCH_CLIENT_SECRET"
|
803 |
-
|
804 |
-
# ํ๊ฒฝ ๋ณ์ ์ค์ ํ์ธ
|
805 |
-
required_env_vars = [
|
806 |
-
"NAVER_API_KEY", "NAVER_SECRET_KEY", "NAVER_CUSTOMER_ID",
|
807 |
-
"NAVER_SEARCH_CLIENT_ID", "NAVER_SEARCH_CLIENT_SECRET"
|
808 |
-
]
|
809 |
-
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
810 |
-
if missing_vars:
|
811 |
-
debug_log(f"๊ฒฝ๊ณ : ๋ค์ ํ์ ํ๊ฒฝ ๋ณ์๊ฐ ์ค์ ๋์ง ์์์ต๋๋ค - {', '.join(missing_vars)}")
|
812 |
-
debug_log("API ํธ์ถ ๊ธฐ๋ฅ์ด ์ ์์ ์ผ๋ก ๋์ํ์ง ์์ ์ ์์ต๋๋ค.")
|
813 |
-
debug_log("์คํฌ๋ฆฝํธ ์คํ ์ ์ ํด๋น ํ๊ฒฝ ๋ณ์๋ฅผ ์ค์ ํด์ฃผ์ธ์.")
|
814 |
-
# Gradio ์ฑ์ ์คํํ๋, API ํธ์ถ ์ ์ค๋ฅ๊ฐ ๋ฐ์ํ ์ ์์์ ์ฌ์ฉ์์๊ฒ ์๋ฆผ.
|
815 |
-
|
816 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
817 |
-
demo.launch(
|
818 |
-
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
+
import urllib.parse # iframe ๊ฒฝ๋ก ๋ณด์ ์ ์ํ ๋ชจ๋
|
5 |
import re
|
6 |
import logging
|
7 |
import tempfile
|
8 |
import pandas as pd
|
9 |
+
import mecab # pythonโmecabโko ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ฌ์ฉ
|
10 |
import os
|
11 |
import time
|
12 |
import hmac
|
13 |
import hashlib
|
14 |
import base64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# ๋๋ฒ๊น
(๋ก๊ทธ)์ฉ ํจ์
|
17 |
def debug_log(message: str):
|
18 |
+
print(f"[DEBUG] {message}")
|
19 |
|
20 |
# --- ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ ---
|
21 |
def scrape_naver_blog(url: str) -> str:
|
|
|
29 |
)
|
30 |
}
|
31 |
try:
|
32 |
+
response = requests.get(url, headers=headers)
|
33 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
34 |
if response.status_code != 200:
|
35 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
|
|
39 |
iframe = soup.select_one("iframe#mainFrame")
|
40 |
if not iframe:
|
41 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
42 |
+
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
iframe_src = iframe.get("src")
|
44 |
if not iframe_src:
|
45 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
46 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
47 |
+
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
49 |
+
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
50 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
51 |
if iframe_response.status_code != 200:
|
52 |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
53 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
54 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
55 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
56 |
+
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
57 |
+
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
59 |
+
content_div = iframe_soup.select_one('.se-main-container')
|
60 |
+
if content_div:
|
61 |
+
content = content_div.get_text("\n", strip=True)
|
62 |
+
else:
|
63 |
+
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
65 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
66 |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํฉ์นจ ์๋ฃ")
|
67 |
return result
|
|
|
|
|
|
|
68 |
except Exception as e:
|
69 |
+
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
70 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
71 |
|
72 |
# --- ํํ์ ๋ถ์ (์ฐธ์กฐ์ฝ๋-1) ---
|
73 |
def analyze_text(text: str):
|
74 |
+
logging.basicConfig(level=logging.DEBUG)
|
75 |
logger = logging.getLogger(__name__)
|
76 |
+
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
77 |
+
filtered_text = re.sub(r'[^๊ฐ-ํฃ]', '', text)
|
78 |
+
logger.debug("ํํฐ๋ง๋ ํ
์คํธ: %s", filtered_text)
|
79 |
+
if not filtered_text:
|
80 |
+
logger.debug("์ ํจํ ํ๊ตญ์ด ํ
์คํธ๊ฐ ์์.")
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
82 |
+
mecab_instance = mecab.MeCab()
|
83 |
+
tokens = mecab_instance.pos(filtered_text)
|
84 |
+
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
85 |
freq = {}
|
86 |
for word, pos in tokens:
|
87 |
+
if word and word.strip() and pos.startswith("NN"):
|
|
|
88 |
freq[word] = freq.get(word, 0) + 1
|
89 |
+
logger.debug("๋จ์ด: %s, ํ์ฌ: %s, ๋น๋: %d", word, pos, freq[word])
|
|
|
90 |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
91 |
+
logger.debug("์ ๋ ฌ๋ ๋จ์ด ๋น๋: %s", sorted_freq)
|
92 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
93 |
+
logger.debug("ํํ์ ๋ถ์ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
94 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
95 |
+
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
96 |
+
temp_file.close()
|
97 |
+
logger.debug("Excel ํ์ผ ์์ฑ๋จ: %s", temp_file.name)
|
98 |
+
return df, temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# --- ๋ค์ด๋ฒ ๊ฒ์ ๋ฐ ๊ด๊ณ API ๊ด๋ จ (์ฐธ์กฐ์ฝ๋-2) ---
|
101 |
def generate_signature(timestamp, method, uri, secret_key):
|
|
|
114 |
"X-Signature": signature
|
115 |
}
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def fetch_related_keywords(keyword):
|
118 |
+
debug_log(f"fetch_related_keywords ํธ์ถ, ํค์๋: {keyword}")
|
119 |
+
API_KEY = os.environ["NAVER_API_KEY"]
|
120 |
+
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
121 |
+
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
|
|
|
|
|
|
|
|
|
|
122 |
BASE_URL = "https://api.naver.com"
|
123 |
uri = "/keywordstool"
|
124 |
method = "GET"
|
125 |
+
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
|
126 |
+
params = {
|
127 |
+
"hintKeywords": [keyword],
|
128 |
+
"showDetail": "1"
|
129 |
+
}
|
130 |
+
response = requests.get(BASE_URL + uri, params=params, headers=headers)
|
131 |
+
data = response.json()
|
132 |
+
if "keywordList" not in data:
|
133 |
+
return pd.DataFrame()
|
134 |
+
df = pd.DataFrame(data["keywordList"])
|
135 |
+
if len(df) > 100:
|
136 |
+
df = df.head(100)
|
137 |
+
def parse_count(x):
|
138 |
+
try:
|
139 |
+
return int(str(x).replace(",", ""))
|
140 |
+
except:
|
141 |
+
return 0
|
142 |
+
df["PC์๊ฒ์๋"] = df["monthlyPcQcCnt"].apply(parse_count)
|
143 |
+
df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
144 |
+
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
145 |
+
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
146 |
+
result_df = df[["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]]
|
147 |
+
debug_log("fetch_related_keywords ์๋ฃ")
|
148 |
+
return result_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
def fetch_blog_count(keyword):
|
151 |
debug_log(f"fetch_blog_count ํธ์ถ, ํค์๋: {keyword}")
|
152 |
+
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
153 |
+
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
|
|
|
|
|
|
|
|
|
|
154 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
155 |
headers = {
|
156 |
"X-Naver-Client-Id": client_id,
|
157 |
"X-Naver-Client-Secret": client_secret
|
158 |
}
|
159 |
+
params = {"query": keyword, "display": 1}
|
160 |
+
response = requests.get(url, headers=headers, params=params)
|
161 |
+
if response.status_code == 200:
|
|
|
|
|
162 |
data = response.json()
|
163 |
+
debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {data.get('total', 0)}")
|
164 |
+
return data.get("total", 0)
|
165 |
+
else:
|
166 |
+
debug_log(f"fetch_blog_count ์ค๋ฅ, ์ํ์ฝ๋: {response.status_code}")
|
167 |
+
return 0
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
def create_excel_file(df):
|
170 |
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
171 |
+
excel_path = tmp.name
|
172 |
+
df.to_excel(excel_path, index=False)
|
173 |
+
debug_log(f"Excel ํ์ผ ์์ฑ๋จ: {excel_path}")
|
174 |
+
return excel_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
def process_keyword(keywords: str, include_related: bool):
|
177 |
+
debug_log(f"process_keyword ํธ์ถ, ํค์๋๋ค: {keywords}, ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}")
|
178 |
+
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
179 |
+
result_dfs = []
|
180 |
+
for idx, kw in enumerate(input_keywords):
|
181 |
+
df_kw = fetch_related_keywords(kw)
|
182 |
+
if df_kw.empty:
|
183 |
+
continue
|
184 |
+
row_kw = df_kw[df_kw["์ ๋ณดํค์๋"] == kw]
|
185 |
+
if not row_kw.empty:
|
186 |
+
result_dfs.append(row_kw)
|
187 |
+
else:
|
188 |
+
result_dfs.append(df_kw.head(1))
|
189 |
+
if include_related and idx == 0:
|
190 |
+
df_related = df_kw[df_kw["์ ๋ณดํค์๋"] != kw]
|
191 |
+
if not df_related.empty:
|
192 |
+
result_dfs.append(df_related)
|
193 |
+
if result_dfs:
|
194 |
+
result_df = pd.concat(result_dfs, ignore_index=True)
|
195 |
+
result_df.drop_duplicates(subset=["์ ๋ณดํค์๋"], inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
else:
|
197 |
+
result_df = pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๏ฟฝ๏ฟฝ๏ฟฝ์๋", "ํ ํ์๊ฒ์๋"])
|
198 |
+
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
199 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
200 |
+
debug_log("process_keyword ์๋ฃ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
return result_df, create_excel_file(result_df)
|
202 |
|
|
|
203 |
# --- ํํ์ ๋ถ์๊ณผ ๊ฒ์๋/๋ธ๋ก๊ทธ๋ฌธ์์ ๋ณํฉ ---
|
204 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
205 |
debug_log("morphological_analysis_and_enrich ํจ์ ์์")
|
206 |
+
df_freq, _ = analyze_text(text)
|
|
|
207 |
if df_freq.empty:
|
208 |
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์
๋๋ค.")
|
209 |
+
return df_freq, ""
|
|
|
210 |
if remove_freq1:
|
211 |
+
before_shape = df_freq.shape
|
212 |
+
df_freq = df_freq[df_freq["๋น๋์"] != 1]
|
213 |
+
debug_log(f"๋น๋์ 1 ์ ๊ฑฐ ์ ์ฉ๋จ. {before_shape} -> {df_freq.shape}")
|
214 |
+
keywords = "\n".join(df_freq["๋จ์ด"].tolist())
|
215 |
+
debug_log(f"๋ถ์๋ ํค์๋: {keywords}")
|
216 |
+
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
217 |
+
debug_log("๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
218 |
+
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
219 |
+
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
merged_excel_path = create_excel_file(merged_df)
|
221 |
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
222 |
return merged_df, merged_excel_path
|
223 |
|
|
|
224 |
# --- ์ง์ ํค์๋ ๋ถ์ (๋จ๋
๋ถ์) ---
|
225 |
def direct_keyword_analysis(text: str, keyword_input: str):
|
226 |
debug_log("direct_keyword_analysis ํจ์ ์์")
|
227 |
+
keywords = re.split(r'[\n,]+', keyword_input)
|
228 |
+
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
229 |
+
debug_log(f"์
๋ ฅ๋ ํค์๋ ๋ชฉ๋ก: {keywords}")
|
230 |
+
results = []
|
231 |
+
for kw in keywords:
|
232 |
+
count = text.count(kw)
|
233 |
+
results.append((kw, count))
|
234 |
+
debug_log(f"ํค์๋ '{kw}'์ ๋น๋์: {count}")
|
235 |
+
df = pd.DataFrame(results, columns=["ํค์๋", "๋น๋์"])
|
236 |
+
excel_path = create_excel_file(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
debug_log("direct_keyword_analysis ํจ์ ์๋ฃ")
|
238 |
+
return df, excel_path
|
|
|
239 |
|
240 |
# --- ํตํฉ ๋ถ์ (ํํ์ ๋ถ์ + ์ง์ ํค์๋ ๋ถ์) ---
|
241 |
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
242 |
debug_log("combined_analysis ํจ์ ์์")
|
243 |
+
merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
244 |
+
if "์ง์ ์
๋ ฅ" not in merged_df.columns:
|
245 |
+
merged_df["์ง์ ์
๋ ฅ"] = ""
|
246 |
+
direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
|
247 |
+
direct_keywords = [kw.strip() for kw in direct_keywords if kw.strip()]
|
248 |
+
debug_log(f"์
๋ ฅ๋ ์ง์ ํค์๋: {direct_keywords}")
|
249 |
+
for dk in direct_keywords:
|
250 |
+
if dk in merged_df["๋จ์ด"].values:
|
251 |
+
merged_df.loc[merged_df["๋จ์ด"] == dk, "์ง์ ์
๋ ฅ"] = "์ง์ ์
๋ ฅ"
|
252 |
+
else:
|
253 |
+
freq = blog_text.count(dk)
|
254 |
+
df_direct, _ = process_keyword(dk, include_related=False)
|
255 |
+
if (not df_direct.empty) and (dk in df_direct["์ ๋ณดํค์๋"].values):
|
256 |
+
row = df_direct[df_direct["์ ๋ณดํค์๋"] == dk].iloc[0]
|
257 |
+
pc = row.get("PC์๊ฒ์๋", None)
|
258 |
+
mobile = row.get("๋ชจ๋ฐ์ผ์๊ฒ์๋", None)
|
259 |
+
total = row.get("ํ ํ์๊ฒ์๋", None)
|
260 |
+
blog_count = row.get("๋ธ๋ก๊ทธ๋ฌธ์์", None)
|
261 |
+
else:
|
262 |
+
pc = mobile = total = blog_count = None
|
263 |
+
new_row = {
|
264 |
+
"๋จ์ด": dk,
|
265 |
+
"๋น๋์": freq,
|
266 |
+
"PC์๊ฒ์๋": pc,
|
267 |
+
"๋ชจ๋ฐ์ผ์๊ฒ์๋": mobile,
|
268 |
+
"ํ ํ์๊ฒ์๋": total,
|
269 |
+
"๋ธ๋ก๊ทธ๋ฌธ์์": blog_count,
|
270 |
+
"์ง์ ์
๋ ฅ": "์ง์ ์
๋ ฅ"
|
271 |
+
}
|
272 |
+
merged_df = pd.concat([merged_df, pd.DataFrame([new_row])], ignore_index=True)
|
273 |
+
merged_df = merged_df.sort_values(by="๋น๋์", ascending=False).reset_index(drop=True)
|
274 |
+
combined_excel = create_excel_file(merged_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
debug_log("combined_analysis ํจ์ ์๋ฃ")
|
276 |
+
return merged_df, combined_excel
|
|
|
277 |
|
278 |
# --- ๋ถ์ ํธ๋ค๋ฌ ---
|
279 |
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
280 |
+
debug_log("analysis_handler ํจ์ ์์")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
if direct_keyword_only:
|
282 |
# "์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์" ์ ํ ์ ๋จ๋
๋ถ์ ์ํ
|
283 |
+
return direct_keyword_analysis(blog_text, direct_keyword_input)
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
else:
|
285 |
# ๊ธฐ๋ณธ ํตํฉ ๋ถ์ ์ํ
|
286 |
+
return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
# --- ์คํฌ๋ํ ์คํ ---
|
289 |
def fetch_blog_content(url: str):
|
290 |
debug_log("fetch_blog_content ํจ์ ์์")
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
content = scrape_naver_blog(url)
|
292 |
+
debug_log("fetch_blog_content ํจ์ ์๋ฃ")
|
|
|
293 |
return content
|
294 |
|
295 |
# --- Custom CSS ---
|
296 |
custom_css = """
|
297 |
/* ์ ์ฒด ์ปจํ
์ด๋ ์คํ์ผ */
|
298 |
.gradio-container {
|
299 |
+
max-width: 960px;
|
300 |
margin: auto;
|
301 |
font-family: 'Helvetica Neue', Arial, sans-serif;
|
302 |
background: #f5f7fa;
|
|
|
330 |
padding: 0.6rem 1.2rem;
|
331 |
font-size: 1rem;
|
332 |
cursor: pointer;
|
|
|
333 |
}
|
|
|
|
|
|
|
|
|
334 |
|
335 |
/* ์ฒดํฌ๋ฐ์ค ์คํ์ผ */
|
336 |
.custom-checkbox {
|
|
|
351 |
"""
|
352 |
|
353 |
# --- Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ ---
|
354 |
+
with gr.Blocks(title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ํํ์ ๋ถ์ ์๋น์ค", css=custom_css) as demo:
|
355 |
+
gr.HTML("<div class='custom-header'>๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ํํ์ ๋ถ์ ์๋น์ค</div>")
|
356 |
+
# ๋ธ๋ก๊ทธ ๋งํฌ์ ์คํฌ๋ํ ์คํ ๋ฒํผ์ ํ ๊ทธ๋ฃน ๋ด์ ๋ฐฐ์น (๋ฒํผ์ ๊ฐ์ด๋ฐ ์ ๋ ฌ)
|
357 |
+
with gr.Group(elem_classes="custom-group"):
|
358 |
+
with gr.Row():
|
359 |
+
blog_url_input = gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", placeholder="์: https://blog.naver.com/ssboost/222983068507", lines=1)
|
360 |
+
with gr.Row(elem_classes="centered"):
|
361 |
+
scrape_button = gr.Button("์คํฌ๋ํ ์คํ", elem_classes="custom-button")
|
362 |
+
with gr.Group(elem_classes="custom-group"):
|
363 |
+
blog_content_box = gr.Textbox(label="๋ธ๋ก๊ทธ ๋ด์ฉ (์์ ๊ฐ๋ฅ)", lines=10, placeholder="์คํฌ๋ํ๋ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค.")
|
364 |
+
with gr.Group(elem_classes="custom-group"):
|
365 |
+
with gr.Row():
|
366 |
+
remove_freq_checkbox = gr.Checkbox(label="๋น๋์1 ์ ๊ฑฐ", value=True, elem_classes="custom-checkbox")
|
367 |
+
with gr.Row():
|
368 |
+
# "๋น๋์1 ์ ๊ฑฐ" ์๋์ "์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์" ์ฒดํฌ๋ฐ์ค ๋ฐฐ์น
|
369 |
+
direct_keyword_only_checkbox = gr.Checkbox(label="์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์", value=False, elem_classes="custom-checkbox")
|
370 |
+
with gr.Row():
|
371 |
+
direct_keyword_box = gr.Textbox(label="์ง์ ํค์๋ ์
๋ ฅ (์ํฐ ๋๋ ','๋ก ๊ตฌ๋ถ)", lines=2, placeholder="์: ํค์๋1, ํค์๋2\nํค์๋3")
|
372 |
+
with gr.Group(elem_classes="custom-group"):
|
373 |
+
with gr.Row(elem_classes="centered"):
|
374 |
+
analyze_button = gr.Button("๋ถ์ ์คํ", elem_classes="custom-button")
|
375 |
+
with gr.Group(elem_classes="custom-group custom-result"):
|
376 |
+
result_df = gr.Dataframe(label="ํตํฉ ๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์, ์ง์ ์
๋ ฅ)", interactive=True)
|
377 |
+
with gr.Group(elem_classes="custom-group"):
|
378 |
+
excel_file = gr.File(label="Excel ๋ค์ด๋ก๋")
|
379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
381 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
382 |
+
analyze_button.click(fn=analysis_handler,
|
383 |
+
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|
384 |
+
outputs=[result_df, excel_file])
|
|
|
|
|
385 |
|
386 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
388 |
+
demo.launch()
|
389 |
+
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|