Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,21 +1,29 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
-
import urllib.parse
|
5 |
import re
|
6 |
import logging
|
7 |
import tempfile
|
8 |
import pandas as pd
|
9 |
-
import mecab
|
10 |
import os
|
11 |
import time
|
12 |
import hmac
|
13 |
import hashlib
|
14 |
import base64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# λλ²κΉ
(λ‘κ·Έ)μ© ν¨μ
|
17 |
def debug_log(message: str):
|
18 |
-
print(f"[DEBUG] {message}")
|
19 |
|
20 |
# --- λ€μ΄λ² λΈλ‘κ·Έ μ€ν¬λν ---
|
21 |
def scrape_naver_blog(url: str) -> str:
|
@@ -29,7 +37,7 @@ def scrape_naver_blog(url: str) -> str:
|
|
29 |
)
|
30 |
}
|
31 |
try:
|
32 |
-
response = requests.get(url, headers=headers)
|
33 |
debug_log("HTTP GET μμ²(λ©μΈ νμ΄μ§) μλ£")
|
34 |
if response.status_code != 200:
|
35 |
debug_log(f"μμ² μ€ν¨, μνμ½λ: {response.status_code}")
|
@@ -39,63 +47,127 @@ def scrape_naver_blog(url: str) -> str:
|
|
39 |
iframe = soup.select_one("iframe#mainFrame")
|
40 |
if not iframe:
|
41 |
debug_log("iframe#mainFrame νκ·Έλ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
iframe_src = iframe.get("src")
|
44 |
if not iframe_src:
|
45 |
debug_log("iframe srcκ° μ‘΄μ¬νμ§ μμ΅λλ€.")
|
46 |
return "λ³Έλ¬Έ iframeμ srcλ₯Ό μ°Ύμ μ μμ΅λλ€."
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
debug_log(f"iframe νμ΄μ§ μμ² URL: {parsed_iframe_url}")
|
49 |
-
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
50 |
debug_log("HTTP GET μμ²(iframe νμ΄μ§) μλ£")
|
51 |
if iframe_response.status_code != 200:
|
52 |
debug_log(f"iframe μμ² μ€ν¨, μνμ½λ: {iframe_response.status_code}")
|
53 |
return f"iframeμμ μ€λ₯κ° λ°μνμ΅λλ€. μνμ½λ: {iframe_response.status_code}"
|
54 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
55 |
debug_log("HTML νμ±(iframe νμ΄μ§) μλ£")
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
debug_log(f"μΆμΆλ μ λͺ©: {title}")
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
debug_log("λ³Έλ¬Έ μΆμΆ μλ£")
|
65 |
result = f"[μ λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
|
66 |
debug_log("μ λͺ©κ³Ό λ³Έλ¬Έ ν©μΉ¨ μλ£")
|
67 |
return result
|
|
|
|
|
|
|
68 |
except Exception as e:
|
69 |
-
debug_log(f"μλ¬ λ°μ: {str(e)}")
|
70 |
return f"μ€ν¬λν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
71 |
|
72 |
# --- ννμ λΆμ (μ°Έμ‘°μ½λ-1) ---
|
73 |
def analyze_text(text: str):
|
74 |
-
logging.basicConfig(level=logging.
|
75 |
logger = logging.getLogger(__name__)
|
76 |
-
logger.debug("μλ³Έ ν
μ€νΈ: %s", text)
|
77 |
-
filtered_text = re.sub(r'[^κ°-ν£]', '', text)
|
78 |
-
logger.debug("νν°λ§λ ν
μ€νΈ: %s", filtered_text)
|
79 |
-
if not filtered_text:
|
80 |
-
logger.
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
82 |
-
|
83 |
-
|
84 |
-
logger.debug("ννμ λΆμ κ²°κ³Ό: %s", tokens)
|
85 |
freq = {}
|
86 |
for word, pos in tokens:
|
87 |
-
|
|
|
88 |
freq[word] = freq.get(word, 0) + 1
|
89 |
-
logger.debug("λ¨μ΄: %s, νμ¬: %s, λΉλ: %d", word, pos, freq[word])
|
|
|
90 |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
91 |
-
logger.debug("μ λ ¬λ λ¨μ΄ λΉλ: %s", sorted_freq)
|
92 |
df = pd.DataFrame(sorted_freq, columns=["λ¨μ΄", "λΉλμ"])
|
93 |
-
logger.
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# --- λ€μ΄λ² κ²μ λ° κ΄κ³ API κ΄λ ¨ (μ°Έμ‘°μ½λ-2) ---
|
101 |
def generate_signature(timestamp, method, uri, secret_key):
|
@@ -114,189 +186,489 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
114 |
"X-Signature": signature
|
115 |
}
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def fetch_related_keywords(keyword):
|
118 |
-
debug_log(f"fetch_related_keywords
|
119 |
-
API_KEY =
|
120 |
-
SECRET_KEY =
|
121 |
-
CUSTOMER_ID =
|
|
|
|
|
|
|
|
|
|
|
122 |
BASE_URL = "https://api.naver.com"
|
123 |
uri = "/keywordstool"
|
124 |
method = "GET"
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
return
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
def fetch_blog_count(keyword):
|
151 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
152 |
-
client_id =
|
153 |
-
client_secret =
|
|
|
|
|
|
|
|
|
|
|
154 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
155 |
headers = {
|
156 |
"X-Naver-Client-Id": client_id,
|
157 |
"X-Naver-Client-Secret": client_secret
|
158 |
}
|
159 |
-
params = {"query": keyword, "display": 1}
|
160 |
-
|
161 |
-
|
|
|
|
|
162 |
data = response.json()
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
def create_excel_file(df):
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
def process_keyword(keywords: str, include_related: bool):
|
177 |
-
debug_log(f"process_keyword
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
else:
|
197 |
-
result_df
|
198 |
-
|
199 |
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
200 |
-
debug_log("process_keyword
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
return result_df, create_excel_file(result_df)
|
202 |
|
|
|
203 |
# --- ννμ λΆμκ³Ό κ²μλ/λΈλ‘κ·Έλ¬Έμμ λ³ν© ---
|
204 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
205 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
206 |
-
df_freq, _ = analyze_text(text)
|
|
|
207 |
if df_freq.empty:
|
208 |
debug_log("ννμ λΆμ κ²°κ³Όκ° λΉ λ°μ΄ν°νλ μμ
λλ€.")
|
209 |
-
return
|
|
|
210 |
if remove_freq1:
|
211 |
-
|
212 |
-
df_freq = df_freq[df_freq["λΉλμ"]
|
213 |
-
debug_log(f"λΉλμ 1 μ κ±° μ μ©λ¨. {
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
merged_excel_path = create_excel_file(merged_df)
|
221 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
222 |
return merged_df, merged_excel_path
|
223 |
|
|
|
224 |
# --- μ§μ ν€μλ λΆμ (λ¨λ
λΆμ) ---
|
225 |
def direct_keyword_analysis(text: str, keyword_input: str):
|
226 |
debug_log("direct_keyword_analysis ν¨μ μμ")
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
238 |
-
return
|
|
|
239 |
|
240 |
# --- ν΅ν© λΆμ (ννμ λΆμ + μ§μ ν€μλ λΆμ) ---
|
241 |
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
242 |
debug_log("combined_analysis ν¨μ μμ")
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
for
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
debug_log("combined_analysis ν¨μ μλ£")
|
276 |
-
return
|
|
|
277 |
|
278 |
# --- λΆμ νΈλ€λ¬ ---
|
279 |
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
280 |
-
debug_log("analysis_handler ν¨μ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
if direct_keyword_only:
|
282 |
# "μ§μ ν€μλ μ
λ ₯λ§ λΆμ" μ ν μ λ¨λ
λΆμ μν
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
else:
|
285 |
# κΈ°λ³Έ ν΅ν© λΆμ μν
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
# --- μ€ν¬λν μ€ν ---
|
289 |
def fetch_blog_content(url: str):
|
290 |
debug_log("fetch_blog_content ν¨μ μμ")
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
content = scrape_naver_blog(url)
|
292 |
-
|
|
|
293 |
return content
|
294 |
|
295 |
# --- Custom CSS ---
|
296 |
custom_css = """
|
297 |
/* μ 체 컨ν
μ΄λ μ€νμΌ */
|
298 |
.gradio-container {
|
299 |
-
max-width:
|
300 |
margin: auto;
|
301 |
font-family: 'Helvetica Neue', Arial, sans-serif;
|
302 |
background: #f5f7fa;
|
@@ -330,7 +702,12 @@ custom_css = """
|
|
330 |
padding: 0.6rem 1.2rem;
|
331 |
font-size: 1rem;
|
332 |
cursor: pointer;
|
|
|
333 |
}
|
|
|
|
|
|
|
|
|
334 |
|
335 |
/* 체ν¬λ°μ€ μ€νμΌ */
|
336 |
.custom-checkbox {
|
@@ -351,39 +728,97 @@ custom_css = """
|
|
351 |
"""
|
352 |
|
353 |
# --- Gradio μΈν°νμ΄μ€ κ΅¬μ± ---
|
354 |
-
with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ
|
355 |
-
gr.HTML("<div class='custom-header'>λ€μ΄λ² λΈλ‘κ·Έ
|
356 |
-
# λΈλ‘κ·Έ λ§ν¬μ μ€ν¬λν μ€ν λ²νΌμ ν κ·Έλ£Ή λ΄μ λ°°μΉ (λ²νΌμ κ°μ΄λ° μ λ ¬)
|
357 |
-
with gr.Group(elem_classes="custom-group"):
|
358 |
-
with gr.Row():
|
359 |
-
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
360 |
-
with gr.Row(elem_classes="centered"):
|
361 |
-
scrape_button = gr.Button("μ€ν¬λν μ€ν", elem_classes="custom-button")
|
362 |
-
with gr.Group(elem_classes="custom-group"):
|
363 |
-
blog_content_box = gr.Textbox(label="λΈλ‘κ·Έ λ΄μ© (μμ κ°λ₯)", lines=10, placeholder="μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€.")
|
364 |
-
with gr.Group(elem_classes="custom-group"):
|
365 |
-
with gr.Row():
|
366 |
-
remove_freq_checkbox = gr.Checkbox(label="λΉλμ1 μ κ±°", value=True, elem_classes="custom-checkbox")
|
367 |
-
with gr.Row():
|
368 |
-
# "λΉλμ1 μ κ±°" μλμ "μ§μ ν€μλ μ
λ ₯λ§ λΆμ" 체ν¬λ°μ€ λ°°μΉ
|
369 |
-
direct_keyword_only_checkbox = gr.Checkbox(label="μ§μ ν€μλ μ
λ ₯λ§ λΆμ", value=False, elem_classes="custom-checkbox")
|
370 |
-
with gr.Row():
|
371 |
-
direct_keyword_box = gr.Textbox(label="μ§μ ν€μλ μ
λ ₯ (μν° λλ ','λ‘ κ΅¬λΆ)", lines=2, placeholder="μ: ν€μλ1, ν€μλ2\nν€μλ3")
|
372 |
-
with gr.Group(elem_classes="custom-group"):
|
373 |
-
with gr.Row(elem_classes="centered"):
|
374 |
-
analyze_button = gr.Button("λΆμ μ€ν", elem_classes="custom-button")
|
375 |
-
with gr.Group(elem_classes="custom-group custom-result"):
|
376 |
-
result_df = gr.Dataframe(label="ν΅ν© λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ, μ§μ μ
λ ₯)", interactive=True)
|
377 |
-
with gr.Group(elem_classes="custom-group"):
|
378 |
-
excel_file = gr.File(label="Excel λ€μ΄λ‘λ")
|
379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
# μ΄λ²€νΈ μ°κ²°
|
381 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
382 |
-
analyze_button.click(
|
383 |
-
|
384 |
-
|
|
|
|
|
385 |
|
386 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
debug_log("Gradio μ± μ€ν μμ")
|
388 |
-
demo.launch()
|
389 |
-
debug_log("Gradio μ± μ€ν μ’
λ£")
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
+
import urllib.parse # iframe κ²½λ‘ λ³΄μ μ μν λͺ¨λ
|
5 |
import re
|
6 |
import logging
|
7 |
import tempfile
|
8 |
import pandas as pd
|
9 |
+
import mecab # python?mecab?ko λΌμ΄λΈλ¬λ¦¬ μ¬μ©
|
10 |
import os
|
11 |
import time
|
12 |
import hmac
|
13 |
import hashlib
|
14 |
import base64
|
15 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
16 |
+
|
17 |
+
# --- λ³λ ¬ μ²λ¦¬ μ€μ ---
|
18 |
+
# API νΈμΆ μ νμ λ§μΆ° μ μ ν μ‘°μ νμΈμ.
|
19 |
+
# λ무 λμ κ°μ API μ νμ 걸릴 μ μμ΅λλ€.
|
20 |
+
MAX_WORKERS_RELATED_KEYWORDS = 5 # fetch_related_keywords λ³λ ¬ μμ
μ μ
|
21 |
+
MAX_WORKERS_BLOG_COUNT = 10 # fetch_blog_count λ³λ ¬ μμ
μ μ
|
22 |
+
|
23 |
|
24 |
# λλ²κΉ
(λ‘κ·Έ)μ© ν¨μ
|
25 |
def debug_log(message: str):
|
26 |
+
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] [DEBUG] {message}")
|
27 |
|
28 |
# --- λ€μ΄λ² λΈλ‘κ·Έ μ€ν¬λν ---
|
29 |
def scrape_naver_blog(url: str) -> str:
|
|
|
37 |
)
|
38 |
}
|
39 |
try:
|
40 |
+
response = requests.get(url, headers=headers, timeout=10)
|
41 |
debug_log("HTTP GET μμ²(λ©μΈ νμ΄μ§) μλ£")
|
42 |
if response.status_code != 200:
|
43 |
debug_log(f"μμ² μ€ν¨, μνμ½λ: {response.status_code}")
|
|
|
47 |
iframe = soup.select_one("iframe#mainFrame")
|
48 |
if not iframe:
|
49 |
debug_log("iframe#mainFrame νκ·Έλ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
50 |
+
# μΌλΆ λΈλ‘κ·Έλ mainFrameμ΄ μμ μ μμ. λ³Έλ¬Έ μ§μ μλ
|
51 |
+
content_div_direct = soup.select_one('.se-main-container')
|
52 |
+
if content_div_direct:
|
53 |
+
title_div_direct = soup.select_one('.se-module.se-module-text.se-title-text')
|
54 |
+
title = title_div_direct.get_text(strip=True) if title_div_direct else "μ λͺ©μ μ°Ύμ μ μμ΅λλ€."
|
55 |
+
content = content_div_direct.get_text("\n", strip=True)
|
56 |
+
debug_log("iframe μμ΄ λ³Έλ¬Έ μ§μ μΆμΆ μλ£")
|
57 |
+
return f"[μ λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
|
58 |
+
return "λ³Έλ¬Έ iframeμ μ°Ύμ μ μμ΅λλ€. (λ³Έλ¬Έ μ§μ μΆμΆ μ€ν¨)"
|
59 |
+
|
60 |
iframe_src = iframe.get("src")
|
61 |
if not iframe_src:
|
62 |
debug_log("iframe srcκ° μ‘΄μ¬νμ§ μμ΅λλ€.")
|
63 |
return "λ³Έλ¬Έ iframeμ srcλ₯Ό μ°Ύμ μ μμ΅λλ€."
|
64 |
+
|
65 |
+
# iframe_srcκ° μ λ URLμ΄ μλ κ²½μ°λ₯Ό λλΉ
|
66 |
+
if iframe_src.startswith("//"):
|
67 |
+
parsed_iframe_url = "https:" + iframe_src
|
68 |
+
elif iframe_src.startswith("/"):
|
69 |
+
parsed_main_url = urllib.parse.urlparse(url)
|
70 |
+
parsed_iframe_url = urllib.parse.urlunparse(
|
71 |
+
(parsed_main_url.scheme, parsed_main_url.netloc, iframe_src, None, None, None)
|
72 |
+
)
|
73 |
+
else:
|
74 |
+
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
75 |
+
|
76 |
debug_log(f"iframe νμ΄μ§ μμ² URL: {parsed_iframe_url}")
|
77 |
+
iframe_response = requests.get(parsed_iframe_url, headers=headers, timeout=10)
|
78 |
debug_log("HTTP GET μμ²(iframe νμ΄μ§) μλ£")
|
79 |
if iframe_response.status_code != 200:
|
80 |
debug_log(f"iframe μμ² μ€ν¨, μνμ½λ: {iframe_response.status_code}")
|
81 |
return f"iframeμμ μ€λ₯κ° λ°μνμ΅λλ€. μνμ½λ: {iframe_response.status_code}"
|
82 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
83 |
debug_log("HTML νμ±(iframe νμ΄μ§) μλ£")
|
84 |
+
|
85 |
+
# μ λͺ© μΆμΆ (λ€μν ꡬ쑰 μλ)
|
86 |
+
title_selectors = [
|
87 |
+
'.se-module.se-module-text.se-title-text', # μΌλ°μ μΈ μ€λ§νΈμλν° ONE
|
88 |
+
'.title_text', # ꡬλ²μ μλν° λλ λ€λ₯Έ ꡬ쑰
|
89 |
+
'div[class*="title"] h3',
|
90 |
+
'h1', 'h2', 'h3' # μΌλ°μ μΈ μ λͺ© νκ·Έ
|
91 |
+
]
|
92 |
+
title = "μ λͺ©μ μ°Ύμ μ μμ΅λλ€."
|
93 |
+
for selector in title_selectors:
|
94 |
+
title_div = iframe_soup.select_one(selector)
|
95 |
+
if title_div:
|
96 |
+
title = title_div.get_text(strip=True)
|
97 |
+
break
|
98 |
debug_log(f"μΆμΆλ μ λͺ©: {title}")
|
99 |
+
|
100 |
+
# λ³Έλ¬Έ μΆμΆ (λ€μν ꡬ쑰 μλ)
|
101 |
+
content_selectors = [
|
102 |
+
'.se-main-container', # μ€λ§νΈμλν° ONE
|
103 |
+
'div#content', # ꡬλ²μ μλν°
|
104 |
+
'div.post_ct', # μΌλΆ λΈλ‘κ·Έ ꡬ쑰
|
105 |
+
'article', 'main' # μλ§¨ν± νκ·Έ
|
106 |
+
]
|
107 |
+
content = "λ³Έλ¬Έμ μ°Ύμ μ μμ΅λλ€."
|
108 |
+
for selector in content_selectors:
|
109 |
+
content_div = iframe_soup.select_one(selector)
|
110 |
+
if content_div:
|
111 |
+
# λΆνμν μ€ν¬λ¦½νΈ, μ€νμΌ νκ·Έ μ κ±°
|
112 |
+
for s in content_div(['script', 'style']):
|
113 |
+
s.decompose()
|
114 |
+
content = content_div.get_text("\n", strip=True)
|
115 |
+
break
|
116 |
+
|
117 |
debug_log("λ³Έλ¬Έ μΆμΆ μλ£")
|
118 |
result = f"[μ λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
|
119 |
debug_log("μ λͺ©κ³Ό λ³Έλ¬Έ ν©μΉ¨ μλ£")
|
120 |
return result
|
121 |
+
except requests.exceptions.Timeout:
|
122 |
+
debug_log(f"μμ² μκ° μ΄κ³Ό: {url}")
|
123 |
+
return f"μ€ν¬λν μ€ μκ° μ΄κ³Όκ° λ°μνμ΅λλ€: {url}"
|
124 |
except Exception as e:
|
125 |
+
debug_log(f"μ€ν¬λν μλ¬ λ°μ: {str(e)}")
|
126 |
return f"μ€ν¬λν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
127 |
|
128 |
# --- ννμ λΆμ (μ°Έμ‘°μ½λ-1) ---
|
129 |
def analyze_text(text: str):
|
130 |
+
logging.basicConfig(level=logging.INFO) # INFO λ λ²¨λ‘ λ³κ²½νμ¬ λ무 λ§μ λ‘κ·Έ λ°©μ§
|
131 |
logger = logging.getLogger(__name__)
|
132 |
+
# logger.debug("μλ³Έ ν
μ€νΈ: %s", text) # λ무 κΈΈ μ μμΌλ―λ‘ μ£Όμ μ²λ¦¬
|
133 |
+
filtered_text = re.sub(r'[^κ°-ν£a-zA-Z0-9\s]', '', text) # μμ΄, μ«μ, 곡백 ν¬ν¨
|
134 |
+
# logger.debug("νν°λ§λ ν
μ€νΈ: %s", filtered_text)
|
135 |
+
if not filtered_text.strip():
|
136 |
+
logger.info("μ ν¨ν ν
μ€νΈκ° μμ (νν°λ§ ν).")
|
137 |
+
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
138 |
+
try:
|
139 |
+
mecab_instance = mecab.MeCab()
|
140 |
+
tokens = mecab_instance.pos(filtered_text)
|
141 |
+
except Exception as e:
|
142 |
+
logger.error(f"MeCab ννμ λΆμ μ€ μ€λ₯: {e}")
|
143 |
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
144 |
+
|
145 |
+
# logger.debug("ννμ λΆμ κ²°κ³Ό: %s", tokens)
|
|
|
146 |
freq = {}
|
147 |
for word, pos in tokens:
|
148 |
+
# μΌλ°λͺ
μ¬(NNG), κ³ μ λͺ
μ¬(NNP), μΈκ΅μ΄(SL), μ«μ(SN) λ± ν¬ν¨, ν κΈμ λ¨μ΄λ μ μΈ (μ ν μ¬ν)
|
149 |
+
if word and word.strip() and (pos.startswith("NN") or pos in ["SL", "SH"]) and len(word) > 1 :
|
150 |
freq[word] = freq.get(word, 0) + 1
|
151 |
+
# logger.debug("λ¨μ΄: %s, νμ¬: %s, λΉλ: %d", word, pos, freq[word])
|
152 |
+
|
153 |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
154 |
+
# logger.debug("μ λ ¬λ λ¨μ΄ λΉλ: %s", sorted_freq)
|
155 |
df = pd.DataFrame(sorted_freq, columns=["λ¨μ΄", "λΉλμ"])
|
156 |
+
logger.info(f"ννμ λΆμ DataFrame μμ±λ¨, shape: {df.shape}")
|
157 |
+
|
158 |
+
temp_file_path = ""
|
159 |
+
if not df.empty:
|
160 |
+
try:
|
161 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx", mode='w+b') as temp_file:
|
162 |
+
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
163 |
+
temp_file_path = temp_file.name
|
164 |
+
logger.info(f"Excel νμΌ μμ±λ¨: {temp_file_path}")
|
165 |
+
except Exception as e:
|
166 |
+
logger.error(f"Excel νμΌ μ μ₯ μ€ μ€λ₯: {e}")
|
167 |
+
temp_file_path = "" # μ€λ₯ λ°μ μ κ²½λ‘ μ΄κΈ°ν
|
168 |
+
|
169 |
+
return df, temp_file_path
|
170 |
+
|
171 |
|
172 |
# --- λ€μ΄λ² κ²μ λ° κ΄κ³ API κ΄λ ¨ (μ°Έμ‘°μ½λ-2) ---
|
173 |
def generate_signature(timestamp, method, uri, secret_key):
|
|
|
186 |
"X-Signature": signature
|
187 |
}
|
188 |
|
189 |
+
# API ν€ νκ²½ λ³μ νμΈ ν¨μ
|
190 |
+
def get_env_variable(var_name):
|
191 |
+
value = os.environ.get(var_name)
|
192 |
+
if value is None:
|
193 |
+
debug_log(f"νκ²½ λ³μ '{var_name}'κ° μ€μ λμ§ μμμ΅λλ€. API νΈμΆμ΄ μ€ν¨ν μ μμ΅λλ€.")
|
194 |
+
# νμμ μ¬κΈ°μ raise Exception λλ οΏ½οΏ½οΏ½λ³Έκ° λ°ν
|
195 |
+
return value
|
196 |
+
|
197 |
def fetch_related_keywords(keyword):
|
198 |
+
debug_log(f"fetch_related_keywords νΈμΆ μμ, ν€μλ: {keyword}")
|
199 |
+
API_KEY = get_env_variable("NAVER_API_KEY")
|
200 |
+
SECRET_KEY = get_env_variable("NAVER_SECRET_KEY")
|
201 |
+
CUSTOMER_ID = get_env_variable("NAVER_CUSTOMER_ID")
|
202 |
+
|
203 |
+
if not all([API_KEY, SECRET_KEY, CUSTOMER_ID]):
|
204 |
+
debug_log(f"λ€μ΄λ² κ΄κ³ API ν€ μ 보 λΆμ‘±μΌλ‘ '{keyword}' μ°κ΄ ν€μλ μ‘°νλ₯Ό 건λ<0xEB><0xB5>λλ€.")
|
205 |
+
return pd.DataFrame()
|
206 |
+
|
207 |
BASE_URL = "https://api.naver.com"
|
208 |
uri = "/keywordstool"
|
209 |
method = "GET"
|
210 |
+
|
211 |
+
try:
|
212 |
+
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
|
213 |
+
params = {
|
214 |
+
"hintKeywords": keyword, # λ¨μΌ ν€μλ λ¬Έμμ΄λ‘ μ λ¬
|
215 |
+
"showDetail": "1"
|
216 |
+
}
|
217 |
+
# hintKeywordsλ 리μ€νΈλ‘ λ°μ μ μμΌλ, μ¬κΈ°μλ λ¨μΌ ν€μλ μ²λ¦¬λ₯Ό κ°μ νκ³ λ¬Έμμ΄λ‘ μ λ¬
|
218 |
+
# λ§μ½ APIκ° hintKeywordsλ₯Ό 리μ€νΈλ‘λ§ λ°λλ€λ©΄ [keyword]λ‘ μμ νμ
|
219 |
+
|
220 |
+
response = requests.get(BASE_URL + uri, params=params, headers=headers, timeout=10)
|
221 |
+
response.raise_for_status() # μ€λ₯ λ°μ μ μμΈ λ°μ
|
222 |
+
data = response.json()
|
223 |
+
|
224 |
+
if "keywordList" not in data or not data["keywordList"]:
|
225 |
+
debug_log(f"'{keyword}'μ λν μ°κ΄ ν€μλ κ²°κ³Ό μμ.")
|
226 |
+
return pd.DataFrame() # λΉ DataFrame λ°ν
|
227 |
+
|
228 |
+
df = pd.DataFrame(data["keywordList"])
|
229 |
+
|
230 |
+
# API μλ΅μ ν΄λΉ 컬λΌμ΄ μμ κ²½μ°λ₯Ό λλΉ
|
231 |
+
df["monthlyPcQcCnt"] = df.get("monthlyPcQcCnt", 0)
|
232 |
+
df["monthlyMobileQcCnt"] = df.get("monthlyMobileQcCnt", 0)
|
233 |
+
|
234 |
+
def parse_count(x):
|
235 |
+
if pd.isna(x) or str(x).lower() == '< 10': # λ€μ΄λ² APIλ 10 λ―Έλ§μΌ λ "< 10"μΌλ‘ λ°ν
|
236 |
+
return 5 # λλ 0, λλ λ€λ₯Έ λνκ° (μ: 5)
|
237 |
+
try:
|
238 |
+
return int(str(x).replace(",", ""))
|
239 |
+
except ValueError:
|
240 |
+
return 0
|
241 |
+
|
242 |
+
df["PCμκ²μλ"] = df["monthlyPcQcCnt"].apply(parse_count)
|
243 |
+
df["λͺ¨λ°μΌμκ²μλ"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
244 |
+
df["ν νμκ²μλ"] = df["PCμκ²μλ"] + df["λͺ¨λ°μΌμκ²μλ"]
|
245 |
+
df.rename(columns={"relKeyword": "μ 보ν€μλ"}, inplace=True)
|
246 |
+
|
247 |
+
# νμν 컬λΌλ§ μ ν, μλ κ²½μ° λλΉ
|
248 |
+
required_cols = ["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"]
|
249 |
+
result_df = pd.DataFrame(columns=required_cols)
|
250 |
+
for col in required_cols:
|
251 |
+
if col in df.columns:
|
252 |
+
result_df[col] = df[col]
|
253 |
+
else: # ν΄λΉ 컬λΌμ΄ API μλ΅μ μμ κ²½μ° κΈ°λ³Έκ°μΌλ‘ μ±μ
|
254 |
+
if col == "μ 보ν€μλ": # μ 보ν€μλλ νμ
|
255 |
+
debug_log(f"API μλ΅μ 'relKeyword'κ° μμ΅λλ€. '{keyword}' μ²λ¦¬ μ€λ¨.")
|
256 |
+
return pd.DataFrame()
|
257 |
+
result_df[col] = 0
|
258 |
+
|
259 |
+
debug_log(f"fetch_related_keywords '{keyword}' μλ£, κ²°κ³Ό {len(result_df)}κ°")
|
260 |
+
return result_df.head(100) # μ΅λ 100κ°λ‘ μ ν
|
261 |
+
|
262 |
+
except requests.exceptions.HTTPError as http_err:
|
263 |
+
debug_log(f"HTTP μ€λ₯ λ°μ (fetch_related_keywords for '{keyword}'): {http_err} - μλ΅: {response.text if 'response' in locals() else 'N/A'}")
|
264 |
+
except requests.exceptions.RequestException as req_err:
|
265 |
+
debug_log(f"μμ² μ€λ₯ λ°μ (fetch_related_keywords for '{keyword}'): {req_err}")
|
266 |
+
except Exception as e:
|
267 |
+
debug_log(f"μ μ μλ μ€λ₯ λ°μ (fetch_related_keywords for '{keyword}'): {e}")
|
268 |
+
return pd.DataFrame() # μ€λ₯ λ°μ μ λΉ DataFrame λ°ν
|
269 |
+
|
270 |
|
271 |
def fetch_blog_count(keyword):
|
272 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
273 |
+
client_id = get_env_variable("NAVER_SEARCH_CLIENT_ID")
|
274 |
+
client_secret = get_env_variable("NAVER_SEARCH_CLIENT_SECRET")
|
275 |
+
|
276 |
+
if not client_id or not client_secret:
|
277 |
+
debug_log(f"λ€μ΄λ² κ²μ API ν€ μ 보 λΆμ‘±μΌλ‘ '{keyword}' λΈλ‘κ·Έ μ μ‘°νλ₯Ό 건λ<0xEB><0xB5>λλ€.")
|
278 |
+
return 0
|
279 |
+
|
280 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
281 |
headers = {
|
282 |
"X-Naver-Client-Id": client_id,
|
283 |
"X-Naver-Client-Secret": client_secret
|
284 |
}
|
285 |
+
params = {"query": keyword, "display": 1} # display=1λ‘ μ€μ νμ¬ total κ°λ§ λΉ λ₯΄κ² νμΈ
|
286 |
+
|
287 |
+
try:
|
288 |
+
response = requests.get(url, headers=headers, params=params, timeout=5)
|
289 |
+
response.raise_for_status() # HTTP μ€λ₯ λ°μ μ μμΈ λ°μ
|
290 |
data = response.json()
|
291 |
+
total_count = data.get("total", 0)
|
292 |
+
debug_log(f"fetch_blog_count κ²°κ³Ό: {total_count} for '{keyword}'")
|
293 |
+
return total_count
|
294 |
+
except requests.exceptions.HTTPError as http_err:
|
295 |
+
debug_log(f"HTTP μ€λ₯ λ°μ (fetch_blog_count for '{keyword}'): {http_err} - μλ΅: {response.text}")
|
296 |
+
except requests.exceptions.RequestException as req_err: # Timeout, ConnectionError λ±
|
297 |
+
debug_log(f"μμ² μ€λ₯ λ°μ (fetch_blog_count for '{keyword}'): {req_err}")
|
298 |
+
except Exception as e: # JSONDecodeError λ± κΈ°ν μμΈ
|
299 |
+
debug_log(f"μ μ μλ μ€λ₯ λ°μ (fetch_blog_count for '{keyword}'): {e}")
|
300 |
+
return 0 # μ€λ₯ λ°μ μ 0 λ°ν
|
301 |
|
302 |
def create_excel_file(df):
|
303 |
+
if df.empty:
|
304 |
+
debug_log("λΉ DataFrameμΌλ‘ Excel νμΌμ μμ±νμ§ μμ΅λλ€.")
|
305 |
+
# λΉ νμΌμ μμ±νκ±°λ, Noneμ λ°ννμ¬ Gradioμμ μ²λ¦¬νλλ‘ ν μ μμ
|
306 |
+
# μ¬κΈ°μλ λΉ μμ νμΌμ μμ±νμ¬ λ°ν (Gradio File μ»΄ν¬λνΈκ° κ²½λ‘λ₯Ό κΈ°λνλ―λ‘)
|
307 |
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
308 |
+
excel_path = tmp.name
|
309 |
+
# λΉ μμ
νμΌμ ν€λλ§μ΄λΌλ μ¨μ£Όλ €λ©΄
|
310 |
+
# pd.DataFrame(columns=df.columns).to_excel(excel_path, index=False)
|
311 |
+
# μλλ©΄ κ·Έλ₯ λΉ νμΌμ λ°ν
|
312 |
+
return excel_path
|
313 |
+
|
314 |
+
try:
|
315 |
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False, mode='w+b') as tmp:
|
316 |
+
excel_path = tmp.name
|
317 |
+
df.to_excel(excel_path, index=False, engine='openpyxl')
|
318 |
+
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
319 |
+
return excel_path
|
320 |
+
except Exception as e:
|
321 |
+
debug_log(f"Excel νμΌ μμ± μ€ μ€λ₯: {e}")
|
322 |
+
# μ€λ₯ λ°μ μ λΉ νμΌ κ²½λ‘λΌλ λ°ν (Gradio νΈνμ±)
|
323 |
+
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
324 |
+
return tmp.name
|
325 |
+
|
326 |
|
327 |
def process_keyword(keywords: str, include_related: bool):
|
328 |
+
debug_log(f"process_keyword νΈμΆ μμ, ν€μλλ€: '{keywords[:100]}...', μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
329 |
+
input_keywords_orig = [k.strip() for k in keywords.splitlines() if k.strip()]
|
330 |
+
|
331 |
+
if not input_keywords_orig:
|
332 |
+
debug_log("μ
λ ₯λ ν€μλκ° μμ΅λλ€.")
|
333 |
+
return pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]), ""
|
334 |
+
|
335 |
+
all_related_keywords_dfs = []
|
336 |
+
|
337 |
+
# 1. fetch_related_keywords λ³λ ¬ μ²λ¦¬
|
338 |
+
debug_log(f"μ°κ΄ ν€οΏ½οΏ½λ μ‘°ν λ³λ ¬ μ²λ¦¬ μμ (μ΅λ μμ
μ μ: {MAX_WORKERS_RELATED_KEYWORDS})")
|
339 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS_RELATED_KEYWORDS) as executor:
|
340 |
+
future_to_keyword_related = {
|
341 |
+
executor.submit(fetch_related_keywords, kw): kw for kw in input_keywords_orig
|
342 |
+
}
|
343 |
+
for i, future in enumerate(as_completed(future_to_keyword_related)):
|
344 |
+
kw = future_to_keyword_related[future]
|
345 |
+
try:
|
346 |
+
df_kw_related = future.result() # DataFrame λ°ν
|
347 |
+
if not df_kw_related.empty:
|
348 |
+
# μλ³Έ ν€μλκ° κ²°κ³Όμ ν¬ν¨λμ΄ μλμ§ νμΈνκ³ , μμΌλ©΄ μΆκ° μλ (APIκ° νμ relKeywordλ‘ μμ μ μ£Όμ§ μμ)
|
349 |
+
# νμ§λ§ fetch_related_keywordsμμ μ΄λ―Έ hintKeywordλ₯Ό κΈ°λ°μΌλ‘ κ²μνλ―λ‘,
|
350 |
+
# μΌλ°μ μΌλ‘λ ν΄λΉ ν€μλ μ λ³΄κ° μκ±°λ, μ°κ΄ ν€μλλ§ λμ΄.
|
351 |
+
# μ¬κΈ°μλ API μλ΅μ κ·Έλλ‘ νμ©.
|
352 |
+
|
353 |
+
# 첫 λ²μ§Έ μ
λ ₯ ν€μλμ΄κ³ , μ°κ΄ ν€μλ ν¬ν¨ μ΅μ
μ΄ μΌμ Έ μμΌλ©΄ λͺ¨λ μ°κ΄ ν€μλλ₯Ό μΆκ°
|
354 |
+
# κ·Έ μΈμ κ²½μ°μλ ν΄λΉ ν€μλ μ체μ μ λ³΄λ§ (μλ€λ©΄) μ¬μ©νκ±°λ, μ΅μλ¨ ν€μλ μ¬μ©
|
355 |
+
if include_related and kw == input_keywords_orig[0]:
|
356 |
+
all_related_keywords_dfs.append(df_kw_related)
|
357 |
+
debug_log(f"첫 λ²μ§Έ ν€μλ '{kw}'μ λͺ¨λ μ°κ΄ ν€μλ ({len(df_kw_related)}κ°) μΆκ°λ¨.")
|
358 |
+
else:
|
359 |
+
# ν΄λΉ ν€μλμ μΌμΉνλ νμ μ°Ύκ±°λ, μμΌλ©΄ APIκ° λ°νν 첫λ²μ§Έ νμ μ¬μ©
|
360 |
+
row_kw = df_kw_related[df_kw_related["μ 보ν€μλ"] == kw]
|
361 |
+
if not row_kw.empty:
|
362 |
+
all_related_keywords_dfs.append(row_kw)
|
363 |
+
debug_log(f"ν€μλ '{kw}'μ μ§μ μ 보 μΆκ°λ¨.")
|
364 |
+
elif not df_kw_related.empty : # μ§μ μ 보λ μμ§λ§ μ°κ΄ ν€μλλ μμ λ
|
365 |
+
all_related_keywords_dfs.append(df_kw_related.head(1)) # κ°μ₯ μ°κ΄μ± λμ ν€μλ μΆκ°
|
366 |
+
debug_log(f"ν€μλ '{kw}'μ μ§μ μ 보λ μμΌλ, κ°μ₯ μ°κ΄μ± λμ ν€μλ 1κ° μΆκ°λ¨.")
|
367 |
+
# else: ν€μλ μ 보λ, μ°κ΄ μ 보λ μμ λ (df_kw_relatedκ° λΉμ΄μμ)
|
368 |
+
|
369 |
+
debug_log(f"'{kw}' μ°κ΄ ν€μλ μ²λ¦¬ μλ£ ({i+1}/{len(input_keywords_orig)})")
|
370 |
+
except Exception as e:
|
371 |
+
debug_log(f"'{kw}' μ°κ΄ ν€μλ μ‘°ν μ€ λ³λ ¬ μμ
μ€λ₯: {e}")
|
372 |
+
|
373 |
+
if not all_related_keywords_dfs:
|
374 |
+
debug_log("μ°κ΄ ν€μλ μ‘°ν κ²°κ³Όκ° λͺ¨λ λΉμ΄μμ΅λλ€.")
|
375 |
+
# λΉ DataFrameμ λΈλ‘κ·Έ λ¬Έμμ μ»¬λΌ μΆκ°
|
376 |
+
empty_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
377 |
+
empty_df["λΈλ‘κ·Έλ¬Έμμ"] = None
|
378 |
+
return empty_df, create_excel_file(empty_df)
|
379 |
+
|
380 |
+
result_df = pd.concat(all_related_keywords_dfs, ignore_index=True)
|
381 |
+
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True) # μ€λ³΅ μ κ±°
|
382 |
+
debug_log(f"μ°κ΄ ν€μλ λ³λ ¬ μ²λ¦¬ μλ£. ν΅ν©λ DataFrame shape: {result_df.shape}")
|
383 |
+
|
384 |
+
# 2. fetch_blog_count λ³λ ¬ μ²λ¦¬
|
385 |
+
keywords_for_blog_count = result_df["μ 보ν€μλ"].dropna().unique().tolist()
|
386 |
+
blog_counts_map = {}
|
387 |
+
|
388 |
+
if keywords_for_blog_count:
|
389 |
+
debug_log(f"λΈλ‘κ·Έ λ¬Έμ μ μ‘°ν λ³λ ¬ μ²λ¦¬ μμ (ν€μλ {len(keywords_for_blog_count)}κ°, μ΅λ μμ
μ μ: {MAX_WORKERS_BLOG_COUNT})")
|
390 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS_BLOG_COUNT) as executor:
|
391 |
+
future_to_keyword_blog = {
|
392 |
+
executor.submit(fetch_blog_count, kw): kw for kw in keywords_for_blog_count
|
393 |
+
}
|
394 |
+
for i, future in enumerate(as_completed(future_to_keyword_blog)):
|
395 |
+
kw = future_to_keyword_blog[future]
|
396 |
+
try:
|
397 |
+
count = future.result() # μ«μ λ°ν
|
398 |
+
blog_counts_map[kw] = count
|
399 |
+
if (i+1) % 50 == 0: # λ무 λ§μ λ‘κ·Έ λ°©μ§
|
400 |
+
debug_log(f"λΈλ‘κ·Έ μ μ‘°ν μ§ν μ€... ({i+1}/{len(keywords_for_blog_count)})")
|
401 |
+
except Exception as e:
|
402 |
+
debug_log(f"'{kw}' λΈλ‘κ·Έ μ μ‘°ν μ€ λ³λ ¬ μμ
μ€λ₯: {e}")
|
403 |
+
blog_counts_map[kw] = 0 # μ€λ₯ μ 0μΌλ‘ μ²λ¦¬
|
404 |
+
|
405 |
+
result_df["λΈλ‘κ·Έλ¬Έμμ"] = result_df["μ 보ν€μλ"].map(blog_counts_map).fillna(0).astype(int)
|
406 |
+
debug_log("λΈλ‘κ·Έ λ¬Έμ μ λ³λ ¬ μ²λ¦¬ μλ£.")
|
407 |
else:
|
408 |
+
result_df["λΈλ‘κ·Έλ¬Έμμ"] = 0 # μ‘°νν ν€μλκ° μμΌλ©΄ 0μΌλ‘ μ±μ
|
409 |
+
|
410 |
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
411 |
+
debug_log(f"process_keyword μ΅μ’
μλ£. DataFrame shape: {result_df.shape}")
|
412 |
+
|
413 |
+
# μ΅μ’
μ»¬λΌ μμ λ° μ‘΄μ¬ μ¬λΆ νμΈ
|
414 |
+
final_columns = ["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]
|
415 |
+
for col in final_columns:
|
416 |
+
if col not in result_df.columns:
|
417 |
+
result_df[col] = 0 if col != "μ 보ν€μλ" else "" # μλ 컬λΌμ κΈ°λ³Έκ°μΌλ‘ μ±μ
|
418 |
+
|
419 |
+
result_df = result_df[final_columns] # μ»¬λΌ μμ κ³ μ
|
420 |
+
|
421 |
return result_df, create_excel_file(result_df)
|
422 |
|
423 |
+
|
424 |
# --- ννμ λΆμκ³Ό κ²μλ/λΈλ‘κ·Έλ¬Έμμ λ³ν© ---
|
425 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
426 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
427 |
+
df_freq, _ = analyze_text(text) # μμ
νμΌ κ²½λ‘λ μ¬κΈ°μ μ¬μ© μ ν¨
|
428 |
+
|
429 |
if df_freq.empty:
|
430 |
debug_log("ννμ λΆμ κ²°κ³Όκ° λΉ λ°μ΄ν°νλ μμ
λλ€.")
|
431 |
+
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]), ""
|
432 |
+
|
433 |
if remove_freq1:
|
434 |
+
before_count = len(df_freq)
|
435 |
+
df_freq = df_freq[df_freq["λΉλμ"] > 1].copy() # .copy() μΆκ°
|
436 |
+
debug_log(f"λΉλμ 1 μ κ±° μ μ©λ¨. {before_count} -> {len(df_freq)}")
|
437 |
+
|
438 |
+
if df_freq.empty:
|
439 |
+
debug_log("λΉλμ 1 μ κ±° ν λ°μ΄ν°κ° μμ΅λλ€.")
|
440 |
+
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]), ""
|
441 |
+
|
442 |
+
keywords_from_morph = "\n".join(df_freq["λ¨μ΄"].tolist())
|
443 |
+
debug_log(f"ννμ λΆμ κΈ°λ° ν€μλ ({len(df_freq['λ¨μ΄'])}κ°)μ λν μ 보 μ‘°ν μμ")
|
444 |
+
|
445 |
+
# process_keywordλ μ°κ΄ ν€μλλ₯Ό ν¬ν¨νμ§ μλλ‘ νΈμΆ (include_related=False)
|
446 |
+
df_keyword_info, _ = process_keyword(keywords_from_morph, include_related=False)
|
447 |
+
debug_log("ννμ λΆμ ν€μλμ λν κ²μλ λ° λΈλ‘κ·Έλ¬Έμμ μ‘°ν μλ£")
|
448 |
+
|
449 |
+
if df_keyword_info.empty:
|
450 |
+
debug_log("ννμ λΆμ ν€μλμ λν API μ 보 μ‘°ν κ²°κ³Όκ° μμ΅λλ€.")
|
451 |
+
# df_freqμ λΉ μ»¬λΌλ€ μΆκ°
|
452 |
+
for col in ["PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]:
|
453 |
+
df_freq[col] = None
|
454 |
+
merged_df = df_freq
|
455 |
+
else:
|
456 |
+
merged_df = pd.merge(df_freq, df_keyword_info, left_on="λ¨μ΄", right_on="μ 보ν€μλ", how="left")
|
457 |
+
if "μ 보ν€μλ" in merged_df.columns: # merge ν μ 보ν€μλ 컬λΌμ΄ μκ²Όλ€λ©΄ μμ
|
458 |
+
merged_df.drop(columns=["μ 보ν€μλ"], inplace=True, errors='ignore')
|
459 |
+
|
460 |
+
# λλ½λ μ»¬λΌ κΈ°λ³Έκ°μΌλ‘ μ±μ°κΈ°
|
461 |
+
expected_cols = ["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]
|
462 |
+
for col in expected_cols:
|
463 |
+
if col not in merged_df.columns:
|
464 |
+
merged_df[col] = None if col not in ["λΉλμ"] else 0
|
465 |
+
|
466 |
+
merged_df = merged_df[expected_cols] # μ»¬λΌ μμ κ³ μ
|
467 |
+
|
468 |
merged_excel_path = create_excel_file(merged_df)
|
469 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
470 |
return merged_df, merged_excel_path
|
471 |
|
472 |
+
|
473 |
# --- μ§μ ν€μλ λΆμ (λ¨λ
λΆμ) ---
|
474 |
def direct_keyword_analysis(text: str, keyword_input: str):
|
475 |
debug_log("direct_keyword_analysis ν¨μ μμ")
|
476 |
+
direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', keyword_input) if kw.strip()]
|
477 |
+
debug_log(f"μ
λ ₯λ μ§μ ν€μλ λͺ©λ‘: {direct_keywords_list}")
|
478 |
+
|
479 |
+
if not direct_keywords_list:
|
480 |
+
debug_log("μ§μ μ
λ ₯λ ν€μλκ° μμ΅λλ€.")
|
481 |
+
return pd.DataFrame(columns=["ν€μλ", "λΉλμ"]), ""
|
482 |
+
|
483 |
+
# 1. λ³Έλ¬Έ λ΄ λΉλμ κ³μ°
|
484 |
+
results_freq = []
|
485 |
+
for kw in direct_keywords_list:
|
486 |
+
count = text.count(kw) # λμλ¬Έμ ꡬλΆ, μ νν λ¬Έμμ΄ μΉ΄μ΄νΈ
|
487 |
+
results_freq.append({"ν€μλ": kw, "λΉλμ": count})
|
488 |
+
debug_log(f"μ§μ ν€μλ '{kw}'μ λ³Έλ¬Έ λ΄ λΉλμ: {count}")
|
489 |
+
df_direct_freq = pd.DataFrame(results_freq)
|
490 |
+
|
491 |
+
# 2. APIλ₯Ό ν΅ν΄ κ²μλ λ° λΈλ‘κ·Έ μ μ‘°ν (λ³λ ¬ μ²λ¦¬λ process_keyword μ¬μ©)
|
492 |
+
# μ¬κΈ°μλ κ° μ§μ ν€μλμ λν μ λ³΄λ§ νμνλ―λ‘ include_related=False
|
493 |
+
keywords_for_api = "\n".join(direct_keywords_list)
|
494 |
+
df_direct_api_info, _ = process_keyword(keywords_for_api, include_related=False)
|
495 |
+
|
496 |
+
# 3. λΉλμ κ²°κ³Όμ API κ²°κ³Ό λ³ν©
|
497 |
+
if not df_direct_api_info.empty:
|
498 |
+
# API κ²°κ³Όμ 'μ 보ν€μλ'λ₯Ό 'ν€μλ'λ‘ λ³κ²½νμ¬ λ³ν© κΈ°μ€ ν΅μΌ
|
499 |
+
df_direct_api_info.rename(columns={"μ 보ν€μλ": "ν€μλ"}, inplace=True)
|
500 |
+
merged_df = pd.merge(df_direct_freq, df_direct_api_info, on="ν€μλ", how="left")
|
501 |
+
else:
|
502 |
+
merged_df = df_direct_freq
|
503 |
+
for col in ["PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]:
|
504 |
+
merged_df[col] = None # API μ λ³΄κ° μμ κ²½μ° λΉ μ»¬λΌ μΆκ°
|
505 |
+
|
506 |
+
# μ»¬λΌ μμ λ° κΈ°λ³Έκ° μ 리
|
507 |
+
final_cols = ["ν€μλ", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]
|
508 |
+
for col in final_cols:
|
509 |
+
if col not in merged_df.columns:
|
510 |
+
merged_df[col] = 0 if col != "ν€μλ" else ""
|
511 |
+
merged_df = merged_df[final_cols]
|
512 |
+
|
513 |
+
|
514 |
+
excel_path = create_excel_file(merged_df)
|
515 |
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
516 |
+
return merged_df, excel_path
|
517 |
+
|
518 |
|
519 |
# --- ν΅ν© λΆμ (ννμ λΆμ + μ§μ ν€μλ λΆμ) ---
|
520 |
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
521 |
debug_log("combined_analysis ν¨μ μμ")
|
522 |
+
|
523 |
+
# 1. ννμ λΆμ κΈ°λ° κ²°κ³Ό (API μ 보 ν¬ν¨)
|
524 |
+
df_morph, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
525 |
+
# df_morph 컬λΌ: "λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"
|
526 |
+
|
527 |
+
# 2. μ§μ μ
λ ₯ ν€μλ μ²λ¦¬
|
528 |
+
direct_keywords_list = [kw.strip() for kw in re.split(r'[\n,]+', direct_keyword_input) if kw.strip()]
|
529 |
+
debug_log(f"ν΅ν© λΆμ - μ
λ ₯λ μ§μ ν€μλ: {direct_keywords_list}")
|
530 |
+
|
531 |
+
if not direct_keywords_list: # μ§μ μ
λ ₯ ν€μλκ° μμΌλ©΄ ννμ λΆμ κ²°κ³Όλ§ λ°ν
|
532 |
+
if "μ§μ μ
λ ₯" not in df_morph.columns and not df_morph.empty:
|
533 |
+
df_morph["μ§μ μ
λ ₯"] = "" # μ§μ μ
λ ₯ μ»¬λΌ μΆκ°
|
534 |
+
# μ»¬λΌ μμ μ‘°μ
|
535 |
+
cols = ["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ", "μ§μ μ
λ ₯"]
|
536 |
+
for col in cols:
|
537 |
+
if col not in df_morph.columns:
|
538 |
+
df_morph[col] = "" if col == "μ§μ μ
λ ₯" else (0 if col != "λ¨μ΄" else "")
|
539 |
+
df_morph = df_morph[cols]
|
540 |
+
return df_morph, create_excel_file(df_morph)
|
541 |
+
|
542 |
+
# μ§μ μ
λ ₯ ν€μλμ λν μ οΏ½οΏ½οΏ½ (λΉλμ, API μ 보) κ°μ Έμ€κΈ°
|
543 |
+
# direct_keyword_analysisλ "ν€μλ" 컬λΌμ μ¬μ©νλ―λ‘, df_morphμ "λ¨μ΄"μ ν΅μΌ νμ
|
544 |
+
df_direct_raw, _ = direct_keyword_analysis(blog_text, direct_keyword_input)
|
545 |
+
# df_direct_raw 컬λΌ: "ν€μλ", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"
|
546 |
+
df_direct_raw.rename(columns={"ν€μλ": "λ¨μ΄"}, inplace=True) # 컬λΌλͺ
ν΅μΌ
|
547 |
+
|
548 |
+
# ννμ λΆμ κ²°κ³Όμ 'μ§μ μ
λ ₯' νκΈ°
|
549 |
+
if not df_morph.empty:
|
550 |
+
df_morph["μ§μ μ
λ ₯"] = df_morph["λ¨μ΄"].apply(lambda x: "μ§μ μ
λ ₯" if x in direct_keywords_list else "")
|
551 |
+
else: # ννμ λΆμ κ²°κ³Όκ° λΉμ΄μμ μ μμ
|
552 |
+
df_morph = pd.DataFrame(columns=["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ", "μ§μ μ
λ ₯"])
|
553 |
+
|
554 |
+
|
555 |
+
# μ§μ μ
λ ₯λ ν€μλ μ€ ννμ λΆμ κ²°κ³Όμ μλ κ²λ€μ μΆκ°
|
556 |
+
# df_direct_rawμλ λͺ¨λ μ§μ μ
λ ₯ ν€μλμ μ λ³΄κ° μμ
|
557 |
+
|
558 |
+
# df_morphμ df_direct_rawλ₯Ό ν©μΉλ, 'λ¨μ΄' κΈ°μ€μΌλ‘ μ€λ³΅ μ²λ¦¬
|
559 |
+
# λ¨Όμ df_direct_rawμ 'μ§μ μ
λ ₯' 컬λΌμ μΆκ°νκ³ "μ§μ μ
λ ₯"μΌλ‘ μ±μ
|
560 |
+
df_direct_raw["μ§μ μ
λ ₯"] = "μ§μ μ
λ ₯"
|
561 |
+
|
562 |
+
# df_morphμ μλ λ¨μ΄λ df_morph μ 보λ₯Ό μ°μ μ¬μ© (μ§μ μ
λ ₯ νλκ·Έλ§ μ
λ°μ΄νΈ)
|
563 |
+
# df_direct_rawμμ df_morphμ μλ λ¨μ΄λ§ 골λΌμ μΆκ°
|
564 |
+
|
565 |
+
# ν©μΉκΈ°: df_morphλ₯Ό κΈ°μ€μΌλ‘ df_direct_rawμ μ 보λ₯Ό μΆκ°/μ
λ°μ΄νΈ
|
566 |
+
# Pandas 0.25.0 μ΄μμμλ combine_firstμ overwrite λμμ΄ μ½κ° λ€λ₯Ό μ μμΌλ―λ‘ merge μ¬μ© κ³ λ €
|
567 |
+
|
568 |
+
# 1. df_morphμ λ¨μ΄λ€μ λν΄ df_direct_rawμ μ λ³΄λ‘ μ
λ°μ΄νΈ (API μ 보 λ±)
|
569 |
+
# λ¨, λΉλμλ κ°μ κ³μ°ν κ²μ μ μ§ν μ§, μλλ©΄ νμͺ½μ νν μ§ κ²°μ νμ.
|
570 |
+
# μ¬κΈ°μλ df_morphμ λΉλμ(ννμλΆμ κΈ°λ°)μ df_direct_rawμ λΉλμ(λ¨μ count)κ° λ€λ₯Ό μ μμ.
|
571 |
+
# μΌλ¨μ df_morph κΈ°μ€μΌλ‘ νκ³ , μλ μ§μ ν€μλλ§ df_direct_rawμμ μΆκ°νλ λ°©μ.
|
572 |
+
|
573 |
+
# df_morphμ 'μ§μ μ
λ ₯' 컬λΌμ μ΄λ―Έ μμμ μ²λ¦¬λ¨.
|
574 |
+
# μ΄μ df_direct_rawμλ§ μλ ν€μλλ₯Ό df_morphμ μΆκ°
|
575 |
+
|
576 |
+
# df_morphμ μλ λ¨μ΄ λͺ©λ‘
|
577 |
+
morph_words = df_morph['λ¨μ΄'].tolist() if not df_morph.empty else []
|
578 |
+
|
579 |
+
rows_to_add = []
|
580 |
+
for idx, row in df_direct_raw.iterrows():
|
581 |
+
if row['λ¨μ΄'] not in morph_words:
|
582 |
+
rows_to_add.append(row)
|
583 |
+
|
584 |
+
if rows_to_add:
|
585 |
+
df_to_add = pd.DataFrame(rows_to_add)
|
586 |
+
combined_df = pd.concat([df_morph, df_to_add], ignore_index=True)
|
587 |
+
else:
|
588 |
+
combined_df = df_morph.copy() # df_morphκ° λΉμ΄μμ μλ μμ
|
589 |
+
|
590 |
+
# μ΅μ’
μ»¬λΌ μ 리 λ° μμ
|
591 |
+
final_cols_combined = ["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ", "μ§μ μ
λ ₯"]
|
592 |
+
for col in final_cols_combined:
|
593 |
+
if col not in combined_df.columns:
|
594 |
+
# κΈ°λ³Έκ° μ€μ : 'μ§μ μ
λ ₯'μ "", λλ¨Έμ§λ 0 λλ None (API κ°μ None νμ©)
|
595 |
+
if col == "μ§μ μ
λ ₯":
|
596 |
+
combined_df[col] = ""
|
597 |
+
elif col == "λΉλμ":
|
598 |
+
combined_df[col] = 0
|
599 |
+
elif col == "λ¨μ΄":
|
600 |
+
combined_df[col] = ""
|
601 |
+
else: # API κ΄λ ¨ 컬λΌ
|
602 |
+
combined_df[col] = None # pd.NAλ κ°λ₯
|
603 |
+
|
604 |
+
# NA κ°λ€μ μ μ ν μ²λ¦¬ (μ: 0μΌλ‘ μ±μ°κ±°λ κ·Έλλ‘ λκΈ°)
|
605 |
+
# API κ°λ€μ μ«μκ° μλ μ μμΌλ―λ‘ (μ: "< 10"), process_keywordμμ μ²λ¦¬λ¨. μ¬κΈ°μλ intν λ³ν μ μ΄λ―λ‘ κ·Έλλ‘ λ .
|
606 |
+
# Gradio Dataframeμ Noneμ μ νμν¨.
|
607 |
+
# λΉλμλ μ μνμ΄μ΄μΌ ν¨
|
608 |
+
if "λΉλμ" in combined_df.columns:
|
609 |
+
combined_df["λΉλμ"] = combined_df["λΉλμ"].fillna(0).astype(int)
|
610 |
+
|
611 |
+
|
612 |
+
combined_df = combined_df[final_cols_combined].drop_duplicates(subset=['λ¨μ΄'], keep='first') # λ§μ½μ μν μ€λ³΅ μ κ±°
|
613 |
+
combined_df.sort_values(by=["μ§μ μ
λ ₯", "λΉλμ"], ascending=[False, False], inplace=True, na_position='last') # μ§μ μ
λ ₯ μ°μ , κ·Έ λ€μ λΉλμ
|
614 |
+
combined_df.reset_index(drop=True, inplace=True)
|
615 |
+
|
616 |
+
combined_excel = create_excel_file(combined_df)
|
617 |
debug_log("combined_analysis ν¨μ μλ£")
|
618 |
+
return combined_df, combined_excel
|
619 |
+
|
620 |
|
621 |
# --- λΆμ νΈλ€λ¬ ---
|
622 |
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
623 |
+
debug_log(f"analysis_handler ν¨μ μμ. μ§μ ν€μλλ§ λΆμ: {direct_keyword_only}")
|
624 |
+
start_time = time.time()
|
625 |
+
|
626 |
+
if not blog_text or blog_text.strip() == "μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€." or blog_text.strip() == "":
|
627 |
+
debug_log("λΆμν λΈλ‘κ·Έ λ΄μ©μ΄ μμ΅λλ€.")
|
628 |
+
# λΉ κ²°κ³Όλ₯Ό λ°ννκΈ° μν DataFrame ꡬ쑰 λͺ
μ
|
629 |
+
empty_cols_direct = ["ν€μλ", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]
|
630 |
+
empty_cols_combined = ["λ¨μ΄", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ", "μ§μ μ
λ ₯"]
|
631 |
+
df_empty = pd.DataFrame(columns=empty_cols_direct if direct_keyword_only else empty_cols_combined)
|
632 |
+
return df_empty, create_excel_file(df_empty)
|
633 |
+
|
634 |
+
|
635 |
if direct_keyword_only:
|
636 |
# "μ§μ ν€μλ μ
λ ₯λ§ λΆμ" μ ν μ λ¨λ
λΆμ μν
|
637 |
+
if not direct_keyword_input or not direct_keyword_input.strip():
|
638 |
+
debug_log("μ§μ ν€μλλ§ λΆμ μ νλμμΌλ, μ
λ ₯λ μ§μ ν€μλκ° μμ΅λλ€.")
|
639 |
+
empty_cols_direct = ["ν€μλ", "λΉλμ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ", "λΈλ‘κ·Έλ¬Έμμ"]
|
640 |
+
df_empty = pd.DataFrame(columns=empty_cols_direct)
|
641 |
+
return df_empty, create_excel_file(df_empty)
|
642 |
+
|
643 |
+
result_df, excel_path = direct_keyword_analysis(blog_text, direct_keyword_input)
|
644 |
else:
|
645 |
# κΈ°λ³Έ ν΅ν© λΆμ μν
|
646 |
+
result_df, excel_path = combined_analysis(blog_text, remove_freq1, direct_keyword_input)
|
647 |
+
|
648 |
+
end_time = time.time()
|
649 |
+
debug_log(f"analysis_handler μ΄ μ€ν μκ°: {end_time - start_time:.2f} μ΄")
|
650 |
+
return result_df, excel_path
|
651 |
+
|
652 |
|
653 |
# --- μ€ν¬λν μ€ν ---
|
654 |
def fetch_blog_content(url: str):
|
655 |
debug_log("fetch_blog_content ν¨μ μμ")
|
656 |
+
if not url or not url.strip():
|
657 |
+
return "λΈλ‘κ·Έ URLμ μ
λ ₯ν΄μ£ΌμΈμ."
|
658 |
+
if not url.startswith("http://") and not url.startswith("https://"):
|
659 |
+
return "μ ν¨ν URL νμ(http:// λλ https://)μΌλ‘ μ
λ ₯ν΄μ£ΌμΈμ."
|
660 |
+
|
661 |
+
start_time = time.time()
|
662 |
content = scrape_naver_blog(url)
|
663 |
+
end_time = time.time()
|
664 |
+
debug_log(f"fetch_blog_content μ΄ μ€ν μκ°: {end_time - start_time:.2f} μ΄. λ΄μ© κΈΈμ΄: {len(content)}")
|
665 |
return content
|
666 |
|
667 |
# --- Custom CSS ---
|
668 |
custom_css = """
|
669 |
/* μ 체 컨ν
μ΄λ μ€νμΌ */
|
670 |
.gradio-container {
|
671 |
+
max-width: 1080px; /* λλΉ νμ₯ */
|
672 |
margin: auto;
|
673 |
font-family: 'Helvetica Neue', Arial, sans-serif;
|
674 |
background: #f5f7fa;
|
|
|
702 |
padding: 0.6rem 1.2rem;
|
703 |
font-size: 1rem;
|
704 |
cursor: pointer;
|
705 |
+
min-width: 150px; /* λ²νΌ μ΅μ λλΉ */
|
706 |
}
|
707 |
+
.custom-button:hover {
|
708 |
+
background-color: #0056b3;
|
709 |
+
}
|
710 |
+
|
711 |
|
712 |
/* 체ν¬λ°μ€ μ€νμΌ */
|
713 |
.custom-checkbox {
|
|
|
728 |
"""
|
729 |
|
730 |
# --- Gradio μΈν°νμ΄μ€ κ΅¬μ± ---
|
731 |
+
with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ν€μλ λΆμ μλΉμ€", css=custom_css) as demo:
|
732 |
+
gr.HTML("<div class='custom-header'>λ€μ΄λ² λΈλ‘κ·Έ ν€μλ λΆμ μλΉμ€</div>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
733 |
|
734 |
+
with gr.Row():
|
735 |
+
with gr.Column(scale=2): # μΌμͺ½ μ»¬λΌ (μ
λ ₯ μμ)
|
736 |
+
with gr.Group(elem_classes="custom-group"):
|
737 |
+
blog_url_input = gr.Textbox(
|
738 |
+
label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬",
|
739 |
+
placeholder="μ: https://blog.naver.com/μμ΄λ/κΈλ²νΈ",
|
740 |
+
lines=1,
|
741 |
+
info="λΆμν λ€μ΄λ² λΈλ‘κ·Έ κ²μλ¬Ό URLμ μ
λ ₯νμΈμ."
|
742 |
+
)
|
743 |
+
with gr.Row(elem_classes="centered"):
|
744 |
+
scrape_button = gr.Button("λΈλ‘κ·Έ λ΄μ© κ°μ Έμ€κΈ°", elem_classes="custom-button", variant="primary")
|
745 |
+
|
746 |
+
with gr.Group(elem_classes="custom-group"):
|
747 |
+
blog_content_box = gr.Textbox(
|
748 |
+
label="λΈλ‘κ·Έ λ΄μ© (μμ κ°λ₯)",
|
749 |
+
lines=10,
|
750 |
+
placeholder="μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€. μ§μ μμ νκ±°λ λΆμ¬λ£μ μ μμ΅λλ€."
|
751 |
+
)
|
752 |
+
|
753 |
+
with gr.Group(elem_classes="custom-group"):
|
754 |
+
gr.Markdown("### λΆμ μ΅μ
μ€μ ")
|
755 |
+
with gr.Row():
|
756 |
+
remove_freq_checkbox = gr.Checkbox(
|
757 |
+
label="λΉλμ 1μΈ λ¨μ΄ μ κ±° (ννμ λΆμ μ)",
|
758 |
+
value=True,
|
759 |
+
elem_classes="custom-checkbox",
|
760 |
+
info="ννμ λΆμ κ²°κ³Όμμ λΉλμκ° 1μΈ λ¨μ΄λ₯Ό μ μΈν©λλ€."
|
761 |
+
)
|
762 |
+
with gr.Row():
|
763 |
+
direct_keyword_only_checkbox = gr.Checkbox(
|
764 |
+
label="μ§μ ν€μλλ§ λΆμ",
|
765 |
+
value=False,
|
766 |
+
elem_classes="custom-checkbox",
|
767 |
+
info="μ΄ μ΅μ
μ μ ννλ©΄ μλ μ
λ ₯ν μ§μ ν€μλμ λν΄μλ§ λΆμμ μνν©λλ€ (ννμ λΆμ μλ΅)."
|
768 |
+
)
|
769 |
+
with gr.Row():
|
770 |
+
direct_keyword_box = gr.Textbox(
|
771 |
+
label="μ§μ ν€μλ μ
λ ₯ (μν° λλ ','λ‘ κ΅¬λΆ)",
|
772 |
+
lines=3,
|
773 |
+
placeholder="μ: ν€μλ1, ν€μλ2\nν€μλ3\n...\n(ννμ λΆμ κ²°κ³Όμ λ³λλ‘ λΆμνκ±°λ, ν΅ν© λΆμμ μΆκ°ν ν€μλ)",
|
774 |
+
info="λΆμμ ν¬ν¨νκ±°λ λ¨λ
μΌλ‘ λΆμν ν€μλλ₯Ό μ§μ μ
λ ₯ν©λλ€."
|
775 |
+
)
|
776 |
+
|
777 |
+
with gr.Group(elem_classes="custom-group"):
|
778 |
+
with gr.Row(elem_classes="centered"):
|
779 |
+
analyze_button = gr.Button("ν€μλ λΆμ μ€ν", elem_classes="custom-button", variant="primary")
|
780 |
+
|
781 |
+
with gr.Column(scale=3): # μ€λ₯Έμͺ½ μ»¬λΌ (κ²°κ³Ό μμ)
|
782 |
+
with gr.Group(elem_classes="custom-group custom-result"):
|
783 |
+
gr.Markdown("### λΆμ κ²°κ³Ό")
|
784 |
+
result_df_display = gr.Dataframe(
|
785 |
+
label="ν΅ν© λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ, μ§μ μ
λ ₯ μ¬λΆ)",
|
786 |
+
interactive=False, # μ¬μ©μκ° μ§μ μμ λΆκ°
|
787 |
+
height=600, # λμ΄ μ‘°μ
|
788 |
+
wrap=True # κΈ΄ ν
μ€νΈ μ€λ°κΏ
|
789 |
+
)
|
790 |
+
with gr.Group(elem_classes="custom-group"):
|
791 |
+
gr.Markdown("### κ²°κ³Ό λ€μ΄λ‘λ")
|
792 |
+
excel_file_display = gr.File(label="λΆμ κ²°κ³Ό Excel νμΌ λ€μ΄λ‘λ")
|
793 |
+
|
794 |
# μ΄λ²€νΈ μ°κ²°
|
795 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
796 |
+
analyze_button.click(
|
797 |
+
fn=analysis_handler,
|
798 |
+
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|
799 |
+
outputs=[result_df_display, excel_file_display]
|
800 |
+
)
|
801 |
|
802 |
if __name__ == "__main__":
|
803 |
+
# νκ²½ λ³μ μ€μ μμ (μ€μ μ€ν μμλ μμ€ν
νκ²½ λ³μλ‘ μ€μ νκ±°λ, .env νμΌ λ±μ μ¬μ©)
|
804 |
+
# os.environ["NAVER_API_KEY"] = "YOUR_NAVER_API_KEY"
|
805 |
+
# os.environ["NAVER_SECRET_KEY"] = "YOUR_NAVER_SECRET_KEY"
|
806 |
+
# os.environ["NAVER_CUSTOMER_ID"] = "YOUR_NAVER_CUSTOMER_ID"
|
807 |
+
# os.environ["NAVER_SEARCH_CLIENT_ID"] = "YOUR_NAVER_SEARCH_CLIENT_ID"
|
808 |
+
# os.environ["NAVER_SEARCH_CLIENT_SECRET"] = "YOUR_NAVER_SEARCH_CLIENT_SECRET"
|
809 |
+
|
810 |
+
# νκ²½ λ³μ μ€μ νμΈ
|
811 |
+
required_env_vars = [
|
812 |
+
"NAVER_API_KEY", "NAVER_SECRET_KEY", "NAVER_CUSTOMER_ID",
|
813 |
+
"NAVER_SEARCH_CLIENT_ID", "NAVER_SEARCH_CLIENT_SECRET"
|
814 |
+
]
|
815 |
+
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
|
816 |
+
if missing_vars:
|
817 |
+
debug_log(f"κ²½κ³ : λ€μ νμ νκ²½ λ³μκ° μ€μ λμ§ μμμ΅λλ€ - {', '.join(missing_vars)}")
|
818 |
+
debug_log("API νΈμΆ κΈ°λ₯μ΄ μ μμ μΌλ‘ λμνμ§ μμ μ μμ΅λλ€.")
|
819 |
+
debug_log("μ€ν¬λ¦½νΈ μ€ν μ μ ν΄λΉ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ.")
|
820 |
+
# Gradio μ±μ μ€ννλ, API νΈμΆ μ μ€λ₯κ° λ°μν μ μμμ μ¬μ©μμκ² μλ¦Ό.
|
821 |
+
|
822 |
debug_log("Gradio μ± μ€ν μμ")
|
823 |
+
demo.launch(debug=True) # κ°λ° μ€μλ debug=Trueλ‘ μ€μ νμ¬ μ€λ₯ νμΈ μ©μ΄
|
824 |
+
debug_log("Gradio μ± μ€ν μ’
λ£")
|