AIRider commited on
Commit
7b2bf17
Β·
verified Β·
1 Parent(s): 7dc2cb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -160
app.py CHANGED
@@ -2,12 +2,10 @@ import gradio as gr
2
  from gradio_client import Client
3
  import json
4
  import logging
5
- import ast
6
  import openai
7
  import os
8
- import random
9
- import re
10
 
 
11
  logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
12
  format='%(asctime)s - %(levelname)s - %(message)s')
13
 
@@ -23,59 +21,53 @@ def parse_api_response(response):
23
  raise ValueError(f"μ˜ˆμƒμΉ˜ λͺ»ν•œ 응닡 ν˜•μ‹μž…λ‹ˆλ‹€. 받은 데이터 νƒ€μž…: {type(response)}")
24
  return response
25
  except Exception as e:
 
26
  raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
27
 
28
- def split_sentences(text):
29
- sentences = re.split(r"(λ‹ˆλ‹€|μ—μš”|κ΅¬λ‚˜|ν•΄μš”|κ΅°μš”|κ² μ–΄μš”|μ‹œμ˜€|해라|μ˜ˆμš”|μ•„μš”|λ°μš”|λŒ€μš”|μ„Έμš”|μ–΄μš”|κ²Œμš”|κ΅¬μš”|κ³ μš”|λ‚˜μš”|ν•˜μ£ )(?![\w])", text)
30
- combined_sentences = []
31
- current_sentence = ""
32
- for i in range(0, len(sentences), 2):
33
- if i + 1 < len(sentences):
34
- sentence = sentences[i] + sentences[i + 1]
35
- else:
36
- sentence = sentences[i]
37
- if len(current_sentence) + len(sentence) > 100:
38
- combined_sentences.append(current_sentence.strip())
39
- current_sentence = sentence.strip()
40
- else:
41
- current_sentence += sentence
42
- if sentence.endswith(('.', '?', '!')):
43
- combined_sentences.append(current_sentence.strip())
44
- current_sentence = ""
45
- if current_sentence:
46
- combined_sentences.append(current_sentence.strip())
47
- return combined_sentences
48
-
49
  def get_youtube_script(url):
50
  logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
51
-
52
  client = Client("whispersound/YT_Ts_R")
53
-
54
  try:
55
- logging.debug("API 호좜 μ‹œμž‘")
56
  result = client.predict(youtube_url=url, api_name="/predict")
57
- logging.debug("API 호좜 μ™„λ£Œ")
58
-
59
  parsed_result = parse_api_response(result)
60
 
61
  if 'data' not in parsed_result or not parsed_result['data']:
62
  raise ValueError("API 응닡에 μœ νš¨ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
63
 
64
- title = parsed_result["data"][0].get("title", "제λͺ© μ—†μŒ")
65
- transcription_text = parsed_result["data"][0].get("transcriptionAsText", "")
66
- sections = parsed_result["data"][0].get("sections", [])
 
67
 
68
  if not transcription_text:
69
  raise ValueError("μΆ”μΆœλœ μŠ€ν¬λ¦½νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
70
 
71
  logging.info("슀크립트 μΆ”μΆœ μ™„λ£Œ")
72
  return title, transcription_text, sections
73
-
74
  except Exception as e:
75
- error_msg = f"슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
76
- logging.exception(error_msg)
77
  raise
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def call_api(prompt, max_tokens, temperature, top_p):
80
  try:
81
  response = openai.ChatCompletion.create(
@@ -90,67 +82,6 @@ def call_api(prompt, max_tokens, temperature, top_p):
90
  logging.exception("LLM API 호좜 쀑 였λ₯˜ λ°œμƒ")
91
  raise
92
 
93
- def summarize_section(section_text):
94
- prompt = f"""
95
- λ‹€μŒ 유튜브 λŒ€λ³Έ μ„Ήμ…˜μ˜ 핡심 λ‚΄μš©μ„ κ°„κ²°ν•˜κ²Œ μš”μ•½ν•˜μ„Έμš”:
96
- 1. ν•œκΈ€λ‘œ μž‘μ„±ν•˜μ„Έμš”.
97
- 2. μ£Όμš” 논점과 μ€‘μš”ν•œ 세뢀사항을 ν¬ν•¨ν•˜μ„Έμš”.
98
- 3. μš”μ•½μ€ 2-3λ¬Έμž₯으둜 μ œν•œν•˜μ„Έμš”.
99
-
100
- μ„Ήμ…˜ λ‚΄μš©:
101
- {section_text}
102
- """
103
- return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9)
104
-
105
- def format_time(seconds):
106
- minutes, seconds = divmod(seconds, 60)
107
- hours, minutes = divmod(minutes, 60)
108
- return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
109
-
110
- def generate_timeline_summary(sections):
111
- combined_sections = "\n\n".join([f"{format_time(section['start_time'])}: {section['text']}" for section in sections])
112
-
113
- prompt = f"""
114
- λ‹€μŒμ€ 유튜브 μ˜μƒμ˜ νƒ€μž„λΌμΈκ³Ό 각 μ„Ήμ…˜μ˜ λ‚΄μš©μž…λ‹ˆλ‹€. 이λ₯Ό λ°”νƒ•μœΌλ‘œ νƒ€μž„λΌμΈ μš”μ•½μ„ μƒμ„±ν•΄μ£Όμ„Έμš”:
115
-
116
- 1. 각 μ„Ήμ…˜μ˜ μ‹œμž‘ μ‹œκ°„μ„ μœ μ§€ν•˜λ©΄μ„œ 핡심 λ‚΄μš©μ„ κ°„κ²°ν•˜κ²Œ μš”μ•½ν•˜μ„Έμš”.
117
- 2. μš”μ•½μ€ ν•œκΈ€λ‘œ μž‘μ„±ν•˜μ„Έμš”.
118
- 3. 각 μ„Ήμ…˜μ˜ μš”μ•½μ€ 1-2λ¬Έμž₯으둜 μ œν•œν•˜μ„Έμš”.
119
- 4. 전체 λ§₯락을 κ³ λ €ν•˜μ—¬ μš”μ•½ν•˜λ˜, 각 μ„Ήμ…˜μ˜ κ³ μœ ν•œ λ‚΄μš©μ„ λ†“μΉ˜μ§€ λ§ˆμ„Έμš”.
120
- 5. 좜λ ₯ ν˜•μ‹μ€ λ‹€μŒκ³Ό 같이 μœ μ§€ν•˜μ„Έμš”:
121
- [μ‹œμž‘ μ‹œκ°„] μ„Ήμ…˜ μš”μ•½
122
-
123
- μ„Ήμ…˜ λ‚΄μš©:
124
- {combined_sections}
125
- """
126
-
127
- try:
128
- response = call_api(prompt, max_tokens=1000, temperature=0.3, top_p=0.9)
129
-
130
- # 응닡을 쀄 λ‹¨μœ„λ‘œ λΆ„λ¦¬ν•˜κ³  각 쀄을 HTML ν˜•μ‹μœΌλ‘œ λ³€ν™˜
131
- timeline_items = response.strip().split('\n')
132
- formatted_timeline = []
133
-
134
- for item in timeline_items:
135
- if ':' in item: # μ‹œκ°„ 정보가 μžˆλŠ” ν•­λͺ©λ§Œ 처리
136
- time, summary = item.split(':', 1)
137
- formatted_timeline.append(f"<p><strong>{time.strip()}</strong>:{summary.strip()}</p>")
138
-
139
- timeline_html = "\n".join(formatted_timeline)
140
-
141
- if not timeline_html:
142
- raise ValueError("μœ νš¨ν•œ νƒ€μž„λΌμΈ μš”μ•½μ„ μƒμ„±ν•˜μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€.")
143
-
144
- return f"""
145
- <h3>νƒ€μž„λΌμΈ μš”μ•½:</h3>
146
- <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
147
- {timeline_html}
148
- </div>
149
- """
150
- except Exception as e:
151
- logging.exception("νƒ€μž„λΌμΈ μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ")
152
- return "<p>νƒ€μž„λΌμΈ μš”μ•½μ„ μƒμ„±ν•˜λŠ” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”.</p>"
153
-
154
  def summarize_text(text):
155
  prompt = f"""
156
  1. λ‹€μŒ μ£Όμ–΄μ§€λŠ” 유튜브 λŒ€λ³Έμ˜ 핡심 μ£Όμ œμ™€ λͺ¨λ“  μ£Όμš” λ‚΄μš©μ„ μƒμ„Έν•˜κ²Œ μš”μ•½ν•˜λΌ
@@ -163,86 +94,55 @@ def summarize_text(text):
163
  8. λŒ€λ³Έμ—μ„œ μ „λ‹¬ν•˜λŠ” κ°μ •μ΄λ‚˜ λΆ„μœ„κΈ°λ„ 포함
164
  9. λ°˜λ“œμ‹œ 기술적 μš©μ–΄λ‚˜ μ „λ¬Έ μš©μ–΄κ°€ μžˆμ„ 경우, 이λ₯Ό μ •ν™•ν•˜κ²Œ μ‚¬μš©
165
  10. λŒ€λ³Έμ˜ λͺ©μ μ΄λ‚˜ μ˜λ„λ₯Ό νŒŒμ•…ν•˜κ³ , 이λ₯Ό μš”μ•½μ— λ°˜λ“œμ‹œ 반영
166
- 11. 전체글을 보고
167
 
168
- ---
 
 
 
169
 
170
- 이 ν”„λ‘¬ν”„νŠΈκ°€ 도움이 λ˜μ‹œκΈΈ λ°”λžλ‹ˆλ‹€.
171
- \n\n
172
- {text}"""
 
 
 
173
 
174
- return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
 
176
  with gr.Blocks() as demo:
177
  gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
178
-
179
  youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
180
  analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
181
  script_output = gr.HTML(label="슀크립트")
182
  timeline_output = gr.HTML(label="νƒ€μž„λΌμΈ μš”μ•½")
183
  summary_output = gr.HTML(label="전체 μš”μ•½")
184
-
185
  cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
186
 
187
- def extract_and_cache(url, cache):
188
- if url == cache["url"]:
189
- return cache["title"], cache["script"], cache["sections"], cache
190
-
191
- try:
192
- title, script, sections = get_youtube_script(url)
193
- new_cache = {"url": url, "title": title, "script": script, "sections": sections}
194
- return title, script, sections, new_cache
195
- except Exception as e:
196
- logging.exception("데이터 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ")
197
- raise gr.Error(f"슀크립트 μΆ”μΆœ μ‹€νŒ¨: {str(e)}")
198
-
199
- def display_script(title, script):
200
- formatted_script = "\n".join(split_sentences(script))
201
- script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
202
- <details>
203
- <summary><h3>원문 슀크립트 (ν΄λ¦­ν•˜μ—¬ 펼치기)</h3></summary>
204
- <div style="white-space: pre-wrap;">{formatted_script}</div>
205
- </details>"""
206
- return script_html
207
-
208
- def display_timeline(sections):
209
- timeline_summary = generate_timeline_summary(sections)
210
- timeline_html = f"""
211
- <h3>νƒ€μž„λΌμΈ μš”μ•½:</h3>
212
- <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
213
- {timeline_summary}
214
- </div>
215
- """
216
- return timeline_html
217
-
218
- def generate_summary(script):
219
- summary = summarize_text(script)
220
- summary_html = f"""
221
- <h3>전체 μš”μ•½:</h3>
222
- <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
223
- {summary}
224
- </div>
225
- """
226
- return summary_html
227
-
228
- def analyze(url, cache):
229
- try:
230
- title, script, sections, new_cache = extract_and_cache(url, cache)
231
- script_html = display_script(title, script)
232
- timeline_html = generate_timeline_summary(sections)
233
- summary_html = generate_summary(script)
234
- return script_html, timeline_html, summary_html, new_cache
235
- except gr.Error as e:
236
- return str(e), "", "", cache
237
- except Exception as e:
238
- error_msg = f"처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
239
- logging.exception(error_msg)
240
- return error_msg, "", "", cache
241
-
242
  analyze_button.click(
243
  analyze,
244
  inputs=[youtube_url_input, cached_data],
245
  outputs=[script_output, timeline_output, summary_output, cached_data]
246
  )
247
 
248
- demo.launch(share=True)
 
 
2
  from gradio_client import Client
3
  import json
4
  import logging
 
5
  import openai
6
  import os
 
 
7
 
8
+ # λ‘œκΉ… μ„€μ •
9
  logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
10
  format='%(asctime)s - %(levelname)s - %(message)s')
11
 
 
21
  raise ValueError(f"μ˜ˆμƒμΉ˜ λͺ»ν•œ 응닡 ν˜•μ‹μž…λ‹ˆλ‹€. 받은 데이터 νƒ€μž…: {type(response)}")
22
  return response
23
  except Exception as e:
24
+ logging.error(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
25
  raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def get_youtube_script(url):
28
  logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
 
29
  client = Client("whispersound/YT_Ts_R")
 
30
  try:
 
31
  result = client.predict(youtube_url=url, api_name="/predict")
 
 
32
  parsed_result = parse_api_response(result)
33
 
34
  if 'data' not in parsed_result or not parsed_result['data']:
35
  raise ValueError("API 응닡에 μœ νš¨ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
36
 
37
+ data = parsed_result["data"][0]
38
+ title = data.get("title", "제λͺ© μ—†μŒ")
39
+ transcription_text = data.get("transcriptionAsText", "")
40
+ sections = data.get("sections", [])
41
 
42
  if not transcription_text:
43
  raise ValueError("μΆ”μΆœλœ μŠ€ν¬λ¦½νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
44
 
45
  logging.info("슀크립트 μΆ”μΆœ μ™„λ£Œ")
46
  return title, transcription_text, sections
 
47
  except Exception as e:
48
+ logging.exception("슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ")
 
49
  raise
50
 
51
+ def format_time(seconds):
52
+ minutes, seconds = divmod(seconds, 60)
53
+ hours, minutes = divmod(minutes, 60)
54
+ return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
55
+
56
+ def generate_timeline_summary(sections):
57
+ timeline_items = []
58
+ for section in sections:
59
+ start_time = format_time(section['start_time'])
60
+ text = section['text']
61
+ timeline_items.append(f"<p><strong>{start_time}</strong>: {text}</p>")
62
+
63
+ timeline_html = "\n".join(timeline_items)
64
+ return f"""
65
+ <h3>νƒ€μž„λΌμΈ μš”μ•½:</h3>
66
+ <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
67
+ {timeline_html}
68
+ </div>
69
+ """
70
+
71
  def call_api(prompt, max_tokens, temperature, top_p):
72
  try:
73
  response = openai.ChatCompletion.create(
 
82
  logging.exception("LLM API 호좜 쀑 였λ₯˜ λ°œμƒ")
83
  raise
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def summarize_text(text):
86
  prompt = f"""
87
  1. λ‹€μŒ μ£Όμ–΄μ§€λŠ” 유튜브 λŒ€λ³Έμ˜ 핡심 μ£Όμ œμ™€ λͺ¨λ“  μ£Όμš” λ‚΄μš©μ„ μƒμ„Έν•˜κ²Œ μš”μ•½ν•˜λΌ
 
94
  8. λŒ€λ³Έμ—μ„œ μ „λ‹¬ν•˜λŠ” κ°μ •μ΄λ‚˜ λΆ„μœ„κΈ°λ„ 포함
95
  9. λ°˜λ“œμ‹œ 기술적 μš©μ–΄λ‚˜ μ „λ¬Έ μš©μ–΄κ°€ μžˆμ„ 경우, 이λ₯Ό μ •ν™•ν•˜κ²Œ μ‚¬μš©
96
  10. λŒ€λ³Έμ˜ λͺ©μ μ΄λ‚˜ μ˜λ„λ₯Ό νŒŒμ•…ν•˜κ³ , 이λ₯Ό μš”μ•½μ— λ°˜λ“œμ‹œ 반영
 
97
 
98
+ λŒ€λ³Έ:
99
+ {text}
100
+ """
101
+ return call_api(prompt, max_tokens=2000, temperature=0.3, top_p=0.9)
102
 
103
+ def display_script(title, script):
104
+ return f"""<h2 style='font-size:24px;'>{title}</h2>
105
+ <details>
106
+ <summary><h3>원문 슀크립트 (ν΄λ¦­ν•˜μ—¬ 펼치기)</h3></summary>
107
+ <div style="white-space: pre-wrap;">{script}</div>
108
+ </details>"""
109
 
110
+ def analyze(url, cache):
111
+ try:
112
+ if url == cache["url"]:
113
+ logging.info(f"μΊμ‹œλœ 데이터 μ‚¬μš©: URL = {url}")
114
+ title, script, sections = cache["title"], cache["script"], cache["sections"]
115
+ else:
116
+ logging.info(f"μƒˆλ‘œμš΄ 데이터 μΆ”μΆœ μ‹œμž‘: URL = {url}")
117
+ title, script, sections = get_youtube_script(url)
118
+ cache = {"url": url, "title": title, "script": script, "sections": sections}
119
+
120
+ script_html = display_script(title, script)
121
+ timeline_html = generate_timeline_summary(sections)
122
+ summary_html = summarize_text(script)
123
+
124
+ logging.info("뢄석 μ™„λ£Œ")
125
+ return script_html, timeline_html, summary_html, cache
126
+ except Exception as e:
127
+ error_msg = f"처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
128
+ logging.exception(error_msg)
129
+ return error_msg, "", "", cache
130
 
131
+ # Gradio μΈν„°νŽ˜μ΄μŠ€
132
  with gr.Blocks() as demo:
133
  gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
 
134
  youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
135
  analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
136
  script_output = gr.HTML(label="슀크립트")
137
  timeline_output = gr.HTML(label="νƒ€μž„λΌμΈ μš”μ•½")
138
  summary_output = gr.HTML(label="전체 μš”μ•½")
 
139
  cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  analyze_button.click(
142
  analyze,
143
  inputs=[youtube_url_input, cached_data],
144
  outputs=[script_output, timeline_output, summary_output, cached_data]
145
  )
146
 
147
+ if __name__ == "__main__":
148
+ demo.launch(share=True)