AIRider commited on
Commit
d3555b8
Β·
verified Β·
1 Parent(s): 3c311b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -143
app.py CHANGED
@@ -7,74 +7,66 @@ import openai
7
  import os
8
  import random
9
  import re
10
-
11
- logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
12
- format='%(asctime)s - %(levelname)s - %(message)s')
13
-
14
- openai.api_key = os.getenv("OPENAI_API_KEY")
 
 
 
 
 
 
 
 
 
 
15
 
16
  def parse_api_response(response):
17
  try:
18
  if isinstance(response, str):
19
- response = json.loads(response)
20
- if isinstance(response, list) and len(response) > 0:
21
- response = response[0]
22
  if not isinstance(response, dict):
23
  raise ValueError(f"μ˜ˆμƒμΉ˜ λͺ»ν•œ 응닡 ν˜•μ‹μž…λ‹ˆλ‹€. 받은 데이터 νƒ€μž…: {type(response)}")
24
  return response
25
  except Exception as e:
26
  raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
27
 
28
- def split_sentences(text):
29
- sentences = re.split(r"(λ‹ˆλ‹€|μ—μš”|κ΅¬λ‚˜|ν•΄μš”|κ΅°μš”|κ² μ–΄μš”|μ‹œμ˜€|해라|μ˜ˆμš”|μ•„μš”|λ°μš”|λŒ€μš”|μ„Έμš”|μ–΄μš”|κ²Œμš”|κ΅¬μš”|κ³ μš”|λ‚˜μš”|ν•˜μ£ )(?![\w])", text)
30
- combined_sentences = []
31
- current_sentence = ""
32
- for i in range(0, len(sentences), 2):
33
- if i + 1 < len(sentences):
34
- sentence = sentences[i] + sentences[i + 1]
35
- else:
36
- sentence = sentences[i]
37
- if len(current_sentence) + len(sentence) > 100:
38
- combined_sentences.append(current_sentence.strip())
39
- current_sentence = sentence.strip()
40
- else:
41
- current_sentence += sentence
42
- if sentence.endswith(('.', '?', '!')):
43
- combined_sentences.append(current_sentence.strip())
44
- current_sentence = ""
45
- if current_sentence:
46
- combined_sentences.append(current_sentence.strip())
47
- return combined_sentences
48
-
49
  def get_youtube_script(url):
50
  logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
51
-
52
  client = Client("whispersound/YT_Ts_R")
53
-
54
  try:
55
  logging.debug("API 호좜 μ‹œμž‘")
56
  result = client.predict(youtube_url=url, api_name="/predict")
57
  logging.debug("API 호좜 μ™„λ£Œ")
58
-
59
  parsed_result = parse_api_response(result)
60
-
61
- if 'data' not in parsed_result or not parsed_result['data']:
62
- raise ValueError("API 응닡에 μœ νš¨ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
63
 
64
- title = parsed_result["data"][0].get("title", "제λͺ© μ—†μŒ")
65
- transcription_text = parsed_result["data"][0].get("transcriptionAsText", "")
66
- sections = parsed_result["data"][0].get("sections", [])
 
67
 
68
- if not transcription_text:
69
- raise ValueError("μΆ”μΆœλœ μŠ€ν¬λ¦½νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
 
 
 
70
 
71
  logging.info("슀크립트 μΆ”μΆœ μ™„λ£Œ")
72
- return title, transcription_text, sections
73
-
 
 
 
 
74
  except Exception as e:
75
  error_msg = f"슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
76
  logging.exception(error_msg)
77
- raise
 
 
 
78
 
79
  def call_api(prompt, max_tokens, temperature, top_p):
80
  try:
@@ -88,125 +80,126 @@ def call_api(prompt, max_tokens, temperature, top_p):
88
  return response['choices'][0]['message']['content']
89
  except Exception as e:
90
  logging.exception("LLM API 호좜 쀑 였λ₯˜ λ°œμƒ")
91
- raise
 
 
 
 
 
 
 
 
 
 
92
 
93
  def summarize_section(section_text):
94
- prompt = f"""
95
- λ‹€μŒ 유튜브 λŒ€λ³Έ μ„Ήμ…˜μ˜ 핡심 λ‚΄μš©μ„ κ°„κ²°ν•˜κ²Œ μš”μ•½ν•˜μ„Έμš”:
96
- 1. ν•œκΈ€λ‘œ μž‘μ„±ν•˜μ„Έμš”.
97
- 2. μ£Όμš” 논점과 μ€‘μš”ν•œ 세뢀사항을 ν¬ν•¨ν•˜μ„Έμš”.
98
- 3. μš”μ•½μ€ 2-3λ¬Έμž₯으둜 μ œν•œν•˜μ„Έμš”.
99
 
100
- μ„Ήμ…˜ λ‚΄μš©:
101
  {section_text}
102
- """
103
- return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9)
104
 
105
- def format_time(seconds):
106
- minutes, seconds = divmod(seconds, 60)
107
- hours, minutes = divmod(minutes, 60)
108
- return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
109
-
110
- def generate_timeline_summary(sections):
111
- timeline_summary = ""
112
- for i, section in enumerate(sections, 1):
113
- start_time = format_time(section['start_time'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  summary = summarize_section(section['text'])
115
- timeline_summary += f"{start_time} {i}. {summary}\n\n"
116
- return timeline_summary
117
-
118
- def summarize_text(text):
119
- prompt = f"""
120
- 1. λ‹€μŒ μ£Όμ–΄μ§€λŠ” 유튜브 λŒ€λ³Έμ˜ 핡심 μ£Όμ œμ™€ λͺ¨λ“  μ£Όμš” λ‚΄μš©μ„ μƒμ„Έν•˜κ²Œ μš”μ•½ν•˜λΌ
121
- 2. λ°˜λ“œμ‹œ ν•œκΈ€λ‘œ μž‘μ„±ν•˜λΌ
122
- 3. μš”μ•½λ¬Έλ§ŒμœΌλ‘œλ„ μ˜μƒμ„ 직접 μ‹œμ²­ν•œ 것과 λ™μΌν•œ μˆ˜μ€€μœΌλ‘œ λ‚΄μš©μ„ 이해할 수 μžˆλ„λ‘ μƒμ„Ένžˆ μž‘μ„±
123
- 4. 글을 λ„ˆλ¬΄ μ••μΆ•ν•˜κ±°λ‚˜ ν•¨μΆ•ν•˜μ§€ 말고, μ€‘μš”ν•œ λ‚΄μš©κ³Ό 세뢀사항을 λͺ¨λ‘ 포함
124
- 5. λ°˜λ“œμ‹œ λŒ€λ³Έμ˜ 흐름과 논리 ꡬ쑰λ₯Ό μœ μ§€
125
- 6. λ°˜λ“œμ‹œ μ‹œκ°„ μˆœμ„œλ‚˜ μ‚¬κ±΄μ˜ μ „κ°œ 과정을 λͺ…ν™•ν•˜κ²Œ 반영
126
- 7. λ“±μž₯인물, μž₯μ†Œ, 사건 λ“± μ€‘μš”ν•œ μš”μ†Œλ₯Ό μ •ν™•ν•˜κ²Œ μž‘μ„±
127
- 8. λŒ€λ³Έμ—μ„œ μ „λ‹¬ν•˜λŠ” κ°μ •μ΄λ‚˜ λΆ„μœ„κΈ°λ„ 포함
128
- 9. λ°˜λ“œμ‹œ 기술적 μš©μ–΄λ‚˜ μ „λ¬Έ μš©μ–΄κ°€ μžˆμ„ 경우, 이λ₯Ό μ •ν™•ν•˜κ²Œ μ‚¬μš©
129
- 10. λŒ€λ³Έμ˜ λͺ©μ μ΄λ‚˜ μ˜λ„λ₯Ό νŒŒμ•…ν•˜κ³ , 이λ₯Ό μš”μ•½μ— λ°˜λ“œμ‹œ 반영
130
- 11. 전체글을 보고
131
-
132
- ---
133
-
134
- 이 ν”„λ‘¬ν”„νŠΈκ°€ 도움이 λ˜μ‹œκΈΈ λ°”λžλ‹ˆλ‹€.
135
- \n\n
136
- {text}"""
137
-
138
- return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9)
139
 
140
  with gr.Blocks() as demo:
141
  gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
142
-
143
  youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
144
  analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
145
  script_output = gr.HTML(label="슀크립트")
146
- timeline_output = gr.HTML(label="νƒ€μž„λΌμΈ μš”μ•½")
147
- summary_output = gr.HTML(label="전체 μš”μ•½")
148
-
149
- cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
150
 
151
  def extract_and_cache(url, cache):
152
- if url == cache["url"]:
153
- return cache["title"], cache["script"], cache["sections"], cache
154
-
155
- try:
156
- title, script, sections = get_youtube_script(url)
157
- new_cache = {"url": url, "title": title, "script": script, "sections": sections}
158
- return title, script, sections, new_cache
159
- except Exception as e:
160
- logging.exception("데이터 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ")
161
- raise gr.Error(f"슀크립트 μΆ”μΆœ μ‹€νŒ¨: {str(e)}")
162
-
163
- def display_script(title, script):
164
- formatted_script = "\n".join(split_sentences(script))
165
- script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
166
- <details>
167
- <summary><h3>원문 슀크립트 (ν΄λ¦­ν•˜μ—¬ 펼치기)</h3></summary>
168
- <div style="white-space: pre-wrap;">{formatted_script}</div>
169
- </details>"""
170
  return script_html
171
 
172
- def display_timeline(sections):
173
- timeline_summary = generate_timeline_summary(sections)
174
- timeline_html = f"""
175
- <h3>νƒ€μž„λΌμΈ μš”μ•½:</h3>
176
- <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
177
- {timeline_summary}
178
- </div>
179
- """
180
- return timeline_html
181
-
182
- def generate_summary(script):
183
- summary = summarize_text(script)
184
- summary_html = f"""
185
- <h3>전체 μš”μ•½:</h3>
186
- <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
187
- {summary}
188
- </div>
189
- """
190
- return summary_html
191
-
192
- def analyze(url, cache):
193
  try:
194
- title, script, sections, new_cache = extract_and_cache(url, cache)
195
- script_html = display_script(title, script)
196
- timeline_html = display_timeline(sections)
197
- summary_html = generate_summary(script)
198
- return script_html, timeline_html, summary_html, new_cache
199
- except gr.Error as e:
200
- return str(e), "", "", cache
 
201
  except Exception as e:
202
- error_msg = f"처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
203
- logging.exception(error_msg)
204
- return error_msg, "", "", cache
205
 
206
  analyze_button.click(
207
- analyze,
208
- inputs=[youtube_url_input, cached_data],
209
- outputs=[script_output, timeline_output, summary_output, cached_data]
 
 
 
 
210
  )
211
 
212
- demo.launch(share=True)
 
7
  import os
8
  import random
9
  import re
10
+ import nltk
11
+ import numpy as np
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ import urllib.parse
15
+
16
+ # nltk 데이터 λ‹€μš΄λ‘œλ“œ (졜초 ν•œ 번 μ‹€ν–‰)
17
+ nltk.download('punkt')
18
+
19
+ # λ‘œκΉ… μ„€μ •
20
+ logging.basicConfig(
21
+ filename='youtube_script_extractor.log',
22
+ level=logging.DEBUG,
23
+ format='%(asctime)s - %(levelname)s - %(message)s'
24
+ )
25
 
26
  def parse_api_response(response):
27
  try:
28
  if isinstance(response, str):
29
+ response = ast.literal_eval(response)
 
 
30
  if not isinstance(response, dict):
31
  raise ValueError(f"μ˜ˆμƒμΉ˜ λͺ»ν•œ 응닡 ν˜•μ‹μž…λ‹ˆλ‹€. 받은 데이터 νƒ€μž…: {type(response)}")
32
  return response
33
  except Exception as e:
34
  raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def get_youtube_script(url):
37
  logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
 
38
  client = Client("whispersound/YT_Ts_R")
 
39
  try:
40
  logging.debug("API 호좜 μ‹œμž‘")
41
  result = client.predict(youtube_url=url, api_name="/predict")
42
  logging.debug("API 호좜 μ™„λ£Œ")
 
43
  parsed_result = parse_api_response(result)
 
 
 
44
 
45
+ # 데이터 ꡬ쑰에 맞게 μˆ˜μ •
46
+ data_list = parsed_result.get("data", [])
47
+ if not data_list:
48
+ raise ValueError("데이터λ₯Ό κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€.")
49
 
50
+ # 첫 번째 데이터 μ‚¬μš©
51
+ data = data_list[0]
52
+ title = data.get("title", "")
53
+ transcription = data.get("transcription", [])
54
+ transcription_as_text = data.get("transcriptionAsText", "")
55
 
56
  logging.info("슀크립트 μΆ”μΆœ μ™„λ£Œ")
57
+ script_json = json.dumps({
58
+ "title": title,
59
+ "transcription": transcription,
60
+ "transcriptionAsText": transcription_as_text
61
+ })
62
+ return title, script_json
63
  except Exception as e:
64
  error_msg = f"슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
65
  logging.exception(error_msg)
66
+ return "", ""
67
+
68
+ # OpenAI API ν‚€ μ„€μ •
69
+ openai.api_key = os.getenv("OPENAI_API_KEY")
70
 
71
  def call_api(prompt, max_tokens, temperature, top_p):
72
  try:
 
80
  return response['choices'][0]['message']['content']
81
  except Exception as e:
82
  logging.exception("LLM API 호좜 쀑 였λ₯˜ λ°œμƒ")
83
+ return "μš”μ•½μ„ μƒμ„±ν•˜λŠ” λ™μ•ˆ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. λ‚˜μ€‘μ— λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
84
+
85
+ def extract_video_id(url):
86
+ parsed_url = urllib.parse.urlparse(url)
87
+ if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
88
+ query_params = urllib.parse.parse_qs(parsed_url.query)
89
+ return query_params.get('v', [None])[0]
90
+ elif parsed_url.hostname == 'youtu.be':
91
+ return parsed_url.path[1:]
92
+ else:
93
+ return None
94
 
95
  def summarize_section(section_text):
96
+ prompt = f"""λ‹€μŒ λ‚΄μš©μ˜ 핡심을 μš”μ•½ν•΄ μ£Όμ„Έμš”:
 
 
 
 
97
 
 
98
  {section_text}
 
 
99
 
100
+ μš”μ•½μ€ ν•œκ΅­μ–΄λ‘œ κ°„κ²°ν•˜κ²Œ μž‘μ„±ν•΄ μ£Όμ„Έμš”.
101
+ """
102
+ return call_api(prompt, max_tokens=500, temperature=0.3, top_p=0.9)
103
+
104
+ def segment_transcript(transcript):
105
+ sentences = []
106
+ start_times = []
107
+ for entry in transcript:
108
+ subtitle = entry.get('subtitle', '')
109
+ start_time = entry.get('start', 0)
110
+ if not subtitle:
111
+ continue
112
+ split_sentences = nltk.tokenize.sent_tokenize(subtitle)
113
+ sentences.extend(split_sentences)
114
+ start_times.extend([start_time] * len(split_sentences))
115
+
116
+ if not sentences:
117
+ return []
118
+
119
+ vectorizer = TfidfVectorizer().fit_transform(sentences)
120
+ vectors = vectorizer.toarray()
121
+
122
+ boundaries = [0]
123
+ threshold = 0.3
124
+ for i in range(1, len(sentences)):
125
+ similarity = cosine_similarity([vectors[i - 1]], [vectors[i]])[0][0]
126
+ if similarity < threshold:
127
+ boundaries.append(i)
128
+ boundaries.append(len(sentences))
129
+
130
+ sections = []
131
+ for i in range(len(boundaries) - 1):
132
+ start_idx = boundaries[i]
133
+ end_idx = boundaries[i + 1]
134
+ section_sentences = sentences[start_idx:end_idx]
135
+ section_text = ' '.join(section_sentences)
136
+ section_start_time = start_times[start_idx]
137
+ sections.append({
138
+ 'text': section_text,
139
+ 'start_time': section_start_time
140
+ })
141
+ return sections
142
+
143
+ def generate_summary(sections, url):
144
+ video_id = extract_video_id(url)
145
+ summary_html = "<h3>μš”μ•½:</h3>"
146
+ for idx, section in enumerate(sections):
147
+ start_time = section['start_time']
148
+ hours = int(start_time // 3600)
149
+ minutes = int((start_time % 3600) // 60)
150
+ seconds = int(start_time % 60)
151
+ timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
152
+ timestamp_link = f"https://www.youtube.com/watch?v={video_id}&t={int(start_time)}s"
153
  summary = summarize_section(section['text'])
154
+ summary_html += f"""
155
+ <h4><a href="{timestamp_link}" target="_blank">{timestamp_str}</a></h4>
156
+ <div style="white-space: pre-wrap; margin-bottom: 20px;">{summary}</div>
157
+ """
158
+ return summary_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  with gr.Blocks() as demo:
161
  gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
 
162
  youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
163
  analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
164
  script_output = gr.HTML(label="슀크립트")
165
+ summary_output = gr.HTML(label="μš”μ•½")
166
+ cached_data = gr.State({"url": "", "title": "", "script": ""})
 
 
167
 
168
  def extract_and_cache(url, cache):
169
+ if url == cache.get("url"):
170
+ return cache["title"], cache
171
+ title, script = get_youtube_script(url)
172
+ new_cache = {"url": url, "title": title, "script": script}
173
+ return title, new_cache
174
+
175
+ def display_script(title):
176
+ script_html = f"""<h2 style='font-size:24px;'>{title}</h2>"""
 
 
 
 
 
 
 
 
 
 
177
  return script_html
178
 
179
+ def update_summary(cache):
180
+ if not cache.get("script"):
181
+ return "μŠ€ν¬λ¦½νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€. λ¨Όμ € YouTube URL을 μž…λ ₯ν•˜κ³  뢄석을 μ‹€ν–‰ν•΄μ£Όμ„Έμš”."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  try:
183
+ parsed_result = json.loads(cache["script"])
184
+ transcript = parsed_result.get("transcription", [])
185
+ if not transcript:
186
+ return "트랜슀크립트λ₯Ό κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€."
187
+ sections = segment_transcript(transcript)
188
+ if not sections:
189
+ return "μ„Ήμ…˜μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
190
+ return generate_summary(sections, cache["url"])
191
  except Exception as e:
192
+ logging.exception("μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ")
193
+ return "μš”μ•½μ„ μƒμ„±ν•˜λŠ” λ™μ•ˆ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. λ‚˜μ€‘μ— λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
 
194
 
195
  analyze_button.click(
196
+ extract_and_cache,
197
+ inputs=[youtube_url_input, cached_data],
198
+ outputs=[script_output, cached_data]
199
+ ).then(
200
+ update_summary,
201
+ inputs=cached_data,
202
+ outputs=summary_output
203
  )
204
 
205
+ demo.launch(share=True)