AIRider commited on
Commit
57d8b30
Β·
verified Β·
1 Parent(s): ea538de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -88
app.py CHANGED
@@ -5,15 +5,12 @@ import logging
5
  import ast
6
  import openai
7
  import os
 
8
  import re
9
- from sklearn.feature_extraction.text import TfidfVectorizer
10
- from multiprocessing import Pool, cpu_count
11
 
12
  logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
13
  format='%(asctime)s - %(levelname)s - %(message)s')
14
 
15
- openai.api_key = os.getenv("OPENAI_API_KEY")
16
-
17
  def parse_api_response(response):
18
  try:
19
  if isinstance(response, str):
@@ -26,8 +23,30 @@ def parse_api_response(response):
26
  except Exception as e:
27
  raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def get_youtube_script(url):
30
  logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
 
31
  client = Client("whispersound/YT_Ts_R")
32
 
33
  try:
@@ -39,48 +58,31 @@ def get_youtube_script(url):
39
 
40
  title = parsed_result["data"][0]["title"]
41
  transcription_text = parsed_result["data"][0]["transcriptionAsText"]
42
- original_sections = parsed_result["data"][0]["sections"]
43
-
44
- merged_sections = merge_sections(original_sections)
45
- processed_sections = process_merged_sections_parallel(merged_sections)
46
-
47
- logging.info("슀크립트 μΆ”μΆœ 및 처리 μ™„λ£Œ")
48
- return title, transcription_text, processed_sections
49
 
50
  except Exception as e:
51
  error_msg = f"슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
52
  logging.exception(error_msg)
53
  return "", "", []
54
 
55
- def is_same_topic_tfidf(text1, text2, threshold=0.3):
56
- vectorizer = TfidfVectorizer().fit([text1, text2])
57
- vectors = vectorizer.transform([text1, text2])
58
- similarity = (vectors[0] * vectors[1].T).A[0][0]
59
- return similarity > threshold
60
 
61
- def merge_sections(sections, min_duration=60, max_duration=300):
62
- merged_sections = []
63
- current_section = sections[0].copy()
64
-
65
- for section in sections[1:]:
66
- duration = current_section['end_time'] - current_section['start_time']
67
-
68
- if duration < min_duration:
69
- current_section['end_time'] = section['end_time']
70
- current_section['text'] += ' ' + section['text']
71
- elif duration >= max_duration:
72
- merged_sections.append(current_section)
73
- current_section = section.copy()
74
- else:
75
- if is_same_topic_tfidf(current_section['text'], section['text']):
76
- current_section['end_time'] = section['end_time']
77
- current_section['text'] += ' ' + section['text']
78
- else:
79
- merged_sections.append(current_section)
80
- current_section = section.copy()
81
-
82
- merged_sections.append(current_section)
83
- return merged_sections
84
 
85
  def summarize_section(section_text):
86
  prompt = f"""
@@ -92,79 +94,114 @@ def summarize_section(section_text):
92
  μ„Ήμ…˜ λ‚΄μš©:
93
  {section_text}
94
  """
95
- try:
96
- response = openai.ChatCompletion.create(
97
- model="gpt-4o-mini",
98
- messages=[{"role": "user", "content": prompt}],
99
- max_tokens=150,
100
- temperature=0.3,
101
- top_p=0.9
102
- )
103
- return response['choices'][0]['message']['content']
104
- except Exception as e:
105
- logging.exception("μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ")
106
- return "μš”μ•½μ„ μƒμ„±ν•˜λŠ” λ™μ•ˆ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€."
107
-
108
- def process_section(section):
109
- summary = summarize_section(section['text'])
110
- return {
111
- 'start_time': section['start_time'],
112
- 'end_time': section['end_time'],
113
- 'summary': summary
114
- }
115
-
116
- def process_merged_sections_parallel(merged_sections):
117
- with Pool(processes=cpu_count()) as pool:
118
- return pool.map(process_section, merged_sections)
119
 
120
  def format_time(seconds):
121
  minutes, seconds = divmod(seconds, 60)
122
  hours, minutes = divmod(minutes, 60)
123
  return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
124
 
125
- def generate_timeline_summary(processed_sections):
126
  timeline_summary = ""
127
- for i, section in enumerate(processed_sections, 1):
128
  start_time = format_time(section['start_time'])
129
- end_time = format_time(section['end_time'])
130
- timeline_summary += f"{start_time} - {end_time} {i}. {section['summary']}\n\n"
131
  return timeline_summary
132
 
133
- def display_script_and_summary(title, script, processed_sections):
134
- timeline_summary = generate_timeline_summary(processed_sections)
135
-
136
- script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
137
- <h3>νƒ€μž„λΌμΈ μš”μ•½:</h3>
138
- <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
139
- {timeline_summary}
140
- </div>
141
- <details>
142
- <summary><h3>원문 슀크립트 (ν΄λ¦­ν•˜μ—¬ 펼치기)</h3></summary>
143
- <div style="white-space: pre-wrap;">{script}</div>
144
- </details>"""
145
- return script_html
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  with gr.Blocks() as demo:
148
  gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
149
 
150
  youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
151
  analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
152
- output = gr.HTML(label="κ²°κ³Ό")
 
 
153
 
154
- cached_data = gr.State({"url": "", "title": "", "script": "", "processed_sections": []})
155
 
156
- def analyze(url, cache):
157
  if url == cache["url"]:
158
- return display_script_and_summary(cache["title"], cache["script"], cache["processed_sections"]), cache
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
- title, script, processed_sections = get_youtube_script(url)
161
- new_cache = {"url": url, "title": title, "script": script, "processed_sections": processed_sections}
162
- return display_script_and_summary(title, script, processed_sections), new_cache
 
163
 
164
  analyze_button.click(
165
  analyze,
166
  inputs=[youtube_url_input, cached_data],
167
- outputs=[output, cached_data]
 
 
 
 
168
  )
169
 
170
  demo.launch(share=True)
 
5
  import ast
6
  import openai
7
  import os
8
+ import random
9
  import re
 
 
10
 
11
  logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
12
  format='%(asctime)s - %(levelname)s - %(message)s')
13
 
 
 
14
  def parse_api_response(response):
15
  try:
16
  if isinstance(response, str):
 
23
  except Exception as e:
24
  raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
25
 
26
+ def split_sentences(text):
27
+ sentences = re.split(r"(λ‹ˆλ‹€|μ—μš”|κ΅¬λ‚˜|ν•΄μš”|κ΅°μš”|κ² μ–΄μš”|μ‹œμ˜€|해라|μ˜ˆμš”|μ•„μš”|λ°μš”|λŒ€μš”|μ„Έμš”|μ–΄μš”|κ²Œμš”|κ΅¬μš”|κ³ μš”|λ‚˜μš”|ν•˜μ£ )(?![\w])", text)
28
+ combined_sentences = []
29
+ current_sentence = ""
30
+ for i in range(0, len(sentences), 2):
31
+ if i + 1 < len(sentences):
32
+ sentence = sentences[i] + sentences[i + 1]
33
+ else:
34
+ sentence = sentences[i]
35
+ if len(current_sentence) + len(sentence) > 100:
36
+ combined_sentences.append(current_sentence.strip())
37
+ current_sentence = sentence.strip()
38
+ else:
39
+ current_sentence += sentence
40
+ if sentence.endswith(('.', '?', '!')):
41
+ combined_sentences.append(current_sentence.strip())
42
+ current_sentence = ""
43
+ if current_sentence:
44
+ combined_sentences.append(current_sentence.strip())
45
+ return combined_sentences
46
+
47
  def get_youtube_script(url):
48
  logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
49
+
50
  client = Client("whispersound/YT_Ts_R")
51
 
52
  try:
 
58
 
59
  title = parsed_result["data"][0]["title"]
60
  transcription_text = parsed_result["data"][0]["transcriptionAsText"]
61
+ sections = parsed_result["data"][0]["sections"]
62
+
63
+ logging.info("슀크립트 μΆ”μΆœ μ™„λ£Œ")
64
+ return title, transcription_text, sections
 
 
 
65
 
66
  except Exception as e:
67
  error_msg = f"슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}"
68
  logging.exception(error_msg)
69
  return "", "", []
70
 
71
+ openai.api_key = os.getenv("OPENAI_API_KEY")
 
 
 
 
72
 
73
+ def call_api(prompt, max_tokens, temperature, top_p):
74
+ try:
75
+ response = openai.ChatCompletion.create(
76
+ model="gpt-4o-mini",
77
+ messages=[{"role": "user", "content": prompt}],
78
+ max_tokens=max_tokens,
79
+ temperature=temperature,
80
+ top_p=top_p
81
+ )
82
+ return response['choices'][0]['message']['content']
83
+ except Exception as e:
84
+ logging.exception("LLM API 호좜 쀑 였λ₯˜ λ°œμƒ")
85
+ return "μš”μ•½μ„ μƒμ„±ν•˜λŠ” λ™μ•ˆ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. λ‚˜μ€‘μ— λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
 
 
 
 
 
 
 
 
 
 
86
 
87
  def summarize_section(section_text):
88
  prompt = f"""
 
94
  μ„Ήμ…˜ λ‚΄μš©:
95
  {section_text}
96
  """
97
+ return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def format_time(seconds):
100
  minutes, seconds = divmod(seconds, 60)
101
  hours, minutes = divmod(minutes, 60)
102
  return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
103
 
104
+ def generate_timeline_summary(sections):
105
  timeline_summary = ""
106
+ for i, section in enumerate(sections, 1):
107
  start_time = format_time(section['start_time'])
108
+ summary = summarize_section(section['text'])
109
+ timeline_summary += f"{start_time} {i}. {summary}\n\n"
110
  return timeline_summary
111
 
112
+ def summarize_text(text):
113
+ prompt = f"""
114
+ 1. λ‹€μŒ μ£Όμ–΄μ§€λŠ” 유튜브 λŒ€λ³Έμ˜ 핡심 μ£Όμ œμ™€ λͺ¨λ“  μ£Όμš” λ‚΄μš©μ„ μƒμ„Έν•˜κ²Œ μš”μ•½ν•˜λΌ
115
+ 2. λ°˜λ“œμ‹œ ν•œκΈ€λ‘œ μž‘μ„±ν•˜λΌ
116
+ 3. μš”μ•½λ¬Έλ§ŒμœΌλ‘œλ„ μ˜μƒμ„ 직접 μ‹œμ²­ν•œ 것과 λ™μΌν•œ μˆ˜μ€€μœΌλ‘œ λ‚΄μš©μ„ 이해할 수 μžˆλ„λ‘ μƒμ„Ένžˆ μž‘μ„±
117
+ 4. 글을 λ„ˆλ¬΄ μ••μΆ•ν•˜κ±°λ‚˜ ν•¨μΆ•ν•˜μ§€ 말고, μ€‘μš”ν•œ λ‚΄μš©κ³Ό 세뢀사항을 λͺ¨λ‘ 포함
118
+ 5. λ°˜λ“œμ‹œ λŒ€λ³Έμ˜ 흐름과 논리 ꡬ쑰λ₯Ό μœ μ§€
119
+ 6. λ°˜λ“œμ‹œ μ‹œκ°„ μˆœμ„œλ‚˜ μ‚¬κ±΄μ˜ μ „κ°œ 과정을 λͺ…ν™•ν•˜κ²Œ 반영
120
+ 7. λ“±μž₯인물, μž₯μ†Œ, 사건 λ“± μ€‘μš”ν•œ μš”μ†Œλ₯Ό μ •ν™•ν•˜κ²Œ μž‘μ„±
121
+ 8. λŒ€λ³Έμ—μ„œ μ „λ‹¬ν•˜λŠ” κ°μ •μ΄λ‚˜ λΆ„μœ„κΈ°λ„ 포함
122
+ 9. λ°˜λ“œμ‹œ 기술적 μš©μ–΄λ‚˜ μ „λ¬Έ μš©μ–΄κ°€ μžˆμ„ 경우, 이λ₯Ό μ •ν™•ν•˜κ²Œ μ‚¬μš©
123
+ 10. λŒ€λ³Έμ˜ λͺ©μ μ΄λ‚˜ μ˜λ„λ₯Ό νŒŒμ•…ν•˜κ³ , 이λ₯Ό μš”μ•½μ— λ°˜λ“œμ‹œ 반영
124
+ 11. 전체글을 보고
125
+
126
+ ---
127
+
128
+ 이 ν”„λ‘¬ν”„νŠΈκ°€ 도움이 λ˜μ‹œκΈΈ λ°”λžλ‹ˆλ‹€.
129
+ \n\n
130
+ {text}"""
131
+
132
+ try:
133
+ return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9)
134
+ except Exception as e:
135
+ logging.exception("μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ")
136
+ return "μš”μ•½μ„ μƒμ„±ν•˜λŠ” λ™μ•ˆ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. λ‚˜μ€‘μ— λ‹€μ‹œ μ‹œλ„ν•΄ μ£Όμ„Έμš”."
137
 
138
  with gr.Blocks() as demo:
139
  gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
140
 
141
  youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
142
  analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
143
+ script_output = gr.HTML(label="슀크립트")
144
+ timeline_output = gr.HTML(label="νƒ€μž„λΌμΈ μš”μ•½")
145
+ summary_output = gr.HTML(label="전체 μš”μ•½")
146
 
147
+ cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
148
 
149
+ def extract_and_cache(url, cache):
150
  if url == cache["url"]:
151
+ return cache["title"], cache["script"], cache["sections"], cache
152
+
153
+ title, script, sections = get_youtube_script(url)
154
+ new_cache = {"url": url, "title": title, "script": script, "sections": sections}
155
+ return title, script, sections, new_cache
156
+
157
+ def display_script(title, script):
158
+ formatted_script = "\n".join(split_sentences(script))
159
+ script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
160
+ <details>
161
+ <summary><h3>원문 슀크립트 (ν΄λ¦­ν•˜μ—¬ 펼치기)</h3></summary>
162
+ <div style="white-space: pre-wrap;">{formatted_script}</div>
163
+ </details>"""
164
+ return script_html
165
+
166
+ def display_timeline(sections):
167
+ timeline_summary = generate_timeline_summary(sections)
168
+ timeline_html = f"""
169
+ <h3>νƒ€μž„λΌμΈ μš”μ•½:</h3>
170
+ <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
171
+ {timeline_summary}
172
+ </div>
173
+ """
174
+ return timeline_html
175
+
176
+ def generate_summary(script):
177
+ summary = summarize_text(script)
178
+ summary_html = f"""
179
+ <h3>전체 μš”μ•½:</h3>
180
+ <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
181
+ {summary}
182
+ </div>
183
+ """
184
+ return summary_html
185
+
186
+ def analyze(url, cache):
187
+ title, script, sections, new_cache = extract_and_cache(url, cache)
188
+ script_html = display_script(title, script)
189
+ timeline_html = display_timeline(sections)
190
+ return script_html, timeline_html, new_cache
191
 
192
+ def update_summary(cache):
193
+ if not cache["script"]:
194
+ return "μŠ€ν¬λ¦½νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€. λ¨Όμ € YouTube URL을 μž…λ ₯ν•˜κ³  뢄석을 μ‹€ν–‰ν•΄μ£Όμ„Έμš”."
195
+ return generate_summary(cache["script"])
196
 
197
  analyze_button.click(
198
  analyze,
199
  inputs=[youtube_url_input, cached_data],
200
+ outputs=[script_output, timeline_output, cached_data]
201
+ ).then(
202
+ update_summary,
203
+ inputs=[cached_data],
204
+ outputs=summary_output
205
  )
206
 
207
  demo.launch(share=True)