Spaces:
Running
Running
import gradio as gr | |
from gradio_client import Client | |
import json | |
import logging | |
import openai | |
import os | |
import re | |
import html | |
# λ‘κΉ μ€μ | |
logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG, | |
format='%(asctime)s - %(levelname)s - %(message)s') | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
def parse_api_response(response): | |
try: | |
if isinstance(response, str): | |
response = json.loads(response) | |
if isinstance(response, list) and len(response) > 0: | |
response = response[0] | |
if not isinstance(response, dict): | |
raise ValueError(f"μμμΉ λͺ»ν μλ΅ νμμ λλ€. λ°μ λ°μ΄ν° νμ : {type(response)}") | |
return response | |
except Exception as e: | |
logging.error(f"API μλ΅ νμ± μ€ν¨: {str(e)}") | |
raise ValueError(f"API μλ΅ νμ± μ€ν¨: {str(e)}") | |
def get_youtube_script(url): | |
logging.info(f"μ€ν¬λ¦½νΈ μΆμΆ μμ: URL = {url}") | |
client = Client("whispersound/YT_Ts_R") | |
try: | |
result = client.predict(youtube_url=url, api_name="/predict") | |
parsed_result = parse_api_response(result) | |
if 'data' not in parsed_result or not parsed_result['data']: | |
raise ValueError("API μλ΅μ μ ν¨ν λ°μ΄ν°κ° μμ΅λλ€.") | |
data = parsed_result["data"][0] | |
title = data.get("title", "μ λͺ© μμ") | |
description = data.get("description", "μ€λͺ μμ") | |
transcription_text = data.get("transcriptionAsText", "") | |
if not transcription_text: | |
raise ValueError("μΆμΆλ μ€ν¬λ¦½νΈκ° μμ΅λλ€.") | |
logging.info("μ€ν¬λ¦½νΈ μΆμΆ μλ£") | |
return title, description, transcription_text | |
except Exception as e: | |
logging.exception("μ€ν¬λ¦½νΈ μΆμΆ μ€ μ€λ₯ λ°μ") | |
raise | |
def call_api(prompt, max_tokens, temperature, top_p): | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-4o-mini", | |
messages=[{"role": "user", "content": prompt}], | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p | |
) | |
return response['choices'][0]['message']['content'] | |
except Exception as e: | |
logging.exception("LLM API νΈμΆ μ€ μ€λ₯ λ°μ") | |
raise | |
def summarize_text(title, description, text): | |
prompt = f""" | |
[μ νλΈ μμ½ κ·μΉ] | |
1. λλ μ νλΈ μμ μ λ¬Έ ν΄μ€κ°λ‘μ μ§μΉ¨μ λ§κ² μ΄ κΈμ μμ±νλΌ | |
2. μλμ μ λͺ©κ³Ό μ€λͺ μ μ΄ μ νλΈ μμμ μλ³Έ λ©νλ°μ΄ν°μ΄λ€. | |
3. λ°λμ μ λͺ©κ³Ό μ€λͺ μΌλ‘ μ£Όμ μ λ¬Έλ§₯μ λ¨Όμ νμ νκ³ , μλμ λλ³Έμ λ°λμ μ§μΉ¨μ λ§κ² μμΈνκ² μμ½νλΌ | |
4. λ°λμ νκΈλ‘ μμ±νλΌ | |
5. λ°λμ 'μ΄ μ νλΈ λλ³Έμ', 'μ΄ μμμ', 'μ΄ μ νλΈλ'λ±μ μκ°μ ννμ μ μΈνλΌ | |
6. μμ½λ¬Έλ§μΌλ‘λ μμμ μ§μ μμ²ν κ²κ³Ό λμΌν μμ€μΌλ‘ λ΄μ©μ μ΄ν΄ν μ μλλ‘ μμΈν μμ± | |
7. κΈμ λ무 μμΆνκ±°λ ν¨μΆνμ§ λ§κ³ , μ€μν λ΄μ©κ³Ό μΈλΆμ¬νμ λͺ¨λ ν¬ν¨ | |
8. λ°λμ λλ³Έμ νλ¦κ³Ό λ Όλ¦¬ ꡬ쑰λ₯Ό μ μ§ | |
9. λλ³Έμ λͺ©μ μ΄λ μλλ₯Ό νμ νκ³ , μ΄λ₯Ό μμ½μ λ°λμ λ°μ | |
10. λ°λμ μκ° μμλ μ¬κ±΄μ μ κ° κ³Όμ μ λͺ ννκ² λ°μ | |
11. λ±μ₯μΈλ¬Ό, μ₯μ, μ¬κ±΄ λ± μ€μν μμλ₯Ό μ ννκ² μμ± | |
12. λλ³Έμμ μ λ¬νλ κ°μ μ΄λ λΆμκΈ°λ ν¬ν¨ | |
13. λ°λμ κΈ°μ μ μ©μ΄λ μ λ¬Έ μ©μ΄κ° μμ κ²½μ°, μ΄λ₯Ό μ ννκ² μ¬μ© | |
14. λ°λμ ν΅μ¬ μΉμ (μμ£Όμ )λ₯Ό νμ νμ¬ μΉμ μ λ§κ² κΈμ μμ½νλΌ(κΈμ μμ κ³ λ €νμ¬ μΉμ μ κ°μλ₯Ό νλ ₯μ μΌλ‘ μ€μ ) | |
15. κ° μΉμ μ μ λͺ©(μμ£Όμ )μλ λ΄μ©κ³Ό μ΄μΈλ¦¬λ μ μ ν μ΄λͺ¨μ§λ‘ μμ£Όμ λ₯Ό μμνλΌ | |
16. κ° μΉμ μ λ΄μ©μ Bullet Pointλ₯Ό μ¬μ©νμ¬ κ°λ μ±μ λμ¬λΌ(λ¬Έμ₯ λ¨μλ‘ κ΅¬λΆ) | |
[μμ] | |
(λ³κ²½μ ) | |
- μ νλΈλ₯Ό μ²μ μμνλ μ¬λλ€μ ꡬλ μ μμ μ‘°νμμ ν° κ΄μ¬μ λκ³ λ§€μΌ μ νλΈ μ€νλμ€λ₯Ό νμΈνκ² λλ€. κ·Έλ¬λ ꡬλ μκ° 100λͺ , 1,000λͺ μ λλ¬νλ κ²λ§μΌλ‘λ μ§μμ μΈ μ±μ₯μ λμμ΄ λμ§ μλλ€. ꡬλ μ μκ° λμ΄λ νμλ μ νλΈ μ±λ μ΄μμ λν κ°μ μ‘μ§ λͺ»ν΄ ν¬κΈ°νλ κ²½μ°κ° λ§λ€. | |
(λ³κ²½ν) | |
- μ νλΈλ₯Ό μ²μ μμνλ μ¬λλ€μ ꡬλ μ μμ μ‘°νμμ ν° κ΄μ¬μ λκ³ λ§€μΌ μ νλΈ μ€νλμ€λ₯Ό νμΈνκ² λλ€. | |
- κ·Έλ¬λ ꡬλ μκ° 100λͺ , 1,000λͺ μ λλ¬νλ κ²λ§μΌλ‘λ μ§μμ μΈ μ±μ₯μ λμμ΄ λμ§ μλλ€. | |
- ꡬλ μ μκ° λμ΄λ νμλ μ νλΈ μ±λ μ΄μμ λν κ°μ μ‘μ§ λͺ»ν΄ ν¬κΈ°νλ κ²½μ°κ° λ§λ€. | |
17. κ° μΉμ μ λ΄μ©μ λ°λμ μΆ©μ€νκ² μμ± | |
μ λͺ©: {title} | |
μ€λͺ : {description} | |
λλ³Έ: | |
{text} | |
""" | |
return call_api(prompt, max_tokens=8000, temperature=0.35, top_p=0.95) | |
def split_sentences(text): | |
sentences = re.split(r"(λλ€|μμ|ꡬλ|ν΄μ|κ΅°μ|κ² μ΄μ|μμ€|ν΄λΌ|μμ|μμ|λ°μ|λμ|μΈμ|μ΄μ|κ²μ|ꡬμ|κ³ μ|λμ|νμ£ )(?![\w])", text) | |
combined_sentences = [] | |
current_sentence = "" | |
for i in range(0, len(sentences), 2): | |
if i + 1 < len(sentences): | |
sentence = sentences[i] + sentences[i + 1] | |
else: | |
sentence = sentences[i] | |
if len(current_sentence) + len(sentence) > 100: # 100μλ₯Ό μ΄κ³Όν κ²½μ° | |
combined_sentences.append(current_sentence.strip()) | |
current_sentence = sentence.strip() | |
else: | |
current_sentence += sentence | |
if sentence.endswith(('.', '?', '!')): | |
combined_sentences.append(current_sentence.strip()) | |
current_sentence = "" | |
if current_sentence: | |
combined_sentences.append(current_sentence.strip()) | |
return combined_sentences | |
def display_script(title, script): | |
script_sentences = split_sentences(script) | |
formatted_script = "\n\n".join(script_sentences) | |
return f"""<div style="background-color: #f0f0f0; padding: 20px; border-radius: 10px;"> | |
<h3>μλ¬Έ μ€ν¬λ¦½νΈ</h3> | |
<details> | |
<summary>ν΄λ¦νμ¬ νΌμΉκΈ°</summary> | |
<h2>{title}</h2> | |
<pre style="white-space: pre-wrap;">{formatted_script}</pre> | |
</details> | |
</div>""" | |
def display_summary(title, summary): | |
return f"""<div style="background-color: #e6f3ff; padding: 20px; border-radius: 10px; margin-top: 20px;"> | |
<h3>μμ½</h3> | |
<h2>{title}</h2> | |
{summary} | |
</div>""" | |
def analyze(url): | |
# μ€ν¬λ¦½νΈ μΆμΆ | |
yield "μ€ν¬λ¦½νΈ μΆμΆ μ€...", "μ€ν¬λ¦½νΈ μΆμΆ μ€..." | |
title, description, script = get_youtube_script(url) | |
script_content = display_script(title, script) | |
# μλ¬Έ μ€ν¬λ¦½νΈ νμ λ° μμ½ μμ | |
yield script_content, "μμ½ μμ± μ€..." | |
# μμ½ μμ± | |
summary = summarize_text(title, description, script) | |
lines = summary.split('\n') | |
formatted_lines = [] | |
for line in lines: | |
if line.startswith('# '): | |
line = f"<h1>{html.escape(line[2:])}</h1>" | |
elif line.startswith('## '): | |
line = f"<h2>{html.escape(line[3:])}</h2>" | |
elif line.startswith('### '): | |
line = f"<h3>{html.escape(line[4:])}</h3>" | |
else: | |
line = f"<p>{html.escape(line)}</p>" | |
formatted_lines.append(line) | |
formatted_summary = '\n'.join(formatted_lines) | |
summary_content = f"""<div style="background-color: #e6f3ff; padding: 20px; border-radius: 10px; margin-top: 20px;"> | |
<h3>μμ½</h3> | |
<h2>{html.escape(title)}</h2> | |
{formatted_summary} | |
</div>""" | |
# μ΅μ’ κ²°κ³Ό νμ | |
yield script_content, summary_content | |
# Gradio μΈν°νμ΄μ€ | |
with gr.Blocks() as demo: | |
gr.Markdown("## YouTube μ€ν¬λ¦½νΈ μΆμΆ λ° μμ½ λꡬ") | |
youtube_url_input = gr.Textbox(label="YouTube URL μ λ ₯") | |
analyze_button = gr.Button("λΆμνκΈ°") | |
script_output = gr.HTML(label="μλ¬Έ μ€ν¬λ¦½νΈ") | |
summary_output = gr.HTML(label="μμ½") | |
analyze_button.click( | |
analyze, | |
inputs=[youtube_url_input], | |
outputs=[script_output, summary_output] | |
) | |
if __name__ == "__main__": | |
demo.launch() |