Spaces:
Running
Running
import gradio as gr | |
from gradio_client import Client | |
import json | |
import logging | |
import ast | |
import openai # OpenAI λΌμ΄λΈλ¬λ¦¬ μΆκ° | |
import os | |
import random | |
import re | |
# λ‘κΉ μ€μ | |
logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG, | |
format='%(asctime)s - %(levelname)s - %(message)s') | |
def parse_api_response(response): | |
try: | |
if isinstance(response, str): | |
response = ast.literal_eval(response) | |
if isinstance(response, list) and len(response) > 0: | |
response = response[0] | |
if not isinstance(response, dict): | |
raise ValueError(f"μμμΉ λͺ»ν μλ΅ νμμ λλ€. λ°μ λ°μ΄ν° νμ : {type(response)}") | |
return response | |
except Exception as e: | |
raise ValueError(f"API μλ΅ νμ± μ€ν¨: {str(e)}") | |
# λ¬Έμ₯ κ΅¬λΆ ν¨μ (νκ΅μ΄) | |
def split_sentences(text): | |
sentences = re.split(r"(λλ€|μμ|ꡬλ|ν΄μ|κ΅°μ|κ² μ΄μ|μμ€|ν΄λΌ|μμ|μμ|λ°μ|λμ|μΈμ|μ΄μ|κ²μ|ꡬμ|κ³ μ|λμ|νμ£ )(?![\w])", text) | |
combined_sentences = [] | |
current_sentence = "" | |
for i in range(0, len(sentences), 2): | |
if i + 1 < len(sentences): | |
sentence = sentences[i] + sentences[i + 1] | |
else: | |
sentence = sentences[i] | |
if len(current_sentence) + len(sentence) > 100: # 100μλ₯Ό μ΄κ³Όν κ²½μ° | |
combined_sentences.append(current_sentence.strip()) | |
current_sentence = sentence.strip() | |
else: | |
current_sentence += sentence | |
if sentence.endswith(('.', '?', '!')): | |
combined_sentences.append(current_sentence.strip()) | |
current_sentence = "" | |
if current_sentence: | |
combined_sentences.append(current_sentence.strip()) | |
return combined_sentences | |
def get_youtube_script(url): | |
logging.info(f"μ€ν¬λ¦½νΈ μΆμΆ μμ: URL = {url}") | |
# μλν¬μΈνΈλ₯Ό μλ‘μ΄ κ²μΌλ‘ λ³κ²½ | |
client = Client("whispersound/YT_Ts_R") | |
try: | |
logging.debug("API νΈμΆ μμ") | |
result = client.predict(youtube_url=url, api_name="/predict") | |
logging.debug("API νΈμΆ μλ£") | |
# μλ΅ νμ± | |
parsed_result = parse_api_response(result) | |
title = parsed_result["data"][0]["title"] | |
transcription_text = parsed_result["data"][0]["transcriptionAsText"] | |
logging.info("μ€ν¬λ¦½νΈ μΆμΆ μλ£") | |
return title, transcription_text | |
except Exception as e: | |
error_msg = f"μ€ν¬λ¦½νΈ μΆμΆ μ€ μ€λ₯ λ°μ: {str(e)}" | |
logging.exception(error_msg) | |
return "", "" | |
# OpenAI API ν€ μ€μ | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
# LLM API νΈμΆ ν¨μ | |
def call_api(prompt, max_tokens, temperature, top_p): | |
try: | |
response = openai.ChatCompletion.create( | |
model="gpt-4o-mini", # λͺ¨λΈμ gpt-4o-miniλ‘ λ³κ²½ | |
messages=[{"role": "user", "content": prompt}], | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p | |
) | |
return response['choices'][0]['message']['content'] | |
except Exception as e: | |
logging.exception("LLM API νΈμΆ μ€ μ€λ₯ λ°μ") | |
return "μμ½μ μμ±νλ λμ μ€λ₯κ° λ°μνμ΅λλ€. λμ€μ λ€μ μλν΄ μ£ΌμΈμ." | |
# ν μ€νΈ μμ½ ν¨μ | |
def summarize_text(text): | |
prompt = f"""λΉμ μ YouTube λΉλμ€ μ€ν¬λ¦½νΈλ₯Ό μμ½νλ AI μ΄μμ€ν΄νΈμ λλ€. | |
μλ μ 곡λ μ€ν¬λ¦½νΈλ₯Ό κ°κ²°νλ©΄μλ ν¬κ΄μ μΌλ‘ μμ½ν΄μ£ΌμΈμ. | |
λΉλμ€μ μ£Όμ μ£Όμ , ν΅μ¬ ν¬μΈνΈ, μ λ°μ μΈ λ©μμ§μ μ΄μ μ λ§μΆμΈμ. | |
μμ½μ μ ꡬ쑰νλκ³ μ΄ν΄νκΈ° μ¬μμΌ νλ©°, λ΄μ©μ λ³Έμ§μ ν¬μ°©ν΄μΌ ν©λλ€. | |
λ°λμ νκ΅μ΄λ‘ μμ½μ μ 곡νμΈμ. | |
λ€μ κ΅¬μ‘°λ‘ μμ½μ μμ±ν΄μ£ΌμΈμ: | |
1. λΉλμ€μ μ£Όμ μ£Όμ λλ ν λ§ | |
2. μ μλ μ£Όμ ν¬μΈνΈ λλ λ Όμ | |
3. μ€μν κ²°λ‘ λλ μμ¬μ | |
μμ½ν μ€ν¬λ¦½νΈ: | |
{text} | |
μ μ€ν¬λ¦½νΈμ λν μμ½μ μ 곡ν΄μ£ΌμΈμ. μμ½μ κ°κ²°νλ©΄μλ μ λ³΄κ° νλΆν΄μΌ νλ©°, λΉλμ€ λ΄μ©μ ν΅μ¬μ ν¬μ°©ν΄μΌ ν©λλ€.""" | |
try: | |
return call_api(prompt, max_tokens=2000, temperature=0.3, top_p=0.9) | |
except Exception as e: | |
logging.exception("μμ½ μμ± μ€ μ€λ₯ λ°μ") | |
return "μμ½μ μμ±νλ λμ μ€λ₯κ° λ°μνμ΅λλ€. λμ€μ λ€μ μλν΄ μ£ΌμΈμ." | |
# Gradio μΈν°νμ΄μ€ μ€μ | |
with gr.Blocks() as demo: | |
gr.Markdown("## YouTube Script Extractor with Summary") | |
youtube_url_input = gr.Textbox(label="YouTube URL μ λ ₯") | |
analyze_button = gr.Button("λΆμνκΈ°") | |
script_output = gr.HTML(label="μ€ν¬λ¦½νΈ") | |
summary_output = gr.HTML(label="μμ½") | |
# μΊμλ₯Ό μν μν λ³μ | |
cached_data = gr.State({"url": "", "title": "", "script": ""}) | |
def extract_and_cache(url, cache): | |
if url == cache["url"]: | |
return cache["title"], cache["script"], cache | |
title, script = get_youtube_script(url) | |
new_cache = {"url": url, "title": title, "script": script} | |
return title, script, new_cache | |
def display_script(title, script): | |
formatted_script = "\n".join(split_sentences(script)) | |
script_html = f"""<h2 style='font-size:24px;'>{title}</h2> | |
<details> | |
<summary><h3>μλ¬Έ μ€ν¬λ¦½νΈ (ν΄λ¦νμ¬ νΌμΉκΈ°)</h3></summary> | |
<pre>{formatted_script}</pre> | |
</details>""" | |
return script_html | |
def generate_summary(script): | |
summary = summarize_text(script) | |
summary_html = f"<h3>μμ½:</h3>\n<pre>{summary}</pre>" | |
return summary_html | |
def analyze(url, cache): | |
title, script, new_cache = extract_and_cache(url, cache) | |
script_html = display_script(title, script) | |
return script_html, new_cache | |
def update_summary(cache): | |
if not cache["script"]: | |
return "μ€ν¬λ¦½νΈκ° μμ΅λλ€. λ¨Όμ YouTube URLμ μ λ ₯νκ³ λΆμμ μ€νν΄μ£ΌμΈμ." | |
return generate_summary(cache["script"]) | |
# λ²νΌ ν΄λ¦ μ μ€ν¬λ¦½νΈ μΆμΆ | |
analyze_button.click( | |
analyze, | |
inputs=[youtube_url_input, cached_data], | |
outputs=[script_output, cached_data] | |
).then( | |
update_summary, | |
inputs=[cached_data], | |
outputs=summary_output | |
) | |
# μΈν°νμ΄μ€ μ€ν | |
demo.launch(share=True) | |