|
import gradio as gr |
|
from gradio_client import Client |
|
import json |
|
import logging |
|
import ast |
|
import openai |
|
import os |
|
import random |
|
import re |
|
|
|
|
|
logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG, |
|
format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
def parse_api_response(response): |
|
try: |
|
if isinstance(response, str): |
|
response = ast.literal_eval(response) |
|
if isinstance(response, list) and len(response) > 0: |
|
response = response[0] |
|
if not isinstance(response, dict): |
|
raise ValueError(f"μμμΉ λͺ»ν μλ΅ νμμ
λλ€. λ°μ λ°μ΄ν° νμ
: {type(response)}") |
|
return response |
|
except Exception as e: |
|
raise ValueError(f"API μλ΅ νμ± μ€ν¨: {str(e)}") |
|
|
|
|
|
def split_sentences(text): |
|
sentences = re.split(r"(λλ€|μμ|ꡬλ|ν΄μ|κ΅°μ|κ² μ΄μ|μμ€|ν΄λΌ|μμ|μμ|λ°μ|λμ|μΈμ|μ΄μ|κ²μ|ꡬμ|κ³ μ|λμ|νμ£ )(?![\w])", text) |
|
combined_sentences = [] |
|
current_sentence = "" |
|
for i in range(0, len(sentences), 2): |
|
if i + 1 < len(sentences): |
|
sentence = sentences[i] + sentences[i + 1] |
|
else: |
|
sentence = sentences[i] |
|
if len(current_sentence) + len(sentence) > 100: |
|
combined_sentences.append(current_sentence.strip()) |
|
current_sentence = sentence.strip() |
|
else: |
|
current_sentence += sentence |
|
if sentence.endswith(('.', '?', '!')): |
|
combined_sentences.append(current_sentence.strip()) |
|
current_sentence = "" |
|
if current_sentence: |
|
combined_sentences.append(current_sentence.strip()) |
|
return combined_sentences |
|
|
|
def get_youtube_script(url): |
|
logging.info(f"μ€ν¬λ¦½νΈ μΆμΆ μμ: URL = {url}") |
|
|
|
|
|
client = Client("whispersound/YT_Ts_R") |
|
|
|
try: |
|
logging.debug("API νΈμΆ μμ") |
|
result = client.predict(youtube_url=url, api_name="/predict") |
|
logging.debug("API νΈμΆ μλ£") |
|
|
|
|
|
parsed_result = parse_api_response(result) |
|
|
|
title = parsed_result["data"][0]["title"] |
|
transcription_text = parsed_result["data"][0]["transcriptionAsText"] |
|
|
|
logging.info("μ€ν¬λ¦½νΈ μΆμΆ μλ£") |
|
return title, transcription_text |
|
|
|
except Exception as e: |
|
error_msg = f"μ€ν¬λ¦½νΈ μΆμΆ μ€ μ€λ₯ λ°μ: {str(e)}" |
|
logging.exception(error_msg) |
|
return "", "" |
|
|
|
|
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
|
|
|
|
def call_api(prompt, max_tokens, temperature, top_p): |
|
try: |
|
response = openai.ChatCompletion.create( |
|
model="gpt-4o-mini", |
|
messages=[{"role": "user", "content": prompt}], |
|
max_tokens=max_tokens, |
|
temperature=temperature, |
|
top_p=top_p |
|
) |
|
return response['choices'][0]['message']['content'] |
|
except Exception as e: |
|
logging.exception("LLM API νΈμΆ μ€ μ€λ₯ λ°μ") |
|
return "μμ½μ μμ±νλ λμ μ€λ₯κ° λ°μνμ΅λλ€. λμ€μ λ€μ μλν΄ μ£ΌμΈμ." |
|
|
|
|
|
def summarize_text(text): |
|
prompt = text |
|
|
|
try: |
|
return call_api(prompt, max_tokens=2000, temperature=0.3, top_p=0.9) |
|
except Exception as e: |
|
logging.exception("μμ½ μμ± μ€ μ€λ₯ λ°μ") |
|
return "μμ½μ μμ±νλ λμ μ€λ₯κ° λ°μνμ΅λλ€. λμ€μ λ€μ μλν΄ μ£ΌμΈμ." |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## YouTube μ€ν¬λ¦½νΈ μΆμΆ λ° μμ½ λꡬ") |
|
|
|
youtube_url_input = gr.Textbox(label="YouTube URL μ
λ ₯") |
|
analyze_button = gr.Button("λΆμνκΈ°") |
|
script_output = gr.HTML(label="μ€ν¬λ¦½νΈ") |
|
summary_output = gr.HTML(label="μμ½") |
|
|
|
|
|
cached_data = gr.State({"url": "", "title": "", "script": ""}) |
|
|
|
def extract_and_cache(url, cache): |
|
if url == cache["url"]: |
|
return cache["title"], cache["script"], cache |
|
|
|
title, script = get_youtube_script(url) |
|
new_cache = {"url": url, "title": title, "script": script} |
|
return title, script, new_cache |
|
|
|
def display_script(title, script): |
|
formatted_script = "\n".join(split_sentences(script)) |
|
script_html = f"""<h2 style='font-size:24px;'>{title}</h2> |
|
<details> |
|
<summary><h3>μλ¬Έ μ€ν¬λ¦½νΈ (ν΄λ¦νμ¬ νΌμΉκΈ°)</h3></summary> |
|
<div style="white-space: pre-wrap;">{formatted_script}</div> |
|
</details>""" |
|
return script_html |
|
|
|
def generate_summary(script): |
|
summary = summarize_text(script) |
|
|
|
summary_html = f""" |
|
<h3>μμ½:</h3> |
|
<div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;"> |
|
{summary} |
|
</div> |
|
""" |
|
return summary_html |
|
|
|
def analyze(url, cache): |
|
title, script, new_cache = extract_and_cache(url, cache) |
|
script_html = display_script(title, script) |
|
return script_html, new_cache |
|
|
|
def update_summary(cache): |
|
if not cache["script"]: |
|
return "μ€ν¬λ¦½νΈκ° μμ΅λλ€. λ¨Όμ YouTube URLμ μ
λ ₯νκ³ λΆμμ μ€νν΄μ£ΌμΈμ." |
|
return generate_summary(cache["script"]) |
|
|
|
|
|
analyze_button.click( |
|
analyze, |
|
inputs=[youtube_url_input, cached_data], |
|
outputs=[script_output, cached_data] |
|
).then( |
|
update_summary, |
|
inputs=[cached_data], |
|
outputs=summary_output |
|
) |
|
|
|
|
|
demo.launch(share=True) |