YT_Script_Shorts

Running

File size: 6,847 Bytes

import gradio as gr
from gradio_client import Client
import json
import logging
import ast
import openai
import os
import random
import re
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import urllib.parse

# nltk 데이터 다운로드 (최초 한 번 실행)
nltk.download('punkt')

# 로깅 설정
logging.basicConfig(
    filename='youtube_script_extractor.log',
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def parse_api_response(response):
    try:
        if isinstance(response, str):
            response = ast.literal_eval(response)
        if isinstance(response, list) and len(response) > 0:
            response = response[0]
        if not isinstance(response, dict):
            raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}")
        return response
    except Exception as e:
        raise ValueError(f"API 응답 파싱 실패: {str(e)}")

def get_youtube_script(url):
    logging.info(f"스크립트 추출 시작: URL = {url}")
    client = Client("whispersound/YT_Ts_R")
    try:
        logging.debug("API 호출 시작")
        result = client.predict(youtube_url=url, api_name="/predict")
        logging.debug("API 호출 완료")
        parsed_result = parse_api_response(result)
        title = parsed_result["data"][0]["title"]
        transcription = parsed_result["data"][0]["transcription"]
        logging.info("스크립트 추출 완료")
        script_json = json.dumps({
            "title": title,
            "transcription": transcription
        })
        return title, script_json
    except Exception as e:
        error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
        logging.exception(error_msg)
        return "", ""

# OpenAI API 키 설정
openai.api_key = os.getenv("OPENAI_API_KEY")

def call_api(prompt, max_tokens, temperature, top_p):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        logging.exception("LLM API 호출 중 오류 발생")
        return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."

def extract_video_id(url):
    parsed_url = urllib.parse.urlparse(url)
    if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
        query_params = urllib.parse.parse_qs(parsed_url.query)
        return query_params.get('v', [None])[0]
    elif parsed_url.hostname == 'youtu.be':
        return parsed_url.path[1:]
    else:
        return None

def summarize_section(section_text):
    prompt = f"""다음 내용의 핵심을 요약해 주세요:

{section_text}

요약은 한국어로 간결하게 작성해 주세요.
"""
    return call_api(prompt, max_tokens=500, temperature=0.3, top_p=0.9)

def segment_transcript(transcript):
    sentences = []
    start_times = []
    for entry in transcript:
        subtitle = entry['subtitle']
        start_time = entry['start']
        split_sentences = nltk.tokenize.sent_tokenize(subtitle)
        sentences.extend(split_sentences)
        start_times.extend([start_time] * len(split_sentences))

    vectorizer = TfidfVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()

    boundaries = [0]
    threshold = 0.3
    for i in range(1, len(sentences)):
        similarity = cosine_similarity([vectors[i - 1]], [vectors[i]])[0][0]
        if similarity < threshold:
            boundaries.append(i)
    boundaries.append(len(sentences))

    sections = []
    for i in range(len(boundaries) - 1):
        start_idx = boundaries[i]
        end_idx = boundaries[i + 1]
        section_sentences = sentences[start_idx:end_idx]
        section_text = ' '.join(section_sentences)
        section_start_time = start_times[start_idx]
        sections.append({
            'text': section_text,
            'start_time': section_start_time
        })
    return sections

def generate_summary(sections, url):
    video_id = extract_video_id(url)
    summary_html = "<h3>요약:</h3>"
    for idx, section in enumerate(sections):
        start_time = section['start_time']
        hours = int(start_time // 3600)
        minutes = int((start_time % 3600) // 60)
        seconds = int(start_time % 60)
        timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
        timestamp_link = f"https://www.youtube.com/watch?v={video_id}&t={int(start_time)}s"
        summary = summarize_section(section['text'])
        summary_html += f"""
        <h4><a href="{timestamp_link}" target="_blank">{timestamp_str}</a></h4>
        <div style="white-space: pre-wrap; margin-bottom: 20px;">{summary}</div>
        """
    return summary_html

with gr.Blocks() as demo:
    gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
    youtube_url_input = gr.Textbox(label="YouTube URL 입력")
    analyze_button = gr.Button("분석하기")
    script_output = gr.HTML(label="스크립트")
    summary_output = gr.HTML(label="요약")
    cached_data = gr.State({"url": "", "title": "", "script": ""})

    def extract_and_cache(url, cache):
        if url == cache["url"]:
            return cache["title"], cache["script"], cache
        title, script = get_youtube_script(url)
        new_cache = {"url": url, "title": title, "script": script}
        return title, script, new_cache

    def display_script(title, script):
        script_html = f"""<h2 style='font-size:24px;'>{title}</h2>"""
        return script_html

    def update_summary(cache):
        if not cache["script"]:
            return "스크립트가 없습니다. 먼저 YouTube URL을 입력하고 분석을 실행해주세요."
        try:
            parsed_result = json.loads(cache["script"])
            transcript = parsed_result.get("transcription", [])
            if not transcript:
                return "트랜스크립트를 가져올 수 없습니다."
            sections = segment_transcript(transcript)
            return generate_summary(sections, cache["url"])
        except Exception as e:
            logging.exception("요약 생성 중 오류 발생")
            return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."

    analyze_button.click(
        extract_and_cache,
        inputs=[youtube_url_input, cached_data],
        outputs=[script_output, cached_data]
    ).then(
        update_summary,
        inputs=[cached_data],
        outputs=summary_output
    )

demo.launch(share=True)