import gradio as gr from gradio_client import Client import json import logging import ast import openai import os import random import re import nltk import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import urllib.parse # nltk 데이터 다운로드 (최초 한 번 실행) nltk.download('punkt') # 로깅 설정 logging.basicConfig( filename='youtube_script_extractor.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s' ) def parse_api_response(response): try: if isinstance(response, str): response = ast.literal_eval(response) if not isinstance(response, dict): raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}") return response except Exception as e: raise ValueError(f"API 응답 파싱 실패: {str(e)}") def get_youtube_script(url): logging.info(f"스크립트 추출 시작: URL = {url}") client = Client("whispersound/YT_Ts_R") try: logging.debug("API 호출 시작") result = client.predict(youtube_url=url, api_name="/predict") logging.debug("API 호출 완료") parsed_result = parse_api_response(result) # 데이터 구조에 맞게 수정 data_list = parsed_result.get("data", []) if not data_list: raise ValueError("데이터를 가져올 수 없습니다.") # 첫 번째 데이터 사용 data = data_list[0] title = data.get("title", "") transcription = data.get("transcription", []) transcription_as_text = data.get("transcriptionAsText", "") logging.info("스크립트 추출 완료") script_json = json.dumps({ "title": title, "transcription": transcription, "transcriptionAsText": transcription_as_text }) return title, script_json except Exception as e: error_msg = f"스크립트 추출 중 오류 발생: {str(e)}" logging.exception(error_msg) return "", "" # OpenAI API 키 설정 openai.api_key = os.getenv("OPENAI_API_KEY") def call_api(prompt, max_tokens, temperature, top_p): try: response = openai.ChatCompletion.create( model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens, temperature=temperature, top_p=top_p ) return response['choices'][0]['message']['content'] except Exception as e: logging.exception("LLM API 호출 중 오류 발생") return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요." def extract_video_id(url): parsed_url = urllib.parse.urlparse(url) if parsed_url.hostname in ('www.youtube.com', 'youtube.com'): query_params = urllib.parse.parse_qs(parsed_url.query) return query_params.get('v', [None])[0] elif parsed_url.hostname == 'youtu.be': return parsed_url.path[1:] else: return None def summarize_section(section_text): prompt = f"""다음 내용의 핵심을 요약해 주세요: {section_text} 요약은 한국어로 간결하게 작성해 주세요. """ return call_api(prompt, max_tokens=500, temperature=0.3, top_p=0.9) def segment_transcript(transcript): sentences = [] start_times = [] for entry in transcript: subtitle = entry.get('subtitle', '') start_time = entry.get('start', 0) if not subtitle: continue split_sentences = nltk.tokenize.sent_tokenize(subtitle) sentences.extend(split_sentences) start_times.extend([start_time] * len(split_sentences)) if not sentences: return [] vectorizer = TfidfVectorizer().fit_transform(sentences) vectors = vectorizer.toarray() boundaries = [0] threshold = 0.3 for i in range(1, len(sentences)): similarity = cosine_similarity([vectors[i - 1]], [vectors[i]])[0][0] if similarity < threshold: boundaries.append(i) boundaries.append(len(sentences)) sections = [] for i in range(len(boundaries) - 1): start_idx = boundaries[i] end_idx = boundaries[i + 1] section_sentences = sentences[start_idx:end_idx] section_text = ' '.join(section_sentences) section_start_time = start_times[start_idx] sections.append({ 'text': section_text, 'start_time': section_start_time }) return sections def generate_summary(sections, url): video_id = extract_video_id(url) summary_html = "

요약:

" for idx, section in enumerate(sections): start_time = section['start_time'] hours = int(start_time // 3600) minutes = int((start_time % 3600) // 60) seconds = int(start_time % 60) timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}" timestamp_link = f"https://www.youtube.com/watch?v={video_id}&t={int(start_time)}s" summary = summarize_section(section['text']) summary_html += f"""

{timestamp_str}

{summary}
""" return summary_html with gr.Blocks() as demo: gr.Markdown("## YouTube 스크립트 추출 및 요약 도구") youtube_url_input = gr.Textbox(label="YouTube URL 입력") analyze_button = gr.Button("분석하기") script_output = gr.HTML(label="스크립트") summary_output = gr.HTML(label="요약") cached_data = gr.State({"url": "", "title": "", "script": ""}) def extract_and_cache(url, cache): if url == cache.get("url"): return cache["title"], cache title, script = get_youtube_script(url) new_cache = {"url": url, "title": title, "script": script} return title, new_cache def display_script(title): script_html = f"""

{title}

""" return script_html def update_summary(cache): if not cache.get("script"): return "스크립트가 없습니다. 먼저 YouTube URL을 입력하고 분석을 실행해주세요." try: parsed_result = json.loads(cache["script"]) transcript = parsed_result.get("transcription", []) if not transcript: return "트랜스크립트를 가져올 수 없습니다." sections = segment_transcript(transcript) if not sections: return "섹션을 생성할 수 없습니다." return generate_summary(sections, cache["url"]) except Exception as e: logging.exception("요약 생성 중 오류 발생") return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요." analyze_button.click( extract_and_cache, inputs=[youtube_url_input, cached_data], outputs=[script_output, cached_data] ).then( update_summary, inputs=cached_data, outputs=summary_output ) demo.launch(share=True)