import gradio as gr from gradio_client import Client import json import logging import ast import openai import os import random import re import nltk import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import urllib.parse # nltk 데이터 다운로드 (최초 한 번 실행) nltk.download('punkt') # 로깅 설정 logging.basicConfig( filename='youtube_script_extractor.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s' ) def parse_api_response(response): try: if isinstance(response, str): response = ast.literal_eval(response) if not isinstance(response, dict): raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}") return response except Exception as e: raise ValueError(f"API 응답 파싱 실패: {str(e)}") def get_youtube_script(url): logging.info(f"스크립트 추출 시작: URL = {url}") client = Client("whispersound/YT_Ts_R") try: logging.debug("API 호출 시작") result = client.predict(youtube_url=url, api_name="/predict") logging.debug("API 호출 완료") parsed_result = parse_api_response(result) # 데이터 구조에 맞게 수정 data_list = parsed_result.get("data", []) if not data_list: raise ValueError("데이터를 가져올 수 없습니다.") # 첫 번째 데이터 사용 data = data_list[0] title = data.get("title", "") transcription = data.get("transcription", []) transcription_as_text = data.get("transcriptionAsText", "") logging.info("스크립트 추출 완료") script_json = json.dumps({ "title": title, "transcription": transcription, "transcriptionAsText": transcription_as_text }) return title, script_json except Exception as e: error_msg = f"스크립트 추출 중 오류 발생: {str(e)}" logging.exception(error_msg) return "", "" # OpenAI API 키 설정 openai.api_key = os.getenv("OPENAI_API_KEY") def call_api(prompt, max_tokens, temperature, top_p): try: response = openai.ChatCompletion.create( model="gpt-4o-mini", messages=[{"role": "user", "content": prompt}], max_tokens=max_tokens, temperature=temperature, top_p=top_p ) return response['choices'][0]['message']['content'] except Exception as e: logging.exception("LLM API 호출 중 오류 발생") return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요." def extract_video_id(url): parsed_url = urllib.parse.urlparse(url) if parsed_url.hostname in ('www.youtube.com', 'youtube.com'): query_params = urllib.parse.parse_qs(parsed_url.query) return query_params.get('v', [None])[0] elif parsed_url.hostname == 'youtu.be': return parsed_url.path[1:] else: return None def summarize_section(section_text): prompt = f"""다음 내용의 핵심을 요약해 주세요: {section_text} 요약은 한국어로 간결하게 작성해 주세요. """ return call_api(prompt, max_tokens=500, temperature=0.3, top_p=0.9) def segment_transcript(transcript): sentences = [] start_times = [] for entry in transcript: subtitle = entry.get('subtitle', '') start_time = entry.get('start', 0) if not subtitle: continue split_sentences = nltk.tokenize.sent_tokenize(subtitle) sentences.extend(split_sentences) start_times.extend([start_time] * len(split_sentences)) if not sentences: return [] vectorizer = TfidfVectorizer().fit_transform(sentences) vectors = vectorizer.toarray() boundaries = [0] threshold = 0.3 for i in range(1, len(sentences)): similarity = cosine_similarity([vectors[i - 1]], [vectors[i]])[0][0] if similarity < threshold: boundaries.append(i) boundaries.append(len(sentences)) sections = [] for i in range(len(boundaries) - 1): start_idx = boundaries[i] end_idx = boundaries[i + 1] section_sentences = sentences[start_idx:end_idx] section_text = ' '.join(section_sentences) section_start_time = start_times[start_idx] sections.append({ 'text': section_text, 'start_time': section_start_time }) return sections def generate_summary(sections, url): video_id = extract_video_id(url) summary_html = "