YT_Script_Shorts

Running

App Files Files Community

AIRider commited on Sep 29, 2024

Commit

d3f7e6f

verified ·

1 Parent(s): d3555b8

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -137

app.py CHANGED Viewed

@@ -7,66 +7,74 @@ import openai
 import os
 import random
 import re
-import nltk
-import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-import urllib.parse
-# nltk 데이터 다운로드 (최초 한 번 실행)
-nltk.download('punkt')
-# 로깅 설정
-logging.basicConfig(
-    filename='youtube_script_extractor.log',
-    level=logging.DEBUG,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
 def parse_api_response(response):
     try:
         if isinstance(response, str):
-            response = ast.literal_eval(response)
         if not isinstance(response, dict):
             raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}")
         return response
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
         logging.debug("API 호출 시작")
         result = client.predict(youtube_url=url, api_name="/predict")
         logging.debug("API 호출 완료")
         parsed_result = parse_api_response(result)
-        # 데이터 구조에 맞게 수정
-        data_list = parsed_result.get("data", [])
-        if not data_list:
-            raise ValueError("데이터를 가져올 수 없습니다.")
-        # 첫 번째 데이터 사용
-        data = data_list[0]
-        title = data.get("title", "")
-        transcription = data.get("transcription", [])
-        transcription_as_text = data.get("transcriptionAsText", "")
         logging.info("스크립트 추출 완료")
-        script_json = json.dumps({
-            "title": title,
-            "transcription": transcription,
-            "transcriptionAsText": transcription_as_text
-        })
-        return title, script_json
     except Exception as e:
         error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
         logging.exception(error_msg)
-        return "", ""
-# OpenAI API 키 설정
-openai.api_key = os.getenv("OPENAI_API_KEY")
 def call_api(prompt, max_tokens, temperature, top_p):
     try:
@@ -80,126 +88,146 @@ def call_api(prompt, max_tokens, temperature, top_p):
         return response['choices'][0]['message']['content']
     except Exception as e:
         logging.exception("LLM API 호출 중 오류 발생")
-        return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."
-def extract_video_id(url):
-    parsed_url = urllib.parse.urlparse(url)
-    if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
-        query_params = urllib.parse.parse_qs(parsed_url.query)
-        return query_params.get('v', [None])[0]
-    elif parsed_url.hostname == 'youtu.be':
-        return parsed_url.path[1:]
-    else:
-        return None
 def summarize_section(section_text):
-    prompt = f"""다음 내용의 핵심을 요약해 주세요:
 {section_text}
-요약은 한국어로 간결하게 작성해 주세요.
 """
-    return call_api(prompt, max_tokens=500, temperature=0.3, top_p=0.9)
-def segment_transcript(transcript):
-    sentences = []
-    start_times = []
-    for entry in transcript:
-        subtitle = entry.get('subtitle', '')
-        start_time = entry.get('start', 0)
-        if not subtitle:
-            continue
-        split_sentences = nltk.tokenize.sent_tokenize(subtitle)
-        sentences.extend(split_sentences)
-        start_times.extend([start_time] * len(split_sentences))
-    if not sentences:
-        return []
-    vectorizer = TfidfVectorizer().fit_transform(sentences)
-    vectors = vectorizer.toarray()
-    boundaries = [0]
-    threshold = 0.3
-    for i in range(1, len(sentences)):
-        similarity = cosine_similarity([vectors[i - 1]], [vectors[i]])[0][0]
-        if similarity < threshold:
-            boundaries.append(i)
-    boundaries.append(len(sentences))
-    sections = []
-    for i in range(len(boundaries) - 1):
-        start_idx = boundaries[i]
-        end_idx = boundaries[i + 1]
-        section_sentences = sentences[start_idx:end_idx]
-        section_text = ' '.join(section_sentences)
-        section_start_time = start_times[start_idx]
-        sections.append({
-            'text': section_text,
-            'start_time': section_start_time
-        })
-    return sections
-def generate_summary(sections, url):
-    video_id = extract_video_id(url)
-    summary_html = "<h3>요약:</h3>"
-    for idx, section in enumerate(sections):
-        start_time = section['start_time']
-        hours = int(start_time // 3600)
-        minutes = int((start_time % 3600) // 60)
-        seconds = int(start_time % 60)
-        timestamp_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
-        timestamp_link = f"https://www.youtube.com/watch?v={video_id}&t={int(start_time)}s"
-        summary = summarize_section(section['text'])
-        summary_html += f"""
-        <h4><a href="{timestamp_link}" target="_blank">{timestamp_str}</a></h4>
-        <div style="white-space: pre-wrap; margin-bottom: 20px;">{summary}</div>
-        """
-    return summary_html
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
     script_output = gr.HTML(label="스크립트")
-    summary_output = gr.HTML(label="요약")
-    cached_data = gr.State({"url": "", "title": "", "script": ""})
     def extract_and_cache(url, cache):
-        if url == cache.get("url"):
-            return cache["title"], cache
-        title, script = get_youtube_script(url)
-        new_cache = {"url": url, "title": title, "script": script}
-        return title, new_cache
-    def display_script(title):
-        script_html = f"""<h2 style='font-size:24px;'>{title}</h2>"""
         return script_html
-    def update_summary(cache):
-        if not cache.get("script"):
-            return "스크립트가 없습니다. 먼저 YouTube URL을 입력하고 분석을 실행해주세요."
         try:
-            parsed_result = json.loads(cache["script"])
-            transcript = parsed_result.get("transcription", [])
-            if not transcript:
-                return "트랜스크립트를 가져올 수 없습니다."
-            sections = segment_transcript(transcript)
-            if not sections:
-                return "섹션을 생성할 수 없습니다."
-            return generate_summary(sections, cache["url"])
         except Exception as e:
-            logging.exception("요약 생성 중 오류 발생")
-            return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."
     analyze_button.click(
-        extract_and_cache,
-        inputs=[youtube_url_input, cached_data],
-        outputs=[script_output, cached_data]
-    ).then(
-        update_summary,
-        inputs=cached_data,
-        outputs=summary_output
     )
-demo.launch(share=True)

 import os
 import random
 import re
+logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+openai.api_key = os.getenv("OPENAI_API_KEY")
 def parse_api_response(response):
     try:
         if isinstance(response, str):
+            response = json.loads(response)
+        if isinstance(response, list) and len(response) > 0:
+            response = response[0]
         if not isinstance(response, dict):
             raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}")
         return response
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
+def split_sentences(text):
+    sentences = re.split(r"(니다|에요|구나|해요|군요|겠어요|시오|해라|예요|아요|데요|대요|세요|어요|게요|구요|고요|나요|하죠)(?![\w])", text)
+    combined_sentences = []
+    current_sentence = ""
+    for i in range(0, len(sentences), 2):
+        if i + 1 < len(sentences):
+            sentence = sentences[i] + sentences[i + 1]
+        else:
+            sentence = sentences[i]
+        if len(current_sentence) + len(sentence) > 100:
+            combined_sentences.append(current_sentence.strip())
+            current_sentence = sentence.strip()
+        else:
+            current_sentence += sentence
+        if sentence.endswith(('.', '?', '!')):
+            combined_sentences.append(current_sentence.strip())
+            current_sentence = ""
+    if current_sentence:
+        combined_sentences.append(current_sentence.strip())
+    return combined_sentences
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
         logging.debug("API 호출 시작")
         result = client.predict(youtube_url=url, api_name="/predict")
         logging.debug("API 호출 완료")
         parsed_result = parse_api_response(result)
+        if 'data' not in parsed_result or not parsed_result['data']:
+            raise ValueError("API 응답에 유효한 데이터가 없습니다.")
+        title = parsed_result["data"][0].get("title", "제목 없음")
+        transcription_text = parsed_result["data"][0].get("transcriptionAsText", "")
+        sections = parsed_result["data"][0].get("sections", [])
+        if not transcription_text:
+            raise ValueError("추출된 스크립트가 없습니다.")
         logging.info("스크립트 추출 완료")
+        return title, transcription_text, sections
     except Exception as e:
         error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
         logging.exception(error_msg)
+        raise
 def call_api(prompt, max_tokens, temperature, top_p):
     try:
         return response['choices'][0]['message']['content']
     except Exception as e:
         logging.exception("LLM API 호출 중 오류 발생")
+        raise
 def summarize_section(section_text):
+    prompt = f"""
+다음 유튜브 대본 섹션의 핵심 내용을 간결하게 요약하세요:
+1. 한글로 작성하세요.
+2. 주요 논점과 중요한 세부사항을 포함하세요.
+3. 요약은 2-3문장으로 제한하세요.
+섹션 내용:
 {section_text}
 """
+    return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9)
+def format_time(seconds):
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+def generate_timeline_summary(sections):
+    combined_sections = "\n\n".join([f"{format_time(section['start_time'])}: {section['text']}" for section in sections])
+    prompt = f"""
+다음은 유튜브 영상의 타임라인과 각 섹션의 내용입니다. 이를 바탕으로 타임라인 요약을 생성해주세요:
+1. 각 섹션의 시작 시간을 유지하면서 핵심 내용을 간결하게 요약하세요.
+2. 요약은 한글로 작성하세요.
+3. 각 섹션의 요약은 1-2문장으로 제한하세요.
+4. 전체 맥락을 고려하여 요약하되, 각 섹션의 고유한 내용을 놓치지 마세요.
+5. 출력 형식은 다음과 같이 유지하세요:
+   [시작 시간] 섹션 요약
+섹션 내용:
+{combined_sections}
+"""
+    response = call_api(prompt, max_tokens=1000, temperature=0.3, top_p=0.9)
+    # 응답을 줄 단위로 분리하고 각 줄을 HTML 형식으로 변환
+    timeline_html = "<br>".join(response.split('\n'))
+    return f"""
+    <h3>타임라인 요약:</h3>
+    <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+        {timeline_html}
+    </div>
+    """
+def summarize_text(text):
+    prompt = f"""
+1. 다음 주어지는 유튜브 대본의 핵심 주제와 모든 주요 내용을 상세하게 요약하라
+2. 반드시 한글로 작성하라
+3. 요약문만으로도 영상을 직접 시청한 것과 동일한 수준으로 내용을 이해할 수 있도록 상세히 작성
+4. 글을 너무 압축하거나 함축하지 말고, 중요한 내용과 세부사항을 모두 포함
+5. 반드시 대본의 흐름과 논리 구조를 유지
+6. 반드시 시간 순서나 사건의 전개 과정을 명확하게 반영
+7. 등장인물, 장소, 사건 등 중요한 요소를 정확하게 작성
+8. 대본에서 전달하는 감정이나 분위기도 포함
+9. 반드시 기술적 용어나 전문 용어가 있을 경우, 이를 정확하게 사용
+10. 대본의 목적이나 의도를 파악하고, 이를 요약에 반드시 반영
+11. 전체글을 보고
+---
+이 프롬프트가 도움이 되시길 바랍니다.
+    \n\n
+    {text}"""
+    return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9)
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
     script_output = gr.HTML(label="스크립트")
+    timeline_output = gr.HTML(label="타임라인 요약")
+    summary_output = gr.HTML(label="전체 요약")
+    cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
     def extract_and_cache(url, cache):
+        if url == cache["url"]:
+            return cache["title"], cache["script"], cache["sections"], cache
+        try:
+            title, script, sections = get_youtube_script(url)
+            new_cache = {"url": url, "title": title, "script": script, "sections": sections}
+            return title, script, sections, new_cache
+        except Exception as e:
+            logging.exception("데이터 추출 중 오류 발생")
+            raise gr.Error(f"스크립트 추출 실패: {str(e)}")
+    def display_script(title, script):
+        formatted_script = "\n".join(split_sentences(script))
+        script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
+        <details>
+            <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
+            <div style="white-space: pre-wrap;">{formatted_script}</div>
+        </details>"""
         return script_html
+    def display_timeline(sections):
+        timeline_summary = generate_timeline_summary(sections)
+        timeline_html = f"""
+        <h3>타임라인 요약:</h3>
+        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+            {timeline_summary}
+        </div>
+        """
+        return timeline_html
+    def generate_summary(script):
+        summary = summarize_text(script)
+        summary_html = f"""
+        <h3>전체 요약:</h3>
+        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+            {summary}
+        </div>
+        """
+        return summary_html
+    def analyze(url, cache):
         try:
+            title, script, sections, new_cache = extract_and_cache(url, cache)
+            script_html = display_script(title, script)
+            timeline_html = generate_timeline_summary(sections)
+            summary_html = generate_summary(script)
+            return script_html, timeline_html, summary_html, new_cache
+        except gr.Error as e:
+            return str(e), "", "", cache
         except Exception as e:
+            error_msg = f"처리 중 오류 발생: {str(e)}"
+            logging.exception(error_msg)
+            return error_msg, "", "", cache
     analyze_button.click(
+        analyze,
+        inputs=[youtube_url_input, cached_data],
+        outputs=[script_output, timeline_output, summary_output, cached_data]
     )
+demo.launch(share=True)