YT_Script_Shorts

Running

App Files Files Community

AIRider commited on Sep 29, 2024

Commit

7b2bf17

verified ·

1 Parent(s): 7dc2cb4

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -160

app.py CHANGED Viewed

@@ -2,12 +2,10 @@ import gradio as gr
 from gradio_client import Client
 import json
 import logging
-import ast
 import openai
 import os
-import random
-import re
 logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
@@ -23,59 +21,53 @@ def parse_api_response(response):
             raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}")
         return response
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
-def split_sentences(text):
-    sentences = re.split(r"(니다|에요|구나|해요|군요|겠어요|시오|해라|예요|아요|데요|대요|세요|어요|게요|구요|고요|나요|하죠)(?![\w])", text)
-    combined_sentences = []
-    current_sentence = ""
-    for i in range(0, len(sentences), 2):
-        if i + 1 < len(sentences):
-            sentence = sentences[i] + sentences[i + 1]
-        else:
-            sentence = sentences[i]
-        if len(current_sentence) + len(sentence) > 100:
-            combined_sentences.append(current_sentence.strip())
-            current_sentence = sentence.strip()
-        else:
-            current_sentence += sentence
-        if sentence.endswith(('.', '?', '!')):
-            combined_sentences.append(current_sentence.strip())
-            current_sentence = ""
-    if current_sentence:
-        combined_sentences.append(current_sentence.strip())
-    return combined_sentences
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
-        logging.debug("API 호출 시작")
         result = client.predict(youtube_url=url, api_name="/predict")
-        logging.debug("API 호출 완료")
         parsed_result = parse_api_response(result)
         if 'data' not in parsed_result or not parsed_result['data']:
             raise ValueError("API 응답에 유효한 데이터가 없습니다.")
-        title = parsed_result["data"][0].get("title", "제목 없음")
-        transcription_text = parsed_result["data"][0].get("transcriptionAsText", "")
-        sections = parsed_result["data"][0].get("sections", [])
         if not transcription_text:
             raise ValueError("추출된 스크립트가 없습니다.")
         logging.info("스크립트 추출 완료")
         return title, transcription_text, sections
     except Exception as e:
-        error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
-        logging.exception(error_msg)
         raise
 def call_api(prompt, max_tokens, temperature, top_p):
     try:
         response = openai.ChatCompletion.create(
@@ -90,67 +82,6 @@ def call_api(prompt, max_tokens, temperature, top_p):
         logging.exception("LLM API 호출 중 오류 발생")
         raise
-def summarize_section(section_text):
-    prompt = f"""
-다음 유튜브 대본 섹션의 핵심 내용을 간결하게 요약하세요:
-1. 한글로 작성하세요.
-2. 주요 논점과 중요한 세부사항을 포함하세요.
-3. 요약은 2-3문장으로 제한하세요.
-섹션 내용:
-{section_text}
-"""
-    return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9)
-def format_time(seconds):
-    minutes, seconds = divmod(seconds, 60)
-    hours, minutes = divmod(minutes, 60)
-    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
-def generate_timeline_summary(sections):
-    combined_sections = "\n\n".join([f"{format_time(section['start_time'])}: {section['text']}" for section in sections])
-    prompt = f"""
-다음은 유튜브 영상의 타임라인과 각 섹션의 내용입니다. 이를 바탕으로 타임라인 요약을 생성해주세요:
-1. 각 섹션의 시작 시간을 유지하면서 핵심 내용을 간결하게 요약하세요.
-2. 요약은 한글로 작성하세요.
-3. 각 섹션의 요약은 1-2문장으로 제한하세요.
-4. 전체 맥락을 고려하여 요약하되, 각 섹션의 고유한 내용을 놓치지 마세요.
-5. 출력 형식은 다음과 같이 유지하세요:
-   [시작 시간] 섹션 요약
-섹션 내용:
-{combined_sections}
-"""
-    try:
-        response = call_api(prompt, max_tokens=1000, temperature=0.3, top_p=0.9)
-        # 응답을 줄 단위로 분리하고 각 줄을 HTML 형식으로 변환
-        timeline_items = response.strip().split('\n')
-        formatted_timeline = []
-        for item in timeline_items:
-            if ':' in item:  # 시간 정보가 있는 항목만 처리
-                time, summary = item.split(':', 1)
-                formatted_timeline.append(f"<p><strong>{time.strip()}</strong>:{summary.strip()}</p>")
-        timeline_html = "\n".join(formatted_timeline)
-        if not timeline_html:
-            raise ValueError("유효한 타임라인 요약을 생성하지 못했습니다.")
-        return f"""
-        <h3>타임라인 요약:</h3>
-        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
-            {timeline_html}
-        </div>
-        """
-    except Exception as e:
-        logging.exception("타임라인 요약 생성 중 오류 발생")
-        return "<p>타임라인 요약을 생성하는 중 오류가 발생했습니다. 다시 시도해 주세요.</p>"
 def summarize_text(text):
     prompt = f"""
 1. 다음 주어지는 유튜브 대본의 핵심 주제와 모든 주요 내용을 상세하게 요약하라
@@ -163,86 +94,55 @@ def summarize_text(text):
 8. 대본에서 전달하는 감정이나 분위기도 포함
 9. 반드시 기술적 용어나 전문 용어가 있을 경우, 이를 정확하게 사용
 10. 대본의 목적이나 의도를 파악하고, 이를 요약에 반드시 반영
-11. 전체글을 보고
----
-이 프롬프트가 도움이 되시길 바랍니다.
-    \n\n
-    {text}"""
-    return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9)
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
     script_output = gr.HTML(label="스크립트")
     timeline_output = gr.HTML(label="타임라인 요약")
     summary_output = gr.HTML(label="전체 요약")
     cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
-    def extract_and_cache(url, cache):
-        if url == cache["url"]:
-            return cache["title"], cache["script"], cache["sections"], cache
-        try:
-            title, script, sections = get_youtube_script(url)
-            new_cache = {"url": url, "title": title, "script": script, "sections": sections}
-            return title, script, sections, new_cache
-        except Exception as e:
-            logging.exception("데이터 추출 중 오류 발생")
-            raise gr.Error(f"스크립트 추출 실패: {str(e)}")
-    def display_script(title, script):
-        formatted_script = "\n".join(split_sentences(script))
-        script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
-        <details>
-            <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
-            <div style="white-space: pre-wrap;">{formatted_script}</div>
-        </details>"""
-        return script_html
-    def display_timeline(sections):
-        timeline_summary = generate_timeline_summary(sections)
-        timeline_html = f"""
-        <h3>타임라인 요약:</h3>
-        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
-            {timeline_summary}
-        </div>
-        """
-        return timeline_html
-    def generate_summary(script):
-        summary = summarize_text(script)
-        summary_html = f"""
-        <h3>전체 요약:</h3>
-        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
-            {summary}
-        </div>
-        """
-        return summary_html
-    def analyze(url, cache):
-        try:
-            title, script, sections, new_cache = extract_and_cache(url, cache)
-            script_html = display_script(title, script)
-            timeline_html = generate_timeline_summary(sections)
-            summary_html = generate_summary(script)
-            return script_html, timeline_html, summary_html, new_cache
-        except gr.Error as e:
-            return str(e), "", "", cache
-        except Exception as e:
-            error_msg = f"처리 중 오류 발생: {str(e)}"
-            logging.exception(error_msg)
-            return error_msg, "", "", cache
     analyze_button.click(
         analyze,
         inputs=[youtube_url_input, cached_data],
         outputs=[script_output, timeline_output, summary_output, cached_data]
     )
-demo.launch(share=True)

 from gradio_client import Client
 import json
 import logging
 import openai
 import os
+# 로깅 설정
 logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
             raise ValueError(f"예상치 못한 응답 형식입니다. 받은 데이터 타입: {type(response)}")
         return response
     except Exception as e:
+        logging.error(f"API 응답 파싱 실패: {str(e)}")
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
         result = client.predict(youtube_url=url, api_name="/predict")
         parsed_result = parse_api_response(result)
         if 'data' not in parsed_result or not parsed_result['data']:
             raise ValueError("API 응답에 유효한 데이터가 없습니다.")
+        data = parsed_result["data"][0]
+        title = data.get("title", "제목 없음")
+        transcription_text = data.get("transcriptionAsText", "")
+        sections = data.get("sections", [])
         if not transcription_text:
             raise ValueError("추출된 스크립트가 없습니다.")
         logging.info("스크립트 추출 완료")
         return title, transcription_text, sections
     except Exception as e:
+        logging.exception("스크립트 추출 중 오류 발생")
         raise
+def format_time(seconds):
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+def generate_timeline_summary(sections):
+    timeline_items = []
+    for section in sections:
+        start_time = format_time(section['start_time'])
+        text = section['text']
+        timeline_items.append(f"<p><strong>{start_time}</strong>: {text}</p>")
+    timeline_html = "\n".join(timeline_items)
+    return f"""
+    <h3>타임라인 요약:</h3>
+    <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+        {timeline_html}
+    </div>
+    """
 def call_api(prompt, max_tokens, temperature, top_p):
     try:
         response = openai.ChatCompletion.create(
         logging.exception("LLM API 호출 중 오류 발생")
         raise
 def summarize_text(text):
     prompt = f"""
 1. 다음 주어지는 유튜브 대본의 핵심 주제와 모든 주요 내용을 상세하게 요약하라
 8. 대본에서 전달하는 감정이나 분위기도 포함
 9. 반드시 기술적 용어나 전문 용어가 있을 경우, 이를 정확하게 사용
 10. 대본의 목적이나 의도를 파악하고, 이를 요약에 반드시 반영
+대본:
+{text}
+"""
+    return call_api(prompt, max_tokens=2000, temperature=0.3, top_p=0.9)
+def display_script(title, script):
+    return f"""<h2 style='font-size:24px;'>{title}</h2>
+    <details>
+        <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
+        <div style="white-space: pre-wrap;">{script}</div>
+    </details>"""
+def analyze(url, cache):
+    try:
+        if url == cache["url"]:
+            logging.info(f"캐시된 데이터 사용: URL = {url}")
+            title, script, sections = cache["title"], cache["script"], cache["sections"]
+        else:
+            logging.info(f"새로운 데이터 추출 시작: URL = {url}")
+            title, script, sections = get_youtube_script(url)
+            cache = {"url": url, "title": title, "script": script, "sections": sections}
+        script_html = display_script(title, script)
+        timeline_html = generate_timeline_summary(sections)
+        summary_html = summarize_text(script)
+        logging.info("분석 완료")
+        return script_html, timeline_html, summary_html, cache
+    except Exception as e:
+        error_msg = f"처리 중 오류 발생: {str(e)}"
+        logging.exception(error_msg)
+        return error_msg, "", "", cache
+# Gradio 인터페이스
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
     script_output = gr.HTML(label="스크립트")
     timeline_output = gr.HTML(label="타임라인 요약")
     summary_output = gr.HTML(label="전체 요약")
     cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
     analyze_button.click(
         analyze,
         inputs=[youtube_url_input, cached_data],
         outputs=[script_output, timeline_output, summary_output, cached_data]
     )
+if __name__ == "__main__":
+    demo.launch(share=True)