YT_Script_Shorts

Running

App Files Files Community

AIRider commited on Sep 29, 2024

Commit

57d8b30

verified ·

1 Parent(s): ea538de

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -88

app.py CHANGED Viewed

@@ -5,15 +5,12 @@ import logging
 import ast
 import openai
 import os
 import re
-from sklearn.feature_extraction.text import TfidfVectorizer
-from multiprocessing import Pool, cpu_count
 logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
-openai.api_key = os.getenv("OPENAI_API_KEY")
 def parse_api_response(response):
     try:
         if isinstance(response, str):
@@ -26,8 +23,30 @@ def parse_api_response(response):
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
@@ -39,48 +58,31 @@ def get_youtube_script(url):
         title = parsed_result["data"][0]["title"]
         transcription_text = parsed_result["data"][0]["transcriptionAsText"]
-        original_sections = parsed_result["data"][0]["sections"]
-        merged_sections = merge_sections(original_sections)
-        processed_sections = process_merged_sections_parallel(merged_sections)
-        logging.info("스크립트 추출 및 처리 완료")
-        return title, transcription_text, processed_sections
     except Exception as e:
         error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
         logging.exception(error_msg)
         return "", "", []
-def is_same_topic_tfidf(text1, text2, threshold=0.3):
-    vectorizer = TfidfVectorizer().fit([text1, text2])
-    vectors = vectorizer.transform([text1, text2])
-    similarity = (vectors[0] * vectors[1].T).A[0][0]
-    return similarity > threshold
-def merge_sections(sections, min_duration=60, max_duration=300):
-    merged_sections = []
-    current_section = sections[0].copy()
-    for section in sections[1:]:
-        duration = current_section['end_time'] - current_section['start_time']
-        if duration < min_duration:
-            current_section['end_time'] = section['end_time']
-            current_section['text'] += ' ' + section['text']
-        elif duration >= max_duration:
-            merged_sections.append(current_section)
-            current_section = section.copy()
-        else:
-            if is_same_topic_tfidf(current_section['text'], section['text']):
-                current_section['end_time'] = section['end_time']
-                current_section['text'] += ' ' + section['text']
-            else:
-                merged_sections.append(current_section)
-                current_section = section.copy()
-    merged_sections.append(current_section)
-    return merged_sections
 def summarize_section(section_text):
     prompt = f"""
@@ -92,79 +94,114 @@ def summarize_section(section_text):
 섹션 내용:
 {section_text}
 """
-    try:
-        response = openai.ChatCompletion.create(
-            model="gpt-4o-mini",
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=150,
-            temperature=0.3,
-            top_p=0.9
-        )
-        return response['choices'][0]['message']['content']
-    except Exception as e:
-        logging.exception("요약 생성 중 오류 발생")
-        return "요약을 생성하는 동안 오류가 발생했습니다."
-def process_section(section):
-    summary = summarize_section(section['text'])
-    return {
-        'start_time': section['start_time'],
-        'end_time': section['end_time'],
-        'summary': summary
-    }
-def process_merged_sections_parallel(merged_sections):
-    with Pool(processes=cpu_count()) as pool:
-        return pool.map(process_section, merged_sections)
 def format_time(seconds):
     minutes, seconds = divmod(seconds, 60)
     hours, minutes = divmod(minutes, 60)
     return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
-def generate_timeline_summary(processed_sections):
     timeline_summary = ""
-    for i, section in enumerate(processed_sections, 1):
         start_time = format_time(section['start_time'])
-        end_time = format_time(section['end_time'])
-        timeline_summary += f"{start_time} - {end_time} {i}. {section['summary']}\n\n"
     return timeline_summary
-def display_script_and_summary(title, script, processed_sections):
-    timeline_summary = generate_timeline_summary(processed_sections)
-    script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
-    <h3>타임라인 요약:</h3>
-    <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
-        {timeline_summary}
-    </div>
-    <details>
-        <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
-        <div style="white-space: pre-wrap;">{script}</div>
-    </details>"""
-    return script_html
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
-    output = gr.HTML(label="결과")
-    cached_data = gr.State({"url": "", "title": "", "script": "", "processed_sections": []})
-    def analyze(url, cache):
         if url == cache["url"]:
-            return display_script_and_summary(cache["title"], cache["script"], cache["processed_sections"]), cache
-        title, script, processed_sections = get_youtube_script(url)
-        new_cache = {"url": url, "title": title, "script": script, "processed_sections": processed_sections}
-        return display_script_and_summary(title, script, processed_sections), new_cache
     analyze_button.click(
         analyze,
         inputs=[youtube_url_input, cached_data],
-        outputs=[output, cached_data]
     )
 demo.launch(share=True)

 import ast
 import openai
 import os
+import random
 import re
 logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 def parse_api_response(response):
     try:
         if isinstance(response, str):
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
+def split_sentences(text):
+    sentences = re.split(r"(니다|에요|구나|해요|군요|겠어요|시오|해라|예요|아요|데요|대요|세요|어요|게요|구요|고요|나요|하죠)(?![\w])", text)
+    combined_sentences = []
+    current_sentence = ""
+    for i in range(0, len(sentences), 2):
+        if i + 1 < len(sentences):
+            sentence = sentences[i] + sentences[i + 1]
+        else:
+            sentence = sentences[i]
+        if len(current_sentence) + len(sentence) > 100:
+            combined_sentences.append(current_sentence.strip())
+            current_sentence = sentence.strip()
+        else:
+            current_sentence += sentence
+        if sentence.endswith(('.', '?', '!')):
+            combined_sentences.append(current_sentence.strip())
+            current_sentence = ""
+    if current_sentence:
+        combined_sentences.append(current_sentence.strip())
+    return combined_sentences
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
         title = parsed_result["data"][0]["title"]
         transcription_text = parsed_result["data"][0]["transcriptionAsText"]
+        sections = parsed_result["data"][0]["sections"]
+        logging.info("스크립트 추출 완료")
+        return title, transcription_text, sections
     except Exception as e:
         error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
         logging.exception(error_msg)
         return "", "", []
+openai.api_key = os.getenv("OPENAI_API_KEY")
+def call_api(prompt, max_tokens, temperature, top_p):
+    try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p
+        )
+        return response['choices'][0]['message']['content']
+    except Exception as e:
+        logging.exception("LLM API 호출 중 오류 발생")
+        return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."
 def summarize_section(section_text):
     prompt = f"""
 섹션 내용:
 {section_text}
 """
+    return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9)
 def format_time(seconds):
     minutes, seconds = divmod(seconds, 60)
     hours, minutes = divmod(minutes, 60)
     return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+def generate_timeline_summary(sections):
     timeline_summary = ""
+    for i, section in enumerate(sections, 1):
         start_time = format_time(section['start_time'])
+        summary = summarize_section(section['text'])
+        timeline_summary += f"{start_time} {i}. {summary}\n\n"
     return timeline_summary
+def summarize_text(text):
+    prompt = f"""
+1. 다음 주어지는 유튜브 대본의 핵심 주제와 모든 주요 내용을 상세하게 요약하라
+2. 반드시 한글로 작성하라
+3. 요약문만으로도 영상을 직접 시청한 것과 동일한 수준으로 내용을 이해할 수 있도록 상세히 작성
+4. 글을 너무 압축하거나 함축하지 말고, 중요한 내용과 세부사항을 모두 포함
+5. 반드시 대본의 흐름과 논리 구조를 유지
+6. 반드시 시간 순서나 사건의 전개 과정을 명확하게 반영
+7. 등장인물, 장소, 사건 등 중요한 요소를 정확하게 작성
+8. 대본에서 전달하는 감정이나 분위기도 포함
+9. 반드시 기술적 용어나 전문 용어가 있을 경우, 이를 정확하게 사용
+10. 대본의 목적이나 의도를 파악하고, 이를 요약에 반드시 반영
+11. 전체글을 보고
+---
+이 프롬프트가 도움이 되시길 바랍니다.
+    \n\n
+    {text}"""
+    try:
+        return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9)
+    except Exception as e:
+        logging.exception("요약 생성 중 오류 발생")
+        return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
+    script_output = gr.HTML(label="스크립트")
+    timeline_output = gr.HTML(label="타임라인 요약")
+    summary_output = gr.HTML(label="전체 요약")
+    cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []})
+    def extract_and_cache(url, cache):
         if url == cache["url"]:
+            return cache["title"], cache["script"], cache["sections"], cache
+        title, script, sections = get_youtube_script(url)
+        new_cache = {"url": url, "title": title, "script": script, "sections": sections}
+        return title, script, sections, new_cache
+    def display_script(title, script):
+        formatted_script = "\n".join(split_sentences(script))
+        script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
+        <details>
+            <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
+            <div style="white-space: pre-wrap;">{formatted_script}</div>
+        </details>"""
+        return script_html
+    def display_timeline(sections):
+        timeline_summary = generate_timeline_summary(sections)
+        timeline_html = f"""
+        <h3>타임라인 요약:</h3>
+        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+            {timeline_summary}
+        </div>
+        """
+        return timeline_html
+    def generate_summary(script):
+        summary = summarize_text(script)
+        summary_html = f"""
+        <h3>전체 요약:</h3>
+        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+            {summary}
+        </div>
+        """
+        return summary_html
+    def analyze(url, cache):
+        title, script, sections, new_cache = extract_and_cache(url, cache)
+        script_html = display_script(title, script)
+        timeline_html = display_timeline(sections)
+        return script_html, timeline_html, new_cache
+    def update_summary(cache):
+        if not cache["script"]:
+            return "스크립트가 없습니다. 먼저 YouTube URL을 입력하고 분석을 실행해주세요."
+        return generate_summary(cache["script"])
     analyze_button.click(
         analyze,
         inputs=[youtube_url_input, cached_data],
+        outputs=[script_output, timeline_output, cached_data]
+    ).then(
+        update_summary,
+        inputs=[cached_data],
+        outputs=summary_output
     )
 demo.launch(share=True)