YT_Script_Shorts

Running

App Files Files Community

AIRider commited on Sep 29, 2024

Commit

1966494

verified ·

1 Parent(s): 4a45930

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -97

app.py CHANGED Viewed

@@ -5,13 +5,15 @@ import logging
 import ast
 import openai
 import os
-import random
 import re
-# 로깅 설정
 logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
 def parse_api_response(response):
     try:
         if isinstance(response, str):
@@ -24,32 +26,8 @@ def parse_api_response(response):
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
-# 문장 구분 함수 (한국어)
-def split_sentences(text):
-    sentences = re.split(r"(니다|에요|구나|해요|군요|겠어요|시오|해라|예요|아요|데요|대요|세요|어요|게요|구요|고요|나요|하죠)(?![\w])", text)
-    combined_sentences = []
-    current_sentence = ""
-    for i in range(0, len(sentences), 2):
-        if i + 1 < len(sentences):
-            sentence = sentences[i] + sentences[i + 1]
-        else:
-            sentence = sentences[i]
-        if len(current_sentence) + len(sentence) > 100:  # 100자를 초과할 경우
-            combined_sentences.append(current_sentence.strip())
-            current_sentence = sentence.strip()
-        else:
-            current_sentence += sentence
-        if sentence.endswith(('.', '?', '!')):
-            combined_sentences.append(current_sentence.strip())
-            current_sentence = ""
-    if current_sentence:
-        combined_sentences.append(current_sentence.strip())
-    return combined_sentences
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
-    # 엔드포인트를 새로운 것으로 변경
     client = Client("whispersound/YT_Ts_R")
     try:
@@ -57,108 +35,136 @@ def get_youtube_script(url):
         result = client.predict(youtube_url=url, api_name="/predict")
         logging.debug("API 호출 완료")
-        # 응답 파싱
         parsed_result = parse_api_response(result)
         title = parsed_result["data"][0]["title"]
         transcription_text = parsed_result["data"][0]["transcriptionAsText"]
-        logging.info("스크립트 추출 완료")
-        return title, transcription_text
     except Exception as e:
         error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
         logging.exception(error_msg)
-        return "", ""
-# OpenAI API 키 설정
-openai.api_key = os.getenv("OPENAI_API_KEY")
-# LLM API 호출 함수
-def call_api(prompt, max_tokens, temperature, top_p):
     try:
         response = openai.ChatCompletion.create(
-            model="gpt-4o-mini",  # 모델을 gpt-4o-mini로 변경
             messages=[{"role": "user", "content": prompt}],
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p
         )
         return response['choices'][0]['message']['content']
-    except Exception as e:
-        logging.exception("LLM API 호출 중 오류 발생")
-        return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."
-# 텍스트 요약 함수
-def summarize_text(text):
-    prompt = text  # 프롬프트를 원본 텍스트로 설정하여 self-discover 가능하도록 함
-    try:
-        return call_api(prompt, max_tokens=2000, temperature=0.3, top_p=0.9)
     except Exception as e:
         logging.exception("요약 생성 중 오류 발생")
-        return "요약을 생성하는 동안 오류가 발생했습니다. 나중에 다시 시도해 주세요."
-# Gradio 인터페이스 설정
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
-    script_output = gr.HTML(label="스크립트")
-    summary_output = gr.HTML(label="요약")
-    # 캐시를 위한 상태 변수
-    cached_data = gr.State({"url": "", "title": "", "script": ""})
-    def extract_and_cache(url, cache):
-        if url == cache["url"]:
-            return cache["title"], cache["script"], cache
-        title, script = get_youtube_script(url)
-        new_cache = {"url": url, "title": title, "script": script}
-        return title, script, new_cache
-    def display_script(title, script):
-        formatted_script = "\n".join(split_sentences(script))
-        script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
-        <details>
-            <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
-            <div style="white-space: pre-wrap;">{formatted_script}</div>
-        </details>"""
-        return script_html
-    def generate_summary(script):
-        summary = summarize_text(script)
-        # 요약 결과를 잘 표시하기 위해 div 태그와 CSS 스타일 적용
-        summary_html = f"""
-        <h3>요약:</h3>
-        <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
-            {summary}
-        </div>
-        """
-        return summary_html
     def analyze(url, cache):
-        title, script, new_cache = extract_and_cache(url, cache)
-        script_html = display_script(title, script)
-        return script_html, new_cache
-    def update_summary(cache):
-        if not cache["script"]:
-            return "스크립트가 없습니다. 먼저 YouTube URL을 입력하고 분석을 실행해주세요."
-        return generate_summary(cache["script"])
-    # 버튼 클릭 시 스크립트 추출
     analyze_button.click(
         analyze,
         inputs=[youtube_url_input, cached_data],
-        outputs=[script_output, cached_data]
-    ).then(
-        update_summary,
-        inputs=[cached_data],
-        outputs=summary_output
     )
-# 인터페이스 실행
 demo.launch(share=True)

 import ast
 import openai
 import os
 import re
+from sklearn.feature_extraction.text import TfidfVectorizer
+from multiprocessing import Pool, cpu_count
 logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
                     format='%(asctime)s - %(levelname)s - %(message)s')
+openai.api_key = os.getenv("OPENAI_API_KEY")
 def parse_api_response(response):
     try:
         if isinstance(response, str):
     except Exception as e:
         raise ValueError(f"API 응답 파싱 실패: {str(e)}")
 def get_youtube_script(url):
     logging.info(f"스크립트 추출 시작: URL = {url}")
     client = Client("whispersound/YT_Ts_R")
     try:
         result = client.predict(youtube_url=url, api_name="/predict")
         logging.debug("API 호출 완료")
         parsed_result = parse_api_response(result)
         title = parsed_result["data"][0]["title"]
         transcription_text = parsed_result["data"][0]["transcriptionAsText"]
+        original_sections = parsed_result["data"][0]["sections"]
+        merged_sections = merge_sections(original_sections)
+        processed_sections = process_merged_sections_parallel(merged_sections)
+        logging.info("스크립트 추출 및 처리 완료")
+        return title, transcription_text, processed_sections
     except Exception as e:
         error_msg = f"스크립트 추출 중 오류 발생: {str(e)}"
         logging.exception(error_msg)
+        return "", "", []
+def is_same_topic_tfidf(text1, text2, threshold=0.3):
+    vectorizer = TfidfVectorizer().fit([text1, text2])
+    vectors = vectorizer.transform([text1, text2])
+    similarity = (vectors[0] * vectors[1].T).A[0][0]
+    return similarity > threshold
+def merge_sections(sections, min_duration=60, max_duration=300):
+    merged_sections = []
+    current_section = sections[0].copy()
+    for section in sections[1:]:
+        duration = current_section['end_time'] - current_section['start_time']
+        if duration < min_duration:
+            current_section['end_time'] = section['end_time']
+            current_section['text'] += ' ' + section['text']
+        elif duration >= max_duration:
+            merged_sections.append(current_section)
+            current_section = section.copy()
+        else:
+            if is_same_topic_tfidf(current_section['text'], section['text']):
+                current_section['end_time'] = section['end_time']
+                current_section['text'] += ' ' + section['text']
+            else:
+                merged_sections.append(current_section)
+                current_section = section.copy()
+    merged_sections.append(current_section)
+    return merged_sections
+def summarize_section(section_text):
+    prompt = f"""
+다음 유튜브 대본 섹션의 핵심 내용을 간결하게 요약하세요:
+1. 한글로 작성하세요.
+2. 주요 논점과 중요한 세부사항을 포함하세요.
+3. 요약은 2-3문장으로 제한하세요.
+섹션 내용:
+{section_text}
+"""
     try:
         response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
             messages=[{"role": "user", "content": prompt}],
+            max_tokens=150,
+            temperature=0.3,
+            top_p=0.9
         )
         return response['choices'][0]['message']['content']
     except Exception as e:
         logging.exception("요약 생성 중 오류 발생")
+        return "요약을 생성하는 동안 오류가 발생했습니다."
+def process_section(section):
+    summary = summarize_section(section['text'])
+    return {
+        'start_time': section['start_time'],
+        'end_time': section['end_time'],
+        'summary': summary
+    }
+def process_merged_sections_parallel(merged_sections):
+    with Pool(processes=cpu_count()) as pool:
+        return pool.map(process_section, merged_sections)
+def format_time(seconds):
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
+def generate_timeline_summary(processed_sections):
+    timeline_summary = ""
+    for i, section in enumerate(processed_sections, 1):
+        start_time = format_time(section['start_time'])
+        end_time = format_time(section['end_time'])
+        timeline_summary += f"{start_time} - {end_time} {i}. {section['summary']}\n\n"
+    return timeline_summary
+def display_script_and_summary(title, script, processed_sections):
+    timeline_summary = generate_timeline_summary(processed_sections)
+    script_html = f"""<h2 style='font-size:24px;'>{title}</h2>
+    <h3>타임라인 요약:</h3>
+    <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;">
+        {timeline_summary}
+    </div>
+    <details>
+        <summary><h3>원문 스크립트 (클릭하여 펼치기)</h3></summary>
+        <div style="white-space: pre-wrap;">{script}</div>
+    </details>"""
+    return script_html
 with gr.Blocks() as demo:
     gr.Markdown("## YouTube 스크립트 추출 및 요약 도구")
     youtube_url_input = gr.Textbox(label="YouTube URL 입력")
     analyze_button = gr.Button("분석하기")
+    output = gr.HTML(label="결과")
+    cached_data = gr.State({"url": "", "title": "", "script": "", "processed_sections": []})
     def analyze(url, cache):
+        if url == cache["url"]:
+            return display_script_and_summary(cache["title"], cache["script"], cache["processed_sections"]), cache
+        title, script, processed_sections = get_youtube_script(url)
+        new_cache = {"url": url, "title": title, "script": script, "processed_sections": processed_sections}
+        return display_script_and_summary(title, script, processed_sections), new_cache
     analyze_button.click(
         analyze,
         inputs=[youtube_url_input, cached_data],
+        outputs=[output, cached_data]
     )
 demo.launch(share=True)