Spaces:
Paused
Paused
| import gradio as gr | |
| from gradio_client import Client | |
| import json | |
| import logging | |
| import ast | |
| import openai | |
| import os | |
| import random | |
| import re | |
| logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG, | |
| format='%(asctime)s - %(levelname)s - %(message)s') | |
| openai.api_key = os.getenv("OPENAI_API_KEY") | |
| def parse_api_response(response): | |
| try: | |
| if isinstance(response, str): | |
| response = ast.literal_eval(response) | |
| if isinstance(response, list) and len(response) > 0: | |
| response = response[0] | |
| if not isinstance(response, dict): | |
| raise ValueError(f"μμμΉ λͺ»ν μλ΅ νμμ λλ€. λ°μ λ°μ΄ν° νμ : {type(response)}") | |
| return response | |
| except Exception as e: | |
| raise ValueError(f"API μλ΅ νμ± μ€ν¨: {str(e)}") | |
| def split_sentences(text): | |
| sentences = re.split(r"(λλ€|μμ|ꡬλ|ν΄μ|κ΅°μ|κ² μ΄μ|μμ€|ν΄λΌ|μμ|μμ|λ°μ|λμ|μΈμ|μ΄μ|κ²μ|ꡬμ|κ³ μ|λμ|νμ£ )(?![\w])", text) | |
| combined_sentences = [] | |
| current_sentence = "" | |
| for i in range(0, len(sentences), 2): | |
| if i + 1 < len(sentences): | |
| sentence = sentences[i] + sentences[i + 1] | |
| else: | |
| sentence = sentences[i] | |
| if len(current_sentence) + len(sentence) > 100: | |
| combined_sentences.append(current_sentence.strip()) | |
| current_sentence = sentence.strip() | |
| else: | |
| current_sentence += sentence | |
| if sentence.endswith(('.', '?', '!')): | |
| combined_sentences.append(current_sentence.strip()) | |
| current_sentence = "" | |
| if current_sentence: | |
| combined_sentences.append(current_sentence.strip()) | |
| return combined_sentences | |
| def get_youtube_script(url): | |
| logging.info(f"μ€ν¬λ¦½νΈ μΆμΆ μμ: URL = {url}") | |
| client = Client("whispersound/YT_Ts_R") | |
| try: | |
| logging.debug("API νΈμΆ μμ") | |
| result = client.predict(youtube_url=url, api_name="/predict") | |
| logging.debug("API νΈμΆ μλ£") | |
| parsed_result = parse_api_response(result) | |
| title = parsed_result["data"][0]["title"] | |
| transcription_text = parsed_result["data"][0]["transcriptionAsText"] | |
| sections = parsed_result["data"][0]["sections"] | |
| logging.info("μ€ν¬λ¦½νΈ μΆμΆ μλ£") | |
| return title, transcription_text, sections | |
| except Exception as e: | |
| error_msg = f"μ€ν¬λ¦½νΈ μΆμΆ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| logging.exception(error_msg) | |
| return "", "", [] | |
| def call_api(prompt, max_tokens, temperature, top_p): | |
| try: | |
| response = openai.ChatCompletion.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p | |
| ) | |
| return response['choices'][0]['message']['content'] | |
| except Exception as e: | |
| logging.exception("LLM API νΈμΆ μ€ μ€λ₯ λ°μ") | |
| return "μμ½μ μμ±νλ λμ μ€λ₯κ° λ°μνμ΅λλ€. λμ€μ λ€μ μλν΄ μ£ΌμΈμ." | |
| def summarize_section(section_text): | |
| prompt = f""" | |
| λ€μ μ νλΈ λλ³Έ μΉμ μ ν΅μ¬ λ΄μ©μ κ°κ²°νκ² μμ½νμΈμ: | |
| 1. νκΈλ‘ μμ±νμΈμ. | |
| 2. μ£Όμ λ Όμ κ³Ό μ€μν μΈλΆμ¬νμ ν¬ν¨νμΈμ. | |
| 3. μμ½μ 2-3λ¬Έμ₯μΌλ‘ μ ννμΈμ. | |
| μΉμ λ΄μ©: | |
| {section_text} | |
| """ | |
| return call_api(prompt, max_tokens=150, temperature=0.3, top_p=0.9) | |
| def format_time(seconds): | |
| minutes, seconds = divmod(seconds, 60) | |
| hours, minutes = divmod(minutes, 60) | |
| return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}" | |
| def generate_timeline_summary(sections): | |
| timeline_summary = "" | |
| for i, section in enumerate(sections, 1): | |
| start_time = format_time(section['start_time']) | |
| summary = summarize_section(section['text']) | |
| timeline_summary += f"{start_time} {i}. {summary}\n\n" | |
| return timeline_summary | |
| def summarize_text(text): | |
| prompt = f""" | |
| 1. λ€μ μ£Όμ΄μ§λ μ νλΈ λλ³Έμ ν΅μ¬ μ£Όμ μ λͺ¨λ μ£Όμ λ΄μ©μ μμΈνκ² μμ½νλΌ | |
| 2. λ°λμ νκΈλ‘ μμ±νλΌ | |
| 3. μμ½λ¬Έλ§μΌλ‘λ μμμ μ§μ μμ²ν κ²κ³Ό λμΌν μμ€μΌλ‘ λ΄μ©μ μ΄ν΄ν μ μλλ‘ μμΈν μμ± | |
| 4. κΈμ λ무 μμΆνκ±°λ ν¨μΆνμ§ λ§κ³ , μ€μν λ΄μ©κ³Ό μΈλΆμ¬νμ λͺ¨λ ν¬ν¨ | |
| 5. λ°λμ λλ³Έμ νλ¦κ³Ό λ Όλ¦¬ ꡬ쑰λ₯Ό μ μ§ | |
| 6. λ°λμ μκ° μμλ μ¬κ±΄μ μ κ° κ³Όμ μ λͺ ννκ² λ°μ | |
| 7. λ±μ₯μΈλ¬Ό, μ₯μ, μ¬κ±΄ λ± μ€μν μμλ₯Ό μ ννκ² μμ± | |
| 8. λλ³Έμμ μ λ¬νλ κ°μ μ΄λ λΆμκΈ°λ ν¬ν¨ | |
| 9. λ°λμ κΈ°μ μ μ©μ΄λ μ λ¬Έ μ©μ΄κ° μμ κ²½μ°, μ΄λ₯Ό μ ννκ² μ¬μ© | |
| 10. λλ³Έμ λͺ©μ μ΄λ μλλ₯Ό νμ νκ³ , μ΄λ₯Ό μμ½μ λ°λμ λ°μ | |
| 11. μ 체κΈμ λ³΄κ³ | |
| --- | |
| μ΄ ν둬ννΈκ° λμμ΄ λμκΈΈ λ°λλλ€. | |
| \n\n | |
| {text}""" | |
| try: | |
| return call_api(prompt, max_tokens=10000, temperature=0.3, top_p=0.9) | |
| except Exception as e: | |
| logging.exception("μμ½ μμ± μ€ μ€λ₯ λ°μ") | |
| return "μμ½μ μμ±νλ λμ μ€λ₯κ° λ°μνμ΅λλ€. λμ€μ λ€μ μλν΄ μ£ΌμΈμ." | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## YouTube μ€ν¬λ¦½νΈ μΆμΆ λ° μμ½ λꡬ") | |
| youtube_url_input = gr.Textbox(label="YouTube URL μ λ ₯") | |
| analyze_button = gr.Button("λΆμνκΈ°") | |
| script_output = gr.HTML(label="μ€ν¬λ¦½νΈ") | |
| timeline_output = gr.HTML(label="νμλΌμΈ μμ½") | |
| summary_output = gr.HTML(label="μ 체 μμ½") | |
| cached_data = gr.State({"url": "", "title": "", "script": "", "sections": []}) | |
| def extract_and_cache(url, cache): | |
| if url == cache["url"]: | |
| return cache["title"], cache["script"], cache["sections"], cache | |
| title, script, sections = get_youtube_script(url) | |
| new_cache = {"url": url, "title": title, "script": script, "sections": sections} | |
| return title, script, sections, new_cache | |
| def display_script(title, script): | |
| if not script: | |
| return "<p>μ€ν¬λ¦½νΈλ₯Ό μΆμΆνμ§ λͺ»νμ΅λλ€. URLμ νμΈνκ³ λ€μ μλν΄ μ£ΌμΈμ.</p>" | |
| formatted_script = "\n".join(split_sentences(script)) | |
| script_html = f"""<h2 style='font-size:24px;'>{title}</h2> | |
| <details> | |
| <summary><h3>μλ¬Έ μ€ν¬λ¦½νΈ (ν΄λ¦νμ¬ νΌμΉκΈ°)</h3></summary> | |
| <div style="white-space: pre-wrap;">{formatted_script}</div> | |
| </details>""" | |
| return script_html | |
| def display_timeline(sections): | |
| if not sections: | |
| return "<p>νμλΌμΈμ μμ±νμ§ λͺ»νμ΅λλ€. μ€ν¬λ¦½νΈ μΆμΆμ μ€ν¨νμ μ μμ΅λλ€.</p>" | |
| timeline_summary = generate_timeline_summary(sections) | |
| timeline_html = f""" | |
| <h3>νμλΌμΈ μμ½:</h3> | |
| <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;"> | |
| {timeline_summary} | |
| </div> | |
| """ | |
| return timeline_html | |
| def generate_summary(script): | |
| if not script: | |
| return "<p>μ 체 μμ½μ μμ±νμ§ λͺ»νμ΅λλ€. μ€ν¬λ¦½νΈ μΆμΆμ μ€ν¨νμ μ μμ΅λλ€.</p>" | |
| summary = summarize_text(script) | |
| summary_html = f""" | |
| <h3>μ 체 μμ½:</h3> | |
| <div style="white-space: pre-wrap; max-height: 400px; overflow-y: auto; border: 1px solid #ccc; padding: 10px;"> | |
| {summary} | |
| </div> | |
| """ | |
| return summary_html | |
| def analyze(url, cache): | |
| title, script, sections, new_cache = extract_and_cache(url, cache) | |
| script_html = display_script(title, script) | |
| timeline_html = display_timeline(sections) | |
| summary_html = generate_summary(script) | |
| return script_html, timeline_html, summary_html, new_cache | |
| analyze_button.click( | |
| analyze, | |
| inputs=[youtube_url_input, cached_data], | |
| outputs=[script_output, timeline_output, summary_output, cached_data] | |
| ) | |
| demo.launch(share=True) |