Spaces:

Yannael
/

video-chaptering

Running

File size: 6,978 Bytes

e29f761
ea1af87
e29f761
ea1af87
e29f761
3730a63
e29f761
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
 
 
ea1af87
3730a63
ea1af87
 
3730a63
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
 
3730a63
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
 
3730a63
 
 
 
 
 
 
 
 
 
 
ea1af87
3730a63
 
 
ea1af87
3730a63
 
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
ea1af87
3730a63
ea1af87
3730a63
1c77c4b
3730a63
 
 
 
 
 
 
 
 
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63

import gradio as gr
import os

from youtube_transcript_api import YouTubeTranscriptApi

import utils

from openai import OpenAI
from groq import Groq

from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

#import importlib
#importlib.reload(utils)

def get_llm_client_and_model(llm_model):
    if llm_model == "llama3-8b":
        llm_client = Groq(api_key=GROQ_API_KEY)
        llm_model = 'llama3-8b-8192'

    elif llm_model == "gpt-4o-mini":
        llm_client = OpenAI(api_key=OPENAI_API_KEY)
        llm_model = 'gpt-4o-mini-2024-07-18'

    return llm_client, llm_model


def gradio_process_video(video_id,
                         model_format_transcript, model_toc,
                         chunk_size_format_transcript, chunk_size_toc,
                         progress=gr.Progress()):
    if video_id in ["ErnWZxJovaM"]:
        chapters = utils.load_json_chapters(video_id)

    else:

        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

        chunk_size_format_transcript = int(chunk_size_format_transcript)

        llm_client_format_transcript, llm_model_format_transcript = \
            get_llm_client_and_model(model_format_transcript)

        paragraphs, nb_input_tokens, nb_output_tokens, price = \
            utils.transcript_to_paragraphs(transcript, \
                                           llm_client_format_transcript, llm_model_format_transcript, \
                                           chunk_size=chunk_size_format_transcript, progress=progress)

        paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)

        chunk_size_toc = int(chunk_size_toc)

        llm_client_get_toc, llm_model_get_toc = \
            get_llm_client_and_model(model_toc)

        json_toc, nb_input_tokens, nb_output_tokens, price = \
            utils.paragraphs_to_toc(paragraphs, \
                                    llm_client_get_toc, llm_model_get_toc, \
                                    chunk_size=chunk_size_toc)

        chapters = utils.get_chapters(paragraphs, json_toc)

    output_html = utils.get_result_as_html(chapters, video_id)

    return {output_processing: str(output_html),
            gv_output: output_html}


def gradio_process_video(video_id,
                         model_format_transcript, model_toc,
                         chunk_size_format_transcript, chunk_size_toc,
                         progress=gr.Progress()):
    if video_id in ["ErnWZxJovaM"]:
        chapters = utils.load_json_chapters(video_id)

    else:

        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

        chunk_size_format_transcript = int(chunk_size_format_transcript)

        llm_client_format_transcript, llm_model_format_transcript = \
            get_llm_client_and_model(model_format_transcript)

        paragraphs, nb_input_tokens, nb_output_tokens, price = \
            utils.transcript_to_paragraphs(transcript, \
                                           llm_client_format_transcript, llm_model_format_transcript, \
                                           chunk_size=chunk_size_format_transcript, progress=progress)

        paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)

        chunk_size_toc = int(chunk_size_toc)

        llm_client_get_toc, llm_model_get_toc = \
            get_llm_client_and_model(model_toc)

        json_toc, nb_input_tokens, nb_output_tokens, price = \
            utils.paragraphs_to_toc(paragraphs, \
                                    llm_client_get_toc, llm_model_get_toc, \
                                    chunk_size=chunk_size_toc)

        chapters = utils.get_chapters(paragraphs, json_toc)

    output_html = utils.get_result_as_html(chapters, video_id)

    return {output_processing: str(output_html),
            gv_output: output_html}


# %%
css = """
.content {
    padding: 20px;
    max-width: 800px;
    margin: 0 auto;
    background-color: #ffffff;
    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    border-radius: 8px;
}
"""

example_video_id = "ErnWZxJovaM"
example_chapters = utils.load_json_chapters(example_video_id)
example_output_html = utils.get_result_as_html(example_chapters, example_video_id)

with (gr.Blocks(css=css) as app):
    gr.HTML("<div align='center'><h1 class='header'>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
    gr.HTML("<div align='center'><h3 class='header'>From raw transcript to structured document</h3></div>")
    gr.HTML("<hr>")
    gr.Markdown("""This demo relies on 
                - Groq's Llama 3 8B for transcript preprocessing
                - OpenAI's GPT-4o-mini for chaptering. Note: Using GPT-4o-mini for transcript preprocessing will improve results, but takes longer (around 2/3 minutes for a one-hour video)

                The following YouTube video ID are already preprocessed (copy and paste ID in box below): 

                - `ErnWZxJovaM`: [MIT course](https://www.youtube.com/watch?v=ErnWZxJovaM)
                - `EuC1GWhQdKE`: [Anthropic](https://www.youtube.com/watch?v=EuC1GWhQdKE)

                Check the [Medium article]() for more details"""
                )

    gv_transcript = gr.State()

    video_id_input = gr.Textbox(label="Enter YouTube Video ID", value="EuC1GWhQdKE")

    with gr.Accordion("Set parameters", open=False):
        with gr.Row():
            with gr.Column(scale=1):
                model_format_transcript = gr.Dropdown(
                    [("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
                    label="Transcript preprocessing", value="llama3-8b", interactive=True)
                chunk_size_format_transcript = gr.Textbox(label="Preprocessing chunk size", value=2000)
            with gr.Column(scale=1):
                model_toc = gr.Dropdown([("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
                                        label="Chaptering", value="gpt-4o-mini", interactive=True)
                chunk_size_toc = gr.Textbox(label="Chaptering chunk size", value=30)
            with gr.Column(scale=1):
                api_key_openai = gr.Textbox(label="OpenAI API Key", value="xxx")
                api_key_groq = gr.Textbox(label="Groq API Key", value="xxx")

    processing_button = gr.Button("Process transcript")

    gv_output = gr.State()

    gr.HTML("<hr>")

    output_processing = gr.HTML(label="Output processing", value=example_output_html)

    processing_button.click(gradio_process_video,
                            inputs=[video_id_input,
                                    model_format_transcript, model_toc,
                                    chunk_size_format_transcript, chunk_size_toc],
                            outputs=[output_processing, gv_output])

    # gr.HTML(result_as_html)

app.launch(debug=True, width="100%")