File size: 6,978 Bytes
e29f761
ea1af87
e29f761
ea1af87
e29f761
3730a63
e29f761
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
 
 
ea1af87
3730a63
ea1af87
 
3730a63
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
 
3730a63
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
 
3730a63
 
 
 
 
 
 
 
 
 
 
ea1af87
3730a63
 
 
ea1af87
3730a63
 
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
 
ea1af87
3730a63
 
ea1af87
3730a63
ea1af87
3730a63
1c77c4b
3730a63
 
 
 
 
 
 
 
 
 
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
ea1af87
3730a63
 
 
 
 
ea1af87
3730a63
ea1af87
3730a63
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import gradio as gr
import os

from youtube_transcript_api import YouTubeTranscriptApi

import utils

from openai import OpenAI
from groq import Groq

from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

#import importlib
#importlib.reload(utils)

def get_llm_client_and_model(llm_model):
    if llm_model == "llama3-8b":
        llm_client = Groq(api_key=GROQ_API_KEY)
        llm_model = 'llama3-8b-8192'

    elif llm_model == "gpt-4o-mini":
        llm_client = OpenAI(api_key=OPENAI_API_KEY)
        llm_model = 'gpt-4o-mini-2024-07-18'

    return llm_client, llm_model


def gradio_process_video(video_id,
                         model_format_transcript, model_toc,
                         chunk_size_format_transcript, chunk_size_toc,
                         progress=gr.Progress()):
    if video_id in ["ErnWZxJovaM"]:
        chapters = utils.load_json_chapters(video_id)

    else:

        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

        chunk_size_format_transcript = int(chunk_size_format_transcript)

        llm_client_format_transcript, llm_model_format_transcript = \
            get_llm_client_and_model(model_format_transcript)

        paragraphs, nb_input_tokens, nb_output_tokens, price = \
            utils.transcript_to_paragraphs(transcript, \
                                           llm_client_format_transcript, llm_model_format_transcript, \
                                           chunk_size=chunk_size_format_transcript, progress=progress)

        paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)

        chunk_size_toc = int(chunk_size_toc)

        llm_client_get_toc, llm_model_get_toc = \
            get_llm_client_and_model(model_toc)

        json_toc, nb_input_tokens, nb_output_tokens, price = \
            utils.paragraphs_to_toc(paragraphs, \
                                    llm_client_get_toc, llm_model_get_toc, \
                                    chunk_size=chunk_size_toc)

        chapters = utils.get_chapters(paragraphs, json_toc)

    output_html = utils.get_result_as_html(chapters, video_id)

    return {output_processing: str(output_html),
            gv_output: output_html}


def gradio_process_video(video_id,
                         model_format_transcript, model_toc,
                         chunk_size_format_transcript, chunk_size_toc,
                         progress=gr.Progress()):
    if video_id in ["ErnWZxJovaM"]:
        chapters = utils.load_json_chapters(video_id)

    else:

        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

        chunk_size_format_transcript = int(chunk_size_format_transcript)

        llm_client_format_transcript, llm_model_format_transcript = \
            get_llm_client_and_model(model_format_transcript)

        paragraphs, nb_input_tokens, nb_output_tokens, price = \
            utils.transcript_to_paragraphs(transcript, \
                                           llm_client_format_transcript, llm_model_format_transcript, \
                                           chunk_size=chunk_size_format_transcript, progress=progress)

        paragraphs = utils.add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50)

        chunk_size_toc = int(chunk_size_toc)

        llm_client_get_toc, llm_model_get_toc = \
            get_llm_client_and_model(model_toc)

        json_toc, nb_input_tokens, nb_output_tokens, price = \
            utils.paragraphs_to_toc(paragraphs, \
                                    llm_client_get_toc, llm_model_get_toc, \
                                    chunk_size=chunk_size_toc)

        chapters = utils.get_chapters(paragraphs, json_toc)

    output_html = utils.get_result_as_html(chapters, video_id)

    return {output_processing: str(output_html),
            gv_output: output_html}


# %%
css = """
.content {
    padding: 20px;
    max-width: 800px;
    margin: 0 auto;
    background-color: #ffffff;
    box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    border-radius: 8px;
}
"""

example_video_id = "ErnWZxJovaM"
example_chapters = utils.load_json_chapters(example_video_id)
example_output_html = utils.get_result_as_html(example_chapters, example_video_id)

with (gr.Blocks(css=css) as app):
    gr.HTML("<div align='center'><h1 class='header'>Demo: Automatic video chaptering with LLMs and TF-IDF</h1></div>")
    gr.HTML("<div align='center'><h3 class='header'>From raw transcript to structured document</h3></div>")
    gr.HTML("<hr>")
    gr.Markdown("""This demo relies on 
                - Groq's Llama 3 8B for transcript preprocessing
                - OpenAI's GPT-4o-mini for chaptering. Note: Using GPT-4o-mini for transcript preprocessing will improve results, but takes longer (around 2/3 minutes for a one-hour video)

                The following YouTube video ID are already preprocessed (copy and paste ID in box below): 

                - `ErnWZxJovaM`: [MIT course](https://www.youtube.com/watch?v=ErnWZxJovaM)
                - `EuC1GWhQdKE`: [Anthropic](https://www.youtube.com/watch?v=EuC1GWhQdKE)

                Check the [Medium article]() for more details"""
                )

    gv_transcript = gr.State()

    video_id_input = gr.Textbox(label="Enter YouTube Video ID", value="EuC1GWhQdKE")

    with gr.Accordion("Set parameters", open=False):
        with gr.Row():
            with gr.Column(scale=1):
                model_format_transcript = gr.Dropdown(
                    [("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
                    label="Transcript preprocessing", value="llama3-8b", interactive=True)
                chunk_size_format_transcript = gr.Textbox(label="Preprocessing chunk size", value=2000)
            with gr.Column(scale=1):
                model_toc = gr.Dropdown([("LLama 3 8B (Groq)", "llama3-8b"), ("GPT-4o-mini (OpenAI)", "gpt-4o-mini")],
                                        label="Chaptering", value="gpt-4o-mini", interactive=True)
                chunk_size_toc = gr.Textbox(label="Chaptering chunk size", value=30)
            with gr.Column(scale=1):
                api_key_openai = gr.Textbox(label="OpenAI API Key", value="xxx")
                api_key_groq = gr.Textbox(label="Groq API Key", value="xxx")

    processing_button = gr.Button("Process transcript")

    gv_output = gr.State()

    gr.HTML("<hr>")

    output_processing = gr.HTML(label="Output processing", value=example_output_html)

    processing_button.click(gradio_process_video,
                            inputs=[video_id_input,
                                    model_format_transcript, model_toc,
                                    chunk_size_format_transcript, chunk_size_toc],
                            outputs=[output_processing, gv_output])

    # gr.HTML(result_as_html)

app.launch(debug=True, width="100%")