File size: 15,454 Bytes
3730a63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
import json
import re

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


########################### LLM call ###########################

price_token={'gpt-4o': {'input': 5/1000000, 'output': 15/1000000},
             'gpt-4o-2024-08-06': {'input': 2.5/1000000, 'output': 10/1000000},
             'gpt-4o-mini-2024-07-18': {'input': 0.15/1000000, 'output': 0.6/1000000},
             'llama3-8b-8192' : {'input': 0.05 / 1000000, 'output': 0.08 / 1000000},
             'llama3-70b-8192' : {'input': 0.59 / 1000000, 'output': 0.79 / 1000000},
             'claude-3-5-sonnet-20240620': {'input': 3/1000000, 'output': 15/1000000},
             'claude-3-haiku-20240307': {'input': 0.25/1000000, 'output': 1.25/1000000},
             }
def call_llm(client, model, system_prompt, prompt,
             temperature=0, seed=42, response_format=None, max_tokens=5000):

    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        model=model,
        temperature=temperature,
        seed=seed,
        response_format=response_format,
        max_tokens=max_tokens
    )

    nb_input_tokens = response.usage.prompt_tokens
    nb_output_tokens = response.usage.completion_tokens
    price = nb_input_tokens * price_token[model]['input'] + nb_output_tokens * price_token[model]['output']

    print(f"input tokens: {nb_input_tokens}; output tokens: {nb_output_tokens}, price: {price}")

    response_content=response.choices[0].message.content

    return response_content, nb_input_tokens, nb_output_tokens, price

########################### Step 2: Transcript to paragraph ###########################

system_prompt_transcript_to_paragraphs = f"""

You are a helpful assistant.

Your task is to improve the user input's readability: add punctuation if needed, remove verbal tics, correct grammatical errors, and add appropriate line breaks with '\n\n'.

Put your answer within <answer></answer> tags.

"""



def transcript_to_paragraphs(transcript, llm_client, llm_model, chunk_size=5000, progress=None):

    transcript_as_text = ' '.join([s['text'] for s in transcript])

    paragraphs = []
    last_paragraph = ""

    total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0

    nb_chunks = int(len(transcript_as_text) / chunk_size) + 1
    progress_i = 0
    print(f"Number of chunks: {nb_chunks}")

    # for i in range(0, 10000, chunk_size):
    for i in range(0, len(transcript_as_text), chunk_size):

        print("i is: " + str(i))

        chunk = last_paragraph + " " + transcript_as_text[i:i + chunk_size]

        if progress is not None:
            progress_i += 1
            progress(progress_i / nb_chunks, desc="Processing")

        found_edited_transcript = False

        while not found_edited_transcript:

            response_content, nb_input_tokens, nb_output_tokens, price = \
                call_llm(llm_client, llm_model,
                         system_prompt=system_prompt_transcript_to_paragraphs, prompt=chunk,
                         temperature=0.2, seed=42, response_format=None)

            if not "</answer>" in response_content:
                response_content += "</answer>"

            # Extract content from <edited_transcript> tags
            pattern = re.compile(r'<answer>(.*?)</answer>', re.DOTALL)
            response_content_edited = pattern.findall(response_content)

            if len(response_content_edited) > 0:
                found_edited_transcript = True
                response_content_edited = response_content_edited[0]

            else:
                print("No edited transcript found. Trying again.")
                print(response_content[0:100])
                print(response_content[-100:])

        total_nb_input_tokens += nb_input_tokens
        total_nb_output_tokens += nb_output_tokens
        total_price += price

        paragraphs_chunk = response_content_edited.strip().split('\n\n')

        print('Found paragraphs:', len(paragraphs_chunk))
        last_paragraph = paragraphs_chunk[-1]

        paragraphs += paragraphs_chunk[:-1]

    paragraphs += [last_paragraph]

    paragraphs_dict = [{'paragraph_number': i, 'paragraph_text': paragraph} for i, paragraph in enumerate(paragraphs)]

    return paragraphs_dict, total_nb_input_tokens, total_nb_output_tokens, total_price

########################### Step 3: Infer timestamps ###########################

def transform_text_segments(text_segments, num_words=50):
    # Initialize variables
    transformed_segments = []
    current_index = 0
    num_segments = len(text_segments)

    for i in range(num_segments):

        current_index = i

        # Get the current segment's starting timestamp and text
        current_segment = text_segments[current_index]
        current_text = current_segment['text']

        # Initialize a list to hold the combined text
        combined_text = " ".join(current_text.split()[:num_words])
        number_words_collected = len(current_text.split())

        # Collect words from subsequent segments
        while number_words_collected < num_words and (current_index + 1) < num_segments:
            current_index += 1
            next_segment = text_segments[current_index]
            next_text = next_segment['text']
            next_words = next_text.split()

            # Append words from the next segment
            if number_words_collected + len(next_words) <= num_words:
                combined_text += ' ' + next_text
                number_words_collected += len(next_words)
            else:
                # Only append enough words to reach the num_words limit
                words_needed = num_words - number_words_collected
                combined_text += ' ' + ' '.join(next_words[:words_needed])
                number_words_collected = num_words

        # Append the combined segment to the result
        transformed_segments.append(combined_text)

    return transformed_segments


def add_timestamps_to_paragraphs(transcript, paragraphs, num_words=50):
    list_indices = []

    transcript_num_words = transform_text_segments(transcript, num_words=num_words)

    paragraphs_start_text = [{"start": p['paragraph_number'], "text": p['paragraph_text']} for p in paragraphs]
    paragraphs_num_words = transform_text_segments(paragraphs_start_text, num_words=num_words)

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer().fit_transform(transcript_num_words + paragraphs_num_words)
    # Get the TF-IDF vectors for the transcript and the excerpt
    vectors = vectorizer.toarray()

    for i in range(len(paragraphs_num_words)):

        # Extract the TF-IDF vector for the paragraph
        paragraph_vector = vectors[len(transcript_num_words) + i]

        # Calculate the cosine similarity between the paragraph vector and each transcript chunk
        similarities = cosine_similarity(vectors[:len(transcript_num_words)], paragraph_vector.reshape(1, -1))
        # Find the index of the most similar chunk
        best_match_index = int(np.argmax(similarities))

        list_indices.append(best_match_index)

        paragraphs[i]['matched_index'] = best_match_index
        paragraphs[i]['matched_text'] = transcript[best_match_index]['text']
        paragraphs[i]['start_time'] = int(transcript[best_match_index]['start']) - 2
        if paragraphs[i]['start_time'] < 0:
            paragraphs[i]['start_time'] = 0

    return paragraphs

########################### Step 4: Generate table of content ###########################


system_prompt_paragraphs_to_toc = """

	You are a helpful assistant.

	You are given a transcript of a course in JSON format as a list of paragraphs, each containing 'paragraph_number' and 'paragraph_text' keys.

	Your task is to group consecutive paragraphs in chapters for the course and identify meaningful chapter titles.

	Here are the steps to follow:

1. Read the transcript carefully to understand its general structure and the main topics covered.
2. Look for clues that a new chapter is about to start. This could be a change of topic, a change of time or setting, the introduction of new themes or topics, or the speaker's explicit mention of a new part.
3. For each chapter, keep track of the paragraph number that starts the chapter and identify a meaningful chapter title.
4. Chapters should ideally be equally spaced throughout the transcript, and discuss a specific topic.
5. A chapter MUST have more than 4 paragraphs.

	Format your result in JSON, with a list dictionaries for chapters, with 'start_paragraph_number':integer and 'title':string as key:value.

	Example: 
    {"chapters": 
        [{"start_paragraph_number": 0, "title": "Introduction"}, 
         {"start_paragraph_number": 10, "title": "Chapter 1"}
        ]
    }

"""


def paragraphs_to_toc(paragraphs, llm_client, llm_model, chunk_size=100):
    chapters = []
    number_last_chapter = 0

    total_nb_input_tokens, total_nb_output_tokens, total_price = 0, 0, 0

    while number_last_chapter < len(paragraphs):

        print(number_last_chapter)

        chunk = paragraphs[number_last_chapter:(number_last_chapter + chunk_size)]
        chunk = [{'paragraph_number': p['paragraph_number'], 'paragraph_text': p['paragraph_text']} for p in chunk]

        chunk_json_dump = json.dumps(chunk)

        content, nb_input_tokens, nb_output_tokens, price = call_llm( \
            llm_client, llm_model, \
            system_prompt_paragraphs_to_toc, chunk_json_dump, \
            temperature=0, seed=42, response_format={"type": "json_object"})

        total_nb_input_tokens += nb_input_tokens
        total_nb_output_tokens += nb_output_tokens

        chapters_chunk = json.loads(content)['chapters']

        if number_last_chapter == chapters_chunk[-1]['start_paragraph_number']:
            break

        chapters += chapters_chunk[:-1]

        number_last_chapter = chapters_chunk[-1]['start_paragraph_number']
        if number_last_chapter >= len(paragraphs) - 5:
            break

    total_price = (total_nb_input_tokens * price_token[llm_model]['input'] +
                   total_nb_output_tokens * price_token[llm_model]['output'])

    chapters += [chapters_chunk[-1]]

    return chapters, total_nb_input_tokens, total_nb_output_tokens, total_price


########################### Step 5: Chapter rendering functions ###########################

def get_chapters(paragraphs, table_of_content):

    chapters = []

    for i in range(len(table_of_content)):


        if i < len(table_of_content) - 1:

            chapter = {'num_chapter': i,
                       'title': table_of_content[i]['title'],
                       'start_paragraph_number': table_of_content[i]['start_paragraph_number'],
                       'end_paragraph_number': table_of_content[i + 1]['start_paragraph_number'],
                       'start_time': paragraphs[table_of_content[i]['start_paragraph_number']]['start_time'],
                       'end_time': paragraphs[table_of_content[i + 1]['start_paragraph_number']]['start_time'],
                      }

        else:
            chapter = {'num_chapter': i,
                       'title': table_of_content[i]['title'],
                       'start_paragraph_number': table_of_content[i]['start_paragraph_number'],
                       'end_paragraph_number': len(paragraphs),
                       'start_time': paragraphs[table_of_content[i]['start_paragraph_number']]['start_time'],
                       'end_time': paragraphs[-1]['start_time'],
                      }

        paragraphs_chapter = [paragraphs[j]['paragraph_text'] for j in
                                range(chapter['start_paragraph_number'], chapter['end_paragraph_number'])]

        paragraph_timestamps_chapter = [paragraphs[j]['start_time'] for j in
                                          range(chapter['start_paragraph_number'], chapter['end_paragraph_number'])]

        chapter['paragraphs'] = paragraphs_chapter
        chapter['paragraph_timestamps'] = paragraph_timestamps_chapter

        chapters.append(chapter)

    return chapters

def convert_seconds_to_hms(seconds):
    # Calculate hours, minutes, and remaining seconds
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    remaining_seconds = seconds % 60

    # Format the result as HH:MM:SS
    return f"{hours:02}:{minutes:02}:{remaining_seconds:02}"

def toc_to_html(chapters):

    toc_html = "<h1>Video chapters</h1><p>\n"

    for chapter in chapters:
        num_chapter = chapter['num_chapter']
        title = chapter['title']

        from_to = convert_seconds_to_hms(int(chapter['start_time'])) + " - "

        toc_html += f"""{from_to}<a href = "#{num_chapter}" >{num_chapter+1} - {title}</a><br>\n"""

    return toc_html


def section_to_html(section_json_data):
    formatted_section = ""

    paragraphs = section_json_data['paragraphs']
    paragraphs_timestamp_hms = [convert_seconds_to_hms(int(section_json_data['paragraph_timestamps'][i])) for i in range(len(paragraphs))]

    for i, (paragraph, paragraph_timestamp_hms) in enumerate(zip(paragraphs, paragraphs_timestamp_hms)):

        formatted_section += f"""
        <div class="row mb-4">
            <div class="col-md-1">
                {paragraph_timestamp_hms}
            </div>
            <div class="col-md-11">
                <p>{paragraph}</p>
            </div>
        </div>"""

    num_section = section_json_data['num_chapter']

    from_to = "From "+convert_seconds_to_hms(int(section_json_data['start_time'])) + " to " + convert_seconds_to_hms(
        int(section_json_data['end_time']))

    title = f"{section_json_data['title']}"

    title_link = f"""<div class="transcript-title-icon" " id="{num_section}">{num_section+1} - {title}</div>"""

    summary_section = f"""
            <h2>{title_link}</h2>
            {from_to}
            <p>
            <div class="summary-section">
                <div class="summary-text" >
                    {formatted_section}
                </div>
            </div>
            """

    return summary_section


def get_result_as_html(chapters, video_id):
    video_embed = f"""
<iframe width="100%" height="400" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
"""

    toc = toc_to_html(chapters)

    edited_transcript = f"""
<h1>Structured transcript</h1>
<p>
"""

    for i in range(len(chapters)):
        chapter_json_data = chapters[i]

        edited_transcript += section_to_html(chapter_json_data)

    result_as_html = f"""
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet">
<div class="container mt-4">
    <div class="content">
        {video_embed}
    </div>
    <p>
    <div class="content">
        {toc}
    </div>
    <p>
    <div class="content">
        {edited_transcript}
    </div>
</div>"""

    return result_as_html

def load_json_chapters(video_id):
    file_name = f"{video_id}.json"
    with open(file_name, 'r') as file:
        chapters = json.load(file)

    return chapters