Yannael_LB
Update
ea1af87
raw
history blame
4.78 kB
import gradio as gr
import os
import json
from youtube_transcript_api import YouTubeTranscriptApi
from openai import OpenAI
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def gradio_video_id_to_transcript(video_id):
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
transcript_formatted = [{'start': entry['start'], 'text': entry['text']} for entry in transcript[0:10]]
transcript_formatted_str = json.dumps(transcript_formatted, indent=2)+'...'
return {output_transcript: transcript_formatted_str,
gv_transcript: transcript}
def gradio_transcript_to_paragraphs(gv_transcript_value):
paragraphs, nb_input_tokens, nb_output_tokens, price = \
transcript_to_paragraphs(gv_transcript_value, openai_client, openai_model, chunk_size=5000)
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
return {output_paragraphs: paragraphs_formatted_str,
gv_paragraphs: paragraphs}
def gradio_paragraphs_to_toc(gv_paragraphs_value):
paragraphs_dict = gv_paragraphs_value
json_toc, nb_input_tokens, nb_output_tokens, price = \
paragraphs_to_toc(paragraphs_dict, openai_client, openai_model, chunk_size=100)
json_toc_formatted_str = json.dumps(json_toc[0:4], indent=2)+'...'
return {output_toc: json_toc_formatted_str,
gv_toc: json_toc}
def gradio_get_paragraphs_timestamps(gv_transcript_value, gv_paragraphs_value):
paragraphs = add_timestamps_to_paragraphs(gv_transcript_value, gv_paragraphs_value, num_words=50)
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
return {output_paragraphs_timestamps: paragraphs_formatted_str,
gv_paragraphs: paragraphs}
def gradio_get_chapters(gv_paragraphs_value, gv_toc_value):
chapters = get_chapters(gv_paragraphs_value, gv_toc_value)
chapters_formatted_str = json.dumps(chapters[0:4], indent=2)+'...'
return {output_chapters: chapters_formatted_str,
gv_chapters: chapters}
def gradio_get_markdown(gv_chapters_value):
markdown = chapters_to_markdown(gv_chapters_value)
return markdown
with gr.Blocks() as app:
gr.Markdown("## Get transcript")
gv_transcript = gr.State()
video_id_input = gr.Textbox(label="Video ID", value = "ErnWZxJovaM")
get_transcript_button = gr.Button("Get transcript")
output_transcript = gr.Textbox(label = "Transcript (JSON format - start, text)")
get_transcript_button.click(gradio_video_id_to_transcript,
inputs=[video_id_input],
outputs=[output_transcript, gv_transcript])
gr.Markdown("## Transcript to paragraphs")
gv_paragraphs = gr.State()
get_paragraphs_button = gr.Button("Get paragraphs")
output_paragraphs = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text)")
get_paragraphs_button.click(gradio_transcript_to_paragraphs,
inputs=[gv_transcript],
outputs=[output_paragraphs, gv_paragraphs])
gr.Markdown("## Get table of content")
gv_toc = gr.State()
get_toc_button = gr.Button("Get table of contents")
output_toc = gr.Textbox(label = "Table of content (JSON format - paragraph_number, title)")
get_toc_button.click(gradio_paragraphs_to_toc,
inputs=[gv_paragraphs],
outputs=[output_toc, gv_toc])
gr.Markdown("## Infer paragraph timestamps with TF-IDF")
get_timestamps_button = gr.Button("Infer paragraph timestamps")
output_paragraphs_timestamps = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text, start)")
get_timestamps_button.click(gradio_get_paragraphs_timestamps,
inputs=[gv_transcript, gv_paragraphs],
outputs=[output_paragraphs_timestamps, gv_paragraphs])
gr.Markdown("## Get chapters")
gv_chapters = gr.State()
get_chapters_button = gr.Button("Get chapters")
output_chapters = gr.Textbox(label = "Chapters (JSON format)")
get_chapters_button.click(gradio_get_chapters,
inputs=[gv_paragraphs, gv_toc],
outputs=[output_chapters, gv_chapters])
gr.Markdown("## Markdown formatting")
get_markdown_button = gr.Button("Markdown formatting")
output_markdown = gr.Markdown(label = "Chapters (Markdown format)")
get_markdown_button.click(gradio_get_markdown,
inputs=[gv_chapters],
outputs=[output_markdown])
app.launch(debug=True)