Spaces:
Running
Running
import gradio as gr | |
import os | |
import json | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from openai import OpenAI | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
def gradio_video_id_to_transcript(video_id): | |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"]) | |
transcript_formatted = [{'start': entry['start'], 'text': entry['text']} for entry in transcript[0:10]] | |
transcript_formatted_str = json.dumps(transcript_formatted, indent=2)+'...' | |
return {output_transcript: transcript_formatted_str, | |
gv_transcript: transcript} | |
def gradio_transcript_to_paragraphs(gv_transcript_value): | |
paragraphs, nb_input_tokens, nb_output_tokens, price = \ | |
transcript_to_paragraphs(gv_transcript_value, openai_client, openai_model, chunk_size=5000) | |
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...' | |
return {output_paragraphs: paragraphs_formatted_str, | |
gv_paragraphs: paragraphs} | |
def gradio_paragraphs_to_toc(gv_paragraphs_value): | |
paragraphs_dict = gv_paragraphs_value | |
json_toc, nb_input_tokens, nb_output_tokens, price = \ | |
paragraphs_to_toc(paragraphs_dict, openai_client, openai_model, chunk_size=100) | |
json_toc_formatted_str = json.dumps(json_toc[0:4], indent=2)+'...' | |
return {output_toc: json_toc_formatted_str, | |
gv_toc: json_toc} | |
def gradio_get_paragraphs_timestamps(gv_transcript_value, gv_paragraphs_value): | |
paragraphs = add_timestamps_to_paragraphs(gv_transcript_value, gv_paragraphs_value, num_words=50) | |
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...' | |
return {output_paragraphs_timestamps: paragraphs_formatted_str, | |
gv_paragraphs: paragraphs} | |
def gradio_get_chapters(gv_paragraphs_value, gv_toc_value): | |
chapters = get_chapters(gv_paragraphs_value, gv_toc_value) | |
chapters_formatted_str = json.dumps(chapters[0:4], indent=2)+'...' | |
return {output_chapters: chapters_formatted_str, | |
gv_chapters: chapters} | |
def gradio_get_markdown(gv_chapters_value): | |
markdown = chapters_to_markdown(gv_chapters_value) | |
return markdown | |
with gr.Blocks() as app: | |
gr.Markdown("## Get transcript") | |
gv_transcript = gr.State() | |
video_id_input = gr.Textbox(label="Video ID", value = "ErnWZxJovaM") | |
get_transcript_button = gr.Button("Get transcript") | |
output_transcript = gr.Textbox(label = "Transcript (JSON format - start, text)") | |
get_transcript_button.click(gradio_video_id_to_transcript, | |
inputs=[video_id_input], | |
outputs=[output_transcript, gv_transcript]) | |
gr.Markdown("## Transcript to paragraphs") | |
gv_paragraphs = gr.State() | |
get_paragraphs_button = gr.Button("Get paragraphs") | |
output_paragraphs = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text)") | |
get_paragraphs_button.click(gradio_transcript_to_paragraphs, | |
inputs=[gv_transcript], | |
outputs=[output_paragraphs, gv_paragraphs]) | |
gr.Markdown("## Get table of content") | |
gv_toc = gr.State() | |
get_toc_button = gr.Button("Get table of contents") | |
output_toc = gr.Textbox(label = "Table of content (JSON format - paragraph_number, title)") | |
get_toc_button.click(gradio_paragraphs_to_toc, | |
inputs=[gv_paragraphs], | |
outputs=[output_toc, gv_toc]) | |
gr.Markdown("## Infer paragraph timestamps with TF-IDF") | |
get_timestamps_button = gr.Button("Infer paragraph timestamps") | |
output_paragraphs_timestamps = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text, start)") | |
get_timestamps_button.click(gradio_get_paragraphs_timestamps, | |
inputs=[gv_transcript, gv_paragraphs], | |
outputs=[output_paragraphs_timestamps, gv_paragraphs]) | |
gr.Markdown("## Get chapters") | |
gv_chapters = gr.State() | |
get_chapters_button = gr.Button("Get chapters") | |
output_chapters = gr.Textbox(label = "Chapters (JSON format)") | |
get_chapters_button.click(gradio_get_chapters, | |
inputs=[gv_paragraphs, gv_toc], | |
outputs=[output_chapters, gv_chapters]) | |
gr.Markdown("## Markdown formatting") | |
get_markdown_button = gr.Button("Markdown formatting") | |
output_markdown = gr.Markdown(label = "Chapters (Markdown format)") | |
get_markdown_button.click(gradio_get_markdown, | |
inputs=[gv_chapters], | |
outputs=[output_markdown]) | |
app.launch(debug=True) | |