Spaces:
Running
Running
File size: 4,775 Bytes
e29f761 ea1af87 e29f761 ea1af87 e29f761 ea1af87 e29f761 ea1af87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
import os
import json
from youtube_transcript_api import YouTubeTranscriptApi
from openai import OpenAI
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def gradio_video_id_to_transcript(video_id):
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
transcript_formatted = [{'start': entry['start'], 'text': entry['text']} for entry in transcript[0:10]]
transcript_formatted_str = json.dumps(transcript_formatted, indent=2)+'...'
return {output_transcript: transcript_formatted_str,
gv_transcript: transcript}
def gradio_transcript_to_paragraphs(gv_transcript_value):
paragraphs, nb_input_tokens, nb_output_tokens, price = \
transcript_to_paragraphs(gv_transcript_value, openai_client, openai_model, chunk_size=5000)
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
return {output_paragraphs: paragraphs_formatted_str,
gv_paragraphs: paragraphs}
def gradio_paragraphs_to_toc(gv_paragraphs_value):
paragraphs_dict = gv_paragraphs_value
json_toc, nb_input_tokens, nb_output_tokens, price = \
paragraphs_to_toc(paragraphs_dict, openai_client, openai_model, chunk_size=100)
json_toc_formatted_str = json.dumps(json_toc[0:4], indent=2)+'...'
return {output_toc: json_toc_formatted_str,
gv_toc: json_toc}
def gradio_get_paragraphs_timestamps(gv_transcript_value, gv_paragraphs_value):
paragraphs = add_timestamps_to_paragraphs(gv_transcript_value, gv_paragraphs_value, num_words=50)
paragraphs_formatted_str = json.dumps(paragraphs[0:4], indent=2)+'...'
return {output_paragraphs_timestamps: paragraphs_formatted_str,
gv_paragraphs: paragraphs}
def gradio_get_chapters(gv_paragraphs_value, gv_toc_value):
chapters = get_chapters(gv_paragraphs_value, gv_toc_value)
chapters_formatted_str = json.dumps(chapters[0:4], indent=2)+'...'
return {output_chapters: chapters_formatted_str,
gv_chapters: chapters}
def gradio_get_markdown(gv_chapters_value):
markdown = chapters_to_markdown(gv_chapters_value)
return markdown
with gr.Blocks() as app:
gr.Markdown("## Get transcript")
gv_transcript = gr.State()
video_id_input = gr.Textbox(label="Video ID", value = "ErnWZxJovaM")
get_transcript_button = gr.Button("Get transcript")
output_transcript = gr.Textbox(label = "Transcript (JSON format - start, text)")
get_transcript_button.click(gradio_video_id_to_transcript,
inputs=[video_id_input],
outputs=[output_transcript, gv_transcript])
gr.Markdown("## Transcript to paragraphs")
gv_paragraphs = gr.State()
get_paragraphs_button = gr.Button("Get paragraphs")
output_paragraphs = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text)")
get_paragraphs_button.click(gradio_transcript_to_paragraphs,
inputs=[gv_transcript],
outputs=[output_paragraphs, gv_paragraphs])
gr.Markdown("## Get table of content")
gv_toc = gr.State()
get_toc_button = gr.Button("Get table of contents")
output_toc = gr.Textbox(label = "Table of content (JSON format - paragraph_number, title)")
get_toc_button.click(gradio_paragraphs_to_toc,
inputs=[gv_paragraphs],
outputs=[output_toc, gv_toc])
gr.Markdown("## Infer paragraph timestamps with TF-IDF")
get_timestamps_button = gr.Button("Infer paragraph timestamps")
output_paragraphs_timestamps = gr.Textbox(label = "Paragraphs (JSON format - paragraph_number, paragraph_text, start)")
get_timestamps_button.click(gradio_get_paragraphs_timestamps,
inputs=[gv_transcript, gv_paragraphs],
outputs=[output_paragraphs_timestamps, gv_paragraphs])
gr.Markdown("## Get chapters")
gv_chapters = gr.State()
get_chapters_button = gr.Button("Get chapters")
output_chapters = gr.Textbox(label = "Chapters (JSON format)")
get_chapters_button.click(gradio_get_chapters,
inputs=[gv_paragraphs, gv_toc],
outputs=[output_chapters, gv_chapters])
gr.Markdown("## Markdown formatting")
get_markdown_button = gr.Button("Markdown formatting")
output_markdown = gr.Markdown(label = "Chapters (Markdown format)")
get_markdown_button.click(gradio_get_markdown,
inputs=[gv_chapters],
outputs=[output_markdown])
app.launch(debug=True)
|