fffiloni's picture
Create app.py
027e8a9 verified
raw
history blame
1 kB
import gradio as gr
from gradio_client import Client
# 1. extract and store 1 image every 5 images from video input
# 2. extract audio
# 3. for each image from extracted_images, get caption from caption model and concatenate into list
# 4. for audio, ask audio questioning model to describe sound/scene
# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption
def extract_image()
def get_moondream()
def get_salmonn()
def llm_process()
def infer(video_in):
return video_description
with gr.Blocks() as demo :
with gr.Column(elem_id="col-container"):
gr.HTML("""
<h2 style="text-align: center;">Video description</h2>
""")
video_in = gr.Video(label="Video input")
submit_btn = gr.Button("SUbmit")
video_description = gr.Textbox(label="Video description")
submit_btn.click(
fn = infer,
inputs = [video_in],
outputs = [video_description]
)
demo.queue().launch()