Spaces:

fffiloni
/

soft-video-understanding

Paused

Create app.py

027e8a9 verified over 1 year ago

1 kB

	import gradio as gr
	from gradio_client import Client

	# 1. extract and store 1 image every 5 images from video input
	# 2. extract audio
	# 3. for each image from extracted_images, get caption from caption model and concatenate into list
	# 4. for audio, ask audio questioning model to describe sound/scene
	# 5. give all to LLM, and ask it to resume, according to image caption list combined to audio caption

	def extract_image()

	def get_moondream()

	def get_salmonn()

	def llm_process()

	def infer(video_in):

	return video_description

	with gr.Blocks() as demo :
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h2 style="text-align: center;">Video description</h2>
	""")
	video_in = gr.Video(label="Video input")
	submit_btn = gr.Button("SUbmit")
	video_description = gr.Textbox(label="Video description")
	submit_btn.click(
	fn = infer,
	inputs = [video_in],
	outputs = [video_description]
	)
	demo.queue().launch()