Spaces:

Pixeltable
/

Text-image-similarity-search-on-video-frames-embedding-indexes

Running

Update app.py

214ca3f verified 6 days ago

7.35 kB

	import gradio as gr
	import pixeltable as pxt
	from pixeltable.functions.huggingface import clip_image, clip_text
	from pixeltable.iterators import FrameIterator
	import os
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Initialize Pixeltable directory constants
	PROJECT_DIR = 'video_search'
	VIDEOS_TABLE = f'{PROJECT_DIR}.videos'
	FRAMES_VIEW = f'{PROJECT_DIR}.frames'

	# Process video and create index
	def process_video(video_file, progress=gr.Progress()):
	if video_file is None:
	return "Please upload a video file first."

	try:
	progress(0, desc="Initializing...")
	logger.info(f"Processing video: {video_file.name}")

	# Pixeltable setup
	pxt.drop_dir(PROJECT_DIR, force=True)
	pxt.create_dir(PROJECT_DIR)

	# Create video table
	video_table = pxt.create_table(VIDEOS_TABLE, {'video': pxt.Video})

	# Create frames view
	frames_view = pxt.create_view(
	FRAMES_VIEW,
	video_table,
	iterator=FrameIterator.create(video=video_table.video, fps=1)
	)

	progress(0.2, desc="Inserting video...")
	video_table.insert([{'video': video_file.name}])

	progress(0.4, desc="Creating embedding index...")
	# Use the CLIP model for both text and image embedding
	clip_model = 'openai/clip-vit-base-patch32'
	frames_view.add_embedding_index(
	'frame',
	string_embed=clip_text.using(model_id=clip_model),
	image_embed=clip_image.using(model_id=clip_model)
	)

	progress(1.0, desc="Processing complete")
	return "✅ Video processed successfully! You can now search for specific moments using text or images."

	except Exception as e:
	logger.error(f"Error processing video: {str(e)}")
	return f"Error processing video: {str(e)}"

	# Perform similarity search
	def similarity_search(query, search_type, num_results, progress=gr.Progress()):
	try:
	if not query:
	return []

	frames_view = pxt.get_table(FRAMES_VIEW)
	if frames_view is None:
	return []

	progress(0.5, desc="Performing search...")
	sim = frames_view.frame.similarity(query)

	results = frames_view.order_by(sim, asc=False).limit(num_results).select(
	frames_view.frame,
	similarity=sim
	).collect()

	progress(1.0, desc="Search complete")
	return [row['frame'] for row in results]

	except Exception as e:
	logger.error(f"Error during search: {str(e)}")
	return []

	# Create CSS for better styling
	css = """
	.container {
	max-width: 1200px;
	margin: 0 auto;
	}
	.header {
	display: flex;
	align-items: center;
	margin-bottom: 20px;
	}
	.header img {
	max-width: 120px;
	margin-right: 20px;
	}
	.step-header {
	background-color: #f5f5f5;
	padding: 10px;
	border-radius: 5px;
	margin-bottom: 15px;
	}
	.examples-section {
	margin-top: 30px;
	}
	"""

	# Gradio interface
	with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
	gr.HTML(
	"""
	<div class="header">
	<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/resources/pixeltable-logo-large.png" alt="Pixeltable" />
	<div>
	<h1>Video Frame Search with AI</h1>
	<p>Search through video content using natural language or images powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a>.</p>
	</div>
	</div>
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	gr.HTML('<div class="step-header"><h3>1. Insert video</h3></div>')

	video_file = gr.File(label="Upload Video", file_types=["video"])
	process_button = gr.Button("Process Video", variant="primary")
	process_output = gr.Textbox(label="Status", lines=2)

	gr.HTML('<div class="step-header"><h3>2. Search video frames</h3></div>')

	search_type = gr.Radio(
	["Text", "Image"],
	label="Search Type",
	value="Text",
	info="Choose whether to search using text or an image"
	)
	text_input = gr.Textbox(
	label="Text Query",
	placeholder="Describe what you're looking for...",
	info="Example: 'person walking' or 'red car'"
	)
	image_input = gr.Image(
	label="Image Query",
	type="pil",
	visible=False,
	info="Upload an image to find similar frames"
	)
	num_results = gr.Slider(
	minimum=1,
	maximum=20,
	value=5,
	step=1,
	label="Number of Results",
	info="How many matching frames to display"
	)
	search_button = gr.Button("Search", variant="primary")

	with gr.Column(scale=2):
	gr.HTML('<div class="step-header"><h3>3. Visualize results</h3></div>')

	results_gallery = gr.Gallery(
	label="Search Results",
	columns=3,
	allow_preview=True,
	object_fit="contain"
	)

	with gr.Accordion("Example Videos", open=False):
	gr.Markdown("Click one of the examples below to get started")
	gr.Examples(
	examples=[
	["bangkok.mp4"],
	["lotr.mp4"],
	["mi.mp4"],
	],
	inputs=[video_file],
	outputs=[process_output],
	fn=process_video,
	cache_examples=True
	)

	# Handle UI interactions
	def update_search_input(choice):
	return gr.update(visible=choice=="Text"), gr.update(visible=choice=="Image")

	search_type.change(update_search_input, search_type, [text_input, image_input])

	process_button.click(
	process_video,
	inputs=[video_file],
	outputs=[process_output]
	)

	def perform_search(search_type, text_query, image_query, num_results):
	query = text_query if search_type == "Text" else image_query
	if query is None or (isinstance(query, str) and query.strip() == ""):
	return gr.Gallery(label="Please enter a valid search query")

	return similarity_search(query, search_type, num_results)

	search_button.click(
	perform_search,
	inputs=[search_type, text_input, image_input, num_results],
	outputs=[results_gallery]
	)

	# Add keyboard shortcuts
	search_type.change(lambda: None, None, None, _js="() => {document.activeElement.blur();}")
	text_input.submit(
	perform_search,
	inputs=[search_type, text_input, image_input, num_results],
	outputs=[results_gallery]
	)

	if __name__ == "__main__":
	demo.launch()