import gradio as gr import pixeltable as pxt from pixeltable.functions.huggingface import clip_image, clip_text from pixeltable.iterators import FrameIterator import os import logging # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Initialize Pixeltable directory constants PROJECT_DIR = 'video_search' VIDEOS_TABLE = f'{PROJECT_DIR}.videos' FRAMES_VIEW = f'{PROJECT_DIR}.frames' # Process video and create index def process_video(video_file, progress=gr.Progress()): if video_file is None: return "Please upload a video file first." try: progress(0, desc="Initializing...") logger.info(f"Processing video: {video_file.name}") # Pixeltable setup pxt.drop_dir(PROJECT_DIR, force=True) pxt.create_dir(PROJECT_DIR) # Create video table video_table = pxt.create_table(VIDEOS_TABLE, {'video': pxt.Video}) # Create frames view frames_view = pxt.create_view( FRAMES_VIEW, video_table, iterator=FrameIterator.create(video=video_table.video, fps=1) ) progress(0.2, desc="Inserting video...") video_table.insert([{'video': video_file.name}]) progress(0.4, desc="Creating embedding index...") # Use the CLIP model for both text and image embedding clip_model = 'openai/clip-vit-base-patch32' frames_view.add_embedding_index( 'frame', string_embed=clip_text.using(model_id=clip_model), image_embed=clip_image.using(model_id=clip_model) ) progress(1.0, desc="Processing complete") return "✅ Video processed successfully! You can now search for specific moments using text or images." except Exception as e: logger.error(f"Error processing video: {str(e)}") return f"Error processing video: {str(e)}" # Perform similarity search def similarity_search(query, search_type, num_results, progress=gr.Progress()): try: if not query: return [] frames_view = pxt.get_table(FRAMES_VIEW) if frames_view is None: return [] progress(0.5, desc="Performing search...") sim = frames_view.frame.similarity(query) results = frames_view.order_by(sim, asc=False).limit(num_results).select( frames_view.frame, similarity=sim ).collect() progress(1.0, desc="Search complete") return [row['frame'] for row in results] except Exception as e: logger.error(f"Error during search: {str(e)}") return [] # Create CSS for better styling css = """ .container { max-width: 1200px; margin: 0 auto; } .header { display: flex; align-items: center; margin-bottom: 20px; } .header img { max-width: 120px; margin-right: 20px; } .step-header { background-color: #f5f5f5; padding: 10px; border-radius: 5px; margin-bottom: 15px; } .examples-section { margin-top: 30px; } """ # Gradio interface with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo: gr.HTML( """
Pixeltable

Video Frame Search with AI

Search through video content using natural language or images powered by Pixeltable.

""" ) with gr.Row(): with gr.Column(scale=1): gr.HTML('

1. Insert video

') video_file = gr.File(label="Upload Video", file_types=["video"]) process_button = gr.Button("Process Video", variant="primary") process_output = gr.Textbox(label="Status", lines=2) gr.HTML('

2. Search video frames

') search_type = gr.Radio( ["Text", "Image"], label="Search Type", value="Text", info="Choose whether to search using text or an image" ) text_input = gr.Textbox( label="Text Query", placeholder="Describe what you're looking for...", info="Example: 'person walking' or 'red car'" ) image_input = gr.Image( label="Image Query", type="pil", visible=False, info="Upload an image to find similar frames" ) num_results = gr.Slider( minimum=1, maximum=20, value=5, step=1, label="Number of Results", info="How many matching frames to display" ) search_button = gr.Button("Search", variant="primary") with gr.Column(scale=2): gr.HTML('

3. Visualize results

') results_gallery = gr.Gallery( label="Search Results", columns=3, allow_preview=True, object_fit="contain" ) with gr.Accordion("Example Videos", open=False): gr.Markdown("Click one of the examples below to get started") gr.Examples( examples=[ ["bangkok.mp4"], ["lotr.mp4"], ["mi.mp4"], ], inputs=[video_file], outputs=[process_output], fn=process_video, cache_examples=True ) # Handle UI interactions def update_search_input(choice): return gr.update(visible=choice=="Text"), gr.update(visible=choice=="Image") search_type.change(update_search_input, search_type, [text_input, image_input]) process_button.click( process_video, inputs=[video_file], outputs=[process_output] ) def perform_search(search_type, text_query, image_query, num_results): query = text_query if search_type == "Text" else image_query if query is None or (isinstance(query, str) and query.strip() == ""): return gr.Gallery(label="Please enter a valid search query") return similarity_search(query, search_type, num_results) search_button.click( perform_search, inputs=[search_type, text_input, image_input, num_results], outputs=[results_gallery] ) # Add keyboard shortcuts search_type.change(lambda: None, None, None, _js="() => {document.activeElement.blur();}") text_input.submit( perform_search, inputs=[search_type, text_input, image_input, num_results], outputs=[results_gallery] ) if __name__ == "__main__": demo.launch()