Spaces:

Pixeltable
/

Text-image-similarity-search-on-video-frames-embedding-indexes

Running

File size: 4,845 Bytes

152a369
 
b659fec
152a369
2be8e0f
152a369
 
 
 
2be8e0f
152a369
2be8e0f
 
 
152a369
2be8e0f
 
152a369
2be8e0f
 
 
 
 
152a369
2be8e0f
 
214ca3f
2be8e0f
689022e
2be8e0f
 
689022e
 
2be8e0f
 
 
 
152a369
b41cec1
 
2be8e0f
 
 
 
 
 
b41cec1
 
2be8e0f
 
 
 
 
152a369
689022e
2be8e0f
152a369
689022e
2be8e0f
 
152a369
 
 
2be8e0f
 
 
 
 
 
 
 
b41cec1
 
 
2be8e0f
 
 
 
 
645d14b
2be8e0f
 
b41cec1
 
2be8e0f
 
 
 
b41cec1
2be8e0f
 
 
 
 
152a369
b41cec1
2be8e0f
 
 
 
 
3a73782
2be8e0f
214ca3f
2be8e0f
 
 
 
 
 
 
 
 
 
b41cec1
 
 
152a369
b41cec1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152a369
 
689022e

import gradio as gr
import pixeltable as pxt
from pixeltable.functions.huggingface import clip
from pixeltable.iterators import FrameIterator
import PIL.Image
import os

# Process video and create index
def process_video(video_file, progress=gr.Progress()):
    progress(0, desc="Initializing...")

    # Pixeltable setup
    pxt.drop_dir('video_search', force=True)
    pxt.create_dir('video_search')

    # Update type declaration to use simpler syntax
    video_table = pxt.create_table('video_search.videos', {'video': pxt.Video})

    frames_view = pxt.create_view(
        'video_search.frames', 
        video_table, 
        iterator=FrameIterator.create(video=video_table.video, fps=1)
    )

    progress(0.2, desc="Inserting video...")
    video_table.insert([{'video': video_file.name}])
    
    progress(0.4, desc="Creating embedding index...")
    # Fixed parameter names for the embedding index
    frames_view.add_embedding_index(
        'frame',
        image_embed=clip.using(model_id='openai/clip-vit-base-patch32'),
        string_embed=clip.using(model_id='openai/clip-vit-base-patch32')
    )

    progress(1.0, desc="Processing complete")
    return "Good news! Your video has been processed. Easily find the moments you need by searching with text or images."

# Perform similarity search
def similarity_search(query, search_type, num_results, progress=gr.Progress()):
    frames_view = pxt.get_table('video_search.frames')
    
    progress(0.5, desc="Performing search...")
    if search_type == "Text":
        sim = frames_view.frame.similarity(query)
    else:  # Image search
        sim = frames_view.frame.similarity(query)
    
    results = frames_view.order_by(sim, asc=False).limit(num_results).select(frames_view.frame, sim=sim).collect()
    
    progress(1.0, desc="Search complete")
    return [row['frame'] for row in results]
    
# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        <div style="margin-bottom: 20px;">
            <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/resources/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 150px;" />
            <h2>Text and Image similarity search on video frames with embedding indexes</h2>
        </div>
        """
    )
    gr.HTML(
    """
    <p>
        <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none; font-weight: bold;">Pixeltable</a> is a declarative interface for working with text, images, embeddings, and even video, enabling you to store, transform, index, and iterate on data.
    </p>
    """
    )

    
    with gr.Row():
        with gr.Column(scale=1):

            gr.Markdown(
            """
            <h3>1. Insert video</h3>
            """)
            
            video_file = gr.File(label="Upload Video")
            process_button = gr.Button("Process Video")
            process_output = gr.Textbox(label="Status", lines=2)
            
            gr.Markdown(
            """
            <h3>2. Search video frames</h3>
            """)
            
            search_type = gr.Radio(["Text", "Image"], label="Search Type", value="Text")
            text_input = gr.Textbox(label="Text Query")
            image_input = gr.Image(label="Image Query", type="pil", visible=False)
            num_results = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results")
            search_button = gr.Button("Search")
        
        with gr.Column(scale=2):

            gr.Markdown(
            """
            <h3>3. Visualize results</h3>
            """)
            
            results_gallery = gr.Gallery(label="Search Results", columns=3)
       
            gr.Examples(
            examples=[
                ["bangkok.mp4"],
                ["lotr.mp4"],
                ["mi.mp4"],
            ],
            label="Click one of the examples below to get started",
            inputs=[video_file],
            fn=process_video
            )
    
    def update_search_input(choice):
        return gr.update(visible=choice=="Text"), gr.update(visible=choice=="Image")

    search_type.change(update_search_input, search_type, [text_input, image_input])
    
    process_button.click(
        process_video,
        inputs=[video_file],
        outputs=[process_output]
    )
    
    def perform_search(search_type, text_query, image_query, num_results):
        query = text_query if search_type == "Text" else image_query
        return similarity_search(query, search_type, num_results)

    search_button.click(
        perform_search,
        inputs=[search_type, text_input, image_input, num_results],
        outputs=[results_gallery]
    )

if __name__ == "__main__":
    # Removed theme parameter which might be causing issues
    demo.launch()