File size: 3,680 Bytes
56de2d4
 
b8466ce
 
 
2c5687c
10696ac
53189f9
 
 
2c5687c
 
 
 
 
53189f9
2c5687c
 
a6c8793
56de2d4
 
 
 
 
 
 
 
 
 
 
2c5687c
 
 
56de2d4
 
 
 
53189f9
56de2d4
 
 
 
 
 
 
 
 
 
 
 
 
53189f9
 
c09b2c5
53189f9
56de2d4
b8466ce
5186ead
 
 
 
56de2d4
 
5186ead
56de2d4
 
 
5186ead
2c5687c
 
 
56de2d4
 
 
a23243f
56de2d4
 
 
 
b8466ce
56de2d4
b8466ce
2c5687c
 
b8466ce
 
56de2d4
b8466ce
2c5687c
56de2d4
a23243f
1bc2256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a23243f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu, gpu

MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
CLIP_LEN = 32

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print (device)

# Load model and processor once and move them to the GPU
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

def sample_uniform_frame_indices(clip_len, seg_len):
    if seg_len < clip_len:
        repeat_factor = np.ceil(clip_len / seg_len).astype(int)
        indices = np.arange(seg_len).tolist() * repeat_factor
        indices = indices[:clip_len]
    else:
        spacing = seg_len // clip_len
        indices = [i * spacing for i in range(clip_len)]
    return np.array(indices).astype(np.int64)

def read_video_decord(file_path, indices):
    # Use GPU for video decoding if available
    vr_ctx = cpu(0)
    vr = VideoReader(file_path, num_threads=1, ctx=vr_ctx)
    video = vr.get_batch(indices).asnumpy()
    return video

def concatenate_frames(frames, clip_len):
    layout = { 32: (4, 8) }
    rows, cols = layout[clip_len]
    combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
    frame_iter = iter(frames)
    y_offset = 0
    for i in range(rows):
        x_offset = 0
        for j in range(cols):
            img = Image.fromarray(next(frame_iter))
            combined_image.paste(img, (x_offset, y_offset))
            x_offset += frames[0].shape[1]
        y_offset += frames[0].shape[0]
    return combined_image

def model_interface(uploaded_video, activity):
    indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=len(VideoReader(uploaded_video)))
    video = read_video_decord(uploaded_video, indices)
    concatenated_image = concatenate_frames(video, CLIP_LEN)

    activities_list = [activity, "other"]
    
    # Convert list of numpy.ndarrays to a single numpy.ndarray
    video_array = np.array(video)
    
    inputs = processor(
        text=activities_list,
        videos=video_array,
        return_tensors="pt",
        padding=True,
    )
    
    # Move inputs to GPU
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_video = outputs.logits_per_video
    probs = logits_per_video.softmax(dim=1)

    results_probs = []
    results_logits = []
    max_prob_index = torch.argmax(probs[0]).item()
    for i in range(len(activities_list)):
        current_activity = activities_list[i]
        prob = float(probs[0][i].cpu())
        logit = float(logits_per_video[0][i].cpu())
        results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
        results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))

    likely_label = activities_list[max_prob_index]
    likely_probability = float(probs[0][max_prob_index].cpu()) * 100

    return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]

iface = gr.Interface(
    fn=model_interface,
    inputs=[
        gr.components.Video(label="Upload a video file"),
        gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
    ],
    outputs=[
        gr.components.Image(type="pil", label="Sampled Frames"),
        gr.components.Textbox(type="text", label="Probabilities"),
        gr.components.Textbox(type="text", label="Raw Scores"),
        gr.components.Textbox(type="text", label="Top Prediction")
    ],
    live=False
)

iface.launch()