Spaces:
Runtime error
Runtime error
File size: 4,017 Bytes
56de2d4 b8466ce a23243f 10696ac a6c8793 56de2d4 a23243f 56de2d4 b8466ce 56de2d4 a6c8793 56de2d4 c09b2c5 b8466ce 56de2d4 b8466ce a6c8793 d6e2134 a6c8793 56de2d4 a6c8793 56de2d4 a6c8793 56de2d4 a23243f 56de2d4 b8466ce 56de2d4 b8466ce 56de2d4 b8466ce 56de2d4 b8466ce 56de2d4 a23243f 1bc2256 d6e2134 1bc2256 a23243f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
def sample_uniform_frame_indices(clip_len, seg_len):
if seg_len < clip_len:
repeat_factor = np.ceil(clip_len / seg_len).astype(int)
indices = np.arange(seg_len).tolist() * repeat_factor
indices = indices[:clip_len]
else:
spacing = seg_len // clip_len
indices = [i * spacing for i in range(clip_len)]
return np.array(indices).astype(np.int64)
def read_video_decord(file_path, indices):
vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
video = vr.get_batch(indices).asnumpy()
return video
def concatenate_frames(frames, clip_len):
layout = {
32: (4, 8),
16: (4, 4),
8: (2, 4)
}
rows, cols = layout[clip_len]
combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
frame_iter = iter(frames)
y_offset = 0
for i in range(rows):
x_offset = 0
for j in range(cols):
img = Image.fromarray(next(frame_iter))
combined_image.paste(img, (x_offset, y_offset))
x_offset += frames[0].shape[1]
y_offset += frames[0].shape[0]
return combined_image
def model_interface(uploaded_video, model_choice, activity):
clip_len = {
"microsoft/xclip-base-patch16-zero-shot": 32,
"microsoft/xclip-base-patch32-16-frames": 16,
"microsoft/xclip-base-patch32": 8
}.get(model_choice, 32)
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
video = read_video_decord(uploaded_video, indices)
concatenated_image = concatenate_frames(video, clip_len)
activities_list = [activity, "other"]
processor = AutoProcessor.from_pretrained(model_choice) # No .to(device) for the processor
model = AutoModel.from_pretrained(model_choice).to(device)
# Convert the list of frames to a single numpy array for efficient conversion to a tensor
video_np_array = np.array(video)
inputs = processor(
text=activities_list,
videos=video_np_array,
return_tensors="pt",
padding=True,
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
logits_per_video = outputs.logits_per_video
probs = logits_per_video.softmax(dim=1)
results_probs = []
results_logits = []
max_prob_index = torch.argmax(probs[0]).item()
for i in range(len(activities_list)):
current_activity = activities_list[i]
prob = float(probs[0][i])
logit = float(logits_per_video[0][i])
results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
likely_label = activities_list[max_prob_index]
likely_probability = float(probs[0][max_prob_index]) * 100
return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
iface = gr.Interface(
fn=model_interface,
inputs=[
gr.components.Video(label="Upload a video file"),
gr.components.Dropdown(choices=[
"microsoft/xclip-base-patch16-zero-shot",
#"microsoft/xclip-base-patch32-16-frames",
#"microsoft/xclip-base-patch32"
], label="Model Choice"),
gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
],
outputs=[
gr.components.Image(type="pil", label="Sampled Frames"),
gr.components.Textbox(type="text", label="Probabilities"),
gr.components.Textbox(type="text", label="Raw Scores"),
gr.components.Textbox(type="text", label="Top Prediction")
],
live=False
)
iface.launch()
|