IbrahimHasani's picture
Update app.py
1bc2256
raw
history blame
4.04 kB
import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu, gpu
import cv2
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
def sample_uniform_frame_indices(clip_len, seg_len):
if seg_len < clip_len:
repeat_factor = np.ceil(clip_len / seg_len).astype(int)
indices = np.arange(seg_len).tolist() * repeat_factor
indices = indices[:clip_len]
else:
spacing = seg_len // clip_len
indices = [i * spacing for i in range(clip_len)]
return np.array(indices).astype(np.int64)
def read_video_decord(file_path, indices):
vr = VideoReader(file_path, num_threads=1, ctx=gpu(0) if torch.cuda.is_available() else cpu(0))
video = vr.get_batch(indices).asnumpy()
return video
def concatenate_frames(frames, clip_len):
layout = {
32: (4, 8),
16: (4, 4),
8: (2, 4)
}
rows, cols = layout[clip_len]
combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
frame_iter = iter(frames)
y_offset = 0
for i in range(rows):
x_offset = 0
for j in range(cols):
img = Image.fromarray(next(frame_iter))
combined_image.paste(img, (x_offset, y_offset))
x_offset += frames[0].shape[1]
y_offset += frames[0].shape[0]
return combined_image
def model_interface(uploaded_video, model_choice, activity):
clip_len = {
"microsoft/xclip-base-patch16-zero-shot": 32,
"microsoft/xclip-base-patch32-16-frames": 16,
"microsoft/xclip-base-patch32": 8
}.get(model_choice, 32)
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
video = read_video_decord(uploaded_video, indices)
concatenated_image = concatenate_frames(video, clip_len)
# Convert list of numpy arrays to a single numpy ndarray
video_np = np.array(video)
activities_list = [activity, "other"]
processor = AutoProcessor.from_pretrained(model_choice)
model = AutoModel.from_pretrained(model_choice).to('cuda')
inputs = processor(
text=activities_list,
videos=video_np, # Use the ndarray instead of the list
return_tensors="pt",
padding=True,
)
inputs = {name: tensor.to('cuda') for name, tensor in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
logits_per_video = outputs.logits_per_video.cpu()
probs = logits_per_video.softmax(dim=1)
results_probs = []
results_logits = []
max_prob_index = torch.argmax(probs[0]).item()
for i in range(len(activities_list)):
current_activity = activities_list[i]
prob = float(probs[0][i])
logit = float(logits_per_video[0][i])
results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
likely_label = activities_list[max_prob_index]
likely_probability = float(probs[0][max_prob_index]) * 100
return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]
iface = gr.Interface(
fn=model_interface,
inputs=[
gr.components.Video(label="Upload a video file"),
gr.components.Dropdown(choices=[
"microsoft/xclip-base-patch16-zero-shot",
"microsoft/xclip-base-patch32-16-frames",
"microsoft/xclip-base-patch32"
], label="Model Choice"),
gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
],
outputs=[
gr.components.Image(type="pil", label="Sampled Frames"),
gr.components.Textbox(type="text", label="Probabilities"),
gr.components.Textbox(type="text", label="Raw Scores"),
gr.components.Textbox(type="text", label="Top Prediction")
],
live=False
)
iface.launch()