Spaces:

IbrahimHasani
/

ActionDetectionVideo

Runtime error

App Files Files Community

ActionDetectionVideo / app.py

IbrahimHasani

Update app.py

1bc2256 over 1 year ago

raw

history blame

4.04 kB

	import gradio as gr
	import torch
	import numpy as np
	from transformers import AutoProcessor, AutoModel
	from PIL import Image
	from decord import VideoReader, cpu, gpu
	import cv2

	print(f"Is CUDA available: {torch.cuda.is_available()}")
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

	def sample_uniform_frame_indices(clip_len, seg_len):
	if seg_len < clip_len:
	repeat_factor = np.ceil(clip_len / seg_len).astype(int)
	indices = np.arange(seg_len).tolist() * repeat_factor
	indices = indices[:clip_len]
	else:
	spacing = seg_len // clip_len
	indices = [i * spacing for i in range(clip_len)]
	return np.array(indices).astype(np.int64)

	def read_video_decord(file_path, indices):
	vr = VideoReader(file_path, num_threads=1, ctx=gpu(0) if torch.cuda.is_available() else cpu(0))
	video = vr.get_batch(indices).asnumpy()
	return video

	def concatenate_frames(frames, clip_len):
	layout = {
	32: (4, 8),
	16: (4, 4),
	8: (2, 4)
	}
	rows, cols = layout[clip_len]
	combined_image = Image.new('RGB', (frames[0].shape[1]cols, frames[0].shape[0]rows))
	frame_iter = iter(frames)
	y_offset = 0
	for i in range(rows):
	x_offset = 0
	for j in range(cols):
	img = Image.fromarray(next(frame_iter))
	combined_image.paste(img, (x_offset, y_offset))
	x_offset += frames[0].shape[1]
	y_offset += frames[0].shape[0]
	return combined_image

	def model_interface(uploaded_video, model_choice, activity):
	clip_len = {
	"microsoft/xclip-base-patch16-zero-shot": 32,
	"microsoft/xclip-base-patch32-16-frames": 16,
	"microsoft/xclip-base-patch32": 8
	}.get(model_choice, 32)

	indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
	video = read_video_decord(uploaded_video, indices)
	concatenated_image = concatenate_frames(video, clip_len)

	# Convert list of numpy arrays to a single numpy ndarray
	video_np = np.array(video)

	activities_list = [activity, "other"]
	processor = AutoProcessor.from_pretrained(model_choice)
	model = AutoModel.from_pretrained(model_choice).to('cuda')

	inputs = processor(
	text=activities_list,
	videos=video_np, # Use the ndarray instead of the list
	return_tensors="pt",
	padding=True,
	)
	inputs = {name: tensor.to('cuda') for name, tensor in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)

	logits_per_video = outputs.logits_per_video.cpu()
	probs = logits_per_video.softmax(dim=1)

	results_probs = []
	results_logits = []
	max_prob_index = torch.argmax(probs[0]).item()
	for i in range(len(activities_list)):
	current_activity = activities_list[i]
	prob = float(probs[0][i])
	logit = float(logits_per_video[0][i])
	results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
	results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))

	likely_label = activities_list[max_prob_index]
	likely_probability = float(probs[0][max_prob_index]) * 100

	return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]

	iface = gr.Interface(
	fn=model_interface,
	inputs=[
	gr.components.Video(label="Upload a video file"),
	gr.components.Dropdown(choices=[
	"microsoft/xclip-base-patch16-zero-shot",
	"microsoft/xclip-base-patch32-16-frames",
	"microsoft/xclip-base-patch32"
	], label="Model Choice"),
	gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
	],
	outputs=[
	gr.components.Image(type="pil", label="Sampled Frames"),
	gr.components.Textbox(type="text", label="Probabilities"),
	gr.components.Textbox(type="text", label="Raw Scores"),
	gr.components.Textbox(type="text", label="Top Prediction")
	],
	live=False
	)

	iface.launch()