Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from pytorchvideo.data.encoded_video import EncodedVideo | |
from torchvision.transforms import Resize | |
from pytorchvideo.transforms import UniformTemporalSubsample | |
from transformers import VideoMAEForVideoClassification | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
model_path = "model" | |
loaded_model = VideoMAEForVideoClassification.from_pretrained(model_path) | |
loaded_model = loaded_model.to(device) | |
loaded_model.eval() | |
label_names = [ | |
'Archery', 'BalanceBeam', 'BenchPress', 'ApplyEyeMakeup', 'BasketballDunk', | |
'BandMarching', 'BabyCrawling', 'ApplyLipstick', 'BaseballPitch', 'Basketball' | |
] | |
def load_video(video_path): | |
try: | |
video = EncodedVideo.from_path(video_path) | |
video_data = video.get_clip(start_sec=0, end_sec=video.duration) | |
return video_data['video'] | |
except Exception as e: | |
raise ValueError(f"Error loading video: {str(e)}") | |
def preprocess_video(video_frames): | |
try: | |
transform_temporal = UniformTemporalSubsample(16) | |
video_frames = transform_temporal(video_frames) | |
video_frames = video_frames / 255.0 | |
if video_frames.shape[0] == 3: | |
video_frames = video_frames.permute(1, 0, 2, 3) | |
mean = torch.tensor([0.485, 0.456, 0.406]) | |
std = torch.tensor([0.229, 0.224, 0.225]) | |
for t in range(video_frames.shape[0]): | |
video_frames[t] = (video_frames[t] - mean[:, None, None]) / std[:, None, None] | |
resize_transform = Resize((224, 224)) | |
video_frames = resize_transform(video_frames) | |
video_frames = video_frames.unsqueeze(0) | |
return video_frames | |
except Exception as e: | |
raise ValueError(f"Error preprocessing video: {str(e)}") | |
def predict_video(video): | |
try: | |
video_path = video.name | |
video_data = load_video(video_path) | |
processed_video = preprocess_video(video_data) | |
processed_video = processed_video.to(device) | |
with torch.no_grad(): | |
outputs = loaded_model(processed_video) | |
logits = outputs.logits | |
probabilities = torch.nn.functional.softmax(logits, dim=-1)[0] | |
top_3 = torch.topk(probabilities, 3) | |
results = [] | |
for i in range(3): | |
idx = top_3.indices[i].item() | |
prob = top_3.values[i].item() | |
results.append(f"{label_names[idx]}: {prob*100:.2f}%") | |
return "\n".join(results) | |
except Exception as e: | |
return f"Error processing video: {str(e)}" | |
iface = gr.Interface( | |
fn=predict_video, | |
inputs=gr.Video(label="Upload Video"), | |
outputs=gr.Textbox(label="Top 3 Predictions"), | |
title="Video Action Recognition", | |
description="Upload a video to classify the action being performed. The model will return the top 3 predictions with their probabilities.", | |
examples=[ | |
["test_video_1.avi"], | |
["test_video_2.avi"], | |
["test_video_3.avi"] | |
] | |
) | |
iface.launch(debug=True, share=True) | |