Spaces:
Sleeping
Sleeping
File size: 3,128 Bytes
174a0dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
import torch
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Resize
from pytorchvideo.transforms import UniformTemporalSubsample
from transformers import VideoMAEForVideoClassification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = "model"
loaded_model = VideoMAEForVideoClassification.from_pretrained(model_path)
loaded_model = loaded_model.to(device)
loaded_model.eval()
label_names = [
'Archery', 'BalanceBeam', 'BenchPress', 'ApplyEyeMakeup', 'BasketballDunk',
'BandMarching', 'BabyCrawling', 'ApplyLipstick', 'BaseballPitch', 'Basketball'
]
def load_video(video_path):
try:
video = EncodedVideo.from_path(video_path)
video_data = video.get_clip(start_sec=0, end_sec=video.duration)
return video_data['video']
except Exception as e:
raise ValueError(f"Error loading video: {str(e)}")
def preprocess_video(video_frames):
try:
transform_temporal = UniformTemporalSubsample(16)
video_frames = transform_temporal(video_frames)
video_frames = video_frames / 255.0
if video_frames.shape[0] == 3:
video_frames = video_frames.permute(1, 0, 2, 3)
mean = torch.tensor([0.485, 0.456, 0.406])
std = torch.tensor([0.229, 0.224, 0.225])
for t in range(video_frames.shape[0]):
video_frames[t] = (video_frames[t] - mean[:, None, None]) / std[:, None, None]
resize_transform = Resize((224, 224))
video_frames = resize_transform(video_frames)
video_frames = video_frames.unsqueeze(0)
return video_frames
except Exception as e:
raise ValueError(f"Error preprocessing video: {str(e)}")
def predict_video(video):
try:
video_path = video.name
video_data = load_video(video_path)
processed_video = preprocess_video(video_data)
processed_video = processed_video.to(device)
with torch.no_grad():
outputs = loaded_model(processed_video)
logits = outputs.logits
probabilities = torch.nn.functional.softmax(logits, dim=-1)[0]
top_3 = torch.topk(probabilities, 3)
results = []
for i in range(3):
idx = top_3.indices[i].item()
prob = top_3.values[i].item()
results.append(f"{label_names[idx]}: {prob*100:.2f}%")
return "\n".join(results)
except Exception as e:
return f"Error processing video: {str(e)}"
iface = gr.Interface(
fn=predict_video,
inputs=gr.Video(label="Upload Video"),
outputs=gr.Textbox(label="Top 3 Predictions"),
title="Video Action Recognition",
description="Upload a video to classify the action being performed. The model will return the top 3 predictions with their probabilities.",
examples=[
["test_video_1.avi"],
["test_video_2.avi"],
["test_video_3.avi"]
]
)
iface.launch(debug=True, share=True)
|