Spaces:

jarif
/

Video-Classifier

Sleeping

File size: 3,128 Bytes

174a0dc

import gradio as gr
import torch
from pytorchvideo.data.encoded_video import EncodedVideo
from torchvision.transforms import Resize
from pytorchvideo.transforms import UniformTemporalSubsample
from transformers import VideoMAEForVideoClassification

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = "model"
loaded_model = VideoMAEForVideoClassification.from_pretrained(model_path)
loaded_model = loaded_model.to(device)
loaded_model.eval()

label_names = [
    'Archery', 'BalanceBeam', 'BenchPress', 'ApplyEyeMakeup', 'BasketballDunk',
    'BandMarching', 'BabyCrawling', 'ApplyLipstick', 'BaseballPitch', 'Basketball'
]

def load_video(video_path):
    try:
        video = EncodedVideo.from_path(video_path)
        video_data = video.get_clip(start_sec=0, end_sec=video.duration)
        return video_data['video']
    except Exception as e:
        raise ValueError(f"Error loading video: {str(e)}")

def preprocess_video(video_frames):
    try:
        transform_temporal = UniformTemporalSubsample(16)
        video_frames = transform_temporal(video_frames)
        video_frames = video_frames / 255.0

        if video_frames.shape[0] == 3:
            video_frames = video_frames.permute(1, 0, 2, 3)
        
        mean = torch.tensor([0.485, 0.456, 0.406])
        std = torch.tensor([0.229, 0.224, 0.225])
        for t in range(video_frames.shape[0]):
            video_frames[t] = (video_frames[t] - mean[:, None, None]) / std[:, None, None]

        resize_transform = Resize((224, 224))
        video_frames = resize_transform(video_frames)
        video_frames = video_frames.unsqueeze(0)

        return video_frames
    except Exception as e:
        raise ValueError(f"Error preprocessing video: {str(e)}")

def predict_video(video):
    try:
        video_path = video.name
        video_data = load_video(video_path)
        processed_video = preprocess_video(video_data)
        processed_video = processed_video.to(device)
        
        with torch.no_grad():
            outputs = loaded_model(processed_video)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)[0]
            top_3 = torch.topk(probabilities, 3)
            
        results = []
        for i in range(3):
            idx = top_3.indices[i].item()
            prob = top_3.values[i].item()
            results.append(f"{label_names[idx]}: {prob*100:.2f}%")
            
        return "\n".join(results)
    except Exception as e:
        return f"Error processing video: {str(e)}"

iface = gr.Interface(
    fn=predict_video,
    inputs=gr.Video(label="Upload Video"),
    outputs=gr.Textbox(label="Top 3 Predictions"),
    title="Video Action Recognition",
    description="Upload a video to classify the action being performed. The model will return the top 3 predictions with their probabilities.",
    examples=[
        ["test_video_1.avi"],
        ["test_video_2.avi"],
        ["test_video_3.avi"]
    ]
)

iface.launch(debug=True, share=True)