Spaces:

IbrahimHasani
/

ActionDetectionVideo

Runtime error

App Files Files Community

IbrahimHasani commited on Aug 16, 2023

Commit

53189f9

1 Parent(s): d6e2134

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -32

app.py CHANGED Viewed

@@ -5,9 +5,12 @@ from transformers import AutoProcessor, AutoModel
 from PIL import Image
 from decord import VideoReader, cpu
-# Use GPU if available
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-torch.backends.cudnn.benchmark = True
 def sample_uniform_frame_indices(clip_len, seg_len):
     if seg_len < clip_len:
@@ -25,11 +28,7 @@ def read_video_decord(file_path, indices):
     return video
 def concatenate_frames(frames, clip_len):
-    layout = {
-        32: (4, 8),
-        16: (4, 4),
-        8:  (2, 4)
-    }
     rows, cols = layout[clip_len]
     combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
     frame_iter = iter(frames)
@@ -43,34 +42,19 @@ def concatenate_frames(frames, clip_len):
         y_offset += frames[0].shape[0]
     return combined_image
-def model_interface(uploaded_video, model_choice, activity):
-    clip_len = {
-        "microsoft/xclip-base-patch16-zero-shot": 32,
-        "microsoft/xclip-base-patch32-16-frames": 16,
-        "microsoft/xclip-base-patch32": 8
-    }.get(model_choice, 32)
-    indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
     video = read_video_decord(uploaded_video, indices)
-    concatenated_image = concatenate_frames(video, clip_len)
     activities_list = [activity, "other"]
-    processor = AutoProcessor.from_pretrained(model_choice)  # No .to(device) for the processor
-    model = AutoModel.from_pretrained(model_choice).to(device)
-    # Convert the list of frames to a single numpy array for efficient conversion to a tensor
-    video_np_array = np.array(video)
     inputs = processor(
         text=activities_list,
-        videos=video_np_array,
         return_tensors="pt",
         padding=True,
     )
-    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
         outputs = model(**inputs)
@@ -96,11 +80,6 @@ iface = gr.Interface(
     fn=model_interface,
     inputs=[
         gr.components.Video(label="Upload a video file"),
-        gr.components.Dropdown(choices=[
-            "microsoft/xclip-base-patch16-zero-shot",
-            #"microsoft/xclip-base-patch32-16-frames",
-            #"microsoft/xclip-base-patch32"
-        ], label="Model Choice"),
         gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
     ],
     outputs=[

 from PIL import Image
 from decord import VideoReader, cpu
+MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
+CLIP_LEN = 32
+# Load model and processor once
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = AutoModel.from_pretrained(MODEL_NAME)
 def sample_uniform_frame_indices(clip_len, seg_len):
     if seg_len < clip_len:
     return video
 def concatenate_frames(frames, clip_len):
+    layout = { 32: (4, 8) }
     rows, cols = layout[clip_len]
     combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
     frame_iter = iter(frames)
         y_offset += frames[0].shape[0]
     return combined_image
+def model_interface(uploaded_video, activity):
+    indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=len(VideoReader(uploaded_video)))
     video = read_video_decord(uploaded_video, indices)
+    concatenated_image = concatenate_frames(video, CLIP_LEN)
     activities_list = [activity, "other"]
     inputs = processor(
         text=activities_list,
+        videos=list(video),
         return_tensors="pt",
         padding=True,
     )
     with torch.no_grad():
         outputs = model(**inputs)
     fn=model_interface,
     inputs=[
         gr.components.Video(label="Upload a video file"),
         gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
     ],
     outputs=[