Spaces:

IbrahimHasani
/

ActionDetectionVideo

Runtime error

App Files Files Community

IbrahimHasani commited on Aug 16, 2023

Commit

2dc6183

1 Parent(s): 8d1f721

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -18

app.py CHANGED Viewed

@@ -3,18 +3,15 @@ import torch
 import numpy as np
 from transformers import AutoProcessor, AutoModel
 from PIL import Image
-import cv2
-# Constants
 MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
 CLIP_LEN = 32
-# Check for GPU and set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load model and processor
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
-model = AutoModel.from_pretrained(MODEL_NAME).to(device).eval()
 def get_video_length(file_path):
     cap = cv2.VideoCapture(file_path)
@@ -25,8 +22,8 @@ def get_video_length(file_path):
 def read_video_opencv(file_path, indices):
     cap = cv2.VideoCapture(file_path)
     frames = []
-    for idx in indices:
-        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
         ret, frame = cap.read()
         if ret:
             frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
@@ -43,20 +40,22 @@ def sample_uniform_frame_indices(clip_len, seg_len):
         indices = [i * spacing for i in range(clip_len)]
     return np.array(indices).astype(np.int64)
-def get_concatenation_layout(clip_len):
-    # Modify as needed for other clip lengths
-    if clip_len == 32:
-        return 4, 8
 def concatenate_frames(frames, clip_len):
-    rows, cols = get_concatenation_layout(clip_len)
     combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
     frame_iter = iter(frames)
     y_offset = 0
     for i in range(rows):
         x_offset = 0
         for j in range(cols):
-            img = Image.fromarray(next(frame_iter))
             combined_image.paste(img, (x_offset, y_offset))
             x_offset += frames[0].shape[1]
         y_offset += frames[0].shape[0]
@@ -74,7 +73,7 @@ def model_interface(uploaded_video, activity):
         videos=list(video),
         return_tensors="pt",
         padding=True,
-    ).to(device)  # Move inputs to GPU if available
     with torch.no_grad():
         outputs = model(**inputs)
@@ -95,7 +94,7 @@ def model_interface(uploaded_video, activity):
     likely_label = activities_list[max_prob_index]
     likely_probability = float(probs[0][max_prob_index]) * 100
-    return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
 iface = gr.Interface(
     fn=model_interface,

 import numpy as np
 from transformers import AutoProcessor, AutoModel
 from PIL import Image
+import cv2
 MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
 CLIP_LEN = 32
+# Load model and processor once
 processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = AutoModel.from_pretrained(MODEL_NAME)
 def get_video_length(file_path):
     cap = cv2.VideoCapture(file_path)
 def read_video_opencv(file_path, indices):
     cap = cv2.VideoCapture(file_path)
     frames = []
+    for i in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
         ret, frame = cap.read()
         if ret:
             frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
         indices = [i * spacing for i in range(clip_len)]
     return np.array(indices).astype(np.int64)
 def concatenate_frames(frames, clip_len):
+    layout = { 32: (4, 8) }
+    rows, cols = layout[clip_len]
     combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
     frame_iter = iter(frames)
     y_offset = 0
     for i in range(rows):
         x_offset = 0
         for j in range(cols):
+            img_array = next(frame_iter)
+            # Handling rank-4 tensor
+            if len(img_array.shape) == 4:
+                img_array = img_array[0]
+            img = Image.fromarray(img_array)
             combined_image.paste(img, (x_offset, y_offset))
             x_offset += frames[0].shape[1]
         y_offset += frames[0].shape[0]
         videos=list(video),
         return_tensors="pt",
         padding=True,
+    )
     with torch.no_grad():
         outputs = model(**inputs)
     likely_label = activities_list[max_prob_index]
     likely_probability = float(probs[0][max_prob_index]) * 100
+    return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]
 iface = gr.Interface(
     fn=model_interface,