Spaces:
Runtime error
Runtime error
Commit
·
53189f9
1
Parent(s):
d6e2134
Update app.py
Browse files
app.py
CHANGED
@@ -5,9 +5,12 @@ from transformers import AutoProcessor, AutoModel
|
|
5 |
from PIL import Image
|
6 |
from decord import VideoReader, cpu
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
11 |
|
12 |
def sample_uniform_frame_indices(clip_len, seg_len):
|
13 |
if seg_len < clip_len:
|
@@ -25,11 +28,7 @@ def read_video_decord(file_path, indices):
|
|
25 |
return video
|
26 |
|
27 |
def concatenate_frames(frames, clip_len):
|
28 |
-
layout = {
|
29 |
-
32: (4, 8),
|
30 |
-
16: (4, 4),
|
31 |
-
8: (2, 4)
|
32 |
-
}
|
33 |
rows, cols = layout[clip_len]
|
34 |
combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
|
35 |
frame_iter = iter(frames)
|
@@ -43,34 +42,19 @@ def concatenate_frames(frames, clip_len):
|
|
43 |
y_offset += frames[0].shape[0]
|
44 |
return combined_image
|
45 |
|
46 |
-
def model_interface(uploaded_video,
|
47 |
-
|
48 |
-
"microsoft/xclip-base-patch16-zero-shot": 32,
|
49 |
-
"microsoft/xclip-base-patch32-16-frames": 16,
|
50 |
-
"microsoft/xclip-base-patch32": 8
|
51 |
-
}.get(model_choice, 32)
|
52 |
-
|
53 |
-
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
|
54 |
video = read_video_decord(uploaded_video, indices)
|
55 |
-
concatenated_image = concatenate_frames(video,
|
56 |
|
57 |
activities_list = [activity, "other"]
|
58 |
-
|
59 |
-
processor = AutoProcessor.from_pretrained(model_choice) # No .to(device) for the processor
|
60 |
-
model = AutoModel.from_pretrained(model_choice).to(device)
|
61 |
-
|
62 |
-
# Convert the list of frames to a single numpy array for efficient conversion to a tensor
|
63 |
-
video_np_array = np.array(video)
|
64 |
-
|
65 |
inputs = processor(
|
66 |
text=activities_list,
|
67 |
-
videos=
|
68 |
return_tensors="pt",
|
69 |
padding=True,
|
70 |
)
|
71 |
|
72 |
-
inputs = {k: v.to(device) for k, v in inputs.items()}
|
73 |
-
|
74 |
with torch.no_grad():
|
75 |
outputs = model(**inputs)
|
76 |
|
@@ -96,11 +80,6 @@ iface = gr.Interface(
|
|
96 |
fn=model_interface,
|
97 |
inputs=[
|
98 |
gr.components.Video(label="Upload a video file"),
|
99 |
-
gr.components.Dropdown(choices=[
|
100 |
-
"microsoft/xclip-base-patch16-zero-shot",
|
101 |
-
#"microsoft/xclip-base-patch32-16-frames",
|
102 |
-
#"microsoft/xclip-base-patch32"
|
103 |
-
], label="Model Choice"),
|
104 |
gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
|
105 |
],
|
106 |
outputs=[
|
|
|
5 |
from PIL import Image
|
6 |
from decord import VideoReader, cpu
|
7 |
|
8 |
+
MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
|
9 |
+
CLIP_LEN = 32
|
10 |
+
|
11 |
+
# Load model and processor once
|
12 |
+
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
13 |
+
model = AutoModel.from_pretrained(MODEL_NAME)
|
14 |
|
15 |
def sample_uniform_frame_indices(clip_len, seg_len):
|
16 |
if seg_len < clip_len:
|
|
|
28 |
return video
|
29 |
|
30 |
def concatenate_frames(frames, clip_len):
|
31 |
+
layout = { 32: (4, 8) }
|
|
|
|
|
|
|
|
|
32 |
rows, cols = layout[clip_len]
|
33 |
combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
|
34 |
frame_iter = iter(frames)
|
|
|
42 |
y_offset += frames[0].shape[0]
|
43 |
return combined_image
|
44 |
|
45 |
+
def model_interface(uploaded_video, activity):
|
46 |
+
indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=len(VideoReader(uploaded_video)))
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
video = read_video_decord(uploaded_video, indices)
|
48 |
+
concatenated_image = concatenate_frames(video, CLIP_LEN)
|
49 |
|
50 |
activities_list = [activity, "other"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
inputs = processor(
|
52 |
text=activities_list,
|
53 |
+
videos=list(video),
|
54 |
return_tensors="pt",
|
55 |
padding=True,
|
56 |
)
|
57 |
|
|
|
|
|
58 |
with torch.no_grad():
|
59 |
outputs = model(**inputs)
|
60 |
|
|
|
80 |
fn=model_interface,
|
81 |
inputs=[
|
82 |
gr.components.Video(label="Upload a video file"),
|
|
|
|
|
|
|
|
|
|
|
83 |
gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
|
84 |
],
|
85 |
outputs=[
|