IbrahimHasani commited on
Commit
53189f9
·
1 Parent(s): d6e2134

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -32
app.py CHANGED
@@ -5,9 +5,12 @@ from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
  from decord import VideoReader, cpu
7
 
8
- # Use GPU if available
9
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
- torch.backends.cudnn.benchmark = True
 
 
 
11
 
12
  def sample_uniform_frame_indices(clip_len, seg_len):
13
  if seg_len < clip_len:
@@ -25,11 +28,7 @@ def read_video_decord(file_path, indices):
25
  return video
26
 
27
  def concatenate_frames(frames, clip_len):
28
- layout = {
29
- 32: (4, 8),
30
- 16: (4, 4),
31
- 8: (2, 4)
32
- }
33
  rows, cols = layout[clip_len]
34
  combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
35
  frame_iter = iter(frames)
@@ -43,34 +42,19 @@ def concatenate_frames(frames, clip_len):
43
  y_offset += frames[0].shape[0]
44
  return combined_image
45
 
46
- def model_interface(uploaded_video, model_choice, activity):
47
- clip_len = {
48
- "microsoft/xclip-base-patch16-zero-shot": 32,
49
- "microsoft/xclip-base-patch32-16-frames": 16,
50
- "microsoft/xclip-base-patch32": 8
51
- }.get(model_choice, 32)
52
-
53
- indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
54
  video = read_video_decord(uploaded_video, indices)
55
- concatenated_image = concatenate_frames(video, clip_len)
56
 
57
  activities_list = [activity, "other"]
58
-
59
- processor = AutoProcessor.from_pretrained(model_choice) # No .to(device) for the processor
60
- model = AutoModel.from_pretrained(model_choice).to(device)
61
-
62
- # Convert the list of frames to a single numpy array for efficient conversion to a tensor
63
- video_np_array = np.array(video)
64
-
65
  inputs = processor(
66
  text=activities_list,
67
- videos=video_np_array,
68
  return_tensors="pt",
69
  padding=True,
70
  )
71
 
72
- inputs = {k: v.to(device) for k, v in inputs.items()}
73
-
74
  with torch.no_grad():
75
  outputs = model(**inputs)
76
 
@@ -96,11 +80,6 @@ iface = gr.Interface(
96
  fn=model_interface,
97
  inputs=[
98
  gr.components.Video(label="Upload a video file"),
99
- gr.components.Dropdown(choices=[
100
- "microsoft/xclip-base-patch16-zero-shot",
101
- #"microsoft/xclip-base-patch32-16-frames",
102
- #"microsoft/xclip-base-patch32"
103
- ], label="Model Choice"),
104
  gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
105
  ],
106
  outputs=[
 
5
  from PIL import Image
6
  from decord import VideoReader, cpu
7
 
8
+ MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
9
+ CLIP_LEN = 32
10
+
11
+ # Load model and processor once
12
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
13
+ model = AutoModel.from_pretrained(MODEL_NAME)
14
 
15
  def sample_uniform_frame_indices(clip_len, seg_len):
16
  if seg_len < clip_len:
 
28
  return video
29
 
30
  def concatenate_frames(frames, clip_len):
31
+ layout = { 32: (4, 8) }
 
 
 
 
32
  rows, cols = layout[clip_len]
33
  combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
34
  frame_iter = iter(frames)
 
42
  y_offset += frames[0].shape[0]
43
  return combined_image
44
 
45
+ def model_interface(uploaded_video, activity):
46
+ indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=len(VideoReader(uploaded_video)))
 
 
 
 
 
 
47
  video = read_video_decord(uploaded_video, indices)
48
+ concatenated_image = concatenate_frames(video, CLIP_LEN)
49
 
50
  activities_list = [activity, "other"]
 
 
 
 
 
 
 
51
  inputs = processor(
52
  text=activities_list,
53
+ videos=list(video),
54
  return_tensors="pt",
55
  padding=True,
56
  )
57
 
 
 
58
  with torch.no_grad():
59
  outputs = model(**inputs)
60
 
 
80
  fn=model_interface,
81
  inputs=[
82
  gr.components.Video(label="Upload a video file"),
 
 
 
 
 
83
  gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
84
  ],
85
  outputs=[