IbrahimHasani commited on
Commit
c09b2c5
·
1 Parent(s): 0fcf96b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -47
app.py CHANGED
@@ -3,13 +3,11 @@ import torch
3
  import numpy as np
4
  from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
- from decord import VideoReader, cpu
7
  import cv2
8
 
9
  print(f"Is CUDA available: {torch.cuda.is_available()}")
10
- # True
11
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
12
- # Tesla T4
13
 
14
  def sample_uniform_frame_indices(clip_len, seg_len):
15
  if seg_len < clip_len:
@@ -22,22 +20,10 @@ def sample_uniform_frame_indices(clip_len, seg_len):
22
  return np.array(indices).astype(np.int64)
23
 
24
  def read_video_decord(file_path, indices):
25
- vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
26
  video = vr.get_batch(indices).asnumpy()
27
  return video
28
 
29
- def read_video_opencv(file_path, indices):
30
- vidcap = cv2.VideoCapture(file_path)
31
- frames = []
32
- for idx in indices:
33
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)
34
- success, image = vidcap.read()
35
- if success:
36
- # Convert BGR to RGB
37
- frames.append(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
38
- return frames
39
-
40
-
41
  def concatenate_frames(frames, clip_len):
42
  layout = {
43
  32: (4, 8),
@@ -63,26 +49,30 @@ def model_interface(uploaded_video, model_choice, activity):
63
  "microsoft/xclip-base-patch32-16-frames": 16,
64
  "microsoft/xclip-base-patch32": 8
65
  }.get(model_choice, 32)
 
66
  indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
67
- video = read_video_opencv(uploaded_video, indices)
68
  concatenated_image = concatenate_frames(video, clip_len)
69
 
70
- # Appending "other" to the list of activities
 
 
71
  activities_list = [activity, "other"]
72
  processor = AutoProcessor.from_pretrained(model_choice)
73
- model = AutoModel.from_pretrained(model_choice)
74
-
75
  inputs = processor(
76
  text=activities_list,
77
- videos=list(video),
78
  return_tensors="pt",
79
  padding=True,
80
  )
 
81
 
82
  with torch.no_grad():
83
  outputs = model(**inputs)
84
 
85
- logits_per_video = outputs.logits_per_video
86
  probs = logits_per_video.softmax(dim=1)
87
 
88
  results_probs = []
@@ -98,28 +88,4 @@ def model_interface(uploaded_video, model_choice, activity):
98
  likely_label = activities_list[max_prob_index]
99
  likely_probability = float(probs[0][max_prob_index]) * 100
100
 
101
- return concatenated_image, results_probs, results_logits, [ likely_label , likely_probability ]
102
-
103
- iface = gr.Interface(
104
- fn=model_interface,
105
- inputs=[
106
- gr.components.Video(label="Upload a video file"),
107
- gr.components.Dropdown(choices=[
108
- "microsoft/xclip-base-patch16-zero-shot",
109
- "microsoft/xclip-base-patch32-16-frames",
110
- "microsoft/xclip-base-patch32"
111
- ], label="Model Choice"),
112
- gr.components.Textbox(default="dancing", label="Desired Activity to Recognize"),
113
- ],
114
- outputs=[
115
- gr.components.Image(type="pil", label="Sampled Frames"),
116
- gr.components.Textbox(type="text", label="Probabilities"),
117
- gr.components.Textbox(type="text", label="Raw Scores"),
118
- gr.components.Textbox(type="text", label="Top Prediction")
119
- ],
120
- live=False
121
- )
122
-
123
-
124
-
125
- iface.launch()
 
3
  import numpy as np
4
  from transformers import AutoProcessor, AutoModel
5
  from PIL import Image
6
+ from decord import VideoReader, cpu, gpu
7
  import cv2
8
 
9
  print(f"Is CUDA available: {torch.cuda.is_available()}")
 
10
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 
11
 
12
  def sample_uniform_frame_indices(clip_len, seg_len):
13
  if seg_len < clip_len:
 
20
  return np.array(indices).astype(np.int64)
21
 
22
  def read_video_decord(file_path, indices):
23
+ vr = VideoReader(file_path, num_threads=1, ctx=gpu(0) if torch.cuda.is_available() else cpu(0))
24
  video = vr.get_batch(indices).asnumpy()
25
  return video
26
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def concatenate_frames(frames, clip_len):
28
  layout = {
29
  32: (4, 8),
 
49
  "microsoft/xclip-base-patch32-16-frames": 16,
50
  "microsoft/xclip-base-patch32": 8
51
  }.get(model_choice, 32)
52
+
53
  indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
54
+ video = read_video_decord(uploaded_video, indices)
55
  concatenated_image = concatenate_frames(video, clip_len)
56
 
57
+ # Convert list of numpy arrays to a single numpy ndarray
58
+ video_np = np.array(video)
59
+
60
  activities_list = [activity, "other"]
61
  processor = AutoProcessor.from_pretrained(model_choice)
62
+ model = AutoModel.from_pretrained(model_choice).to('cuda')
63
+
64
  inputs = processor(
65
  text=activities_list,
66
+ videos=video_np, # Use the ndarray instead of the list
67
  return_tensors="pt",
68
  padding=True,
69
  )
70
+ inputs = {name: tensor.to('cuda') for name, tensor in inputs.items()}
71
 
72
  with torch.no_grad():
73
  outputs = model(**inputs)
74
 
75
+ logits_per_video = outputs.logits_per_video.cpu()
76
  probs = logits_per_video.softmax(dim=1)
77
 
78
  results_probs = []
 
88
  likely_label = activities_list[max_prob_index]
89
  likely_probability = float(probs[0][max_prob_index]) * 100
90
 
91
+ return concatenated_image, results_probs, results_logits, [likely_label, likely_probability]