JulianPhillips commited on
Commit
b42f738
·
verified ·
1 Parent(s): e4107bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -21
app.py CHANGED
@@ -23,14 +23,6 @@ if not os.path.exists(movenet_model_path):
23
  else:
24
  movenet_model = tf.saved_model.load(movenet_model_path)
25
 
26
- # Load BLIP model
27
- blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')
28
- blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
29
-
30
- # Load CLIP model
31
- clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
32
- clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
33
-
34
  # Keypoint dictionary for reference
35
  KEYPOINT_DICT = {
36
  'nose': 0,
@@ -60,13 +52,13 @@ def process_video():
60
  # Clear previous cache
61
  gc.collect()
62
  torch.cuda.empty_cache()
 
63
  # Get the video URL from the request
64
  video_url = request.json.get('videoURL')
65
  height = request.json.get('height')
66
  weight = request.json.get('weight')
67
  wingspan = request.json.get('wingspan')
68
 
69
-
70
  if not video_url:
71
  return jsonify({"error": "No video URL provided"}), 400
72
 
@@ -99,16 +91,7 @@ def process_video():
99
  # Process each frame with MoveNet (to get 3D keypoints and detect stance)
100
  movenet_results = []
101
  stances = []
102
- hip_rotations = []
103
- arm_extensions = []
104
- stepping_jabs = []
105
  guard_up = []
106
- hand_returned = []
107
- hips_width_apart = []
108
- leg_angle_correct = []
109
- punch_started = False
110
- initial_left_wrist = None
111
- initial_right_wrist = None
112
 
113
  for frame_index, frame in enumerate(frames):
114
  input_tensor = tf.image.resize_with_pad(tf.convert_to_tensor(frame, dtype=tf.uint8), 256, 256)
@@ -139,26 +122,46 @@ def process_video():
139
  right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
140
  guard_up.append(left_hand_near_head and right_hand_near_head)
141
 
 
 
 
 
142
  # Generate captions for all 60 frames using BLIP
143
  captions = []
 
 
 
144
  for frame in frames:
145
- inputs = blip_processor(images=frame, return_tensors="pt")
146
  with torch.no_grad():
147
  caption = blip_model.generate(**inputs)
148
  captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
 
 
 
 
 
149
 
150
  # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
151
  clip_results = []
 
 
 
152
  for i, frame in enumerate(frames):
153
  stance = stances[i]
154
  prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
155
- text_inputs = clip_processor(text=[prompt], return_tensors="pt")
156
- image_inputs = clip_processor(images=frame, return_tensors="pt")
157
  with torch.no_grad():
158
  image_features = clip_model.get_image_features(**image_inputs)
159
  text_features = clip_model.get_text_features(**text_inputs)
160
  similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
161
  clip_results.append(similarity.item())
 
 
 
 
 
162
 
163
  # Calculate score based on CLIP results and BLIP captions
164
  avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0
 
23
  else:
24
  movenet_model = tf.saved_model.load(movenet_model_path)
25
 
 
 
 
 
 
 
 
 
26
  # Keypoint dictionary for reference
27
  KEYPOINT_DICT = {
28
  'nose': 0,
 
52
  # Clear previous cache
53
  gc.collect()
54
  torch.cuda.empty_cache()
55
+
56
  # Get the video URL from the request
57
  video_url = request.json.get('videoURL')
58
  height = request.json.get('height')
59
  weight = request.json.get('weight')
60
  wingspan = request.json.get('wingspan')
61
 
 
62
  if not video_url:
63
  return jsonify({"error": "No video URL provided"}), 400
64
 
 
91
  # Process each frame with MoveNet (to get 3D keypoints and detect stance)
92
  movenet_results = []
93
  stances = []
 
 
 
94
  guard_up = []
 
 
 
 
 
 
95
 
96
  for frame_index, frame in enumerate(frames):
97
  input_tensor = tf.image.resize_with_pad(tf.convert_to_tensor(frame, dtype=tf.uint8), 256, 256)
 
122
  right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
123
  guard_up.append(left_hand_near_head and right_hand_near_head)
124
 
125
+ # Free up memory used by MoveNet
126
+ del movenet_model
127
+ gc.collect()
128
+
129
  # Generate captions for all 60 frames using BLIP
130
  captions = []
131
+ blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base').to('cuda')
132
+ blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
133
+
134
  for frame in frames:
135
+ inputs = blip_processor(images=frame, return_tensors="pt").to('cuda')
136
  with torch.no_grad():
137
  caption = blip_model.generate(**inputs)
138
  captions.append(blip_processor.decode(caption[0], skip_special_tokens=True))
139
+
140
+ # Free up memory used by BLIP
141
+ del blip_model, blip_processor
142
+ torch.cuda.empty_cache()
143
+ gc.collect()
144
 
145
  # Use CLIP to assess the similarity of frames to a Muay Thai jab prompt, including stance
146
  clip_results = []
147
+ clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to('cuda')
148
+ clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
149
+
150
  for i, frame in enumerate(frames):
151
  stance = stances[i]
152
  prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
153
+ text_inputs = clip_processor(text=[prompt], return_tensors="pt").to('cuda')
154
+ image_inputs = clip_processor(images=frame, return_tensors="pt").to('cuda')
155
  with torch.no_grad():
156
  image_features = clip_model.get_image_features(**image_inputs)
157
  text_features = clip_model.get_text_features(**text_inputs)
158
  similarity = torch.nn.functional.cosine_similarity(image_features, text_features)
159
  clip_results.append(similarity.item())
160
+
161
+ # Free up memory used by CLIP
162
+ del clip_model, clip_processor
163
+ torch.cuda.empty_cache()
164
+ gc.collect()
165
 
166
  # Calculate score based on CLIP results and BLIP captions
167
  avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0