JulianPhillips commited on
Commit
e4107bd
·
verified ·
1 Parent(s): 9dcf517

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -112
app.py CHANGED
@@ -8,8 +8,6 @@ import os
8
  import requests
9
  from tempfile import NamedTemporaryFile
10
  import gc
11
-
12
-
13
  import tensorflow_hub as hub
14
 
15
  # Ensure that Hugging Face uses the appropriate cache directory
@@ -26,12 +24,12 @@ else:
26
  movenet_model = tf.saved_model.load(movenet_model_path)
27
 
28
  # Load BLIP model
29
- blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-large')
30
- blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-large')
31
 
32
  # Load CLIP model
33
- clip_model = CLIPModel.from_pretrained('openai/clip-vit-large-patch14')
34
- clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-large-patch14')
35
 
36
  # Keypoint dictionary for reference
37
  KEYPOINT_DICT = {
@@ -68,15 +66,13 @@ def process_video():
68
  weight = request.json.get('weight')
69
  wingspan = request.json.get('wingspan')
70
 
71
-
72
  if not video_url:
73
  return jsonify({"error": "No video URL provided"}), 400
74
 
75
-
76
  if not all([height, weight, wingspan]):
77
  return jsonify({"error": "Height, weight, and wingspan are required"}), 400
78
 
79
-
80
  # Download the video from the S3 URL
81
  with NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
82
  response = requests.get(video_url)
@@ -122,7 +118,6 @@ def process_video():
122
  keypoints_3d = keypoints['output_0'][0].numpy().tolist() # Assuming the model returns 3D keypoints
123
  movenet_results.append(keypoints_3d)
124
 
125
-
126
  # Detect stance based on keypoints (using ankles and wrists)
127
  left_ankle = keypoints_3d[KEYPOINT_DICT['left_ankle']]
128
  right_ankle = keypoints_3d[KEYPOINT_DICT['right_ankle']]
@@ -144,91 +139,6 @@ def process_video():
144
  right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
145
  guard_up.append(left_hand_near_head and right_hand_near_head)
146
 
147
- # Determine if the punch has started (based on wrist movement)
148
- if frame_index > 0:
149
- previous_left_wrist = movenet_results[frame_index - 1][KEYPOINT_DICT['left_wrist']]
150
- previous_right_wrist = movenet_results[frame_index - 1][KEYPOINT_DICT['right_wrist']]
151
-
152
- if stance == "orthodox" and (left_wrist[0] - previous_left_wrist[0]) > 0.05:
153
- punch_started = True
154
- if initial_left_wrist is None:
155
- initial_left_wrist = left_wrist
156
- elif stance == "southpaw" and (right_wrist[0] - previous_right_wrist[0]) > 0.05:
157
- punch_started = True
158
- if initial_right_wrist is None:
159
- initial_right_wrist = right_wrist
160
-
161
- # Detect hip rotation (based on left and right hips, considering stance and punch start)
162
- left_hip = keypoints_3d[KEYPOINT_DICT['left_hip']]
163
- right_hip = keypoints_3d[KEYPOINT_DICT['right_hip']]
164
- if punch_started:
165
- if stance == "orthodox":
166
- hip_rotation = right_hip[0] - left_hip[0] # Right hip should move forward
167
- elif stance == "southpaw":
168
- hip_rotation = left_hip[0] - right_hip[0] # Left hip should move forward
169
- else:
170
- hip_rotation = 0
171
- else:
172
- hip_rotation = 0
173
- hip_rotations.append(hip_rotation)
174
-
175
- # Detect full arm extension (based on shoulder, elbow, and wrist, considering stance)
176
- left_shoulder = keypoints_3d[KEYPOINT_DICT['left_shoulder']]
177
- left_elbow = keypoints_3d[KEYPOINT_DICT['left_elbow']]
178
- right_shoulder = keypoints_3d[KEYPOINT_DICT['right_shoulder']]
179
- right_elbow = keypoints_3d[KEYPOINT_DICT['right_elbow']]
180
-
181
- if stance == "orthodox":
182
- lead_arm_extension = np.linalg.norm(np.array(left_wrist) - np.array(left_shoulder))
183
- elif stance == "southpaw":
184
- lead_arm_extension = np.linalg.norm(np.array(right_wrist) - np.array(right_shoulder))
185
- else:
186
- lead_arm_extension = 0
187
- arm_extensions.append(lead_arm_extension)
188
-
189
- # Detect stepping with the jab and coming back (based on ankles, considering stance and punch start)
190
- if punch_started and frame_index > 0:
191
- previous_left_ankle = movenet_results[frame_index - 1][KEYPOINT_DICT['left_ankle']]
192
- previous_right_ankle = movenet_results[frame_index - 1][KEYPOINT_DICT['right_ankle']]
193
-
194
- if stance == "orthodox":
195
- step_movement = (left_ankle[0] - previous_left_ankle[0]) > 0.05 # Lead foot is left
196
- elif stance == "southpaw":
197
- step_movement = (right_ankle[0] - previous_right_ankle[0]) > 0.05 # Lead foot is right
198
- else:
199
- step_movement = False
200
- stepping_jabs.append(step_movement)
201
- else:
202
- stepping_jabs.append(False)
203
-
204
- # Detect if the hand returns to the initial position after the punch
205
- if punch_started:
206
- if stance == "orthodox" and initial_left_wrist is not None:
207
- hand_returned.append(np.linalg.norm(np.array(left_wrist) - np.array(initial_left_wrist)) < 0.05)
208
- elif stance == "southpaw" and initial_right_wrist is not None:
209
- hand_returned.append(np.linalg.norm(np.array(right_wrist) - np.array(initial_right_wrist)) < 0.05)
210
- else:
211
- hand_returned.append(False)
212
- else:
213
- hand_returned.append(False)
214
-
215
- # Detect if hips are shoulder width apart
216
- left_shoulder = keypoints_3d[KEYPOINT_DICT['left_shoulder']]
217
- right_shoulder = keypoints_3d[KEYPOINT_DICT['right_shoulder']]
218
- shoulder_width = abs(left_shoulder[0] - right_shoulder[0])
219
- hips_width = abs(left_hip[0] - right_hip[0])
220
- hips_width_apart.append(hips_width > 0.9 * shoulder_width and hips_width < 1.1 * shoulder_width)
221
-
222
- # Detect if the back leg is at a 45 degree angle outward (for orthodox and southpaw)
223
- if stance == "orthodox":
224
- right_leg_angle = np.arctan2(right_ankle[1] - right_hip[1], right_ankle[0] - right_hip[0]) * 180 / np.pi
225
- leg_angle_correct.append(40 <= right_leg_angle <= 50)
226
- elif stance == "southpaw":
227
- left_leg_angle = np.arctan2(left_ankle[1] - left_hip[1], left_ankle[0] - left_hip[0]) * 180 / np.pi
228
- leg_angle_correct.append(40 <= left_leg_angle <= 50)
229
- else:
230
- leg_angle_correct.append(False)
231
-
232
  # Generate captions for all 60 frames using BLIP
233
  captions = []
234
  for frame in frames:
@@ -241,7 +151,7 @@ def process_video():
241
  clip_results = []
242
  for i, frame in enumerate(frames):
243
  stance = stances[i]
244
- prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm, with hip rotation of {hip_rotations[i]:.2f}, arm extension of {arm_extensions[i]:.2f}, {'stepping forward' if stepping_jabs[i] else 'not stepping'}, {'guard up' if guard_up[i] else 'guard down'}, {'hand returned to initial position' if hand_returned[i] else 'hand not returned'}, {'hips shoulder width apart' if hips_width_apart[i] else 'hips not shoulder width apart'}, and {'correct leg angle' if leg_angle_correct[i] else 'incorrect leg angle'}"
245
  text_inputs = clip_processor(text=[prompt], return_tensors="pt")
246
  image_inputs = clip_processor(images=frame, return_tensors="pt")
247
  with torch.no_grad():
@@ -253,10 +163,7 @@ def process_video():
253
  # Calculate score based on CLIP results and BLIP captions
254
  avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0
255
  guard_score = sum(guard_up) / len(guard_up) if guard_up else 0
256
- hand_return_score = sum(hand_returned) / len(hand_returned) if hand_returned else 0
257
- hips_width_score = sum(hips_width_apart) / len(hips_width_apart) if hips_width_apart else 0
258
- leg_angle_score = sum(leg_angle_correct) / len(leg_angle_correct) if leg_angle_correct else 0
259
- overall_score = (avg_clip_similarity + guard_score + hand_return_score + hips_width_score + leg_angle_score) / 5
260
 
261
  # Scale the overall score to a range of 0 - 10
262
  overall_score = max(0, min(overall_score * 10, 10))
@@ -267,24 +174,13 @@ def process_video():
267
  "blip_captions": captions,
268
  "clip_similarities": clip_results,
269
  "stances": stances,
270
- "hip_rotations": hip_rotations,
271
- "arm_extensions": arm_extensions,
272
- "stepping_jabs": stepping_jabs,
273
- "hips_width_apart": hips_width_apart,
274
- "leg_angle_correct": leg_angle_correct,
275
  "overall_score": overall_score,
276
- "guard_score": guard_score,
277
- "hand_return_score": hand_return_score,
278
- "hips_width_score":hips_width_score,
279
- "leg_angle_score": leg_angle_score,
280
  }
281
  return jsonify(response)
282
  except Exception as e:
283
  return jsonify({"error": str(e)}), 500
284
 
285
- # if __name__ == '__main__':
286
- # app.run(host='0.0.0.0', port=7860)
287
-
288
  if __name__ == '__main__':
289
  # Clear any cache before starting the Flask server
290
  gc.collect()
 
8
  import requests
9
  from tempfile import NamedTemporaryFile
10
  import gc
 
 
11
  import tensorflow_hub as hub
12
 
13
  # Ensure that Hugging Face uses the appropriate cache directory
 
24
  movenet_model = tf.saved_model.load(movenet_model_path)
25
 
26
  # Load BLIP model
27
+ blip_model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base')
28
+ blip_processor = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-base')
29
 
30
  # Load CLIP model
31
+ clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
32
+ clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
33
 
34
  # Keypoint dictionary for reference
35
  KEYPOINT_DICT = {
 
66
  weight = request.json.get('weight')
67
  wingspan = request.json.get('wingspan')
68
 
69
+
70
  if not video_url:
71
  return jsonify({"error": "No video URL provided"}), 400
72
 
 
73
  if not all([height, weight, wingspan]):
74
  return jsonify({"error": "Height, weight, and wingspan are required"}), 400
75
 
 
76
  # Download the video from the S3 URL
77
  with NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video_file:
78
  response = requests.get(video_url)
 
118
  keypoints_3d = keypoints['output_0'][0].numpy().tolist() # Assuming the model returns 3D keypoints
119
  movenet_results.append(keypoints_3d)
120
 
 
121
  # Detect stance based on keypoints (using ankles and wrists)
122
  left_ankle = keypoints_3d[KEYPOINT_DICT['left_ankle']]
123
  right_ankle = keypoints_3d[KEYPOINT_DICT['right_ankle']]
 
139
  right_hand_near_head = abs(right_wrist[1] - nose[1]) < guard_threshold
140
  guard_up.append(left_hand_near_head and right_hand_near_head)
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  # Generate captions for all 60 frames using BLIP
143
  captions = []
144
  for frame in frames:
 
151
  clip_results = []
152
  for i, frame in enumerate(frames):
153
  stance = stances[i]
154
+ prompt = f"A person performing a Muay Thai jab in {stance} stance at {height} in in height, {weight} lbs in weight, and a wingspan of {wingspan} cm."
155
  text_inputs = clip_processor(text=[prompt], return_tensors="pt")
156
  image_inputs = clip_processor(images=frame, return_tensors="pt")
157
  with torch.no_grad():
 
163
  # Calculate score based on CLIP results and BLIP captions
164
  avg_clip_similarity = sum(clip_results) / len(clip_results) if clip_results else 0
165
  guard_score = sum(guard_up) / len(guard_up) if guard_up else 0
166
+ overall_score = (avg_clip_similarity + guard_score) / 2
 
 
 
167
 
168
  # Scale the overall score to a range of 0 - 10
169
  overall_score = max(0, min(overall_score * 10, 10))
 
174
  "blip_captions": captions,
175
  "clip_similarities": clip_results,
176
  "stances": stances,
 
 
 
 
 
177
  "overall_score": overall_score,
178
+ "guard_score": guard_score
 
 
 
179
  }
180
  return jsonify(response)
181
  except Exception as e:
182
  return jsonify({"error": str(e)}), 500
183
 
 
 
 
184
  if __name__ == '__main__':
185
  # Clear any cache before starting the Flask server
186
  gc.collect()