Spaces:

ddriscoll
/

SOC3242-01_Group_3_Interactive

Sleeping

App Files Files Community

David Driscoll commited on Feb 17

Commit

473b2d5

1 Parent(s): f3de933

fix emotion, output vector

Browse files

Files changed (1) hide show

app.py +82 -40

app.py CHANGED Viewed

@@ -6,9 +6,8 @@ from torchvision import models, transforms
 from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
 from PIL import Image
 import mediapipe as mp
-# Hugging Face imports for emotion detection
-from transformers import AutoImageProcessor, AutoModelForImageClassification
 # -----------------------------
 # Configuration
@@ -28,6 +27,7 @@ faces_cache = {"boxes": None, "text": "Initializing...", "counter": 0}
 # -----------------------------
 # Initialize Models and Helpers
 # -----------------------------
 mp_pose = mp.solutions.pose
 pose = mp_pose.Pose()
 mp_drawing = mp.solutions.drawing_utils
@@ -35,22 +35,27 @@ mp_drawing = mp.solutions.drawing_utils
 mp_face_detection = mp.solutions.face_detection
 face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)
 object_detection_model = models.detection.fasterrcnn_resnet50_fpn(
     weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT
 )
 object_detection_model.eval().to(device)
 obj_transform = transforms.Compose([transforms.ToTensor()])
-# Initialize the Hugging Face emotion detection model.
-# (Using the public "nateraw/fer" repo to mimic expression recognition.)
-emotion_processor = AutoImageProcessor.from_pretrained("nateraw/fer")
-emotion_model = AutoModelForImageClassification.from_pretrained("nateraw/fer")
-emotion_model.to(device)
-emotion_model.eval()
 # Retrieve object categories from model weights metadata
 object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
 # -----------------------------
 # Overlay Drawing Functions
 # -----------------------------
@@ -96,36 +101,14 @@ def compute_posture_overlay(image):
     return landmarks, text
 def compute_emotion_overlay(image):
-    """
-    This function mimics the original FER-based expression recognition,
-    but uses a Hugging Face emotion model instead.
-    """
     frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
     frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
-    # Use MediaPipe to detect a face and crop it
-    face_results = face_detection.process(frame_rgb_small)
-    if face_results.detections:
-        detection = face_results.detections[0]
-        bbox = detection.location_data.relative_bounding_box
-        h, w, _ = frame_rgb_small.shape
-        x = int(bbox.xmin * w)
-        y = int(bbox.ymin * h)
-        box_w = int(bbox.width * w)
-        box_h = int(bbox.height * h)
-        face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
-        face_image = Image.fromarray(face_crop)
-        # Process face crop with the Hugging Face emotion model
-        inputs = emotion_processor(face_image, return_tensors="pt").to(device)
-        with torch.no_grad():
-            outputs = emotion_model(**inputs)
-        logits = outputs.logits
-        probs = torch.softmax(logits, dim=-1)
-        score, pred = torch.max(probs, dim=-1)
-        label = emotion_model.config.id2label[pred.item()]
-        text = f"{label} ({score.item():.2f})"
     else:
         text = "No face detected"
     return text
@@ -172,6 +155,37 @@ def compute_faces_overlay(image):
         text = "No faces detected"
     return boxes, text
 # -----------------------------
 # Main Analysis Functions for Single Image
 # -----------------------------
@@ -225,6 +239,11 @@ def analyze_faces_current(image):
         output = draw_boxes_overlay(output, faces_cache["boxes"], (0, 0, 255))
     return output, f"<div style='color: lime !important;'>Face Detection: {faces_cache['text']}</div>"
 def analyze_all(image):
     current_frame = np.array(image).copy()
     # Posture Analysis
@@ -304,7 +323,7 @@ emotion_interface = gr.Interface(
     inputs=gr.Image(label="Upload an Image for Emotion Analysis"),
     outputs=[gr.Image(type="numpy", label="Annotated Output"), gr.HTML(label="Emotion Analysis")],
     title="Emotion",
-    description="Detects facial emotions using a Hugging Face model.",
     live=False
 )
@@ -326,6 +345,15 @@ faces_interface = gr.Interface(
     live=False
 )
 all_interface = gr.Interface(
     fn=analyze_all,
     inputs=gr.Image(label="Upload an Image for All Inferences"),
@@ -336,8 +364,22 @@ all_interface = gr.Interface(
 )
 tabbed_interface = gr.TabbedInterface(
-    interface_list=[posture_interface, emotion_interface, objects_interface, faces_interface, all_interface],
-    tab_names=["Posture", "Emotion", "Objects", "Faces", "All Inferences"]
 )
 # -----------------------------
@@ -346,7 +388,7 @@ tabbed_interface = gr.TabbedInterface(
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.Markdown("<h1 class='gradio-title' style='color: #32CD32;'>Multi-Analysis Image App</h1>")
-    gr.Markdown("<p class='gradio-description' style='color: #32CD32;'>Upload an image to run high-tech analysis for posture, emotions, objects, and faces.</p>")
     tabbed_interface.render()
 if __name__ == "__main__":

 from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
 from PIL import Image
 import mediapipe as mp
+from fer import FER  # Facial emotion recognition
+from transformers import AutoFeatureExtractor, AutoModel
 # -----------------------------
 # Configuration
 # -----------------------------
 # Initialize Models and Helpers
 # -----------------------------
+# MediaPipe Pose and Face Detection
 mp_pose = mp.solutions.pose
 pose = mp_pose.Pose()
 mp_drawing = mp.solutions.drawing_utils
 mp_face_detection = mp.solutions.face_detection
 face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)
+# Object Detection using Faster R-CNN
 object_detection_model = models.detection.fasterrcnn_resnet50_fpn(
     weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT
 )
 object_detection_model.eval().to(device)
 obj_transform = transforms.Compose([transforms.ToTensor()])
+# Initialize the FER emotion detector (using the FER package)
+emotion_detector = FER(mtcnn=True)
 # Retrieve object categories from model weights metadata
 object_categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
+# -----------------------------
+# Facial Recognition Model (DINO-ViT)
+# -----------------------------
+facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16")
+facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16")
+facial_recognition_model.to(device)
+facial_recognition_model.eval()
 # -----------------------------
 # Overlay Drawing Functions
 # -----------------------------
     return landmarks, text
 def compute_emotion_overlay(image):
+    # Use the FER package (exactly as in your provided code)
     frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
     frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
+    emotions = emotion_detector.detect_emotions(frame_rgb_small)
+    if emotions:
+        top_emotion, score = max(emotions[0]["emotions"].items(), key=lambda x: x[1])
+        text = f"{top_emotion} ({score:.2f})"
     else:
         text = "No face detected"
     return text
         text = "No faces detected"
     return boxes, text
+def compute_facial_recognition_vector(image):
+    """
+    Detects a face using MediaPipe, crops it, and computes its embedding vector
+    using facebook/dino-vitb16. The raw vector is returned as a string.
+    """
+    frame_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    frame_bgr_small = cv2.resize(frame_bgr, DESIRED_SIZE)
+    frame_rgb_small = cv2.cvtColor(frame_bgr_small, cv2.COLOR_BGR2RGB)
+    face_results = face_detection.process(frame_rgb_small)
+    if face_results.detections:
+        detection = face_results.detections[0]
+        bbox = detection.location_data.relative_bounding_box
+        h, w, _ = frame_rgb_small.shape
+        x = int(bbox.xmin * w)
+        y = int(bbox.ymin * h)
+        box_w = int(bbox.width * w)
+        box_h = int(bbox.height * h)
+        face_crop = frame_rgb_small[y:y+box_h, x:x+box_w]
+        face_image = Image.fromarray(face_crop)
+        inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = facial_recognition_model(**inputs)
+        # Mean pooling of the last hidden state to obtain a vector representation
+        vector = outputs.last_hidden_state.mean(dim=1).squeeze()
+        vector_np = vector.cpu().numpy()
+        # Format vector as a string with limited decimal places
+        vector_str = np.array2string(vector_np, precision=2, separator=',')
+        return face_crop, vector_str
+    else:
+        return np.array(image), "No face detected"
 # -----------------------------
 # Main Analysis Functions for Single Image
 # -----------------------------
         output = draw_boxes_overlay(output, faces_cache["boxes"], (0, 0, 255))
     return output, f"<div style='color: lime !important;'>Face Detection: {faces_cache['text']}</div>"
+def analyze_facial_recognition(image):
+    # Compute and return the facial vector (and the cropped face)
+    face_crop, vector_str = compute_facial_recognition_vector(image)
+    return face_crop, f"<div style='color: lime !important;'>Facial Vector: {vector_str}</div>"
 def analyze_all(image):
     current_frame = np.array(image).copy()
     # Posture Analysis
     inputs=gr.Image(label="Upload an Image for Emotion Analysis"),
     outputs=[gr.Image(type="numpy", label="Annotated Output"), gr.HTML(label="Emotion Analysis")],
     title="Emotion",
+    description="Detects facial emotions using FER.",
     live=False
 )
     live=False
 )
+facial_recognition_interface = gr.Interface(
+    fn=analyze_facial_recognition,
+    inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
+    outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Facial Recognition")],
+    title="Facial Recognition",
+    description="Extracts and outputs the facial vector using facebook/dino-vitb16.",
+    live=False
+)
 all_interface = gr.Interface(
     fn=analyze_all,
     inputs=gr.Image(label="Upload an Image for All Inferences"),
 )
 tabbed_interface = gr.TabbedInterface(
+    interface_list=[
+        posture_interface,
+        emotion_interface,
+        objects_interface,
+        faces_interface,
+        facial_recognition_interface,
+        all_interface
+    ],
+    tab_names=[
+        "Posture",
+        "Emotion",
+        "Objects",
+        "Faces",
+        "Facial Recognition",
+        "All Inferences"
+    ]
 )
 # -----------------------------
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.Markdown("<h1 class='gradio-title' style='color: #32CD32;'>Multi-Analysis Image App</h1>")
+    gr.Markdown("<p class='gradio-description' style='color: #32CD32;'>Upload an image to run high-tech analysis for posture, emotions, objects, faces, and facial embeddings.</p>")
     tabbed_interface.render()
 if __name__ == "__main__":