Spaces:

ddriscoll
/

SOC3242-01_Group_3_Interactive

Sleeping

File size: 13,012 Bytes

import gradio as gr
import cv2
import numpy as np
import torch
from PIL import Image
import mediapipe as mp

from transformers import (
    AutoFeatureExtractor,
    AutoModel,
    AutoImageProcessor,
    AutoModelForImageClassification,
    AutoModelForSemanticSegmentation
)

# -----------------------------
# Configuration & Device Setup
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DESIRED_SIZE = (640, 480)

# -----------------------------
# Initialize Mediapipe Face Detection
# -----------------------------
mp_face_detection = mp.solutions.face_detection
face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)

# -----------------------------
# Load New Models from Hugging Face
# -----------------------------

# 1. Facial Recognition & Identification (facebook/dino-vitb16)
facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16")
facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16")
facial_recognition_model.to(device)
facial_recognition_model.eval()

# Create a dummy database for demonstration (embeddings of dimension 768 assumed)
dummy_database = {
    "Alice": torch.randn(768).to(device),
    "Bob": torch.randn(768).to(device)
}

# 2. Emotion Detection (nateraw/facial-expression-recognition)
emotion_processor = AutoImageProcessor.from_pretrained("nateraw/facial-expression-recognition")
emotion_model = AutoModelForImageClassification.from_pretrained("nateraw/facial-expression-recognition")
emotion_model.to(device)
emotion_model.eval()

# 3. Age & Gender Prediction (oayu/age-gender-estimation)
age_gender_processor = AutoImageProcessor.from_pretrained("oayu/age-gender-estimation")
age_gender_model = AutoModelForImageClassification.from_pretrained("oayu/age-gender-estimation")
age_gender_model.to(device)
age_gender_model.eval()

# 4. Face Parsing (hila-chefer/face-parsing)
face_parsing_processor = AutoImageProcessor.from_pretrained("hila-chefer/face-parsing")
face_parsing_model = AutoModelForSemanticSegmentation.from_pretrained("hila-chefer/face-parsing")
face_parsing_model.to(device)
face_parsing_model.eval()

# 5. Deepfake Detection (microsoft/FaceForensics)
deepfake_processor = AutoImageProcessor.from_pretrained("microsoft/FaceForensics")
deepfake_model = AutoModelForImageClassification.from_pretrained("microsoft/FaceForensics")
deepfake_model.to(device)
deepfake_model.eval()

# -----------------------------
# Helper Functions for New Inferences
# -----------------------------

def compute_facial_recognition(image):
    """
    Detects a face using MediaPipe, crops it, and computes its embedding with DINO-ViT.
    Compares the embedding against a dummy database to "identify" the person.
    """
    frame = np.array(image)
    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE)
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    
    face_results = face_detection.process(frame_rgb)
    if face_results.detections:
        detection = face_results.detections[0]
        bbox = detection.location_data.relative_bounding_box
        h, w, _ = frame_rgb.shape
        x = int(bbox.xmin * w)
        y = int(bbox.ymin * h)
        box_w = int(bbox.width * w)
        box_h = int(bbox.height * h)
        face_crop = frame_rgb[y:y+box_h, x:x+box_w]
        face_image = Image.fromarray(face_crop)
        
        inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = facial_recognition_model(**inputs)
        # Use mean pooling over the last hidden state to get an embedding vector
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
        
        # Compare against dummy database using cosine similarity
        best_score = -1
        best_name = "Unknown"
        for name, db_emb in dummy_database.items():
            cos_sim = torch.nn.functional.cosine_similarity(embeddings, db_emb, dim=0)
            if cos_sim > best_score:
                best_score = cos_sim
                best_name = name
        threshold = 0.7  # dummy threshold for identification
        if best_score > threshold:
            result = f"Identified as {best_name} (sim: {best_score:.2f})"
        else:
            result = f"No match found (best: {best_name}, sim: {best_score:.2f})"
        return face_crop, result
    else:
        return frame, "No face detected"

def compute_emotion_detection(image):
    """
    Detects a face, crops it, and classifies the facial expression.
    """
    frame = np.array(image)
    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE)
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    
    face_results = face_detection.process(frame_rgb)
    if face_results.detections:
        detection = face_results.detections[0]
        bbox = detection.location_data.relative_bounding_box
        h, w, _ = frame_rgb.shape
        x = int(bbox.xmin * w)
        y = int(bbox.ymin * h)
        box_w = int(bbox.width * w)
        box_h = int(bbox.height * h)
        face_crop = frame_rgb[y:y+box_h, x:x+box_w]
        face_image = Image.fromarray(face_crop)
        
        inputs = emotion_processor(face_image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = emotion_model(**inputs)
        logits = outputs.logits
        pred = logits.argmax(-1).item()
        label = emotion_model.config.id2label[pred]
        return face_crop, f"Emotion: {label}"
    else:
        return frame, "No face detected"

def compute_age_gender(image):
    """
    Detects a face, crops it, and predicts the age & gender.
    """
    frame = np.array(image)
    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE)
    frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
    
    face_results = face_detection.process(frame_rgb)
    if face_results.detections:
        detection = face_results.detections[0]
        bbox = detection.location_data.relative_bounding_box
        h, w, _ = frame_rgb.shape
        x = int(bbox.xmin * w)
        y = int(bbox.ymin * h)
        box_w = int(bbox.width * w)
        box_h = int(bbox.height * h)
        face_crop = frame_rgb[y:y+box_h, x:x+box_w]
        face_image = Image.fromarray(face_crop)
        
        inputs = age_gender_processor(face_image, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = age_gender_model(**inputs)
        logits = outputs.logits
        pred = logits.argmax(-1).item()
        label = age_gender_model.config.id2label[pred]
        return face_crop, f"Age & Gender: {label}"
    else:
        return frame, "No face detected"

def compute_face_parsing(image):
    """
    Runs face parsing (segmentation) on the provided image.
    """
    image_pil = Image.fromarray(np.array(image))
    inputs = face_parsing_processor(image_pil, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = face_parsing_model(**inputs)
    logits = outputs.logits  # shape: (batch, num_labels, H, W)
    segmentation = logits.argmax(dim=1)[0].cpu().numpy()
    # For visualization, we apply a color map to the segmentation mask.
    segmentation_norm = np.uint8(255 * segmentation / (segmentation.max() + 1e-5))
    segmentation_color = cv2.applyColorMap(segmentation_norm, cv2.COLORMAP_JET)
    return segmentation_color, "Face Parsing completed"

def compute_deepfake_detection(image):
    """
    Runs deepfake detection on the image.
    """
    image_pil = Image.fromarray(np.array(image))
    inputs = deepfake_processor(image_pil, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = deepfake_model(**inputs)
    logits = outputs.logits
    pred = logits.argmax(-1).item()
    label = deepfake_model.config.id2label[pred]
    return np.array(image), f"Deepfake Detection: {label}"

# -----------------------------
# Analysis Functions (Wrapping Inference & Green Text)
# -----------------------------

def analyze_facial_recognition(image):
    annotated_face, result = compute_facial_recognition(image)
    return annotated_face, f"<div style='color: lime !important;'>Facial Recognition: {result}</div>"

def analyze_emotion_detection(image):
    face_crop, result = compute_emotion_detection(image)
    return face_crop, f"<div style='color: lime !important;'>{result}</div>"

def analyze_age_gender(image):
    face_crop, result = compute_age_gender(image)
    return face_crop, f"<div style='color: lime !important;'>{result}</div>"

def analyze_face_parsing(image):
    segmentation, result = compute_face_parsing(image)
    return segmentation, f"<div style='color: lime !important;'>{result}</div>"

def analyze_deepfake_detection(image):
    output, result = compute_deepfake_detection(image)
    return output, f"<div style='color: lime !important;'>{result}</div>"

# -----------------------------
# Custom CSS (All Text in Green)
# -----------------------------
custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@400;700&display=swap');
body {
    background-color: #0e0e0e;
    font-family: 'Orbitron', sans-serif;
    margin: 0;
    padding: 0;
    color: #32CD32;
}
.gradio-container {
    background: linear-gradient(135deg, #1a1a1a, #333333);
    border: 2px solid #32CD32;
    box-shadow: 0 0 15px #32CD32;
    border-radius: 10px;
    padding: 20px;
    max-width: 1200px;
    margin: auto;
}
.gradio-title, .gradio-description, .tab-item, .tab-item * {
    color: #32CD32 !important;
    text-shadow: 0 0 10px #32CD32;
}
input, button, .output {
    border: 1px solid #32CD32;
    box-shadow: 0 0 8px #32CD32;
    color: #32CD32;
}
"""

# -----------------------------
# Create Gradio Interfaces for New Models
# -----------------------------
facial_recognition_interface = gr.Interface(
    fn=analyze_facial_recognition,
    inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
    outputs=[gr.Image(type="numpy", label="Cropped Face / Embedding Visualization"), 
             gr.HTML(label="Facial Recognition Result")],
    title="Facial Recognition & Identification",
    description="Extracts facial embeddings using facebook/dino-vitb16 and identifies the face by comparing against a dummy database.",
    live=False
)

emotion_interface = gr.Interface(
    fn=analyze_emotion_detection,
    inputs=gr.Image(label="Upload a Face Image for Emotion Detection"),
    outputs=[gr.Image(type="numpy", label="Cropped Face"), 
             gr.HTML(label="Emotion Detection")],
    title="Emotion Detection",
    description="Classifies the facial expression using nateraw/facial-expression-recognition.",
    live=False
)

age_gender_interface = gr.Interface(
    fn=analyze_age_gender,
    inputs=gr.Image(label="Upload a Face Image for Age & Gender Prediction"),
    outputs=[gr.Image(type="numpy", label="Cropped Face"), 
             gr.HTML(label="Age & Gender Prediction")],
    title="Age & Gender Prediction",
    description="Predicts age and gender from the face using oayu/age-gender-estimation.",
    live=False
)

face_parsing_interface = gr.Interface(
    fn=analyze_face_parsing,
    inputs=gr.Image(label="Upload a Face Image for Face Parsing"),
    outputs=[gr.Image(type="numpy", label="Segmentation Overlay"), 
             gr.HTML(label="Face Parsing")],
    title="Face Parsing",
    description="Segments face regions (eyes, nose, lips, hair, etc.) using hila-chefer/face-parsing.",
    live=False
)

deepfake_interface = gr.Interface(
    fn=analyze_deepfake_detection,
    inputs=gr.Image(label="Upload an Image for Deepfake Detection"),
    outputs=[gr.Image(type="numpy", label="Input Image"), 
             gr.HTML(label="Deepfake Detection")],
    title="Deepfake Detection",
    description="Detects manipulated or deepfake images using microsoft/FaceForensics.",
    live=False
)

# -----------------------------
# Create a Tabbed Interface
# -----------------------------
tabbed_interface = gr.TabbedInterface(
    interface_list=[
        facial_recognition_interface, 
        emotion_interface, 
        age_gender_interface, 
        face_parsing_interface, 
        deepfake_interface
    ],
    tab_names=[
        "Facial Recognition",
        "Emotion Detection",
        "Age & Gender",
        "Face Parsing",
        "Deepfake Detection"
    ]
)

# -----------------------------
# Wrap in a Blocks Layout & Launch
# -----------------------------
demo = gr.Blocks(css=custom_css)
with demo:
    gr.Markdown("<h1 class='gradio-title' style='color: #32CD32;'>Multi-Analysis Face App</h1>")
    gr.Markdown("<p class='gradio-description' style='color: #32CD32;'>Upload an image to run advanced face analysis using state-of-the-art Hugging Face models.</p>")
    tabbed_interface.render()

if __name__ == "__main__":
    demo.launch()