import gradio as gr import cv2 import numpy as np import torch from PIL import Image import mediapipe as mp from transformers import ( AutoFeatureExtractor, AutoModel, AutoImageProcessor, AutoModelForImageClassification, AutoModelForSemanticSegmentation ) # ----------------------------- # Configuration & Device Setup # ----------------------------- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") DESIRED_SIZE = (640, 480) # ----------------------------- # Initialize Mediapipe Face Detection # ----------------------------- mp_face_detection = mp.solutions.face_detection face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5) # ----------------------------- # Load New Models from Hugging Face # ----------------------------- # 1. Facial Recognition & Identification (facebook/dino-vitb16) facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16") facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16") facial_recognition_model.to(device) facial_recognition_model.eval() # Create a dummy database for demonstration (embeddings of dimension 768 assumed) dummy_database = { "Alice": torch.randn(768).to(device), "Bob": torch.randn(768).to(device) } # 2. Emotion Detection (nateraw/facial-expression-recognition) emotion_processor = AutoImageProcessor.from_pretrained("nateraw/facial-expression-recognition") emotion_model = AutoModelForImageClassification.from_pretrained("nateraw/facial-expression-recognition") emotion_model.to(device) emotion_model.eval() # 3. Age & Gender Prediction (oayu/age-gender-estimation) age_gender_processor = AutoImageProcessor.from_pretrained("oayu/age-gender-estimation") age_gender_model = AutoModelForImageClassification.from_pretrained("oayu/age-gender-estimation") age_gender_model.to(device) age_gender_model.eval() # 4. Face Parsing (hila-chefer/face-parsing) face_parsing_processor = AutoImageProcessor.from_pretrained("hila-chefer/face-parsing") face_parsing_model = AutoModelForSemanticSegmentation.from_pretrained("hila-chefer/face-parsing") face_parsing_model.to(device) face_parsing_model.eval() # 5. Deepfake Detection (microsoft/FaceForensics) deepfake_processor = AutoImageProcessor.from_pretrained("microsoft/FaceForensics") deepfake_model = AutoModelForImageClassification.from_pretrained("microsoft/FaceForensics") deepfake_model.to(device) deepfake_model.eval() # ----------------------------- # Helper Functions for New Inferences # ----------------------------- def compute_facial_recognition(image): """ Detects a face using MediaPipe, crops it, and computes its embedding with DINO-ViT. Compares the embedding against a dummy database to "identify" the person. """ frame = np.array(image) frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE) frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB) face_results = face_detection.process(frame_rgb) if face_results.detections: detection = face_results.detections[0] bbox = detection.location_data.relative_bounding_box h, w, _ = frame_rgb.shape x = int(bbox.xmin * w) y = int(bbox.ymin * h) box_w = int(bbox.width * w) box_h = int(bbox.height * h) face_crop = frame_rgb[y:y+box_h, x:x+box_w] face_image = Image.fromarray(face_crop) inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = facial_recognition_model(**inputs) # Use mean pooling over the last hidden state to get an embedding vector embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() # Compare against dummy database using cosine similarity best_score = -1 best_name = "Unknown" for name, db_emb in dummy_database.items(): cos_sim = torch.nn.functional.cosine_similarity(embeddings, db_emb, dim=0) if cos_sim > best_score: best_score = cos_sim best_name = name threshold = 0.7 # dummy threshold for identification if best_score > threshold: result = f"Identified as {best_name} (sim: {best_score:.2f})" else: result = f"No match found (best: {best_name}, sim: {best_score:.2f})" return face_crop, result else: return frame, "No face detected" def compute_emotion_detection(image): """ Detects a face, crops it, and classifies the facial expression. """ frame = np.array(image) frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE) frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB) face_results = face_detection.process(frame_rgb) if face_results.detections: detection = face_results.detections[0] bbox = detection.location_data.relative_bounding_box h, w, _ = frame_rgb.shape x = int(bbox.xmin * w) y = int(bbox.ymin * h) box_w = int(bbox.width * w) box_h = int(bbox.height * h) face_crop = frame_rgb[y:y+box_h, x:x+box_w] face_image = Image.fromarray(face_crop) inputs = emotion_processor(face_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = emotion_model(**inputs) logits = outputs.logits pred = logits.argmax(-1).item() label = emotion_model.config.id2label[pred] return face_crop, f"Emotion: {label}" else: return frame, "No face detected" def compute_age_gender(image): """ Detects a face, crops it, and predicts the age & gender. """ frame = np.array(image) frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE) frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB) face_results = face_detection.process(frame_rgb) if face_results.detections: detection = face_results.detections[0] bbox = detection.location_data.relative_bounding_box h, w, _ = frame_rgb.shape x = int(bbox.xmin * w) y = int(bbox.ymin * h) box_w = int(bbox.width * w) box_h = int(bbox.height * h) face_crop = frame_rgb[y:y+box_h, x:x+box_w] face_image = Image.fromarray(face_crop) inputs = age_gender_processor(face_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = age_gender_model(**inputs) logits = outputs.logits pred = logits.argmax(-1).item() label = age_gender_model.config.id2label[pred] return face_crop, f"Age & Gender: {label}" else: return frame, "No face detected" def compute_face_parsing(image): """ Runs face parsing (segmentation) on the provided image. """ image_pil = Image.fromarray(np.array(image)) inputs = face_parsing_processor(image_pil, return_tensors="pt").to(device) with torch.no_grad(): outputs = face_parsing_model(**inputs) logits = outputs.logits # shape: (batch, num_labels, H, W) segmentation = logits.argmax(dim=1)[0].cpu().numpy() # For visualization, we apply a color map to the segmentation mask. segmentation_norm = np.uint8(255 * segmentation / (segmentation.max() + 1e-5)) segmentation_color = cv2.applyColorMap(segmentation_norm, cv2.COLORMAP_JET) return segmentation_color, "Face Parsing completed" def compute_deepfake_detection(image): """ Runs deepfake detection on the image. """ image_pil = Image.fromarray(np.array(image)) inputs = deepfake_processor(image_pil, return_tensors="pt").to(device) with torch.no_grad(): outputs = deepfake_model(**inputs) logits = outputs.logits pred = logits.argmax(-1).item() label = deepfake_model.config.id2label[pred] return np.array(image), f"Deepfake Detection: {label}" # ----------------------------- # Analysis Functions (Wrapping Inference & Green Text) # ----------------------------- def analyze_facial_recognition(image): annotated_face, result = compute_facial_recognition(image) return annotated_face, f"
Upload an image to run advanced face analysis using state-of-the-art Hugging Face models.
") tabbed_interface.render() if __name__ == "__main__": demo.launch()