import gradio as gr import cv2 import numpy as np import torch from PIL import Image import mediapipe as mp from transformers import ( AutoFeatureExtractor, AutoModel, AutoImageProcessor, AutoModelForImageClassification, AutoModelForSemanticSegmentation ) # ----------------------------- # Configuration & Device Setup # ----------------------------- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") DESIRED_SIZE = (640, 480) # ----------------------------- # Initialize Mediapipe Face Detection # ----------------------------- mp_face_detection = mp.solutions.face_detection face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5) # ----------------------------- # Load New Models from Hugging Face # ----------------------------- # 1. Facial Recognition & Identification (facebook/dino-vitb16) facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16") facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16") facial_recognition_model.to(device) facial_recognition_model.eval() # Create a dummy database for demonstration (embeddings of dimension 768 assumed) dummy_database = { "Alice": torch.randn(768).to(device), "Bob": torch.randn(768).to(device) } # 2. Emotion Detection (nateraw/facial-expression-recognition) emotion_processor = AutoImageProcessor.from_pretrained("nateraw/facial-expression-recognition") emotion_model = AutoModelForImageClassification.from_pretrained("nateraw/facial-expression-recognition") emotion_model.to(device) emotion_model.eval() # 3. Age & Gender Prediction (oayu/age-gender-estimation) age_gender_processor = AutoImageProcessor.from_pretrained("oayu/age-gender-estimation") age_gender_model = AutoModelForImageClassification.from_pretrained("oayu/age-gender-estimation") age_gender_model.to(device) age_gender_model.eval() # 4. Face Parsing (hila-chefer/face-parsing) face_parsing_processor = AutoImageProcessor.from_pretrained("hila-chefer/face-parsing") face_parsing_model = AutoModelForSemanticSegmentation.from_pretrained("hila-chefer/face-parsing") face_parsing_model.to(device) face_parsing_model.eval() # 5. Deepfake Detection (microsoft/FaceForensics) deepfake_processor = AutoImageProcessor.from_pretrained("microsoft/FaceForensics") deepfake_model = AutoModelForImageClassification.from_pretrained("microsoft/FaceForensics") deepfake_model.to(device) deepfake_model.eval() # ----------------------------- # Helper Functions for New Inferences # ----------------------------- def compute_facial_recognition(image): """ Detects a face using MediaPipe, crops it, and computes its embedding with DINO-ViT. Compares the embedding against a dummy database to "identify" the person. """ frame = np.array(image) frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE) frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB) face_results = face_detection.process(frame_rgb) if face_results.detections: detection = face_results.detections[0] bbox = detection.location_data.relative_bounding_box h, w, _ = frame_rgb.shape x = int(bbox.xmin * w) y = int(bbox.ymin * h) box_w = int(bbox.width * w) box_h = int(bbox.height * h) face_crop = frame_rgb[y:y+box_h, x:x+box_w] face_image = Image.fromarray(face_crop) inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = facial_recognition_model(**inputs) # Use mean pooling over the last hidden state to get an embedding vector embeddings = outputs.last_hidden_state.mean(dim=1).squeeze() # Compare against dummy database using cosine similarity best_score = -1 best_name = "Unknown" for name, db_emb in dummy_database.items(): cos_sim = torch.nn.functional.cosine_similarity(embeddings, db_emb, dim=0) if cos_sim > best_score: best_score = cos_sim best_name = name threshold = 0.7 # dummy threshold for identification if best_score > threshold: result = f"Identified as {best_name} (sim: {best_score:.2f})" else: result = f"No match found (best: {best_name}, sim: {best_score:.2f})" return face_crop, result else: return frame, "No face detected" def compute_emotion_detection(image): """ Detects a face, crops it, and classifies the facial expression. """ frame = np.array(image) frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE) frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB) face_results = face_detection.process(frame_rgb) if face_results.detections: detection = face_results.detections[0] bbox = detection.location_data.relative_bounding_box h, w, _ = frame_rgb.shape x = int(bbox.xmin * w) y = int(bbox.ymin * h) box_w = int(bbox.width * w) box_h = int(bbox.height * h) face_crop = frame_rgb[y:y+box_h, x:x+box_w] face_image = Image.fromarray(face_crop) inputs = emotion_processor(face_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = emotion_model(**inputs) logits = outputs.logits pred = logits.argmax(-1).item() label = emotion_model.config.id2label[pred] return face_crop, f"Emotion: {label}" else: return frame, "No face detected" def compute_age_gender(image): """ Detects a face, crops it, and predicts the age & gender. """ frame = np.array(image) frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE) frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB) face_results = face_detection.process(frame_rgb) if face_results.detections: detection = face_results.detections[0] bbox = detection.location_data.relative_bounding_box h, w, _ = frame_rgb.shape x = int(bbox.xmin * w) y = int(bbox.ymin * h) box_w = int(bbox.width * w) box_h = int(bbox.height * h) face_crop = frame_rgb[y:y+box_h, x:x+box_w] face_image = Image.fromarray(face_crop) inputs = age_gender_processor(face_image, return_tensors="pt").to(device) with torch.no_grad(): outputs = age_gender_model(**inputs) logits = outputs.logits pred = logits.argmax(-1).item() label = age_gender_model.config.id2label[pred] return face_crop, f"Age & Gender: {label}" else: return frame, "No face detected" def compute_face_parsing(image): """ Runs face parsing (segmentation) on the provided image. """ image_pil = Image.fromarray(np.array(image)) inputs = face_parsing_processor(image_pil, return_tensors="pt").to(device) with torch.no_grad(): outputs = face_parsing_model(**inputs) logits = outputs.logits # shape: (batch, num_labels, H, W) segmentation = logits.argmax(dim=1)[0].cpu().numpy() # For visualization, we apply a color map to the segmentation mask. segmentation_norm = np.uint8(255 * segmentation / (segmentation.max() + 1e-5)) segmentation_color = cv2.applyColorMap(segmentation_norm, cv2.COLORMAP_JET) return segmentation_color, "Face Parsing completed" def compute_deepfake_detection(image): """ Runs deepfake detection on the image. """ image_pil = Image.fromarray(np.array(image)) inputs = deepfake_processor(image_pil, return_tensors="pt").to(device) with torch.no_grad(): outputs = deepfake_model(**inputs) logits = outputs.logits pred = logits.argmax(-1).item() label = deepfake_model.config.id2label[pred] return np.array(image), f"Deepfake Detection: {label}" # ----------------------------- # Analysis Functions (Wrapping Inference & Green Text) # ----------------------------- def analyze_facial_recognition(image): annotated_face, result = compute_facial_recognition(image) return annotated_face, f"
Facial Recognition: {result}
" def analyze_emotion_detection(image): face_crop, result = compute_emotion_detection(image) return face_crop, f"
{result}
" def analyze_age_gender(image): face_crop, result = compute_age_gender(image) return face_crop, f"
{result}
" def analyze_face_parsing(image): segmentation, result = compute_face_parsing(image) return segmentation, f"
{result}
" def analyze_deepfake_detection(image): output, result = compute_deepfake_detection(image) return output, f"
{result}
" # ----------------------------- # Custom CSS (All Text in Green) # ----------------------------- custom_css = """ @import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@400;700&display=swap'); body { background-color: #0e0e0e; font-family: 'Orbitron', sans-serif; margin: 0; padding: 0; color: #32CD32; } .gradio-container { background: linear-gradient(135deg, #1a1a1a, #333333); border: 2px solid #32CD32; box-shadow: 0 0 15px #32CD32; border-radius: 10px; padding: 20px; max-width: 1200px; margin: auto; } .gradio-title, .gradio-description, .tab-item, .tab-item * { color: #32CD32 !important; text-shadow: 0 0 10px #32CD32; } input, button, .output { border: 1px solid #32CD32; box-shadow: 0 0 8px #32CD32; color: #32CD32; } """ # ----------------------------- # Create Gradio Interfaces for New Models # ----------------------------- facial_recognition_interface = gr.Interface( fn=analyze_facial_recognition, inputs=gr.Image(label="Upload a Face Image for Facial Recognition"), outputs=[gr.Image(type="numpy", label="Cropped Face / Embedding Visualization"), gr.HTML(label="Facial Recognition Result")], title="Facial Recognition & Identification", description="Extracts facial embeddings using facebook/dino-vitb16 and identifies the face by comparing against a dummy database.", live=False ) emotion_interface = gr.Interface( fn=analyze_emotion_detection, inputs=gr.Image(label="Upload a Face Image for Emotion Detection"), outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Emotion Detection")], title="Emotion Detection", description="Classifies the facial expression using nateraw/facial-expression-recognition.", live=False ) age_gender_interface = gr.Interface( fn=analyze_age_gender, inputs=gr.Image(label="Upload a Face Image for Age & Gender Prediction"), outputs=[gr.Image(type="numpy", label="Cropped Face"), gr.HTML(label="Age & Gender Prediction")], title="Age & Gender Prediction", description="Predicts age and gender from the face using oayu/age-gender-estimation.", live=False ) face_parsing_interface = gr.Interface( fn=analyze_face_parsing, inputs=gr.Image(label="Upload a Face Image for Face Parsing"), outputs=[gr.Image(type="numpy", label="Segmentation Overlay"), gr.HTML(label="Face Parsing")], title="Face Parsing", description="Segments face regions (eyes, nose, lips, hair, etc.) using hila-chefer/face-parsing.", live=False ) deepfake_interface = gr.Interface( fn=analyze_deepfake_detection, inputs=gr.Image(label="Upload an Image for Deepfake Detection"), outputs=[gr.Image(type="numpy", label="Input Image"), gr.HTML(label="Deepfake Detection")], title="Deepfake Detection", description="Detects manipulated or deepfake images using microsoft/FaceForensics.", live=False ) # ----------------------------- # Create a Tabbed Interface # ----------------------------- tabbed_interface = gr.TabbedInterface( interface_list=[ facial_recognition_interface, emotion_interface, age_gender_interface, face_parsing_interface, deepfake_interface ], tab_names=[ "Facial Recognition", "Emotion Detection", "Age & Gender", "Face Parsing", "Deepfake Detection" ] ) # ----------------------------- # Wrap in a Blocks Layout & Launch # ----------------------------- demo = gr.Blocks(css=custom_css) with demo: gr.Markdown("

Multi-Analysis Face App

") gr.Markdown("

Upload an image to run advanced face analysis using state-of-the-art Hugging Face models.

") tabbed_interface.render() if __name__ == "__main__": demo.launch()