Spaces:

ddriscoll
/

SOC3242-01_Group_3_Interactive

Sleeping

David Driscoll

Model overhaul

107dab2 5 months ago

13 kB

	import gradio as gr
	import cv2
	import numpy as np
	import torch
	from PIL import Image
	import mediapipe as mp

	from transformers import (
	AutoFeatureExtractor,
	AutoModel,
	AutoImageProcessor,
	AutoModelForImageClassification,
	AutoModelForSemanticSegmentation
	)

	# -----------------------------
	# Configuration & Device Setup
	# -----------------------------
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	DESIRED_SIZE = (640, 480)

	# -----------------------------
	# Initialize Mediapipe Face Detection
	# -----------------------------
	mp_face_detection = mp.solutions.face_detection
	face_detection = mp_face_detection.FaceDetection(min_detection_confidence=0.5)

	# -----------------------------
	# Load New Models from Hugging Face
	# -----------------------------

	# 1. Facial Recognition & Identification (facebook/dino-vitb16)
	facial_recognition_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16")
	facial_recognition_model = AutoModel.from_pretrained("facebook/dino-vitb16")
	facial_recognition_model.to(device)
	facial_recognition_model.eval()

	# Create a dummy database for demonstration (embeddings of dimension 768 assumed)
	dummy_database = {
	"Alice": torch.randn(768).to(device),
	"Bob": torch.randn(768).to(device)
	}

	# 2. Emotion Detection (nateraw/facial-expression-recognition)
	emotion_processor = AutoImageProcessor.from_pretrained("nateraw/facial-expression-recognition")
	emotion_model = AutoModelForImageClassification.from_pretrained("nateraw/facial-expression-recognition")
	emotion_model.to(device)
	emotion_model.eval()

	# 3. Age & Gender Prediction (oayu/age-gender-estimation)
	age_gender_processor = AutoImageProcessor.from_pretrained("oayu/age-gender-estimation")
	age_gender_model = AutoModelForImageClassification.from_pretrained("oayu/age-gender-estimation")
	age_gender_model.to(device)
	age_gender_model.eval()

	# 4. Face Parsing (hila-chefer/face-parsing)
	face_parsing_processor = AutoImageProcessor.from_pretrained("hila-chefer/face-parsing")
	face_parsing_model = AutoModelForSemanticSegmentation.from_pretrained("hila-chefer/face-parsing")
	face_parsing_model.to(device)
	face_parsing_model.eval()

	# 5. Deepfake Detection (microsoft/FaceForensics)
	deepfake_processor = AutoImageProcessor.from_pretrained("microsoft/FaceForensics")
	deepfake_model = AutoModelForImageClassification.from_pretrained("microsoft/FaceForensics")
	deepfake_model.to(device)
	deepfake_model.eval()

	# -----------------------------
	# Helper Functions for New Inferences
	# -----------------------------

	def compute_facial_recognition(image):
	"""
	Detects a face using MediaPipe, crops it, and computes its embedding with DINO-ViT.
	Compares the embedding against a dummy database to "identify" the person.
	"""
	frame = np.array(image)
	frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
	frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE)
	frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)

	face_results = face_detection.process(frame_rgb)
	if face_results.detections:
	detection = face_results.detections[0]
	bbox = detection.location_data.relative_bounding_box
	h, w, _ = frame_rgb.shape
	x = int(bbox.xmin * w)
	y = int(bbox.ymin * h)
	box_w = int(bbox.width * w)
	box_h = int(bbox.height * h)
	face_crop = frame_rgb[y:y+box_h, x:x+box_w]
	face_image = Image.fromarray(face_crop)

	inputs = facial_recognition_extractor(face_image, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = facial_recognition_model(**inputs)
	# Use mean pooling over the last hidden state to get an embedding vector
	embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()

	# Compare against dummy database using cosine similarity
	best_score = -1
	best_name = "Unknown"
	for name, db_emb in dummy_database.items():
	cos_sim = torch.nn.functional.cosine_similarity(embeddings, db_emb, dim=0)
	if cos_sim > best_score:
	best_score = cos_sim
	best_name = name
	threshold = 0.7 # dummy threshold for identification
	if best_score > threshold:
	result = f"Identified as {best_name} (sim: {best_score:.2f})"
	else:
	result = f"No match found (best: {best_name}, sim: {best_score:.2f})"
	return face_crop, result
	else:
	return frame, "No face detected"

	def compute_emotion_detection(image):
	"""
	Detects a face, crops it, and classifies the facial expression.
	"""
	frame = np.array(image)
	frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
	frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE)
	frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)

	face_results = face_detection.process(frame_rgb)
	if face_results.detections:
	detection = face_results.detections[0]
	bbox = detection.location_data.relative_bounding_box
	h, w, _ = frame_rgb.shape
	x = int(bbox.xmin * w)
	y = int(bbox.ymin * h)
	box_w = int(bbox.width * w)
	box_h = int(bbox.height * h)
	face_crop = frame_rgb[y:y+box_h, x:x+box_w]
	face_image = Image.fromarray(face_crop)

	inputs = emotion_processor(face_image, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = emotion_model(**inputs)
	logits = outputs.logits
	pred = logits.argmax(-1).item()
	label = emotion_model.config.id2label[pred]
	return face_crop, f"Emotion: {label}"
	else:
	return frame, "No face detected"

	def compute_age_gender(image):
	"""
	Detects a face, crops it, and predicts the age & gender.
	"""
	frame = np.array(image)
	frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
	frame_resized = cv2.resize(frame_bgr, DESIRED_SIZE)
	frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)

	face_results = face_detection.process(frame_rgb)
	if face_results.detections:
	detection = face_results.detections[0]
	bbox = detection.location_data.relative_bounding_box
	h, w, _ = frame_rgb.shape
	x = int(bbox.xmin * w)
	y = int(bbox.ymin * h)
	box_w = int(bbox.width * w)
	box_h = int(bbox.height * h)
	face_crop = frame_rgb[y:y+box_h, x:x+box_w]
	face_image = Image.fromarray(face_crop)

	inputs = age_gender_processor(face_image, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = age_gender_model(**inputs)
	logits = outputs.logits
	pred = logits.argmax(-1).item()
	label = age_gender_model.config.id2label[pred]
	return face_crop, f"Age & Gender: {label}"
	else:
	return frame, "No face detected"

	def compute_face_parsing(image):
	"""
	Runs face parsing (segmentation) on the provided image.
	"""
	image_pil = Image.fromarray(np.array(image))
	inputs = face_parsing_processor(image_pil, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = face_parsing_model(**inputs)
	logits = outputs.logits # shape: (batch, num_labels, H, W)
	segmentation = logits.argmax(dim=1)[0].cpu().numpy()
	# For visualization, we apply a color map to the segmentation mask.
	segmentation_norm = np.uint8(255 * segmentation / (segmentation.max() + 1e-5))
	segmentation_color = cv2.applyColorMap(segmentation_norm, cv2.COLORMAP_JET)
	return segmentation_color, "Face Parsing completed"

	def compute_deepfake_detection(image):
	"""
	Runs deepfake detection on the image.
	"""
	image_pil = Image.fromarray(np.array(image))
	inputs = deepfake_processor(image_pil, return_tensors="pt").to(device)
	with torch.no_grad():
	outputs = deepfake_model(**inputs)
	logits = outputs.logits
	pred = logits.argmax(-1).item()
	label = deepfake_model.config.id2label[pred]
	return np.array(image), f"Deepfake Detection: {label}"

	# -----------------------------
	# Analysis Functions (Wrapping Inference & Green Text)
	# -----------------------------

	def analyze_facial_recognition(image):
	annotated_face, result = compute_facial_recognition(image)
	return annotated_face, f"<div style='color: lime !important;'>Facial Recognition: {result}</div>"

	def analyze_emotion_detection(image):
	face_crop, result = compute_emotion_detection(image)
	return face_crop, f"<div style='color: lime !important;'>{result}</div>"

	def analyze_age_gender(image):
	face_crop, result = compute_age_gender(image)
	return face_crop, f"<div style='color: lime !important;'>{result}</div>"

	def analyze_face_parsing(image):
	segmentation, result = compute_face_parsing(image)
	return segmentation, f"<div style='color: lime !important;'>{result}</div>"

	def analyze_deepfake_detection(image):
	output, result = compute_deepfake_detection(image)
	return output, f"<div style='color: lime !important;'>{result}</div>"

	# -----------------------------
	# Custom CSS (All Text in Green)
	# -----------------------------
	custom_css = """
	@import url('https://fonts.googleapis.com/css2?family=Orbitron:wght@400;700&display=swap');
	body {
	background-color: #0e0e0e;
	font-family: 'Orbitron', sans-serif;
	margin: 0;
	padding: 0;
	color: #32CD32;
	}
	.gradio-container {
	background: linear-gradient(135deg, #1a1a1a, #333333);
	border: 2px solid #32CD32;
	box-shadow: 0 0 15px #32CD32;
	border-radius: 10px;
	padding: 20px;
	max-width: 1200px;
	margin: auto;
	}
	.gradio-title, .gradio-description, .tab-item, .tab-item * {
	color: #32CD32 !important;
	text-shadow: 0 0 10px #32CD32;
	}
	input, button, .output {
	border: 1px solid #32CD32;
	box-shadow: 0 0 8px #32CD32;
	color: #32CD32;
	}
	"""

	# -----------------------------
	# Create Gradio Interfaces for New Models
	# -----------------------------
	facial_recognition_interface = gr.Interface(
	fn=analyze_facial_recognition,
	inputs=gr.Image(label="Upload a Face Image for Facial Recognition"),
	outputs=[gr.Image(type="numpy", label="Cropped Face / Embedding Visualization"),
	gr.HTML(label="Facial Recognition Result")],
	title="Facial Recognition & Identification",
	description="Extracts facial embeddings using facebook/dino-vitb16 and identifies the face by comparing against a dummy database.",
	live=False
	)

	emotion_interface = gr.Interface(
	fn=analyze_emotion_detection,
	inputs=gr.Image(label="Upload a Face Image for Emotion Detection"),
	outputs=[gr.Image(type="numpy", label="Cropped Face"),
	gr.HTML(label="Emotion Detection")],
	title="Emotion Detection",
	description="Classifies the facial expression using nateraw/facial-expression-recognition.",
	live=False
	)

	age_gender_interface = gr.Interface(
	fn=analyze_age_gender,
	inputs=gr.Image(label="Upload a Face Image for Age & Gender Prediction"),
	outputs=[gr.Image(type="numpy", label="Cropped Face"),
	gr.HTML(label="Age & Gender Prediction")],
	title="Age & Gender Prediction",
	description="Predicts age and gender from the face using oayu/age-gender-estimation.",
	live=False
	)

	face_parsing_interface = gr.Interface(
	fn=analyze_face_parsing,
	inputs=gr.Image(label="Upload a Face Image for Face Parsing"),
	outputs=[gr.Image(type="numpy", label="Segmentation Overlay"),
	gr.HTML(label="Face Parsing")],
	title="Face Parsing",
	description="Segments face regions (eyes, nose, lips, hair, etc.) using hila-chefer/face-parsing.",
	live=False
	)

	deepfake_interface = gr.Interface(
	fn=analyze_deepfake_detection,
	inputs=gr.Image(label="Upload an Image for Deepfake Detection"),
	outputs=[gr.Image(type="numpy", label="Input Image"),
	gr.HTML(label="Deepfake Detection")],
	title="Deepfake Detection",
	description="Detects manipulated or deepfake images using microsoft/FaceForensics.",
	live=False
	)

	# -----------------------------
	# Create a Tabbed Interface
	# -----------------------------
	tabbed_interface = gr.TabbedInterface(
	interface_list=[
	facial_recognition_interface,
	emotion_interface,
	age_gender_interface,
	face_parsing_interface,
	deepfake_interface
	],
	tab_names=[
	"Facial Recognition",
	"Emotion Detection",
	"Age & Gender",
	"Face Parsing",
	"Deepfake Detection"
	]
	)

	# -----------------------------
	# Wrap in a Blocks Layout & Launch
	# -----------------------------
	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.Markdown("<h1 class='gradio-title' style='color: #32CD32;'>Multi-Analysis Face App</h1>")
	gr.Markdown("<p class='gradio-description' style='color: #32CD32;'>Upload an image to run advanced face analysis using state-of-the-art Hugging Face models.</p>")
	tabbed_interface.render()

	if __name__ == "__main__":
	demo.launch()