Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Ultra Supreme Analyzer - Complete Multi-Model Analysis | |
Integrates multiple specialized models for comprehensive image analysis | |
""" | |
import re | |
import logging | |
import spaces | |
import torch | |
import cv2 | |
import numpy as np | |
from typing import Dict, List, Any, Tuple, Optional | |
from PIL import Image | |
# Deep learning models for specialized analysis | |
try: | |
from deepface import DeepFace | |
DEEPFACE_AVAILABLE = True | |
except: | |
DEEPFACE_AVAILABLE = False | |
try: | |
import mediapipe as mp | |
MEDIAPIPE_AVAILABLE = True | |
except: | |
MEDIAPIPE_AVAILABLE = False | |
try: | |
from transformers import pipeline | |
TRANSFORMERS_AVAILABLE = True | |
except: | |
TRANSFORMERS_AVAILABLE = False | |
from constants import ( | |
FORBIDDEN_ELEMENTS, MICRO_AGE_INDICATORS, ULTRA_FACIAL_ANALYSIS, | |
EMOTION_MICRO_EXPRESSIONS, CULTURAL_RELIGIOUS_ULTRA, CLOTHING_ACCESSORIES_ULTRA, | |
ENVIRONMENTAL_ULTRA_ANALYSIS, POSE_BODY_LANGUAGE_ULTRA, COMPOSITION_PHOTOGRAPHY_ULTRA, | |
TECHNICAL_PHOTOGRAPHY_ULTRA, QUALITY_DESCRIPTORS_ULTRA, GENDER_INDICATORS | |
) | |
logger = logging.getLogger(__name__) | |
class UltraSupremeAnalyzer: | |
"""Complete analyzer with multiple specialized models""" | |
def __init__(self): | |
self.face_cascade = None | |
self.pose_detector = None | |
self.emotion_classifier = None | |
self.scene_classifier = None | |
self.models_initialized = False | |
def _initialize_models(self): | |
"""Lazy initialization of models""" | |
if self.models_initialized: | |
return | |
try: | |
# OpenCV face detector (lightweight) | |
self.face_cascade = cv2.CascadeClassifier( | |
cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' | |
) | |
# MediaPipe pose detector | |
if MEDIAPIPE_AVAILABLE: | |
self.mp_pose = mp.solutions.pose | |
self.pose_detector = self.mp_pose.Pose( | |
static_image_mode=True, | |
min_detection_confidence=0.5 | |
) | |
# Emotion classifier from transformers | |
if TRANSFORMERS_AVAILABLE: | |
self.emotion_classifier = pipeline( | |
"image-classification", | |
model="dima806/facial_emotions_image_detection" | |
) | |
self.models_initialized = True | |
logger.info("Additional analysis models initialized") | |
except Exception as e: | |
logger.error(f"Error initializing models: {e}") | |
self.models_initialized = False | |
def ultra_supreme_analysis(self, image: Any, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]: | |
"""Complete analysis using all available models""" | |
# Initialize models if needed | |
self._initialize_models() | |
# Start with CLIP analysis | |
clip_analysis = self._parse_clip_results(clip_fast, clip_classic, clip_best) | |
# Convert image for processing | |
if isinstance(image, Image.Image): | |
img_array = np.array(image) | |
img_rgb = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) | |
else: | |
img_rgb = image | |
image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) | |
# Initialize complete analysis structure | |
analysis = { | |
"clip_fast": clip_fast, | |
"clip_classic": clip_classic, | |
"clip_best": clip_best, | |
"full_description": f"{clip_fast} {clip_classic} {clip_best}", | |
"demographic": { | |
"age_category": None, | |
"age_confidence": 0, | |
"gender": None, | |
"gender_confidence": 0, | |
"cultural_religious": [] | |
}, | |
"facial_ultra": { | |
"eyes": [], | |
"eyebrows": [], | |
"nose": [], | |
"mouth": [], | |
"facial_hair": [], | |
"skin": [], | |
"structure": [], | |
"face_count": 0, | |
"face_locations": [] | |
}, | |
"emotional_state": { | |
"primary_emotion": None, | |
"emotion_confidence": 0, | |
"emotion_distribution": {}, | |
"micro_expressions": [], | |
"overall_demeanor": [] | |
}, | |
"clothing_accessories": { | |
"headwear": [], | |
"eyewear": [], | |
"clothing": [], | |
"accessories": [], | |
"style": [] | |
}, | |
"environmental": { | |
"setting_type": None, | |
"specific_location": None, | |
"lighting_analysis": [], | |
"atmosphere": [], | |
"objects": [] | |
}, | |
"pose_composition": { | |
"body_language": [], | |
"head_position": [], | |
"eye_contact": [], | |
"posture": [], | |
"gesture": [], | |
"pose_confidence": 0 | |
}, | |
"technical_analysis": { | |
"shot_type": None, | |
"angle": None, | |
"lighting_setup": None, | |
"composition": [], | |
"suggested_equipment": {} | |
}, | |
"intelligence_metrics": { | |
"total_features_detected": 0, | |
"analysis_depth_score": 0, | |
"cultural_awareness_score": 0, | |
"technical_optimization_score": 0, | |
"model_confidence_average": 0 | |
} | |
} | |
# Merge CLIP analysis | |
analysis = self._merge_analysis(analysis, clip_analysis) | |
# Face detection and analysis | |
face_analysis = self._analyze_faces(img_rgb, image) | |
analysis = self._merge_analysis(analysis, face_analysis) | |
# Pose analysis | |
if MEDIAPIPE_AVAILABLE: | |
pose_analysis = self._analyze_pose(image) | |
analysis = self._merge_analysis(analysis, pose_analysis) | |
# Emotion analysis | |
if TRANSFORMERS_AVAILABLE and analysis["facial_ultra"]["face_count"] > 0: | |
emotion_analysis = self._analyze_emotions(image) | |
analysis = self._merge_analysis(analysis, emotion_analysis) | |
# Scene and environment analysis | |
scene_analysis = self._analyze_scene(clip_analysis) | |
analysis = self._merge_analysis(analysis, scene_analysis) | |
# Calculate intelligence metrics | |
analysis = self._calculate_intelligence_metrics(analysis) | |
return analysis | |
def _parse_clip_results(self, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]: | |
"""Parse CLIP results for structured information""" | |
combined_text = f"{clip_fast} {clip_classic} {clip_best}".lower() | |
analysis = { | |
"demographic": {}, | |
"facial_ultra": {}, | |
"emotional_state": {}, | |
"clothing_accessories": {}, | |
"environmental": {}, | |
"pose_composition": {}, | |
"technical_analysis": {} | |
} | |
# Gender detection | |
for gender, indicators in GENDER_INDICATORS.items(): | |
if any(indicator in combined_text for indicator in indicators): | |
analysis["demographic"]["gender"] = gender | |
analysis["demographic"]["gender_confidence"] = 0.8 | |
break | |
# Age detection | |
for age_category, indicators in MICRO_AGE_INDICATORS.items(): | |
if any(indicator in combined_text for indicator in indicators): | |
analysis["demographic"]["age_category"] = age_category | |
analysis["demographic"]["age_confidence"] = 0.7 | |
break | |
# Facial features | |
for feature_type, features in ULTRA_FACIAL_ANALYSIS.items(): | |
if isinstance(features, dict): | |
for sub_type, sub_features in features.items(): | |
found = [f for f in sub_features if f in combined_text] | |
if found and feature_type in analysis["facial_ultra"]: | |
analysis["facial_ultra"][feature_type] = found | |
else: | |
found = [f for f in features if f in combined_text] | |
if found: | |
analysis["facial_ultra"][feature_type] = found | |
# Emotions | |
all_emotions = EMOTION_MICRO_EXPRESSIONS["primary_emotions"] + EMOTION_MICRO_EXPRESSIONS["complex_emotions"] | |
found_emotions = [e for e in all_emotions if e in combined_text] | |
if found_emotions: | |
analysis["emotional_state"]["primary_emotion"] = found_emotions[0] | |
analysis["emotional_state"]["micro_expressions"] = found_emotions | |
# Environment | |
for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["indoor_settings"].items(): | |
if any(s in combined_text for s in settings): | |
analysis["environmental"]["setting_type"] = f"indoor_{setting_type}" | |
break | |
for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["outdoor_settings"].items(): | |
if any(s in combined_text for s in settings): | |
analysis["environmental"]["setting_type"] = f"outdoor_{setting_type}" | |
break | |
# Technical analysis | |
for shot_type in COMPOSITION_PHOTOGRAPHY_ULTRA["shot_types"]: | |
if shot_type in combined_text: | |
analysis["technical_analysis"]["shot_type"] = shot_type | |
break | |
return analysis | |
def _analyze_faces(self, img_bgr: np.ndarray, img_pil: Image.Image) -> Dict[str, Any]: | |
"""Analyze faces using OpenCV and DeepFace""" | |
analysis = {"facial_ultra": {}, "demographic": {}, "emotional_state": {}} | |
# OpenCV face detection | |
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) | |
faces = self.face_cascade.detectMultiScale(gray, 1.1, 4) | |
analysis["facial_ultra"]["face_count"] = len(faces) | |
analysis["facial_ultra"]["face_locations"] = faces.tolist() if len(faces) > 0 else [] | |
# DeepFace analysis for the first detected face | |
if DEEPFACE_AVAILABLE and len(faces) > 0: | |
try: | |
# Analyze with DeepFace | |
results = DeepFace.analyze( | |
img_path=np.array(img_pil), | |
actions=['age', 'gender', 'emotion', 'race'], | |
enforce_detection=False, | |
silent=True | |
) | |
if isinstance(results, list): | |
results = results[0] | |
# Extract demographics | |
analysis["demographic"]["age_category"] = self._age_to_category(results.get('age', 0)) | |
analysis["demographic"]["age_confidence"] = 0.85 | |
analysis["demographic"]["gender"] = results.get('dominant_gender', '').lower() | |
analysis["demographic"]["gender_confidence"] = results.get('gender', {}).get( | |
results.get('dominant_gender', ''), 0 | |
) / 100.0 | |
# Extract emotions | |
emotions = results.get('emotion', {}) | |
if emotions: | |
sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True) | |
analysis["emotional_state"]["primary_emotion"] = sorted_emotions[0][0] | |
analysis["emotional_state"]["emotion_confidence"] = sorted_emotions[0][1] / 100.0 | |
analysis["emotional_state"]["emotion_distribution"] = { | |
k: v/100.0 for k, v in emotions.items() | |
} | |
except Exception as e: | |
logger.warning(f"DeepFace analysis failed: {e}") | |
return analysis | |
def _analyze_pose(self, image: Image.Image) -> Dict[str, Any]: | |
"""Analyze body pose using MediaPipe""" | |
analysis = {"pose_composition": {}} | |
if not MEDIAPIPE_AVAILABLE or not self.pose_detector: | |
return analysis | |
try: | |
# Convert PIL to RGB array | |
image_rgb = np.array(image) | |
# Process the image | |
results = self.pose_detector.process(image_rgb) | |
if results.pose_landmarks: | |
landmarks = results.pose_landmarks.landmark | |
# Analyze head position | |
nose = landmarks[self.mp_pose.PoseLandmark.NOSE] | |
left_eye = landmarks[self.mp_pose.PoseLandmark.LEFT_EYE] | |
right_eye = landmarks[self.mp_pose.PoseLandmark.RIGHT_EYE] | |
# Calculate head tilt | |
eye_diff_y = abs(left_eye.y - right_eye.y) | |
if eye_diff_y > 0.02: | |
analysis["pose_composition"]["head_position"] = ["head tilted"] | |
else: | |
analysis["pose_composition"]["head_position"] = ["head straight"] | |
# Analyze posture | |
left_shoulder = landmarks[self.mp_pose.PoseLandmark.LEFT_SHOULDER] | |
right_shoulder = landmarks[self.mp_pose.PoseLandmark.RIGHT_SHOULDER] | |
shoulder_diff_y = abs(left_shoulder.y - right_shoulder.y) | |
if shoulder_diff_y < 0.02: | |
analysis["pose_composition"]["posture"] = ["upright posture", "balanced stance"] | |
else: | |
analysis["pose_composition"]["posture"] = ["asymmetric posture"] | |
# Confidence based on visibility | |
visibility_scores = [l.visibility for l in landmarks] | |
analysis["pose_composition"]["pose_confidence"] = np.mean(visibility_scores) | |
# Body language interpretation | |
if nose.y < 0.3: | |
analysis["pose_composition"]["body_language"].append("confident stance") | |
except Exception as e: | |
logger.warning(f"Pose analysis failed: {e}") | |
return analysis | |
def _analyze_emotions(self, image: Image.Image) -> Dict[str, Any]: | |
"""Analyze emotions using transformer model""" | |
analysis = {"emotional_state": {}} | |
if not TRANSFORMERS_AVAILABLE or not self.emotion_classifier: | |
return analysis | |
try: | |
# Run emotion classification | |
predictions = self.emotion_classifier(image) | |
if predictions: | |
# Sort by confidence | |
predictions.sort(key=lambda x: x['score'], reverse=True) | |
# Primary emotion | |
analysis["emotional_state"]["primary_emotion"] = predictions[0]['label'].lower() | |
analysis["emotional_state"]["emotion_confidence"] = predictions[0]['score'] | |
# Emotion distribution | |
analysis["emotional_state"]["emotion_distribution"] = { | |
pred['label'].lower(): pred['score'] for pred in predictions[:5] | |
} | |
# Map to micro-expressions | |
primary = predictions[0]['label'].lower() | |
if primary in ['happy', 'joy']: | |
analysis["emotional_state"]["micro_expressions"] = ["smile", "positive expression"] | |
elif primary in ['sad', 'sorrow']: | |
analysis["emotional_state"]["micro_expressions"] = ["downturned mouth", "melancholic"] | |
elif primary in ['angry', 'disgust']: | |
analysis["emotional_state"]["micro_expressions"] = ["furrowed brow", "tense jaw"] | |
elif primary in ['surprise', 'fear']: | |
analysis["emotional_state"]["micro_expressions"] = ["raised eyebrows", "wide eyes"] | |
except Exception as e: | |
logger.warning(f"Emotion analysis failed: {e}") | |
return analysis | |
def _analyze_scene(self, clip_analysis: Dict[str, Any]) -> Dict[str, Any]: | |
"""Analyze scene and environment from CLIP results""" | |
analysis = {"environmental": clip_analysis.get("environmental", {})} | |
# Lighting analysis based on CLIP description | |
combined_text = clip_analysis.get("full_description", "").lower() | |
lighting_keywords = { | |
"natural light": ["sunlight", "daylight", "outdoor", "sunny"], | |
"artificial light": ["indoor", "lamp", "fluorescent", "led"], | |
"dramatic lighting": ["dramatic", "moody", "contrast", "shadow"], | |
"soft lighting": ["soft", "diffused", "gentle", "even"] | |
} | |
for light_type, keywords in lighting_keywords.items(): | |
if any(keyword in combined_text for keyword in keywords): | |
analysis["environmental"]["lighting_analysis"].append(light_type) | |
# Atmosphere | |
if any(word in combined_text for word in ["professional", "formal", "business"]): | |
analysis["environmental"]["atmosphere"].append("professional") | |
if any(word in combined_text for word in ["casual", "relaxed", "informal"]): | |
analysis["environmental"]["atmosphere"].append("casual") | |
if any(word in combined_text for word in ["artistic", "creative", "abstract"]): | |
analysis["environmental"]["atmosphere"].append("artistic") | |
return analysis | |
def _age_to_category(self, age: int) -> str: | |
"""Convert numeric age to category""" | |
if age < 2: | |
return "infant" | |
elif age < 12: | |
return "child" | |
elif age < 20: | |
return "teen" | |
elif age < 35: | |
return "young_adult" | |
elif age < 50: | |
return "middle_aged" | |
elif age < 65: | |
return "senior" | |
else: | |
return "elderly" | |
def _merge_analysis(self, base: Dict[str, Any], new: Dict[str, Any]) -> Dict[str, Any]: | |
"""Merge analysis results""" | |
for key, value in new.items(): | |
if key in base: | |
if isinstance(value, dict) and isinstance(base[key], dict): | |
base[key].update(value) | |
elif isinstance(value, list) and isinstance(base[key], list): | |
base[key].extend(value) | |
elif value is not None and (not isinstance(base[key], (int, float)) or base[key] == 0): | |
base[key] = value | |
return base | |
def _calculate_intelligence_metrics(self, analysis: Dict[str, Any]) -> Dict[str, Any]: | |
"""Calculate intelligence metrics based on analysis completeness""" | |
metrics = analysis["intelligence_metrics"] | |
# Count detected features | |
total_features = 0 | |
confidence_scores = [] | |
# Demographic features | |
if analysis["demographic"]["age_category"]: | |
total_features += 1 | |
confidence_scores.append(analysis["demographic"]["age_confidence"]) | |
if analysis["demographic"]["gender"]: | |
total_features += 1 | |
confidence_scores.append(analysis["demographic"]["gender_confidence"]) | |
# Facial features | |
for feature in ["eyes", "eyebrows", "nose", "mouth", "facial_hair", "skin", "structure"]: | |
if analysis["facial_ultra"].get(feature): | |
total_features += len(analysis["facial_ultra"][feature]) | |
# Emotional features | |
if analysis["emotional_state"]["primary_emotion"]: | |
total_features += 1 | |
confidence_scores.append(analysis["emotional_state"]["emotion_confidence"]) | |
# Pose features | |
if analysis["pose_composition"].get("pose_confidence", 0) > 0: | |
total_features += 1 | |
confidence_scores.append(analysis["pose_composition"]["pose_confidence"]) | |
# Environmental features | |
if analysis["environmental"]["setting_type"]: | |
total_features += 1 | |
total_features += len(analysis["environmental"].get("lighting_analysis", [])) | |
# Technical features | |
if analysis["technical_analysis"]["shot_type"]: | |
total_features += 1 | |
# Calculate scores | |
metrics["total_features_detected"] = total_features | |
metrics["analysis_depth_score"] = min(100, total_features * 5) | |
# Cultural awareness (if religious/cultural indicators found) | |
if analysis["demographic"].get("cultural_religious"): | |
metrics["cultural_awareness_score"] = 80 | |
else: | |
metrics["cultural_awareness_score"] = 40 | |
# Technical optimization score | |
tech_features = sum([ | |
1 if analysis["technical_analysis"]["shot_type"] else 0, | |
len(analysis["environmental"].get("lighting_analysis", [])), | |
len(analysis["pose_composition"].get("posture", [])) | |
]) | |
metrics["technical_optimization_score"] = min(100, tech_features * 25) | |
# Average confidence | |
if confidence_scores: | |
metrics["model_confidence_average"] = sum(confidence_scores) / len(confidence_scores) | |
else: | |
metrics["model_confidence_average"] = 0.5 | |
return analysis | |
def build_ultra_supreme_prompt(self, ultra_analysis: Dict[str, Any], clip_results: List[str]) -> str: | |
"""Build enhanced prompt based on comprehensive analysis""" | |
prompt_parts = [] | |
# Start with the best CLIP result | |
if clip_results: | |
prompt_parts.append(clip_results[0]) | |
# Add demographic details if confident | |
if ultra_analysis["demographic"]["age_category"] and ultra_analysis["demographic"]["age_confidence"] > 0.7: | |
age_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_age"].get( | |
ultra_analysis["demographic"]["age_category"], [] | |
) | |
if age_descriptors: | |
prompt_parts.append(age_descriptors[0]) | |
# Add emotional context | |
if ultra_analysis["emotional_state"]["primary_emotion"]: | |
emotion = ultra_analysis["emotional_state"]["primary_emotion"] | |
emotion_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_emotion"].get(emotion, []) | |
if emotion_descriptors: | |
prompt_parts.append(f"{emotion_descriptors[0]} expression") | |
# Add technical details | |
if ultra_analysis["technical_analysis"]["shot_type"]: | |
prompt_parts.append(ultra_analysis["technical_analysis"]["shot_type"]) | |
# Add lighting | |
lighting = ultra_analysis["environmental"].get("lighting_analysis", []) | |
if lighting: | |
prompt_parts.append(f"with {lighting[0]}") | |
# Combine parts | |
enhanced_prompt = ", ".join(prompt_parts) | |
# Clean up | |
enhanced_prompt = re.sub(r'\s+', ' ', enhanced_prompt) | |
enhanced_prompt = re.sub(r',\s*,+', ',', enhanced_prompt) | |
return enhanced_prompt | |
def calculate_ultra_supreme_score(self, prompt: str, ultra_analysis: Dict[str, Any]) -> Tuple[int, Dict[str, int]]: | |
"""Calculate comprehensive score based on multi-model analysis""" | |
breakdown = {} | |
# Base score from prompt quality | |
breakdown["prompt_quality"] = min(25, len(prompt) // 10) | |
# Analysis depth score | |
breakdown["analysis_depth"] = min(25, ultra_analysis["intelligence_metrics"]["analysis_depth_score"] // 4) | |
# Model confidence score | |
avg_confidence = ultra_analysis["intelligence_metrics"]["model_confidence_average"] | |
breakdown["model_confidence"] = int(avg_confidence * 25) | |
# Feature richness score | |
total_features = ultra_analysis["intelligence_metrics"]["total_features_detected"] | |
breakdown["feature_richness"] = min(25, total_features * 2) | |
total_score = sum(breakdown.values()) | |
return total_score, breakdown |