""" Ultra Supreme Analyzer - Complete Multi-Model Analysis Integrates multiple specialized models for comprehensive image analysis """ import re import logging import spaces import torch import cv2 import numpy as np from typing import Dict, List, Any, Tuple, Optional from PIL import Image # Deep learning models for specialized analysis try: from deepface import DeepFace DEEPFACE_AVAILABLE = True except: DEEPFACE_AVAILABLE = False try: import mediapipe as mp MEDIAPIPE_AVAILABLE = True except: MEDIAPIPE_AVAILABLE = False try: from transformers import pipeline TRANSFORMERS_AVAILABLE = True except: TRANSFORMERS_AVAILABLE = False from constants import ( FORBIDDEN_ELEMENTS, MICRO_AGE_INDICATORS, ULTRA_FACIAL_ANALYSIS, EMOTION_MICRO_EXPRESSIONS, CULTURAL_RELIGIOUS_ULTRA, CLOTHING_ACCESSORIES_ULTRA, ENVIRONMENTAL_ULTRA_ANALYSIS, POSE_BODY_LANGUAGE_ULTRA, COMPOSITION_PHOTOGRAPHY_ULTRA, TECHNICAL_PHOTOGRAPHY_ULTRA, QUALITY_DESCRIPTORS_ULTRA, GENDER_INDICATORS ) logger = logging.getLogger(__name__) class UltraSupremeAnalyzer: """Complete analyzer with multiple specialized models""" def __init__(self): self.face_cascade = None self.pose_detector = None self.emotion_classifier = None self.scene_classifier = None self.models_initialized = False def _initialize_models(self): """Lazy initialization of models""" if self.models_initialized: return try: # OpenCV face detector (lightweight) self.face_cascade = cv2.CascadeClassifier( cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' ) # MediaPipe pose detector if MEDIAPIPE_AVAILABLE: self.mp_pose = mp.solutions.pose self.pose_detector = self.mp_pose.Pose( static_image_mode=True, min_detection_confidence=0.5 ) # Emotion classifier from transformers if TRANSFORMERS_AVAILABLE: self.emotion_classifier = pipeline( "image-classification", model="dima806/facial_emotions_image_detection" ) self.models_initialized = True logger.info("Additional analysis models initialized") except Exception as e: logger.error(f"Error initializing models: {e}") self.models_initialized = False @spaces.GPU(duration=30) def ultra_supreme_analysis(self, image: Any, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]: """Complete analysis using all available models""" # Initialize models if needed self._initialize_models() # Start with CLIP analysis clip_analysis = self._parse_clip_results(clip_fast, clip_classic, clip_best) # Convert image for processing if isinstance(image, Image.Image): img_array = np.array(image) img_rgb = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) else: img_rgb = image image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # Initialize complete analysis structure analysis = { "clip_fast": clip_fast, "clip_classic": clip_classic, "clip_best": clip_best, "full_description": f"{clip_fast} {clip_classic} {clip_best}", "demographic": { "age_category": None, "age_confidence": 0, "gender": None, "gender_confidence": 0, "cultural_religious": [] }, "facial_ultra": { "eyes": [], "eyebrows": [], "nose": [], "mouth": [], "facial_hair": [], "skin": [], "structure": [], "face_count": 0, "face_locations": [] }, "emotional_state": { "primary_emotion": None, "emotion_confidence": 0, "emotion_distribution": {}, "micro_expressions": [], "overall_demeanor": [] }, "clothing_accessories": { "headwear": [], "eyewear": [], "clothing": [], "accessories": [], "style": [] }, "environmental": { "setting_type": None, "specific_location": None, "lighting_analysis": [], "atmosphere": [], "objects": [] }, "pose_composition": { "body_language": [], "head_position": [], "eye_contact": [], "posture": [], "gesture": [], "pose_confidence": 0 }, "technical_analysis": { "shot_type": None, "angle": None, "lighting_setup": None, "composition": [], "suggested_equipment": {} }, "intelligence_metrics": { "total_features_detected": 0, "analysis_depth_score": 0, "cultural_awareness_score": 0, "technical_optimization_score": 0, "model_confidence_average": 0 } } # Merge CLIP analysis analysis = self._merge_analysis(analysis, clip_analysis) # Face detection and analysis face_analysis = self._analyze_faces(img_rgb, image) analysis = self._merge_analysis(analysis, face_analysis) # Pose analysis if MEDIAPIPE_AVAILABLE: pose_analysis = self._analyze_pose(image) analysis = self._merge_analysis(analysis, pose_analysis) # Emotion analysis if TRANSFORMERS_AVAILABLE and analysis["facial_ultra"]["face_count"] > 0: emotion_analysis = self._analyze_emotions(image) analysis = self._merge_analysis(analysis, emotion_analysis) # Scene and environment analysis scene_analysis = self._analyze_scene(clip_analysis) analysis = self._merge_analysis(analysis, scene_analysis) # Calculate intelligence metrics analysis = self._calculate_intelligence_metrics(analysis) return analysis def _parse_clip_results(self, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]: """Parse CLIP results for structured information""" combined_text = f"{clip_fast} {clip_classic} {clip_best}".lower() analysis = { "demographic": {}, "facial_ultra": {}, "emotional_state": {}, "clothing_accessories": {}, "environmental": {}, "pose_composition": {}, "technical_analysis": {} } # Gender detection for gender, indicators in GENDER_INDICATORS.items(): if any(indicator in combined_text for indicator in indicators): analysis["demographic"]["gender"] = gender analysis["demographic"]["gender_confidence"] = 0.8 break # Age detection for age_category, indicators in MICRO_AGE_INDICATORS.items(): if any(indicator in combined_text for indicator in indicators): analysis["demographic"]["age_category"] = age_category analysis["demographic"]["age_confidence"] = 0.7 break # Facial features for feature_type, features in ULTRA_FACIAL_ANALYSIS.items(): if isinstance(features, dict): for sub_type, sub_features in features.items(): found = [f for f in sub_features if f in combined_text] if found and feature_type in analysis["facial_ultra"]: analysis["facial_ultra"][feature_type] = found else: found = [f for f in features if f in combined_text] if found: analysis["facial_ultra"][feature_type] = found # Emotions all_emotions = EMOTION_MICRO_EXPRESSIONS["primary_emotions"] + EMOTION_MICRO_EXPRESSIONS["complex_emotions"] found_emotions = [e for e in all_emotions if e in combined_text] if found_emotions: analysis["emotional_state"]["primary_emotion"] = found_emotions[0] analysis["emotional_state"]["micro_expressions"] = found_emotions # Environment for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["indoor_settings"].items(): if any(s in combined_text for s in settings): analysis["environmental"]["setting_type"] = f"indoor_{setting_type}" break for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["outdoor_settings"].items(): if any(s in combined_text for s in settings): analysis["environmental"]["setting_type"] = f"outdoor_{setting_type}" break # Technical analysis for shot_type in COMPOSITION_PHOTOGRAPHY_ULTRA["shot_types"]: if shot_type in combined_text: analysis["technical_analysis"]["shot_type"] = shot_type break return analysis def _analyze_faces(self, img_bgr: np.ndarray, img_pil: Image.Image) -> Dict[str, Any]: """Analyze faces using OpenCV and DeepFace""" analysis = {"facial_ultra": {}, "demographic": {}, "emotional_state": {}} # OpenCV face detection gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY) faces = self.face_cascade.detectMultiScale(gray, 1.1, 4) analysis["facial_ultra"]["face_count"] = len(faces) analysis["facial_ultra"]["face_locations"] = faces.tolist() if len(faces) > 0 else [] # DeepFace analysis for the first detected face if DEEPFACE_AVAILABLE and len(faces) > 0: try: # Analyze with DeepFace results = DeepFace.analyze( img_path=np.array(img_pil), actions=['age', 'gender', 'emotion', 'race'], enforce_detection=False, silent=True ) if isinstance(results, list): results = results[0] # Extract demographics analysis["demographic"]["age_category"] = self._age_to_category(results.get('age', 0)) analysis["demographic"]["age_confidence"] = 0.85 analysis["demographic"]["gender"] = results.get('dominant_gender', '').lower() analysis["demographic"]["gender_confidence"] = results.get('gender', {}).get( results.get('dominant_gender', ''), 0 ) / 100.0 # Extract emotions emotions = results.get('emotion', {}) if emotions: sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True) analysis["emotional_state"]["primary_emotion"] = sorted_emotions[0][0] analysis["emotional_state"]["emotion_confidence"] = sorted_emotions[0][1] / 100.0 analysis["emotional_state"]["emotion_distribution"] = { k: v/100.0 for k, v in emotions.items() } except Exception as e: logger.warning(f"DeepFace analysis failed: {e}") return analysis def _analyze_pose(self, image: Image.Image) -> Dict[str, Any]: """Analyze body pose using MediaPipe""" analysis = {"pose_composition": {}} if not MEDIAPIPE_AVAILABLE or not self.pose_detector: return analysis try: # Convert PIL to RGB array image_rgb = np.array(image) # Process the image results = self.pose_detector.process(image_rgb) if results.pose_landmarks: landmarks = results.pose_landmarks.landmark # Analyze head position nose = landmarks[self.mp_pose.PoseLandmark.NOSE] left_eye = landmarks[self.mp_pose.PoseLandmark.LEFT_EYE] right_eye = landmarks[self.mp_pose.PoseLandmark.RIGHT_EYE] # Calculate head tilt eye_diff_y = abs(left_eye.y - right_eye.y) if eye_diff_y > 0.02: analysis["pose_composition"]["head_position"] = ["head tilted"] else: analysis["pose_composition"]["head_position"] = ["head straight"] # Analyze posture left_shoulder = landmarks[self.mp_pose.PoseLandmark.LEFT_SHOULDER] right_shoulder = landmarks[self.mp_pose.PoseLandmark.RIGHT_SHOULDER] shoulder_diff_y = abs(left_shoulder.y - right_shoulder.y) if shoulder_diff_y < 0.02: analysis["pose_composition"]["posture"] = ["upright posture", "balanced stance"] else: analysis["pose_composition"]["posture"] = ["asymmetric posture"] # Confidence based on visibility visibility_scores = [l.visibility for l in landmarks] analysis["pose_composition"]["pose_confidence"] = np.mean(visibility_scores) # Body language interpretation if nose.y < 0.3: analysis["pose_composition"]["body_language"].append("confident stance") except Exception as e: logger.warning(f"Pose analysis failed: {e}") return analysis def _analyze_emotions(self, image: Image.Image) -> Dict[str, Any]: """Analyze emotions using transformer model""" analysis = {"emotional_state": {}} if not TRANSFORMERS_AVAILABLE or not self.emotion_classifier: return analysis try: # Run emotion classification predictions = self.emotion_classifier(image) if predictions: # Sort by confidence predictions.sort(key=lambda x: x['score'], reverse=True) # Primary emotion analysis["emotional_state"]["primary_emotion"] = predictions[0]['label'].lower() analysis["emotional_state"]["emotion_confidence"] = predictions[0]['score'] # Emotion distribution analysis["emotional_state"]["emotion_distribution"] = { pred['label'].lower(): pred['score'] for pred in predictions[:5] } # Map to micro-expressions primary = predictions[0]['label'].lower() if primary in ['happy', 'joy']: analysis["emotional_state"]["micro_expressions"] = ["smile", "positive expression"] elif primary in ['sad', 'sorrow']: analysis["emotional_state"]["micro_expressions"] = ["downturned mouth", "melancholic"] elif primary in ['angry', 'disgust']: analysis["emotional_state"]["micro_expressions"] = ["furrowed brow", "tense jaw"] elif primary in ['surprise', 'fear']: analysis["emotional_state"]["micro_expressions"] = ["raised eyebrows", "wide eyes"] except Exception as e: logger.warning(f"Emotion analysis failed: {e}") return analysis def _analyze_scene(self, clip_analysis: Dict[str, Any]) -> Dict[str, Any]: """Analyze scene and environment from CLIP results""" analysis = {"environmental": clip_analysis.get("environmental", {})} # Lighting analysis based on CLIP description combined_text = clip_analysis.get("full_description", "").lower() lighting_keywords = { "natural light": ["sunlight", "daylight", "outdoor", "sunny"], "artificial light": ["indoor", "lamp", "fluorescent", "led"], "dramatic lighting": ["dramatic", "moody", "contrast", "shadow"], "soft lighting": ["soft", "diffused", "gentle", "even"] } for light_type, keywords in lighting_keywords.items(): if any(keyword in combined_text for keyword in keywords): analysis["environmental"]["lighting_analysis"].append(light_type) # Atmosphere if any(word in combined_text for word in ["professional", "formal", "business"]): analysis["environmental"]["atmosphere"].append("professional") if any(word in combined_text for word in ["casual", "relaxed", "informal"]): analysis["environmental"]["atmosphere"].append("casual") if any(word in combined_text for word in ["artistic", "creative", "abstract"]): analysis["environmental"]["atmosphere"].append("artistic") return analysis def _age_to_category(self, age: int) -> str: """Convert numeric age to category""" if age < 2: return "infant" elif age < 12: return "child" elif age < 20: return "teen" elif age < 35: return "young_adult" elif age < 50: return "middle_aged" elif age < 65: return "senior" else: return "elderly" def _merge_analysis(self, base: Dict[str, Any], new: Dict[str, Any]) -> Dict[str, Any]: """Merge analysis results""" for key, value in new.items(): if key in base: if isinstance(value, dict) and isinstance(base[key], dict): base[key].update(value) elif isinstance(value, list) and isinstance(base[key], list): base[key].extend(value) elif value is not None and (not isinstance(base[key], (int, float)) or base[key] == 0): base[key] = value return base def _calculate_intelligence_metrics(self, analysis: Dict[str, Any]) -> Dict[str, Any]: """Calculate intelligence metrics based on analysis completeness""" metrics = analysis["intelligence_metrics"] # Count detected features total_features = 0 confidence_scores = [] # Demographic features if analysis["demographic"]["age_category"]: total_features += 1 confidence_scores.append(analysis["demographic"]["age_confidence"]) if analysis["demographic"]["gender"]: total_features += 1 confidence_scores.append(analysis["demographic"]["gender_confidence"]) # Facial features for feature in ["eyes", "eyebrows", "nose", "mouth", "facial_hair", "skin", "structure"]: if analysis["facial_ultra"].get(feature): total_features += len(analysis["facial_ultra"][feature]) # Emotional features if analysis["emotional_state"]["primary_emotion"]: total_features += 1 confidence_scores.append(analysis["emotional_state"]["emotion_confidence"]) # Pose features if analysis["pose_composition"].get("pose_confidence", 0) > 0: total_features += 1 confidence_scores.append(analysis["pose_composition"]["pose_confidence"]) # Environmental features if analysis["environmental"]["setting_type"]: total_features += 1 total_features += len(analysis["environmental"].get("lighting_analysis", [])) # Technical features if analysis["technical_analysis"]["shot_type"]: total_features += 1 # Calculate scores metrics["total_features_detected"] = total_features metrics["analysis_depth_score"] = min(100, total_features * 5) # Cultural awareness (if religious/cultural indicators found) if analysis["demographic"].get("cultural_religious"): metrics["cultural_awareness_score"] = 80 else: metrics["cultural_awareness_score"] = 40 # Technical optimization score tech_features = sum([ 1 if analysis["technical_analysis"]["shot_type"] else 0, len(analysis["environmental"].get("lighting_analysis", [])), len(analysis["pose_composition"].get("posture", [])) ]) metrics["technical_optimization_score"] = min(100, tech_features * 25) # Average confidence if confidence_scores: metrics["model_confidence_average"] = sum(confidence_scores) / len(confidence_scores) else: metrics["model_confidence_average"] = 0.5 return analysis def build_ultra_supreme_prompt(self, ultra_analysis: Dict[str, Any], clip_results: List[str]) -> str: """Build enhanced prompt based on comprehensive analysis""" prompt_parts = [] # Start with the best CLIP result if clip_results: prompt_parts.append(clip_results[0]) # Add demographic details if confident if ultra_analysis["demographic"]["age_category"] and ultra_analysis["demographic"]["age_confidence"] > 0.7: age_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_age"].get( ultra_analysis["demographic"]["age_category"], [] ) if age_descriptors: prompt_parts.append(age_descriptors[0]) # Add emotional context if ultra_analysis["emotional_state"]["primary_emotion"]: emotion = ultra_analysis["emotional_state"]["primary_emotion"] emotion_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_emotion"].get(emotion, []) if emotion_descriptors: prompt_parts.append(f"{emotion_descriptors[0]} expression") # Add technical details if ultra_analysis["technical_analysis"]["shot_type"]: prompt_parts.append(ultra_analysis["technical_analysis"]["shot_type"]) # Add lighting lighting = ultra_analysis["environmental"].get("lighting_analysis", []) if lighting: prompt_parts.append(f"with {lighting[0]}") # Combine parts enhanced_prompt = ", ".join(prompt_parts) # Clean up enhanced_prompt = re.sub(r'\s+', ' ', enhanced_prompt) enhanced_prompt = re.sub(r',\s*,+', ',', enhanced_prompt) return enhanced_prompt def calculate_ultra_supreme_score(self, prompt: str, ultra_analysis: Dict[str, Any]) -> Tuple[int, Dict[str, int]]: """Calculate comprehensive score based on multi-model analysis""" breakdown = {} # Base score from prompt quality breakdown["prompt_quality"] = min(25, len(prompt) // 10) # Analysis depth score breakdown["analysis_depth"] = min(25, ultra_analysis["intelligence_metrics"]["analysis_depth_score"] // 4) # Model confidence score avg_confidence = ultra_analysis["intelligence_metrics"]["model_confidence_average"] breakdown["model_confidence"] = int(avg_confidence * 25) # Feature richness score total_features = ultra_analysis["intelligence_metrics"]["total_features_detected"] breakdown["feature_richness"] = min(25, total_features * 2) total_score = sum(breakdown.values()) return total_score, breakdown