Phramer_AI

Running on Zero

File size: 24,378 Bytes

"""
Ultra Supreme Analyzer - Complete Multi-Model Analysis
Integrates multiple specialized models for comprehensive image analysis
"""

import re
import logging
import spaces
import torch
import cv2
import numpy as np
from typing import Dict, List, Any, Tuple, Optional
from PIL import Image

# Deep learning models for specialized analysis
try:
    from deepface import DeepFace
    DEEPFACE_AVAILABLE = True
except:
    DEEPFACE_AVAILABLE = False
    
try:
    import mediapipe as mp
    MEDIAPIPE_AVAILABLE = True
except:
    MEDIAPIPE_AVAILABLE = False

try:
    from transformers import pipeline
    TRANSFORMERS_AVAILABLE = True
except:
    TRANSFORMERS_AVAILABLE = False

from constants import (
    FORBIDDEN_ELEMENTS, MICRO_AGE_INDICATORS, ULTRA_FACIAL_ANALYSIS,
    EMOTION_MICRO_EXPRESSIONS, CULTURAL_RELIGIOUS_ULTRA, CLOTHING_ACCESSORIES_ULTRA,
    ENVIRONMENTAL_ULTRA_ANALYSIS, POSE_BODY_LANGUAGE_ULTRA, COMPOSITION_PHOTOGRAPHY_ULTRA,
    TECHNICAL_PHOTOGRAPHY_ULTRA, QUALITY_DESCRIPTORS_ULTRA, GENDER_INDICATORS
)

logger = logging.getLogger(__name__)


class UltraSupremeAnalyzer:
    """Complete analyzer with multiple specialized models"""
    
    def __init__(self):
        self.face_cascade = None
        self.pose_detector = None
        self.emotion_classifier = None
        self.scene_classifier = None
        self.models_initialized = False
        
    def _initialize_models(self):
        """Lazy initialization of models"""
        if self.models_initialized:
            return
            
        try:
            # OpenCV face detector (lightweight)
            self.face_cascade = cv2.CascadeClassifier(
                cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
            )
            
            # MediaPipe pose detector
            if MEDIAPIPE_AVAILABLE:
                self.mp_pose = mp.solutions.pose
                self.pose_detector = self.mp_pose.Pose(
                    static_image_mode=True,
                    min_detection_confidence=0.5
                )
                
            # Emotion classifier from transformers
            if TRANSFORMERS_AVAILABLE:
                self.emotion_classifier = pipeline(
                    "image-classification", 
                    model="dima806/facial_emotions_image_detection"
                )
                
            self.models_initialized = True
            logger.info("Additional analysis models initialized")
            
        except Exception as e:
            logger.error(f"Error initializing models: {e}")
            self.models_initialized = False
    
    @spaces.GPU(duration=30)
    def ultra_supreme_analysis(self, image: Any, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]:
        """Complete analysis using all available models"""
        
        # Initialize models if needed
        self._initialize_models()
        
        # Start with CLIP analysis
        clip_analysis = self._parse_clip_results(clip_fast, clip_classic, clip_best)
        
        # Convert image for processing
        if isinstance(image, Image.Image):
            img_array = np.array(image)
            img_rgb = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
        else:
            img_rgb = image
            image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        
        # Initialize complete analysis structure
        analysis = {
            "clip_fast": clip_fast,
            "clip_classic": clip_classic,
            "clip_best": clip_best,
            "full_description": f"{clip_fast} {clip_classic} {clip_best}",
            "demographic": {
                "age_category": None,
                "age_confidence": 0,
                "gender": None,
                "gender_confidence": 0,
                "cultural_religious": []
            },
            "facial_ultra": {
                "eyes": [],
                "eyebrows": [],
                "nose": [],
                "mouth": [],
                "facial_hair": [],
                "skin": [],
                "structure": [],
                "face_count": 0,
                "face_locations": []
            },
            "emotional_state": {
                "primary_emotion": None,
                "emotion_confidence": 0,
                "emotion_distribution": {},
                "micro_expressions": [],
                "overall_demeanor": []
            },
            "clothing_accessories": {
                "headwear": [],
                "eyewear": [],
                "clothing": [],
                "accessories": [],
                "style": []
            },
            "environmental": {
                "setting_type": None,
                "specific_location": None,
                "lighting_analysis": [],
                "atmosphere": [],
                "objects": []
            },
            "pose_composition": {
                "body_language": [],
                "head_position": [],
                "eye_contact": [],
                "posture": [],
                "gesture": [],
                "pose_confidence": 0
            },
            "technical_analysis": {
                "shot_type": None,
                "angle": None,
                "lighting_setup": None,
                "composition": [],
                "suggested_equipment": {}
            },
            "intelligence_metrics": {
                "total_features_detected": 0,
                "analysis_depth_score": 0,
                "cultural_awareness_score": 0,
                "technical_optimization_score": 0,
                "model_confidence_average": 0
            }
        }
        
        # Merge CLIP analysis
        analysis = self._merge_analysis(analysis, clip_analysis)
        
        # Face detection and analysis
        face_analysis = self._analyze_faces(img_rgb, image)
        analysis = self._merge_analysis(analysis, face_analysis)
        
        # Pose analysis
        if MEDIAPIPE_AVAILABLE:
            pose_analysis = self._analyze_pose(image)
            analysis = self._merge_analysis(analysis, pose_analysis)
        
        # Emotion analysis
        if TRANSFORMERS_AVAILABLE and analysis["facial_ultra"]["face_count"] > 0:
            emotion_analysis = self._analyze_emotions(image)
            analysis = self._merge_analysis(analysis, emotion_analysis)
        
        # Scene and environment analysis
        scene_analysis = self._analyze_scene(clip_analysis)
        analysis = self._merge_analysis(analysis, scene_analysis)
        
        # Calculate intelligence metrics
        analysis = self._calculate_intelligence_metrics(analysis)
        
        return analysis
    
    def _parse_clip_results(self, clip_fast: str, clip_classic: str, clip_best: str) -> Dict[str, Any]:
        """Parse CLIP results for structured information"""
        combined_text = f"{clip_fast} {clip_classic} {clip_best}".lower()
        
        analysis = {
            "demographic": {},
            "facial_ultra": {},
            "emotional_state": {},
            "clothing_accessories": {},
            "environmental": {},
            "pose_composition": {},
            "technical_analysis": {}
        }
        
        # Gender detection
        for gender, indicators in GENDER_INDICATORS.items():
            if any(indicator in combined_text for indicator in indicators):
                analysis["demographic"]["gender"] = gender
                analysis["demographic"]["gender_confidence"] = 0.8
                break
        
        # Age detection
        for age_category, indicators in MICRO_AGE_INDICATORS.items():
            if any(indicator in combined_text for indicator in indicators):
                analysis["demographic"]["age_category"] = age_category
                analysis["demographic"]["age_confidence"] = 0.7
                break
        
        # Facial features
        for feature_type, features in ULTRA_FACIAL_ANALYSIS.items():
            if isinstance(features, dict):
                for sub_type, sub_features in features.items():
                    found = [f for f in sub_features if f in combined_text]
                    if found and feature_type in analysis["facial_ultra"]:
                        analysis["facial_ultra"][feature_type] = found
            else:
                found = [f for f in features if f in combined_text]
                if found:
                    analysis["facial_ultra"][feature_type] = found
        
        # Emotions
        all_emotions = EMOTION_MICRO_EXPRESSIONS["primary_emotions"] + EMOTION_MICRO_EXPRESSIONS["complex_emotions"]
        found_emotions = [e for e in all_emotions if e in combined_text]
        if found_emotions:
            analysis["emotional_state"]["primary_emotion"] = found_emotions[0]
            analysis["emotional_state"]["micro_expressions"] = found_emotions
        
        # Environment
        for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["indoor_settings"].items():
            if any(s in combined_text for s in settings):
                analysis["environmental"]["setting_type"] = f"indoor_{setting_type}"
                break
        
        for setting_type, settings in ENVIRONMENTAL_ULTRA_ANALYSIS["outdoor_settings"].items():
            if any(s in combined_text for s in settings):
                analysis["environmental"]["setting_type"] = f"outdoor_{setting_type}"
                break
        
        # Technical analysis
        for shot_type in COMPOSITION_PHOTOGRAPHY_ULTRA["shot_types"]:
            if shot_type in combined_text:
                analysis["technical_analysis"]["shot_type"] = shot_type
                break
        
        return analysis
    
    def _analyze_faces(self, img_bgr: np.ndarray, img_pil: Image.Image) -> Dict[str, Any]:
        """Analyze faces using OpenCV and DeepFace"""
        analysis = {"facial_ultra": {}, "demographic": {}, "emotional_state": {}}
        
        # OpenCV face detection
        gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
        faces = self.face_cascade.detectMultiScale(gray, 1.1, 4)
        
        analysis["facial_ultra"]["face_count"] = len(faces)
        analysis["facial_ultra"]["face_locations"] = faces.tolist() if len(faces) > 0 else []
        
        # DeepFace analysis for the first detected face
        if DEEPFACE_AVAILABLE and len(faces) > 0:
            try:
                # Analyze with DeepFace
                results = DeepFace.analyze(
                    img_path=np.array(img_pil),
                    actions=['age', 'gender', 'emotion', 'race'],
                    enforce_detection=False,
                    silent=True
                )
                
                if isinstance(results, list):
                    results = results[0]
                
                # Extract demographics
                analysis["demographic"]["age_category"] = self._age_to_category(results.get('age', 0))
                analysis["demographic"]["age_confidence"] = 0.85
                analysis["demographic"]["gender"] = results.get('dominant_gender', '').lower()
                analysis["demographic"]["gender_confidence"] = results.get('gender', {}).get(
                    results.get('dominant_gender', ''), 0
                ) / 100.0
                
                # Extract emotions
                emotions = results.get('emotion', {})
                if emotions:
                    sorted_emotions = sorted(emotions.items(), key=lambda x: x[1], reverse=True)
                    analysis["emotional_state"]["primary_emotion"] = sorted_emotions[0][0]
                    analysis["emotional_state"]["emotion_confidence"] = sorted_emotions[0][1] / 100.0
                    analysis["emotional_state"]["emotion_distribution"] = {
                        k: v/100.0 for k, v in emotions.items()
                    }
                    
            except Exception as e:
                logger.warning(f"DeepFace analysis failed: {e}")
        
        return analysis
    
    def _analyze_pose(self, image: Image.Image) -> Dict[str, Any]:
        """Analyze body pose using MediaPipe"""
        analysis = {"pose_composition": {}}
        
        if not MEDIAPIPE_AVAILABLE or not self.pose_detector:
            return analysis
        
        try:
            # Convert PIL to RGB array
            image_rgb = np.array(image)
            
            # Process the image
            results = self.pose_detector.process(image_rgb)
            
            if results.pose_landmarks:
                landmarks = results.pose_landmarks.landmark
                
                # Analyze head position
                nose = landmarks[self.mp_pose.PoseLandmark.NOSE]
                left_eye = landmarks[self.mp_pose.PoseLandmark.LEFT_EYE]
                right_eye = landmarks[self.mp_pose.PoseLandmark.RIGHT_EYE]
                
                # Calculate head tilt
                eye_diff_y = abs(left_eye.y - right_eye.y)
                if eye_diff_y > 0.02:
                    analysis["pose_composition"]["head_position"] = ["head tilted"]
                else:
                    analysis["pose_composition"]["head_position"] = ["head straight"]
                
                # Analyze posture
                left_shoulder = landmarks[self.mp_pose.PoseLandmark.LEFT_SHOULDER]
                right_shoulder = landmarks[self.mp_pose.PoseLandmark.RIGHT_SHOULDER]
                shoulder_diff_y = abs(left_shoulder.y - right_shoulder.y)
                
                if shoulder_diff_y < 0.02:
                    analysis["pose_composition"]["posture"] = ["upright posture", "balanced stance"]
                else:
                    analysis["pose_composition"]["posture"] = ["asymmetric posture"]
                
                # Confidence based on visibility
                visibility_scores = [l.visibility for l in landmarks]
                analysis["pose_composition"]["pose_confidence"] = np.mean(visibility_scores)
                
                # Body language interpretation
                if nose.y < 0.3:
                    analysis["pose_composition"]["body_language"].append("confident stance")
                
        except Exception as e:
            logger.warning(f"Pose analysis failed: {e}")
        
        return analysis
    
    def _analyze_emotions(self, image: Image.Image) -> Dict[str, Any]:
        """Analyze emotions using transformer model"""
        analysis = {"emotional_state": {}}
        
        if not TRANSFORMERS_AVAILABLE or not self.emotion_classifier:
            return analysis
        
        try:
            # Run emotion classification
            predictions = self.emotion_classifier(image)
            
            if predictions:
                # Sort by confidence
                predictions.sort(key=lambda x: x['score'], reverse=True)
                
                # Primary emotion
                analysis["emotional_state"]["primary_emotion"] = predictions[0]['label'].lower()
                analysis["emotional_state"]["emotion_confidence"] = predictions[0]['score']
                
                # Emotion distribution
                analysis["emotional_state"]["emotion_distribution"] = {
                    pred['label'].lower(): pred['score'] for pred in predictions[:5]
                }
                
                # Map to micro-expressions
                primary = predictions[0]['label'].lower()
                if primary in ['happy', 'joy']:
                    analysis["emotional_state"]["micro_expressions"] = ["smile", "positive expression"]
                elif primary in ['sad', 'sorrow']:
                    analysis["emotional_state"]["micro_expressions"] = ["downturned mouth", "melancholic"]
                elif primary in ['angry', 'disgust']:
                    analysis["emotional_state"]["micro_expressions"] = ["furrowed brow", "tense jaw"]
                elif primary in ['surprise', 'fear']:
                    analysis["emotional_state"]["micro_expressions"] = ["raised eyebrows", "wide eyes"]
                
        except Exception as e:
            logger.warning(f"Emotion analysis failed: {e}")
        
        return analysis
    
    def _analyze_scene(self, clip_analysis: Dict[str, Any]) -> Dict[str, Any]:
        """Analyze scene and environment from CLIP results"""
        analysis = {"environmental": clip_analysis.get("environmental", {})}
        
        # Lighting analysis based on CLIP description
        combined_text = clip_analysis.get("full_description", "").lower()
        
        lighting_keywords = {
            "natural light": ["sunlight", "daylight", "outdoor", "sunny"],
            "artificial light": ["indoor", "lamp", "fluorescent", "led"],
            "dramatic lighting": ["dramatic", "moody", "contrast", "shadow"],
            "soft lighting": ["soft", "diffused", "gentle", "even"]
        }
        
        for light_type, keywords in lighting_keywords.items():
            if any(keyword in combined_text for keyword in keywords):
                analysis["environmental"]["lighting_analysis"].append(light_type)
        
        # Atmosphere
        if any(word in combined_text for word in ["professional", "formal", "business"]):
            analysis["environmental"]["atmosphere"].append("professional")
        if any(word in combined_text for word in ["casual", "relaxed", "informal"]):
            analysis["environmental"]["atmosphere"].append("casual")
        if any(word in combined_text for word in ["artistic", "creative", "abstract"]):
            analysis["environmental"]["atmosphere"].append("artistic")
        
        return analysis
    
    def _age_to_category(self, age: int) -> str:
        """Convert numeric age to category"""
        if age < 2:
            return "infant"
        elif age < 12:
            return "child"
        elif age < 20:
            return "teen"
        elif age < 35:
            return "young_adult"
        elif age < 50:
            return "middle_aged"
        elif age < 65:
            return "senior"
        else:
            return "elderly"
    
    def _merge_analysis(self, base: Dict[str, Any], new: Dict[str, Any]) -> Dict[str, Any]:
        """Merge analysis results"""
        for key, value in new.items():
            if key in base:
                if isinstance(value, dict) and isinstance(base[key], dict):
                    base[key].update(value)
                elif isinstance(value, list) and isinstance(base[key], list):
                    base[key].extend(value)
                elif value is not None and (not isinstance(base[key], (int, float)) or base[key] == 0):
                    base[key] = value
        return base
    
    def _calculate_intelligence_metrics(self, analysis: Dict[str, Any]) -> Dict[str, Any]:
        """Calculate intelligence metrics based on analysis completeness"""
        metrics = analysis["intelligence_metrics"]
        
        # Count detected features
        total_features = 0
        confidence_scores = []
        
        # Demographic features
        if analysis["demographic"]["age_category"]:
            total_features += 1
            confidence_scores.append(analysis["demographic"]["age_confidence"])
        if analysis["demographic"]["gender"]:
            total_features += 1
            confidence_scores.append(analysis["demographic"]["gender_confidence"])
        
        # Facial features
        for feature in ["eyes", "eyebrows", "nose", "mouth", "facial_hair", "skin", "structure"]:
            if analysis["facial_ultra"].get(feature):
                total_features += len(analysis["facial_ultra"][feature])
        
        # Emotional features
        if analysis["emotional_state"]["primary_emotion"]:
            total_features += 1
            confidence_scores.append(analysis["emotional_state"]["emotion_confidence"])
        
        # Pose features
        if analysis["pose_composition"].get("pose_confidence", 0) > 0:
            total_features += 1
            confidence_scores.append(analysis["pose_composition"]["pose_confidence"])
        
        # Environmental features
        if analysis["environmental"]["setting_type"]:
            total_features += 1
        total_features += len(analysis["environmental"].get("lighting_analysis", []))
        
        # Technical features
        if analysis["technical_analysis"]["shot_type"]:
            total_features += 1
        
        # Calculate scores
        metrics["total_features_detected"] = total_features
        metrics["analysis_depth_score"] = min(100, total_features * 5)
        
        # Cultural awareness (if religious/cultural indicators found)
        if analysis["demographic"].get("cultural_religious"):
            metrics["cultural_awareness_score"] = 80
        else:
            metrics["cultural_awareness_score"] = 40
        
        # Technical optimization score
        tech_features = sum([
            1 if analysis["technical_analysis"]["shot_type"] else 0,
            len(analysis["environmental"].get("lighting_analysis", [])),
            len(analysis["pose_composition"].get("posture", []))
        ])
        metrics["technical_optimization_score"] = min(100, tech_features * 25)
        
        # Average confidence
        if confidence_scores:
            metrics["model_confidence_average"] = sum(confidence_scores) / len(confidence_scores)
        else:
            metrics["model_confidence_average"] = 0.5
        
        return analysis
    
    def build_ultra_supreme_prompt(self, ultra_analysis: Dict[str, Any], clip_results: List[str]) -> str:
        """Build enhanced prompt based on comprehensive analysis"""
        prompt_parts = []
        
        # Start with the best CLIP result
        if clip_results:
            prompt_parts.append(clip_results[0])
        
        # Add demographic details if confident
        if ultra_analysis["demographic"]["age_category"] and ultra_analysis["demographic"]["age_confidence"] > 0.7:
            age_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_age"].get(
                ultra_analysis["demographic"]["age_category"], []
            )
            if age_descriptors:
                prompt_parts.append(age_descriptors[0])
        
        # Add emotional context
        if ultra_analysis["emotional_state"]["primary_emotion"]:
            emotion = ultra_analysis["emotional_state"]["primary_emotion"]
            emotion_descriptors = QUALITY_DESCRIPTORS_ULTRA["based_on_emotion"].get(emotion, [])
            if emotion_descriptors:
                prompt_parts.append(f"{emotion_descriptors[0]} expression")
        
        # Add technical details
        if ultra_analysis["technical_analysis"]["shot_type"]:
            prompt_parts.append(ultra_analysis["technical_analysis"]["shot_type"])
        
        # Add lighting
        lighting = ultra_analysis["environmental"].get("lighting_analysis", [])
        if lighting:
            prompt_parts.append(f"with {lighting[0]}")
        
        # Combine parts
        enhanced_prompt = ", ".join(prompt_parts)
        
        # Clean up
        enhanced_prompt = re.sub(r'\s+', ' ', enhanced_prompt)
        enhanced_prompt = re.sub(r',\s*,+', ',', enhanced_prompt)
        
        return enhanced_prompt
    
    def calculate_ultra_supreme_score(self, prompt: str, ultra_analysis: Dict[str, Any]) -> Tuple[int, Dict[str, int]]:
        """Calculate comprehensive score based on multi-model analysis"""
        breakdown = {}
        
        # Base score from prompt quality
        breakdown["prompt_quality"] = min(25, len(prompt) // 10)
        
        # Analysis depth score
        breakdown["analysis_depth"] = min(25, ultra_analysis["intelligence_metrics"]["analysis_depth_score"] // 4)
        
        # Model confidence score
        avg_confidence = ultra_analysis["intelligence_metrics"]["model_confidence_average"]
        breakdown["model_confidence"] = int(avg_confidence * 25)
        
        # Feature richness score
        total_features = ultra_analysis["intelligence_metrics"]["total_features_detected"]
        breakdown["feature_richness"] = min(25, total_features * 2)
        
        total_score = sum(breakdown.values())
        
        return total_score, breakdown