Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Ultra Supreme Optimizer - Main optimization engine for image analysis | |
VERSIΓN FLORENCE-2 - Usa Florence-2 en lugar de CLIP Interrogator | |
""" | |
# IMPORTANT: spaces must be imported BEFORE torch or any CUDA-using library | |
import spaces | |
import gc | |
import logging | |
import re | |
from datetime import datetime | |
from typing import Tuple, Dict, Any, Optional | |
import torch | |
import numpy as np | |
from PIL import Image | |
from transformers import AutoProcessor, AutoModelForCausalLM | |
from analyzer import UltraSupremeAnalyzer | |
logger = logging.getLogger(__name__) | |
class UltraSupremeOptimizer: | |
"""Main optimizer class for ultra supreme image analysis""" | |
def __init__(self): | |
self.processor = None | |
self.model = None | |
self.analyzer = UltraSupremeAnalyzer() | |
self.usage_count = 0 | |
self.device = self._get_device() | |
self.is_initialized = False | |
def _get_device() -> str: | |
"""Determine the best available device for computation""" | |
if torch.cuda.is_available(): | |
return "cuda" | |
elif torch.backends.mps.is_available(): | |
return "mps" | |
else: | |
return "cpu" | |
def initialize_model(self) -> bool: | |
"""Initialize Florence-2 model""" | |
if self.is_initialized: | |
return True | |
try: | |
logger.info("Loading Florence-2 model...") | |
# Load Florence-2 base model (you can also use 'microsoft/Florence-2-large' for better quality) | |
model_id = "microsoft/Florence-2-base" | |
self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
self.model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
trust_remote_code=True, | |
torch_dtype=torch.float16 if self.device == "cuda" else torch.float32 | |
) | |
# Keep model on CPU initially | |
self.model = self.model.to("cpu") | |
self.model.eval() | |
self.is_initialized = True | |
# Clean up memory after initialization | |
gc.collect() | |
logger.info("Florence-2 model initialized successfully") | |
return True | |
except Exception as e: | |
logger.error(f"Model initialization error: {e}") | |
return False | |
def optimize_image(self, image: Any) -> Optional[Image.Image]: | |
"""Optimize image for processing""" | |
if image is None: | |
return None | |
try: | |
# Convert to PIL Image if necessary | |
if isinstance(image, np.ndarray): | |
image = Image.fromarray(image) | |
elif not isinstance(image, Image.Image): | |
image = Image.open(image) | |
# Convert to RGB if necessary | |
if image.mode != 'RGB': | |
image = image.convert('RGB') | |
# Florence-2 handles various sizes well, but let's be reasonable | |
max_size = 1024 | |
if image.size[0] > max_size or image.size[1] > max_size: | |
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS) | |
return image | |
except Exception as e: | |
logger.error(f"Image optimization error: {e}") | |
return None | |
def apply_flux_rules(self, base_prompt: str) -> str: | |
"""Aplica las reglas de Flux a un prompt base""" | |
# Limpiar el prompt de elementos no deseados | |
cleanup_patterns = [ | |
r',\s*trending on artstation', | |
r',\s*trending on [^,]+', | |
r',\s*\d+k\s*', | |
r',\s*\d+k resolution', | |
r',\s*artstation', | |
r',\s*concept art', | |
r',\s*digital art', | |
r',\s*by greg rutkowski', | |
] | |
cleaned_prompt = base_prompt | |
for pattern in cleanup_patterns: | |
cleaned_prompt = re.sub(pattern, '', cleaned_prompt, flags=re.IGNORECASE) | |
# Detectar el tipo de imagen para aΓ±adir configuraciΓ³n de cΓ‘mara apropiada | |
camera_config = "" | |
if any(word in base_prompt.lower() for word in ['portrait', 'person', 'man', 'woman', 'face']): | |
camera_config = ", Shot on Hasselblad X2D 100C, 90mm f/2.5 lens at f/2.8, professional portrait photography" | |
elif any(word in base_prompt.lower() for word in ['landscape', 'mountain', 'nature', 'outdoor']): | |
camera_config = ", Shot on Phase One XT, 40mm f/4 lens at f/8, epic landscape photography" | |
elif any(word in base_prompt.lower() for word in ['street', 'urban', 'city']): | |
camera_config = ", Shot on Leica M11, 35mm f/1.4 lens at f/2.8, documentary street photography" | |
else: | |
camera_config = ", Shot on Phase One XF IQ4, 80mm f/2.8 lens at f/4, professional photography" | |
# AΓ±adir mejoras de iluminaciΓ³n si no estΓ‘n presentes | |
if 'lighting' not in cleaned_prompt.lower(): | |
if 'dramatic' in cleaned_prompt.lower(): | |
cleaned_prompt += ", dramatic cinematic lighting" | |
elif 'portrait' in cleaned_prompt.lower(): | |
cleaned_prompt += ", professional studio lighting with subtle rim light" | |
else: | |
cleaned_prompt += ", masterful natural lighting" | |
# Construir el prompt final | |
final_prompt = cleaned_prompt + camera_config | |
# Asegurar que empiece con mayΓΊscula | |
final_prompt = final_prompt[0].upper() + final_prompt[1:] if final_prompt else final_prompt | |
# Limpiar espacios y comas duplicadas | |
final_prompt = re.sub(r'\s+', ' ', final_prompt) | |
final_prompt = re.sub(r',\s*,+', ',', final_prompt) | |
return final_prompt | |
def run_florence_inference(self, image: Image.Image) -> Tuple[str, str, str]: | |
"""Run Florence-2 inference on GPU""" | |
try: | |
# Move model to GPU | |
self.model = self.model.to("cuda") | |
logger.info("Florence-2 model moved to GPU") | |
# Task prompts for different types of analysis | |
tasks = { | |
"detailed_caption": "<DETAILED_CAPTION>", | |
"more_detailed_caption": "<MORE_DETAILED_CAPTION>", | |
"caption": "<CAPTION>", | |
"dense_region_caption": "<DENSE_REGION_CAPTION>" | |
} | |
results = {} | |
# Run different captioning tasks | |
for task_name, task_prompt in tasks.items(): | |
try: | |
inputs = self.processor(text=task_prompt, images=image, return_tensors="pt") | |
inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
with torch.cuda.amp.autocast(dtype=torch.float16): | |
generated_ids = self.model.generate( | |
input_ids=inputs["input_ids"], | |
pixel_values=inputs["pixel_values"], | |
max_new_tokens=1024, | |
num_beams=3, | |
do_sample=False | |
) | |
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=False)[0] | |
parsed = self.processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height)) | |
# Extract the caption from the parsed result | |
if task_prompt in parsed: | |
results[task_name] = parsed[task_prompt] | |
else: | |
# Sometimes the result is directly in the parsed output | |
results[task_name] = str(parsed) if parsed else "" | |
except Exception as e: | |
logger.warning(f"Error in {task_name}: {e}") | |
results[task_name] = "" | |
# Extract results | |
detailed_caption = results.get("detailed_caption", "") | |
more_detailed = results.get("more_detailed_caption", "") | |
caption = results.get("caption", "") | |
# Combine for a comprehensive description | |
if more_detailed: | |
full_prompt = more_detailed | |
elif detailed_caption: | |
full_prompt = detailed_caption | |
else: | |
full_prompt = caption | |
# Use different levels as our three outputs | |
clip_fast = caption if caption else "A photograph" | |
clip_classic = detailed_caption if detailed_caption else full_prompt | |
clip_best = more_detailed if more_detailed else full_prompt | |
logger.info(f"Florence-2 captions generated successfully") | |
return full_prompt, clip_fast, clip_classic | |
except Exception as e: | |
logger.error(f"Florence-2 inference error: {e}") | |
# Move model back to CPU to free GPU memory | |
self.model = self.model.to("cpu") | |
raise e | |
finally: | |
# Always move model back to CPU after inference | |
self.model = self.model.to("cpu") | |
torch.cuda.empty_cache() | |
def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]: | |
""" | |
Generate ultra supreme prompt from image usando Florence-2 | |
Returns: | |
Tuple of (prompt, analysis_info, score, breakdown) | |
""" | |
try: | |
# Inicializar modelo si no estΓ‘ inicializado | |
if not self.is_initialized: | |
if not self.initialize_model(): | |
return "β Model initialization failed.", "Please refresh and try again.", 0, {} | |
# Validate input | |
if image is None: | |
return "β Please upload an image.", "No image provided.", 0, {} | |
self.usage_count += 1 | |
# Optimize image | |
image = self.optimize_image(image) | |
if image is None: | |
return "β Image processing failed.", "Invalid image format.", 0, {} | |
start_time = datetime.now() | |
logger.info("ULTRA SUPREME ANALYSIS - Starting with Florence-2") | |
# Ejecutar inferencia Florence-2 | |
try: | |
full_prompt, caption_fast, caption_detailed = self.run_florence_inference(image) | |
except Exception as e: | |
logger.error(f"Florence-2 failed: {e}") | |
# Fallback bΓ‘sico | |
full_prompt = "A photograph" | |
caption_fast = "image" | |
caption_detailed = "detailed image" | |
logger.info(f"Florence-2 caption: {full_prompt[:100]}...") | |
# Ejecutar anΓ‘lisis ultra supremo con mΓΊltiples modelos | |
logger.info("Running multi-model ultra supreme analysis...") | |
ultra_analysis = self.analyzer.ultra_supreme_analysis( | |
image, caption_fast, caption_detailed, full_prompt | |
) | |
# Construir prompt mejorado basado en anΓ‘lisis completo | |
enhanced_prompt_parts = [] | |
# Base prompt de Florence | |
enhanced_prompt_parts.append(full_prompt) | |
# Agregar informaciΓ³n demogrΓ‘fica si estΓ‘ disponible | |
if ultra_analysis["demographic"]["gender"] and ultra_analysis["demographic"]["gender_confidence"] > 0.7: | |
gender = ultra_analysis["demographic"]["gender"] | |
age_cat = ultra_analysis["demographic"]["age_category"] | |
if age_cat: | |
enhanced_prompt_parts.append(f"{age_cat} {gender}") | |
# Agregar estado emocional principal | |
if ultra_analysis["emotional_state"]["primary_emotion"] and ultra_analysis["emotional_state"]["emotion_confidence"] > 0.6: | |
emotion = ultra_analysis["emotional_state"]["primary_emotion"] | |
enhanced_prompt_parts.append(f"{emotion} expression") | |
# Agregar informaciΓ³n de pose si estΓ‘ disponible | |
if ultra_analysis["pose_composition"]["posture"]: | |
enhanced_prompt_parts.append(ultra_analysis["pose_composition"]["posture"][0]) | |
# Combinar y aplicar reglas de Flux | |
combined_prompt = ", ".join(enhanced_prompt_parts) | |
optimized_prompt = self.apply_flux_rules(combined_prompt) | |
# Si el analyzer enriqueciΓ³ el prompt, ΓΊsalo | |
analyzer_prompt = self.analyzer.build_ultra_supreme_prompt(ultra_analysis, [full_prompt]) | |
if len(analyzer_prompt) > len(optimized_prompt): | |
optimized_prompt = self.apply_flux_rules(analyzer_prompt) | |
# Calcular score usando el analyzer | |
score, breakdown = self.analyzer.calculate_ultra_supreme_score(optimized_prompt, ultra_analysis) | |
end_time = datetime.now() | |
duration = (end_time - start_time).total_seconds() | |
# Memory cleanup | |
gc.collect() | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
# Generate enhanced analysis report con datos de mΓΊltiples modelos | |
analysis_info = self._generate_ultra_analysis_report( | |
ultra_analysis, score, breakdown, duration, "Florence-2" | |
) | |
return optimized_prompt, analysis_info, score, breakdown | |
except Exception as e: | |
logger.error(f"Ultra supreme generation error: {e}", exc_info=True) | |
return f"β Error: {str(e)}", "Please try with a different image.", 0, {} | |
def _generate_ultra_analysis_report(self, analysis: Dict[str, Any], | |
score: int, breakdown: Dict[str, int], | |
duration: float, caption_model: str = "Florence-2") -> str: | |
"""Generate ultra detailed analysis report with multi-model results""" | |
device_used = "cuda" if torch.cuda.is_available() else "cpu" | |
gpu_status = "β‘ ZeroGPU" if device_used == "cuda" else "π» CPU" | |
# Demographic info | |
demo_info = "" | |
if analysis["demographic"]["age_category"]: | |
age = analysis["demographic"]["age_category"].replace("_", " ").title() | |
gender = analysis["demographic"]["gender"] or "person" | |
confidence = analysis["demographic"]["age_confidence"] | |
demo_info = f"**Detected:** {age} {gender} (confidence: {confidence:.0%})" | |
# Emotion info | |
emotion_info = "" | |
if analysis["emotional_state"]["primary_emotion"]: | |
emotion = analysis["emotional_state"]["primary_emotion"] | |
confidence = analysis["emotional_state"]["emotion_confidence"] | |
emotion_info = f"**Primary Emotion:** {emotion} ({confidence:.0%})" | |
# Add emotion distribution if available | |
if analysis["emotional_state"]["emotion_distribution"]: | |
top_emotions = sorted( | |
analysis["emotional_state"]["emotion_distribution"].items(), | |
key=lambda x: x[1], reverse=True | |
)[:3] | |
emotion_details = ", ".join([f"{e[0]}: {e[1]:.0%}" for e in top_emotions]) | |
emotion_info += f"\n**Emotion Distribution:** {emotion_details}" | |
# Face analysis info | |
face_info = f"**Faces Detected:** {analysis['facial_ultra']['face_count']}" | |
if analysis['facial_ultra']['face_count'] > 0: | |
features = [] | |
for feature_type in ['eyes', 'mouth', 'facial_hair', 'skin']: | |
if analysis['facial_ultra'].get(feature_type): | |
features.extend(analysis['facial_ultra'][feature_type]) | |
if features: | |
face_info += f"\n**Facial Features:** {', '.join(features[:5])}" | |
# Pose info | |
pose_info = "" | |
if analysis["pose_composition"].get("pose_confidence", 0) > 0: | |
confidence = analysis["pose_composition"]["pose_confidence"] | |
pose_info = f"**Pose Analysis:** Body detected ({confidence:.0%} confidence)" | |
if analysis["pose_composition"]["posture"]: | |
pose_info += f"\n**Posture:** {', '.join(analysis['pose_composition']['posture'])}" | |
# Environment info | |
env_info = "" | |
if analysis["environmental"]["setting_type"]: | |
env_info = f"**Setting:** {analysis['environmental']['setting_type'].replace('_', ' ').title()}" | |
if analysis["environmental"]["lighting_analysis"]: | |
env_info += f"\n**Lighting:** {', '.join(analysis['environmental']['lighting_analysis'])}" | |
# Intelligence metrics | |
metrics = analysis["intelligence_metrics"] | |
# Caption info | |
caption_info = analysis.get("clip_best", "")[:150] + "..." if len(analysis.get("clip_best", "")) > 150 else analysis.get("clip_best", "") | |
analysis_info = f"""**π ULTRA SUPREME MULTI-MODEL ANALYSIS COMPLETE** | |
**Processing:** {gpu_status} β’ {duration:.1f}s β’ {caption_model} + Multi-Model Pipeline | |
**Ultra Score:** {score}/100 β’ Models: {caption_model} + DeepFace + MediaPipe + Transformers | |
**π BREAKDOWN:** | |
β’ Prompt Quality: {breakdown.get('prompt_quality', 0)}/25 | |
β’ Analysis Depth: {breakdown.get('analysis_depth', 0)}/25 | |
β’ Model Confidence: {breakdown.get('model_confidence', 0)}/25 | |
β’ Feature Richness: {breakdown.get('feature_richness', 0)}/25 | |
**π VISION-LANGUAGE ANALYSIS:** | |
**{caption_model} Caption:** {caption_info} | |
**π§ DEEP ANALYSIS RESULTS:** | |
**π€ DEMOGRAPHICS & IDENTITY:** | |
{demo_info or "No face detected for demographic analysis"} | |
**π EMOTIONAL ANALYSIS:** | |
{emotion_info or "No emotional data available"} | |
**ποΈ FACIAL ANALYSIS:** | |
{face_info} | |
**πΆ POSE & BODY LANGUAGE:** | |
{pose_info or "No pose data available"} | |
**ποΈ ENVIRONMENT & SCENE:** | |
{env_info or "No environmental data detected"} | |
**π INTELLIGENCE METRICS:** | |
β’ **Total Features Detected:** {metrics['total_features_detected']} | |
β’ **Analysis Depth Score:** {metrics['analysis_depth_score']}/100 | |
β’ **Model Confidence Average:** {metrics['model_confidence_average']:.0%} | |
β’ **Technical Optimization:** {metrics['technical_optimization_score']}/100 | |
**β¨ MULTI-MODEL ADVANTAGES:** | |
β {caption_model}: State-of-the-art vision-language understanding | |
β DeepFace: Accurate age, gender, emotion detection | |
β MediaPipe: Body pose and gesture analysis | |
β Transformers: Advanced emotion classification | |
β OpenCV: Robust face detection | |
**π¬ Powered by Pariente AI Research β’ Ultra Supreme Intelligence Engine**""" | |
return analysis_info |