Spaces:
Running
on
Zero
Running
on
Zero
Update optimizer.py
Browse files- optimizer.py +132 -61
optimizer.py
CHANGED
@@ -30,9 +30,8 @@ class UltraSupremeOptimizer:
|
|
30 |
self.usage_count = 0
|
31 |
self.device = self._get_device()
|
32 |
self.is_initialized = False
|
33 |
-
#
|
34 |
-
|
35 |
-
|
36 |
@staticmethod
|
37 |
def _get_device() -> str:
|
38 |
"""Determine the best available device for computation"""
|
@@ -49,13 +48,13 @@ class UltraSupremeOptimizer:
|
|
49 |
return True
|
50 |
|
51 |
try:
|
52 |
-
# Configuración
|
53 |
config = Config(
|
54 |
clip_model_name="ViT-L-14/openai",
|
55 |
download_cache=True,
|
56 |
chunk_size=2048,
|
57 |
quiet=True,
|
58 |
-
device="cpu" #
|
59 |
)
|
60 |
|
61 |
self.interrogator = Interrogator(config)
|
@@ -63,7 +62,8 @@ class UltraSupremeOptimizer:
|
|
63 |
|
64 |
# Clean up memory after initialization
|
65 |
gc.collect()
|
66 |
-
|
|
|
67 |
return True
|
68 |
|
69 |
except Exception as e:
|
@@ -86,8 +86,8 @@ class UltraSupremeOptimizer:
|
|
86 |
if image.mode != 'RGB':
|
87 |
image = image.convert('RGB')
|
88 |
|
89 |
-
# Resize if too large
|
90 |
-
max_size =
|
91 |
if image.size[0] > max_size or image.size[1] > max_size:
|
92 |
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
|
93 |
|
@@ -109,7 +109,7 @@ class UltraSupremeOptimizer:
|
|
109 |
r',\s*artstation',
|
110 |
r',\s*concept art',
|
111 |
r',\s*digital art',
|
112 |
-
r',\s*by greg rutkowski',
|
113 |
]
|
114 |
|
115 |
cleaned_prompt = base_prompt
|
@@ -148,25 +148,101 @@ class UltraSupremeOptimizer:
|
|
148 |
|
149 |
return final_prompt
|
150 |
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
def run_clip_inference(self, image: Image.Image) -> Tuple[str, str, str]:
|
153 |
"""Solo la inferencia CLIP usa GPU"""
|
154 |
try:
|
155 |
-
#
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
-
#
|
161 |
-
|
162 |
-
self.interrogator.clip_model = self.interrogator.clip_model.to("cuda")
|
163 |
-
logger.info("CLIP model moved to GPU with native precision")
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
168 |
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
full_prompt = self.interrogator.interrogate(image)
|
171 |
clip_fast = self.interrogator.interrogate_fast(image)
|
172 |
clip_classic = self.interrogator.interrogate_classic(image)
|
@@ -174,26 +250,8 @@ class UltraSupremeOptimizer:
|
|
174 |
return full_prompt, clip_fast, clip_classic
|
175 |
|
176 |
except Exception as e:
|
177 |
-
logger.error(f"
|
178 |
-
|
179 |
-
if self.device == "cuda":
|
180 |
-
logger.info("Falling back to CPU inference")
|
181 |
-
self.interrogator.config.device = "cpu"
|
182 |
-
|
183 |
-
if hasattr(self.interrogator, 'clip_model') and self.interrogator.clip_model is not None:
|
184 |
-
self.interrogator.clip_model = self.interrogator.clip_model.to("cpu")
|
185 |
-
|
186 |
-
if hasattr(self.interrogator, 'blip_model') and self.interrogator.blip_model is not None:
|
187 |
-
self.interrogator.blip_model = self.interrogator.blip_model.to("cpu")
|
188 |
-
|
189 |
-
# Reintentar en CPU
|
190 |
-
full_prompt = self.interrogator.interrogate(image)
|
191 |
-
clip_fast = self.interrogator.interrogate_fast(image)
|
192 |
-
clip_classic = self.interrogator.interrogate_classic(image)
|
193 |
-
|
194 |
-
return full_prompt, clip_fast, clip_classic
|
195 |
-
else:
|
196 |
-
raise e
|
197 |
|
198 |
def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]:
|
199 |
"""
|
@@ -203,9 +261,10 @@ class UltraSupremeOptimizer:
|
|
203 |
Tuple of (prompt, analysis_info, score, breakdown)
|
204 |
"""
|
205 |
try:
|
206 |
-
#
|
207 |
if not self.is_initialized:
|
208 |
-
|
|
|
209 |
|
210 |
# Validate input
|
211 |
if image is None:
|
@@ -222,17 +281,24 @@ class UltraSupremeOptimizer:
|
|
222 |
|
223 |
logger.info("ULTRA SUPREME ANALYSIS - Starting pipeline")
|
224 |
|
225 |
-
# Ejecutar inferencia CLIP
|
226 |
full_prompt, clip_fast, clip_classic = self.run_clip_inference(image)
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
231 |
|
232 |
-
|
|
|
|
|
|
|
|
|
233 |
optimized_prompt = self.apply_flux_rules(full_prompt)
|
234 |
|
235 |
-
#
|
236 |
analysis_summary = {
|
237 |
"base_prompt": full_prompt,
|
238 |
"clip_fast": clip_fast,
|
@@ -242,7 +308,7 @@ class UltraSupremeOptimizer:
|
|
242 |
"detected_subject": self._detect_subject(full_prompt)
|
243 |
}
|
244 |
|
245 |
-
#
|
246 |
score = self._calculate_score(optimized_prompt, full_prompt)
|
247 |
breakdown = {
|
248 |
"base_quality": min(len(full_prompt) // 10, 25),
|
@@ -268,7 +334,7 @@ class UltraSupremeOptimizer:
|
|
268 |
return optimized_prompt, analysis_info, score, breakdown
|
269 |
|
270 |
except Exception as e:
|
271 |
-
logger.error(f"Ultra supreme generation error: {e}")
|
272 |
return f"❌ Error: {str(e)}", "Please try with a different image.", 0, {}
|
273 |
|
274 |
def _detect_style(self, prompt: str) -> str:
|
@@ -281,19 +347,23 @@ class UltraSupremeOptimizer:
|
|
281 |
"dramatic": ["dramatic", "cinematic", "moody"]
|
282 |
}
|
283 |
|
|
|
284 |
for style_name, keywords in styles.items():
|
285 |
-
if any(keyword in
|
286 |
return style_name
|
287 |
|
288 |
return "general"
|
289 |
|
290 |
def _detect_subject(self, prompt: str) -> str:
|
291 |
"""Detecta el sujeto principal del prompt"""
|
|
|
|
|
|
|
292 |
# Tomar las primeras palabras significativas
|
293 |
words = prompt.split(',')[0].split()
|
294 |
if len(words) > 3:
|
295 |
return ' '.join(words[:4])
|
296 |
-
return prompt.split(',')[0]
|
297 |
|
298 |
def _calculate_score(self, optimized_prompt: str, base_prompt: str) -> int:
|
299 |
"""Calcula el score basado en la calidad del prompt"""
|
@@ -321,8 +391,9 @@ class UltraSupremeOptimizer:
|
|
321 |
duration: float) -> str:
|
322 |
"""Generate detailed analysis report"""
|
323 |
|
324 |
-
|
325 |
-
|
|
|
326 |
|
327 |
# Extraer información clave
|
328 |
detected_style = analysis.get("detected_style", "general").title()
|
@@ -337,8 +408,8 @@ class UltraSupremeOptimizer:
|
|
337 |
**🧠 INTELLIGENT DETECTION:**
|
338 |
- **Detected Style:** {detected_style}
|
339 |
- **Main Subject:** {detected_subject}
|
340 |
-
- **Precision:** Using
|
341 |
-
- **Quality:** Maximum resolution processing (
|
342 |
|
343 |
**📊 CLIP INTERROGATOR ANALYSIS:**
|
344 |
- **Base Prompt:** {base_prompt_preview}
|
@@ -346,9 +417,9 @@ class UltraSupremeOptimizer:
|
|
346 |
- **Classic Analysis:** {analysis.get('clip_classic', '')[:80]}...
|
347 |
|
348 |
**⚡ OPTIMIZATION APPLIED:**
|
349 |
-
- ✅
|
350 |
-
- ✅ GPU
|
351 |
-
- ✅
|
352 |
- ✅ Added professional camera specifications
|
353 |
- ✅ Enhanced lighting descriptions
|
354 |
- ✅ Applied Flux-specific optimizations
|
|
|
30 |
self.usage_count = 0
|
31 |
self.device = self._get_device()
|
32 |
self.is_initialized = False
|
33 |
+
# NO inicializar modelo aquí - hacerlo lazy
|
34 |
+
|
|
|
35 |
@staticmethod
|
36 |
def _get_device() -> str:
|
37 |
"""Determine the best available device for computation"""
|
|
|
48 |
return True
|
49 |
|
50 |
try:
|
51 |
+
# Configuración para CPU inicialmente
|
52 |
config = Config(
|
53 |
clip_model_name="ViT-L-14/openai",
|
54 |
download_cache=True,
|
55 |
chunk_size=2048,
|
56 |
quiet=True,
|
57 |
+
device="cpu" # Siempre inicializar en CPU
|
58 |
)
|
59 |
|
60 |
self.interrogator = Interrogator(config)
|
|
|
62 |
|
63 |
# Clean up memory after initialization
|
64 |
gc.collect()
|
65 |
+
|
66 |
+
logger.info("Model initialized successfully on CPU")
|
67 |
return True
|
68 |
|
69 |
except Exception as e:
|
|
|
86 |
if image.mode != 'RGB':
|
87 |
image = image.convert('RGB')
|
88 |
|
89 |
+
# Resize if too large
|
90 |
+
max_size = 768 # Reducir tamaño para evitar problemas de memoria
|
91 |
if image.size[0] > max_size or image.size[1] > max_size:
|
92 |
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
|
93 |
|
|
|
109 |
r',\s*artstation',
|
110 |
r',\s*concept art',
|
111 |
r',\s*digital art',
|
112 |
+
r',\s*by greg rutkowski',
|
113 |
]
|
114 |
|
115 |
cleaned_prompt = base_prompt
|
|
|
148 |
|
149 |
return final_prompt
|
150 |
|
151 |
+
def _prepare_models_for_gpu(self):
|
152 |
+
"""Prepara los modelos para GPU con la precisión correcta"""
|
153 |
+
try:
|
154 |
+
if hasattr(self.interrogator, 'caption_model'):
|
155 |
+
self.interrogator.caption_model = self.interrogator.caption_model.half().to("cuda")
|
156 |
+
|
157 |
+
if hasattr(self.interrogator, 'clip_model'):
|
158 |
+
self.interrogator.clip_model = self.interrogator.clip_model.half().to("cuda")
|
159 |
+
|
160 |
+
if hasattr(self.interrogator, 'blip_model'):
|
161 |
+
self.interrogator.blip_model = self.interrogator.blip_model.half().to("cuda")
|
162 |
+
|
163 |
+
self.interrogator.config.device = "cuda"
|
164 |
+
logger.info("Models prepared for GPU with FP16")
|
165 |
+
|
166 |
+
except Exception as e:
|
167 |
+
logger.error(f"Error preparing models for GPU: {e}")
|
168 |
+
raise
|
169 |
+
|
170 |
+
def _prepare_models_for_cpu(self):
|
171 |
+
"""Prepara los modelos para CPU con float32"""
|
172 |
+
try:
|
173 |
+
if hasattr(self.interrogator, 'caption_model'):
|
174 |
+
self.interrogator.caption_model = self.interrogator.caption_model.float().to("cpu")
|
175 |
+
|
176 |
+
if hasattr(self.interrogator, 'clip_model'):
|
177 |
+
self.interrogator.clip_model = self.interrogator.clip_model.float().to("cpu")
|
178 |
+
|
179 |
+
if hasattr(self.interrogator, 'blip_model'):
|
180 |
+
self.interrogator.blip_model = self.interrogator.blip_model.float().to("cpu")
|
181 |
+
|
182 |
+
self.interrogator.config.device = "cpu"
|
183 |
+
logger.info("Models prepared for CPU with FP32")
|
184 |
+
|
185 |
+
except Exception as e:
|
186 |
+
logger.error(f"Error preparing models for CPU: {e}")
|
187 |
+
raise
|
188 |
+
|
189 |
+
@spaces.GPU(duration=60)
|
190 |
def run_clip_inference(self, image: Image.Image) -> Tuple[str, str, str]:
|
191 |
"""Solo la inferencia CLIP usa GPU"""
|
192 |
try:
|
193 |
+
# Preparar modelos para GPU
|
194 |
+
self._prepare_models_for_gpu()
|
195 |
+
|
196 |
+
# Usar autocast para manejar precisión mixta
|
197 |
+
with torch.cuda.amp.autocast(enabled=True, dtype=torch.float16):
|
198 |
+
# Convertir imagen a tensor y asegurar que esté en half precision
|
199 |
+
from torchvision import transforms
|
200 |
+
preprocess = transforms.Compose([
|
201 |
+
transforms.Resize((224, 224)),
|
202 |
+
transforms.ToTensor(),
|
203 |
+
transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
|
204 |
+
std=[0.26862954, 0.26130258, 0.27577711]),
|
205 |
+
])
|
206 |
|
207 |
+
# Procesar imagen manualmente para controlar la precisión
|
208 |
+
image_tensor = preprocess(image).unsqueeze(0).half().to("cuda")
|
|
|
|
|
209 |
|
210 |
+
# Ejecutar inferencias con manejo especial
|
211 |
+
full_prompt = self._safe_interrogate(image, 'interrogate')
|
212 |
+
clip_fast = self._safe_interrogate(image, 'interrogate_fast')
|
213 |
+
clip_classic = self._safe_interrogate(image, 'interrogate_classic')
|
214 |
|
215 |
+
return full_prompt, clip_fast, clip_classic
|
216 |
+
|
217 |
+
except Exception as e:
|
218 |
+
logger.error(f"GPU inference error: {e}")
|
219 |
+
# Intentar en CPU como fallback
|
220 |
+
return self._run_cpu_inference(image)
|
221 |
+
|
222 |
+
def _safe_interrogate(self, image: Image.Image, method: str) -> str:
|
223 |
+
"""Ejecuta interrogate de forma segura manejando precisión"""
|
224 |
+
try:
|
225 |
+
# Temporalmente parchear el método de procesamiento de imagen
|
226 |
+
original_method = getattr(self.interrogator, method)
|
227 |
+
|
228 |
+
# Ejecutar el método
|
229 |
+
result = original_method(image)
|
230 |
+
|
231 |
+
return result
|
232 |
+
|
233 |
+
except Exception as e:
|
234 |
+
logger.error(f"Error in {method}: {e}")
|
235 |
+
return f"Error processing with {method}"
|
236 |
+
|
237 |
+
def _run_cpu_inference(self, image: Image.Image) -> Tuple[str, str, str]:
|
238 |
+
"""Ejecuta inferencia en CPU como fallback"""
|
239 |
+
try:
|
240 |
+
logger.info("Running CPU inference as fallback")
|
241 |
+
|
242 |
+
# Preparar modelos para CPU
|
243 |
+
self._prepare_models_for_cpu()
|
244 |
+
|
245 |
+
# Ejecutar en CPU sin autocast
|
246 |
full_prompt = self.interrogator.interrogate(image)
|
247 |
clip_fast = self.interrogator.interrogate_fast(image)
|
248 |
clip_classic = self.interrogator.interrogate_classic(image)
|
|
|
250 |
return full_prompt, clip_fast, clip_classic
|
251 |
|
252 |
except Exception as e:
|
253 |
+
logger.error(f"CPU inference also failed: {e}")
|
254 |
+
return "Error: Failed to process image", "Error", "Error"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
def generate_ultra_supreme_prompt(self, image: Any) -> Tuple[str, str, int, Dict[str, int]]:
|
257 |
"""
|
|
|
261 |
Tuple of (prompt, analysis_info, score, breakdown)
|
262 |
"""
|
263 |
try:
|
264 |
+
# Inicializar modelo si no está inicializado
|
265 |
if not self.is_initialized:
|
266 |
+
if not self.initialize_model():
|
267 |
+
return "❌ Model initialization failed.", "Please refresh and try again.", 0, {}
|
268 |
|
269 |
# Validate input
|
270 |
if image is None:
|
|
|
281 |
|
282 |
logger.info("ULTRA SUPREME ANALYSIS - Starting pipeline")
|
283 |
|
284 |
+
# Ejecutar inferencia CLIP
|
285 |
full_prompt, clip_fast, clip_classic = self.run_clip_inference(image)
|
286 |
|
287 |
+
# Verificar si hubo errores
|
288 |
+
if "Error" in full_prompt:
|
289 |
+
logger.warning("Using fallback prompt due to inference error")
|
290 |
+
full_prompt = "A photograph"
|
291 |
+
clip_fast = "image"
|
292 |
+
clip_classic = "picture"
|
293 |
|
294 |
+
logger.info(f"Prompt completo: {full_prompt[:100]}...")
|
295 |
+
logger.info(f"Fast: {clip_fast[:50]}...")
|
296 |
+
logger.info(f"Classic: {clip_classic[:50]}...")
|
297 |
+
|
298 |
+
# Aplicar reglas de Flux al prompt completo
|
299 |
optimized_prompt = self.apply_flux_rules(full_prompt)
|
300 |
|
301 |
+
# Crear análisis para el reporte
|
302 |
analysis_summary = {
|
303 |
"base_prompt": full_prompt,
|
304 |
"clip_fast": clip_fast,
|
|
|
308 |
"detected_subject": self._detect_subject(full_prompt)
|
309 |
}
|
310 |
|
311 |
+
# Calcular score
|
312 |
score = self._calculate_score(optimized_prompt, full_prompt)
|
313 |
breakdown = {
|
314 |
"base_quality": min(len(full_prompt) // 10, 25),
|
|
|
334 |
return optimized_prompt, analysis_info, score, breakdown
|
335 |
|
336 |
except Exception as e:
|
337 |
+
logger.error(f"Ultra supreme generation error: {e}", exc_info=True)
|
338 |
return f"❌ Error: {str(e)}", "Please try with a different image.", 0, {}
|
339 |
|
340 |
def _detect_style(self, prompt: str) -> str:
|
|
|
347 |
"dramatic": ["dramatic", "cinematic", "moody"]
|
348 |
}
|
349 |
|
350 |
+
prompt_lower = prompt.lower()
|
351 |
for style_name, keywords in styles.items():
|
352 |
+
if any(keyword in prompt_lower for keyword in keywords):
|
353 |
return style_name
|
354 |
|
355 |
return "general"
|
356 |
|
357 |
def _detect_subject(self, prompt: str) -> str:
|
358 |
"""Detecta el sujeto principal del prompt"""
|
359 |
+
if not prompt:
|
360 |
+
return "Unknown"
|
361 |
+
|
362 |
# Tomar las primeras palabras significativas
|
363 |
words = prompt.split(',')[0].split()
|
364 |
if len(words) > 3:
|
365 |
return ' '.join(words[:4])
|
366 |
+
return prompt.split(',')[0] if prompt else "Unknown"
|
367 |
|
368 |
def _calculate_score(self, optimized_prompt: str, base_prompt: str) -> int:
|
369 |
"""Calcula el score basado en la calidad del prompt"""
|
|
|
391 |
duration: float) -> str:
|
392 |
"""Generate detailed analysis report"""
|
393 |
|
394 |
+
device_used = "cuda" if torch.cuda.is_available() else "cpu"
|
395 |
+
gpu_status = "⚡ ZeroGPU" if device_used == "cuda" else "💻 CPU"
|
396 |
+
precision_info = "Half Precision (FP16)" if device_used == "cuda" else "Full Precision (FP32)"
|
397 |
|
398 |
# Extraer información clave
|
399 |
detected_style = analysis.get("detected_style", "general").title()
|
|
|
408 |
**🧠 INTELLIGENT DETECTION:**
|
409 |
- **Detected Style:** {detected_style}
|
410 |
- **Main Subject:** {detected_subject}
|
411 |
+
- **Precision:** Using {precision_info} for optimal performance
|
412 |
+
- **Quality:** Maximum resolution processing (768px)
|
413 |
|
414 |
**📊 CLIP INTERROGATOR ANALYSIS:**
|
415 |
- **Base Prompt:** {base_prompt_preview}
|
|
|
417 |
- **Classic Analysis:** {analysis.get('clip_classic', '')[:80]}...
|
418 |
|
419 |
**⚡ OPTIMIZATION APPLIED:**
|
420 |
+
- ✅ Mixed precision handling for stability
|
421 |
+
- ✅ Automatic GPU/CPU fallback
|
422 |
+
- ✅ Memory-efficient processing
|
423 |
- ✅ Added professional camera specifications
|
424 |
- ✅ Enhanced lighting descriptions
|
425 |
- ✅ Applied Flux-specific optimizations
|