Spaces:
Running
on
Zero
Running
on
Zero
""" | |
Model management for Phramer AI | |
By Pariente AI, for MIA TV Series | |
BAGEL 7B integration with professional photography knowledge enhancement | |
""" | |
import spaces | |
import logging | |
import tempfile | |
import os | |
import re | |
from typing import Optional, Dict, Any, Tuple | |
from PIL import Image | |
from gradio_client import Client, handle_file | |
from config import get_device_config, PROFESSIONAL_PHOTOGRAPHY_CONFIG | |
from utils import clean_memory, safe_execute | |
from professional_photography import ( | |
ProfessionalPhotoAnalyzer, | |
enhance_flux_prompt_with_professional_knowledge, | |
professional_analyzer | |
) | |
logger = logging.getLogger(__name__) | |
class BaseImageAnalyzer: | |
"""Base class for image analysis models""" | |
def __init__(self): | |
self.is_initialized = False | |
self.device_config = get_device_config() | |
def initialize(self) -> bool: | |
"""Initialize the model""" | |
raise NotImplementedError | |
def analyze_image(self, image: Image.Image) -> Tuple[str, Dict[str, Any]]: | |
"""Analyze image and return description""" | |
raise NotImplementedError | |
def cleanup(self) -> None: | |
"""Clean up model resources""" | |
clean_memory() | |
class BagelAPIAnalyzer(BaseImageAnalyzer): | |
"""BAGEL 7B model with professional photography knowledge integration""" | |
def __init__(self): | |
super().__init__() | |
self.client = None | |
self.space_url = "Malaji71/Bagel-7B-Demo" | |
self.api_endpoint = "/image_understanding" | |
self.hf_token = os.getenv("HF_TOKEN") | |
self.professional_analyzer = professional_analyzer | |
def initialize(self) -> bool: | |
"""Initialize BAGEL API client with authentication""" | |
if self.is_initialized: | |
return True | |
try: | |
logger.info("Initializing BAGEL API client for Phramer AI...") | |
# Initialize client with token if available | |
if self.hf_token: | |
logger.info("Using HF token for enhanced API access") | |
self.client = Client(self.space_url, hf_token=self.hf_token) | |
else: | |
logger.info("Using public API access") | |
self.client = Client(self.space_url) | |
self.is_initialized = True | |
logger.info("BAGEL API client initialized successfully") | |
return True | |
except Exception as e: | |
logger.error(f"BAGEL API client initialization failed: {e}") | |
if self.hf_token: | |
logger.info("Retrying without token...") | |
try: | |
self.client = Client(self.space_url) | |
self.is_initialized = True | |
logger.info("BAGEL API client initialized (fallback mode)") | |
return True | |
except Exception as e2: | |
logger.error(f"Fallback initialization failed: {e2}") | |
return False | |
def _create_professional_enhanced_prompt(self, analysis_type: str = "multimodal") -> str: | |
"""Create professionally enhanced prompt that makes BAGEL see with cinematographic eyes""" | |
if analysis_type == "cinematic": | |
return """You are a master cinematographer with 30+ years of experience. Analyze this image with complete professional cinematography knowledge and provide exactly two sections: | |
1. DESCRIPTION: Analyze what you see using professional cinematography terminology: | |
First, identify the PHOTOGRAPHIC PLANE: | |
- EXTREME WIDE SHOT: Subject very small in environment (establishes location) | |
- WIDE SHOT: Full body visible with environment (subject in context) | |
- MEDIUM SHOT: From waist up (balance subject/environment) | |
- CLOSE-UP: Head and shoulders (emotion and expression) | |
- EXTREME CLOSE-UP: Part of face or detail (intense emotion) | |
- DETAIL SHOT: Specific small element (highlight aspect) | |
Second, identify the CAMERA ANGLE: | |
- EYE LEVEL: Camera at subject's eye level (neutral, natural perspective) | |
- LOW ANGLE: Camera below looking up (subject appears powerful, heroic) | |
- HIGH ANGLE: Camera above looking down (subject appears vulnerable, shows context) | |
- DUTCH ANGLE: Camera tilted (dynamic tension, instability) | |
Third, analyze the LIGHTING: | |
- GOLDEN HOUR: Warm, soft, directional light (first/last hour of sun) | |
- BLUE HOUR: Even blue light, dramatic mood (20-30 min after sunset) | |
- NATURAL DAYLIGHT: Bright sunny conditions | |
- SOFT NATURAL: Overcast, diffused, even light | |
- DRAMATIC: High contrast, moody shadows | |
- STUDIO: Controlled professional lighting | |
Fourth, identify COMPOSITION: | |
- RULE OF THIRDS: Key elements on intersection points | |
- LEADING LINES: Lines guide viewer's eye to subject | |
- SYMMETRICAL: Mirror-like balance | |
- CENTERED: Subject in middle for impact | |
- DEPTH LAYERS: Foreground, middle ground, background separation | |
Now describe the scene combining all these professional elements in flowing descriptive language. | |
2. CAMERA_SETUP: Recommend specific professional equipment based on your analysis: | |
For PORTRAIT scenes: Canon EOS R5, 85mm f/1.4 lens, f/2.8, ISO 200, single point AF on eyes | |
For LANDSCAPE scenes: Phase One XT, 24-70mm f/4 lens, f/8-f/11, ISO 100, hyperfocal distance | |
For STREET scenes: Leica M11, 35mm f/1.4 lens, f/5.6-f/8, ISO 400-1600, zone focusing | |
For ARCHITECTURE: Canon EOS R5, 24-70mm f/2.8 lens, f/8-f/11, ISO 100, tilt-shift correction | |
For ACTION: Sony A1, 70-200mm f/2.8 lens, f/2.8-f/4, ISO 800-3200, continuous AF tracking | |
Apply your complete professional cinematography knowledge to see this image as a master would.""" | |
elif analysis_type == "flux_optimized": | |
return """You are a professional cinematographer analyzing this image for photorealistic prompt generation. Use complete technical knowledge and provide exactly two sections: | |
1. DESCRIPTION: Technical cinematographic analysis: | |
PHOTOGRAPHIC PLANE (choose one): | |
- Wide shot: Full subject visible with environment | |
- Medium shot: Waist up, balanced composition | |
- Close-up: Head and shoulders, tight framing | |
- Extreme close-up: Facial details or specific elements | |
- Detail shot: Small specific elements highlighted | |
CAMERA ANGLE (identify): | |
- Eye level: Natural, relatable perspective | |
- Low angle: Looking up, subject appears powerful | |
- High angle: Looking down, shows vulnerability/context | |
- Dutch angle: Tilted, creates dynamic tension | |
LIGHTING TYPE (analyze): | |
- Golden hour: Warm, soft directional light | |
- Natural daylight: Bright outdoor conditions | |
- Soft natural: Overcast, even diffusion | |
- Dramatic: High contrast, moody shadows | |
- Blue hour: Even twilight, dramatic mood | |
COMPOSITION TECHNIQUE (apply): | |
- Rule of thirds: Subject on intersection points | |
- Leading lines: Elements guide eye to subject | |
- Symmetrical: Balanced mirror composition | |
- Centered: Subject middle for impact | |
- Dynamic: Diagonal elements, movement | |
Describe the scene using these professional cinematography elements in precise technical language. | |
2. CAMERA_SETUP: Professional equipment recommendation: | |
PORTRAIT SETUP: Canon EOS R5 with 85mm f/1.4 lens at f/2.8, ISO 200, rule of thirds composition | |
LANDSCAPE SETUP: Phase One XT with 24-70mm f/4 lens at f/8, ISO 100, hyperfocal distance focus | |
STREET SETUP: Leica M11 with 35mm f/1.4 lens at f/5.6, ISO 800, zone focusing technique | |
ARCHITECTURE SETUP: Canon EOS R5 with 24-70mm f/2.8 lens at f/11, ISO 100, perspective correction | |
ACTION SETUP: Sony A1 with 70-200mm f/2.8 lens at f/4, ISO 1600, continuous AF tracking | |
Choose the setup that matches your scene analysis and provide complete technical specifications.""" | |
else: # multimodal analysis | |
return """You are a master cinematographer with decades of professional experience. Analyze this image using complete cinematography knowledge and provide exactly two sections: | |
1. DESCRIPTION: Professional cinematographic analysis combining: | |
PHOTOGRAPHIC PLANES: Identify if this is a wide shot (full subject with environment), medium shot (waist up), close-up (head/shoulders), extreme close-up (facial details), or detail shot (specific elements). | |
CAMERA ANGLES: Determine if shot from eye level (natural perspective), low angle (looking up, powerful), high angle (looking down, vulnerable), or dutch angle (tilted, dynamic). | |
LIGHTING ANALYSIS: Analyze if this is golden hour (warm directional), natural daylight (bright outdoor), soft natural (overcast even), dramatic (high contrast), blue hour (twilight mood), or studio (controlled). | |
COMPOSITION: Identify rule of thirds (key elements on intersections), leading lines (guiding elements), symmetrical (balanced), centered (middle impact), or dynamic (diagonal movement). | |
Describe the complete scene using professional cinematography terminology in flowing descriptive language that captures all visual and technical elements. | |
2. CAMERA_SETUP: Professional equipment recommendation based on scene analysis: | |
Choose from these professional setups: | |
- PORTRAIT: Canon EOS R5, 85mm f/1.4 lens, f/2.8, ISO 200 | |
- LANDSCAPE: Phase One XT, 24-70mm f/4 lens, f/8, ISO 100 | |
- STREET: Leica M11, 35mm f/1.4 lens, f/5.6, ISO 800 | |
- ARCHITECTURE: Canon EOS R5, 24-70mm f/2.8 lens, f/11, ISO 100 | |
- ACTION: Sony A1, 70-200mm f/2.8 lens, f/4, ISO 1600 | |
Provide complete technical specifications matching your cinematographic analysis.""" | |
def _extract_professional_camera_setup(self, description: str) -> Optional[str]: | |
"""Extract and enhance camera setup with professional photography knowledge""" | |
try: | |
camera_setup = None | |
# Extract BAGEL's camera recommendation | |
if "CAMERA_SETUP:" in description: | |
parts = description.split("CAMERA_SETUP:") | |
if len(parts) > 1: | |
camera_section = parts[1].strip() | |
# Take the first substantial line | |
lines = camera_section.split('\n') | |
for line in lines: | |
clean_line = line.strip() | |
if len(clean_line) > 20 and not clean_line.startswith('2.'): | |
camera_setup = clean_line | |
break | |
elif "2. CAMERA_SETUP" in description: | |
parts = description.split("2. CAMERA_SETUP") | |
if len(parts) > 1: | |
camera_section = parts[1].strip() | |
lines = camera_section.split('\n') | |
for line in lines: | |
clean_line = line.strip() | |
if len(clean_line) > 20: | |
camera_setup = clean_line | |
break | |
# Clean and format camera setup | |
if camera_setup: | |
return self._clean_camera_setup(camera_setup) | |
return None | |
except Exception as e: | |
logger.warning(f"Failed to extract professional camera setup: {e}") | |
return None | |
def _clean_camera_setup(self, raw_setup: str) -> str: | |
"""Clean and format camera setup""" | |
try: | |
# Remove common prefixes | |
setup = re.sub(r'^(Based on.*?recommend|I would recommend|For this.*?setup)\s*:?\s*', '', raw_setup, flags=re.IGNORECASE) | |
setup = re.sub(r'^(CAMERA_SETUP:|2\.\s*CAMERA_SETUP:?)\s*', '', setup, flags=re.IGNORECASE) | |
# Clean up formatting | |
setup = re.sub(r'\s+', ' ', setup).strip() | |
# Ensure proper format | |
if setup and not setup.lower().startswith('shot on'): | |
setup = f"shot on {setup}" | |
return setup | |
except Exception as e: | |
logger.warning(f"Camera setup cleaning failed: {e}") | |
return raw_setup | |
def _save_temp_image(self, image: Image.Image) -> str: | |
"""Save image to temporary file for API call""" | |
try: | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png') | |
temp_path = temp_file.name | |
temp_file.close() | |
if image.mode != 'RGB': | |
image = image.convert('RGB') | |
image.save(temp_path, 'PNG') | |
return temp_path | |
except Exception as e: | |
logger.error(f"Failed to save temporary image: {e}") | |
return None | |
def _cleanup_temp_file(self, file_path: str): | |
"""Clean up temporary file""" | |
try: | |
if file_path and os.path.exists(file_path): | |
os.unlink(file_path) | |
except Exception as e: | |
logger.warning(f"Failed to cleanup temp file: {e}") | |
def analyze_image(self, image: Image.Image, prompt: str = None) -> Tuple[str, Dict[str, Any]]: | |
"""Analyze image using BAGEL API with professional cinematography enhancement""" | |
if not self.is_initialized: | |
success = self.initialize() | |
if not success: | |
return "BAGEL API not available", {"error": "API initialization failed"} | |
temp_path = None | |
metadata = { | |
"model": "BAGEL-7B-Professional", | |
"device": "api", | |
"confidence": 0.9, | |
"api_endpoint": self.api_endpoint, | |
"space_url": self.space_url, | |
"prompt_used": prompt, | |
"has_camera_suggestion": False, | |
"professional_enhancement": True | |
} | |
try: | |
# Use professional enhanced prompt if none provided | |
if prompt is None: | |
prompt = self._create_professional_enhanced_prompt("multimodal") | |
# Save image to temporary file | |
temp_path = self._save_temp_image(image) | |
if not temp_path: | |
return "Image processing failed", {"error": "Could not save image"} | |
logger.info("Calling BAGEL API with professional cinematography prompt...") | |
# Call BAGEL API with enhanced prompt | |
result = self.client.predict( | |
image=handle_file(temp_path), | |
prompt=prompt, | |
show_thinking=False, | |
do_sample=False, | |
text_temperature=0.2, | |
max_new_tokens=512, | |
api_name=self.api_endpoint | |
) | |
# Extract and process response | |
if isinstance(result, tuple) and len(result) >= 2: | |
description = result[1] if result[1] else result[0] | |
else: | |
description = str(result) | |
if isinstance(description, str) and description.strip(): | |
description = description.strip() | |
# Extract professional camera setup | |
camera_setup = self._extract_professional_camera_setup(description) | |
if camera_setup: | |
metadata["camera_setup"] = camera_setup | |
metadata["has_camera_suggestion"] = True | |
logger.info(f"Professional camera setup extracted: {camera_setup}") | |
else: | |
metadata["has_camera_suggestion"] = False | |
logger.info("No camera setup found in BAGEL response") | |
# Mark as cinematography enhanced | |
metadata["cinematography_context_applied"] = True | |
else: | |
description = "Professional cinematographic analysis completed" | |
metadata["has_camera_suggestion"] = False | |
# Update metadata | |
metadata.update({ | |
"response_length": len(description), | |
"analysis_type": "professional_enhanced" | |
}) | |
logger.info(f"BAGEL Professional analysis complete: {len(description)} chars, Camera: {metadata.get('has_camera_suggestion', False)}") | |
return description, metadata | |
except Exception as e: | |
logger.error(f"BAGEL Professional analysis failed: {e}") | |
return "Professional analysis failed", {"error": str(e), "model": "BAGEL-7B-Professional"} | |
finally: | |
if temp_path: | |
self._cleanup_temp_file(temp_path) | |
def analyze_for_cinematic_prompt(self, image: Image.Image) -> Tuple[str, Dict[str, Any]]: | |
"""Analyze image specifically for cinematic/MIA TV Series prompt generation""" | |
cinematic_prompt = self._create_professional_enhanced_prompt("cinematic") | |
return self.analyze_image(image, cinematic_prompt) | |
def analyze_for_flux_with_professional_context(self, image: Image.Image) -> Tuple[str, Dict[str, Any]]: | |
"""Analyze image for FLUX with enhanced professional cinematography context""" | |
flux_prompt = self._create_professional_enhanced_prompt("flux_optimized") | |
return self.analyze_image(image, flux_prompt) | |
def analyze_for_multiengine_prompt(self, image: Image.Image) -> Tuple[str, Dict[str, Any]]: | |
"""Analyze image for multi-engine compatibility (Flux, Midjourney, etc.)""" | |
multiengine_prompt = self._create_professional_enhanced_prompt("multimodal") | |
return self.analyze_image(image, multiengine_prompt) | |
def cleanup(self) -> None: | |
"""Clean up API client resources""" | |
try: | |
if hasattr(self, 'client'): | |
self.client = None | |
super().cleanup() | |
logger.info("BAGEL Professional API resources cleaned up") | |
except Exception as e: | |
logger.warning(f"BAGEL Professional API cleanup warning: {e}") | |
class FallbackAnalyzer(BaseImageAnalyzer): | |
"""Enhanced fallback analyzer with basic professional cinematography principles""" | |
def __init__(self): | |
super().__init__() | |
self.professional_analyzer = professional_analyzer | |
def initialize(self) -> bool: | |
"""Fallback with cinematography enhancement is always ready""" | |
self.is_initialized = True | |
return True | |
def analyze_image(self, image: Image.Image) -> Tuple[str, Dict[str, Any]]: | |
"""Provide enhanced image description with cinematography context""" | |
try: | |
width, height = image.size | |
mode = image.mode | |
aspect_ratio = width / height | |
# Enhanced scene detection with cinematographic analysis | |
if aspect_ratio > 1.5: | |
orientation = "landscape" | |
scene_type = "landscape" | |
plane = "Wide shot" | |
camera_suggestion = "Phase One XT with 24-70mm f/4 lens, f/8, ISO 100" | |
elif aspect_ratio < 0.75: | |
orientation = "portrait" | |
scene_type = "portrait_studio" | |
plane = "Close-up" | |
camera_suggestion = "Canon EOS R5 with 85mm f/1.4 lens, f/2.8, ISO 200" | |
else: | |
orientation = "square" | |
scene_type = "general" | |
plane = "Medium shot" | |
camera_suggestion = "Canon EOS R6 with 50mm f/1.8 lens, f/4, ISO 400" | |
# Generate professional cinematographic description | |
description = f"{plane} composition with balanced framing and professional execution, natural lighting with good contrast, rule of thirds composition, suitable for high-quality reproduction across multiple generative platforms" | |
metadata = { | |
"model": "Professional-Fallback", | |
"device": "cpu", | |
"confidence": 0.7, | |
"image_size": f"{width}x{height}", | |
"color_mode": mode, | |
"orientation": orientation, | |
"aspect_ratio": round(aspect_ratio, 2), | |
"scene_type": scene_type, | |
"has_camera_suggestion": True, | |
"camera_setup": f"shot on {camera_suggestion}", | |
"professional_enhancement": True, | |
"cinematography_context_applied": True | |
} | |
return description, metadata | |
except Exception as e: | |
logger.error(f"Professional fallback analysis failed: {e}") | |
return "Professional cinematographic analysis with technical excellence", { | |
"error": str(e), | |
"model": "Professional-Fallback" | |
} | |
class ModelManager: | |
"""Enhanced manager for handling image analysis models with professional cinematography integration""" | |
def __init__(self, preferred_model: str = "bagel-professional"): | |
self.preferred_model = preferred_model | |
self.analyzers = {} | |
self.current_analyzer = None | |
def get_analyzer(self, model_name: str = None) -> Optional[BaseImageAnalyzer]: | |
"""Get or create analyzer for specified model""" | |
model_name = model_name or self.preferred_model | |
if model_name not in self.analyzers: | |
if model_name in ["bagel-api", "bagel-professional"]: | |
self.analyzers[model_name] = BagelAPIAnalyzer() | |
elif model_name == "fallback": | |
self.analyzers[model_name] = FallbackAnalyzer() | |
else: | |
logger.warning(f"Unknown model: {model_name}, using professional fallback") | |
model_name = "fallback" | |
self.analyzers[model_name] = FallbackAnalyzer() | |
return self.analyzers[model_name] | |
def analyze_image(self, image: Image.Image, model_name: str = None, analysis_type: str = "multiengine") -> Tuple[str, Dict[str, Any]]: | |
"""Analyze image with professional cinematography enhancement""" | |
analyzer = self.get_analyzer(model_name) | |
if analyzer is None: | |
return "No analyzer available", {"error": "Model not found"} | |
# Choose analysis method based on type and analyzer capabilities | |
if analysis_type == "cinematic" and hasattr(analyzer, 'analyze_for_cinematic_prompt'): | |
success, result = safe_execute(analyzer.analyze_for_cinematic_prompt, image) | |
elif analysis_type == "flux" and hasattr(analyzer, 'analyze_for_flux_with_professional_context'): | |
success, result = safe_execute(analyzer.analyze_for_flux_with_professional_context, image) | |
elif analysis_type == "multiengine" and hasattr(analyzer, 'analyze_for_multiengine_prompt'): | |
success, result = safe_execute(analyzer.analyze_for_multiengine_prompt, image) | |
else: | |
success, result = safe_execute(analyzer.analyze_image, image) | |
if success and result[1].get("error") is None: | |
return result | |
else: | |
# Enhanced fallback with cinematography context | |
logger.warning(f"Primary model failed, using cinematography-enhanced fallback: {result}") | |
fallback_analyzer = self.get_analyzer("fallback") | |
fallback_success, fallback_result = safe_execute(fallback_analyzer.analyze_image, image) | |
if fallback_success: | |
return fallback_result | |
else: | |
return "All cinematography analyzers failed", {"error": "Complete analysis failure"} | |
def cleanup_all(self) -> None: | |
"""Clean up all model resources""" | |
for analyzer in self.analyzers.values(): | |
analyzer.cleanup() | |
self.analyzers.clear() | |
clean_memory() | |
logger.info("All cinematography analyzers cleaned up") | |
# Global model manager instance with cinematography enhancement | |
model_manager = ModelManager(preferred_model="bagel-professional") | |
def analyze_image(image: Image.Image, model_name: str = None, analysis_type: str = "multiengine") -> Tuple[str, Dict[str, Any]]: | |
""" | |
Enhanced convenience function for professional cinematography analysis | |
Args: | |
image: PIL Image to analyze | |
model_name: Optional model name ("bagel-professional", "fallback") | |
analysis_type: Type of analysis ("multiengine", "cinematic", "flux") | |
Returns: | |
Tuple of (description, metadata) with professional cinematography enhancement | |
""" | |
return model_manager.analyze_image(image, model_name, analysis_type) | |
# Export main components | |
__all__ = [ | |
"BaseImageAnalyzer", | |
"BagelAPIAnalyzer", | |
"FallbackAnalyzer", | |
"ModelManager", | |
"model_manager", | |
"analyze_image" | |
] |