import gradio as gr import torch from PIL import Image import os from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, T5EncoderModel, T5TokenizerFast from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler from flux.transformer_flux import FluxTransformer2DModel from flux.pipeline_flux_chameleon import FluxPipeline import torch.nn as nn import math import logging import sys from qwen2_vl.modeling_qwen2_vl import Qwen2VLSimplifiedModel from huggingface_hub import snapshot_download # 设置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) MODEL_ID = "Djrango/Qwen2vl-Flux" MODEL_CACHE_DIR = "model_cache" # 预下载所有模型 def download_models(): logger.info("Starting model download...") try: # 下载完整模型仓库 snapshot_download( repo_id=MODEL_ID, local_dir=MODEL_CACHE_DIR, local_dir_use_symlinks=False ) logger.info("Model download completed successfully") except Exception as e: logger.error(f"Error downloading models: {str(e)}") raise # 在脚本开始时下载模型 if not os.path.exists(MODEL_CACHE_DIR): download_models() # Add aspect ratio options ASPECT_RATIOS = { "1:1": (1024, 1024), "16:9": (1344, 768), "9:16": (768, 1344), "2.4:1": (1536, 640), "3:4": (896, 1152), "4:3": (1152, 896), } class Qwen2Connector(nn.Module): def __init__(self, input_dim=3584, output_dim=4096): super().__init__() self.linear = nn.Linear(input_dim, output_dim) def forward(self, x): return self.linear(x) class FluxInterface: def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"): self.device = device self.dtype = torch.bfloat16 self.models = None self.MODEL_ID = "Djrango/Qwen2vl-Flux" def load_models(self): if self.models is not None: return logger.info("Starting model loading...") # 1. 首先加载较小的模型到GPU tokenizer = CLIPTokenizer.from_pretrained(os.path.join(MODEL_CACHE_DIR, "flux/tokenizer")) text_encoder = CLIPTextModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/text_encoder") ).to(self.dtype).to(self.device) text_encoder_two = T5EncoderModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/text_encoder_2") ).to(self.dtype).to(self.device) tokenizer_two = T5TokenizerFast.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/tokenizer_2")) # 2. 将大模型加载到CPU,但保持bfloat16精度 vae = AutoencoderKL.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/vae") ).to(self.dtype).cpu() transformer = FluxTransformer2DModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/transformer") ).to(self.dtype).cpu() scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/scheduler"), shift=1 ) # 3. Qwen2VL加载到CPU,保持bfloat16 qwen2vl = Qwen2VLSimplifiedModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "qwen2-vl") ).to(self.dtype).cpu() # 4. 加载connector和embedder,保持bfloat16 connector = Qwen2Connector().to(self.dtype).cpu() connector_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/connector.pt") connector_state = torch.load(connector_path, map_location='cpu') connector_state = {k.replace('module.', ''): v.to(self.dtype) for k, v in connector_state.items()} connector.load_state_dict(connector_state) self.t5_context_embedder = nn.Linear(4096, 3072).to(self.dtype).cpu() t5_embedder_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/t5_embedder.pt") t5_embedder_state = torch.load(t5_embedder_path, map_location='cpu') t5_embedder_state = {k: v.to(self.dtype) for k, v in t5_embedder_state.items()} self.t5_context_embedder.load_state_dict(t5_embedder_state) # 5. 设置所有模型为eval模式 for model in [text_encoder, text_encoder_two, vae, transformer, qwen2vl, connector, self.t5_context_embedder]: model.requires_grad_(False) model.eval() logger.info("All models loaded successfully") self.models = { 'tokenizer': tokenizer, 'text_encoder': text_encoder, 'text_encoder_two': text_encoder_two, 'tokenizer_two': tokenizer_two, 'vae': vae, 'transformer': transformer, 'scheduler': scheduler, 'qwen2vl': qwen2vl, 'connector': connector } self.qwen2vl_processor = AutoProcessor.from_pretrained( self.MODEL_ID, subfolder="qwen2-vl", min_pixels=256*28*28, max_pixels=256*28*28 ) self.pipeline = FluxPipeline( transformer=transformer, scheduler=scheduler, vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, ) def move_to_device(self, model, device): """Helper function to move model to specified device""" if hasattr(model, 'to'): return model.to(self.dtype).to(device) return model def process_image(self, image): """Process image with Qwen2VL model""" try: # 1. 将Qwen2VL相关模型移到GPU logger.info("Moving Qwen2VL models to GPU...") self.models['qwen2vl'] = self.models['qwen2vl'].to(self.device) self.models['connector'] = self.models['connector'].to(self.device) logger.info("Qwen2VL models moved to GPU") message = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}, ] } ] text = self.qwen2vl_processor.apply_chat_template( message, tokenize=False, add_generation_prompt=True ) with torch.no_grad(): inputs = self.qwen2vl_processor( text=[text], images=[image], padding=True, return_tensors="pt" ).to(self.device) output_hidden_state, image_token_mask, image_grid_thw = self.models['qwen2vl'](**inputs) image_hidden_state = output_hidden_state[image_token_mask].view(1, -1, output_hidden_state.size(-1)) image_hidden_state = self.models['connector'](image_hidden_state) # 保存结果到CPU result = (image_hidden_state.cpu(), image_grid_thw) # 2. 将Qwen2VL相关模型移回CPU logger.info("Moving Qwen2VL models back to CPU...") self.models['qwen2vl'] = self.models['qwen2vl'].cpu() self.models['connector'] = self.models['connector'].cpu() torch.cuda.empty_cache() logger.info("Qwen2VL models moved to CPU and GPU cache cleared") return result except Exception as e: logger.error(f"Error in process_image: {str(e)}") raise def resize_image(self, img, max_pixels=1050000): if not isinstance(img, Image.Image): img = Image.fromarray(img) width, height = img.size num_pixels = width * height if num_pixels > max_pixels: scale = math.sqrt(max_pixels / num_pixels) new_width = int(width * scale) new_height = int(height * scale) new_width = new_width - (new_width % 8) new_height = new_height - (new_height % 8) img = img.resize((new_width, new_height), Image.LANCZOS) return img def compute_t5_text_embeddings(self, prompt): """Compute T5 embeddings for text prompt""" if prompt == "": return None text_inputs = self.models['tokenizer_two']( prompt, padding="max_length", max_length=256, truncation=True, return_tensors="pt" ).to(self.device) prompt_embeds = self.models['text_encoder_two'](text_inputs.input_ids)[0] prompt_embeds = self.t5_context_embedder.to(self.device)(prompt_embeds) self.t5_context_embedder = self.t5_context_embedder.cpu() return prompt_embeds def compute_text_embeddings(self, prompt=""): with torch.no_grad(): text_inputs = self.models['tokenizer']( prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt" ).to(self.device) prompt_embeds = self.models['text_encoder']( text_inputs.input_ids, output_hidden_states=False ) pooled_prompt_embeds = prompt_embeds.pooler_output return pooled_prompt_embeds def generate(self, input_image, prompt="", guidance_scale=3.5, num_inference_steps=28, num_images=2, seed=None, aspect_ratio="1:1"): try: logger.info(f"Starting generation with prompt: {prompt}") if input_image is None: raise ValueError("No input image provided") if seed is not None: torch.manual_seed(seed) logger.info(f"Set random seed to: {seed}") # 1. 使用Qwen2VL处理图像 logger.info("Processing input image with Qwen2VL...") qwen2_hidden_state, image_grid_thw = self.process_image(input_image) logger.info("Image processing completed") # 2. 计算文本嵌入 logger.info("Computing text embeddings...") pooled_prompt_embeds = self.compute_text_embeddings("") t5_prompt_embeds = self.compute_t5_text_embeddings(prompt) logger.info("Text embeddings computed") # 3. 将Transformer和VAE移到GPU logger.info("Moving Transformer and VAE to GPU...") self.models['transformer'] = self.models['transformer'].to(self.device) self.models['vae'] = self.models['vae'].to(self.device) # 更新pipeline中的模型引用 self.pipeline.transformer = self.models['transformer'] self.pipeline.vae = self.models['vae'] logger.info("Models moved to GPU") # 获取维度 width, height = ASPECT_RATIOS[aspect_ratio] logger.info(f"Using dimensions: {width}x{height}") # 4. 生成图像 try: logger.info("Starting image generation...") output_images = self.pipeline( prompt_embeds=qwen2_hidden_state.to(self.device).repeat(num_images, 1, 1), pooled_prompt_embeds=pooled_prompt_embeds, t5_prompt_embeds=t5_prompt_embeds.repeat(num_images, 1, 1) if t5_prompt_embeds is not None else None, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, height=height, width=width, ).images logger.info("Image generation completed") # 5. 将Transformer和VAE移回CPU logger.info("Moving models back to CPU...") self.models['transformer'] = self.models['transformer'].cpu() self.models['vae'] = self.models['vae'].cpu() torch.cuda.empty_cache() logger.info("Models moved to CPU and GPU cache cleared") return output_images except Exception as e: raise RuntimeError(f"Error generating images: {str(e)}") except Exception as e: logger.error(f"Error during generation: {str(e)}") raise gr.Error(f"Generation failed: {str(e)}") # Initialize the interface interface = FluxInterface() def process_request(input_image, prompt="", guidance_scale=3.5, num_inference_steps=28, num_images=2, seed=None, aspect_ratio="1:1"): """主处理函数,直接处理用户请求""" try: if interface.models is None: interface.load_models() return interface.generate( input_image=input_image, prompt=prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, num_images=num_images, seed=seed, aspect_ratio=aspect_ratio ) except Exception as e: logger.error(f"Error during generation: {str(e)}") raise gr.Error(f"Generation failed: {str(e)}") # Create Gradio interface with gr.Blocks( theme=gr.themes.Soft(), css=""" .container { max-width: 1200px; margin: auto; padding: 0 20px; } .header { text-align: center; margin: 20px 0 40px 0; padding: 20px; background: #f7f7f7; border-radius: 12px; } .param-row { padding: 10px 0; } footer { margin-top: 40px; padding: 20px; border-top: 1px solid #eee; } """ ) as demo: with gr.Column(elem_classes="container"): gr.Markdown( """