Spaces:
Runtime error
Runtime error
erwold
commited on
Commit
·
9590121
1
Parent(s):
f53a34a
ZeroGPU
Browse files
app.py
CHANGED
|
@@ -41,27 +41,29 @@ if not os.path.exists(MODEL_CACHE_DIR):
|
|
| 41 |
logger.error(f"Error downloading models: {str(e)}")
|
| 42 |
raise
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
logger.info("Loading models...")
|
| 46 |
tokenizer = CLIPTokenizer.from_pretrained(os.path.join(MODEL_CACHE_DIR, "flux/tokenizer"))
|
| 47 |
text_encoder = CLIPTextModel.from_pretrained(
|
| 48 |
os.path.join(MODEL_CACHE_DIR, "flux/text_encoder")
|
| 49 |
-
).to(dtype)
|
| 50 |
|
| 51 |
text_encoder_two = T5EncoderModel.from_pretrained(
|
| 52 |
os.path.join(MODEL_CACHE_DIR, "flux/text_encoder_2")
|
| 53 |
-
).to(dtype)
|
| 54 |
|
| 55 |
tokenizer_two = T5TokenizerFast.from_pretrained(
|
| 56 |
os.path.join(MODEL_CACHE_DIR, "flux/tokenizer_2"))
|
| 57 |
|
|
|
|
|
|
|
| 58 |
vae = AutoencoderKL.from_pretrained(
|
| 59 |
os.path.join(MODEL_CACHE_DIR, "flux/vae")
|
| 60 |
-
).to(dtype)
|
| 61 |
|
| 62 |
transformer = FluxTransformer2DModel.from_pretrained(
|
| 63 |
os.path.join(MODEL_CACHE_DIR, "flux/transformer")
|
| 64 |
-
).to(dtype)
|
| 65 |
|
| 66 |
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
|
| 67 |
os.path.join(MODEL_CACHE_DIR, "flux/scheduler"),
|
|
@@ -70,7 +72,7 @@ scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
|
|
| 70 |
|
| 71 |
qwen2vl = Qwen2VLSimplifiedModel.from_pretrained(
|
| 72 |
os.path.join(MODEL_CACHE_DIR, "qwen2-vl")
|
| 73 |
-
).to(dtype)
|
| 74 |
|
| 75 |
qwen2vl_processor = AutoProcessor.from_pretrained(
|
| 76 |
MODEL_ID,
|
|
@@ -79,20 +81,20 @@ qwen2vl_processor = AutoProcessor.from_pretrained(
|
|
| 79 |
max_pixels=256*28*28
|
| 80 |
)
|
| 81 |
|
| 82 |
-
# 加载connector和embedder
|
| 83 |
-
connector = nn.Linear(3584, 4096).to(dtype)
|
| 84 |
connector_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/connector.pt")
|
| 85 |
connector_state = torch.load(connector_path, map_location='cpu')
|
| 86 |
connector_state = {k.replace('module.', ''): v.to(dtype) for k, v in connector_state.items()}
|
| 87 |
connector.load_state_dict(connector_state)
|
| 88 |
|
| 89 |
-
t5_context_embedder = nn.Linear(4096, 3072).to(dtype)
|
| 90 |
t5_embedder_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/t5_embedder.pt")
|
| 91 |
t5_embedder_state = torch.load(t5_embedder_path, map_location='cpu')
|
| 92 |
t5_embedder_state = {k: v.to(dtype) for k, v in t5_embedder_state.items()}
|
| 93 |
t5_context_embedder.load_state_dict(t5_embedder_state)
|
| 94 |
|
| 95 |
-
# 创建pipeline
|
| 96 |
pipeline = FluxPipeline(
|
| 97 |
transformer=transformer,
|
| 98 |
scheduler=scheduler,
|
|
@@ -120,6 +122,11 @@ ASPECT_RATIOS = {
|
|
| 120 |
def process_image(image):
|
| 121 |
"""Process image with Qwen2VL model"""
|
| 122 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
message = [
|
| 124 |
{
|
| 125 |
"role": "user",
|
|
@@ -147,7 +154,16 @@ def process_image(image):
|
|
| 147 |
image_hidden_state = output_hidden_state[image_token_mask].view(1, -1, output_hidden_state.size(-1))
|
| 148 |
image_hidden_state = connector(image_hidden_state)
|
| 149 |
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
except Exception as e:
|
| 153 |
logger.error(f"Error in process_image: {str(e)}")
|
|
@@ -167,8 +183,14 @@ def compute_t5_text_embeddings(prompt):
|
|
| 167 |
).to(device)
|
| 168 |
|
| 169 |
prompt_embeds = text_encoder_two(text_inputs.input_ids)[0]
|
|
|
|
|
|
|
|
|
|
| 170 |
prompt_embeds = t5_context_embedder(prompt_embeds)
|
| 171 |
|
|
|
|
|
|
|
|
|
|
| 172 |
return prompt_embeds
|
| 173 |
|
| 174 |
def compute_text_embeddings(prompt=""):
|
|
@@ -216,8 +238,18 @@ def generate_images(input_image, prompt="", guidance_scale=3.5,
|
|
| 216 |
# Generate images
|
| 217 |
try:
|
| 218 |
logger.info("Starting image generation...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
output_images = pipeline(
|
| 220 |
-
prompt_embeds=qwen2_hidden_state.repeat(num_images, 1, 1),
|
| 221 |
pooled_prompt_embeds=pooled_prompt_embeds,
|
| 222 |
t5_prompt_embeds=t5_prompt_embeds.repeat(num_images, 1, 1) if t5_prompt_embeds is not None else None,
|
| 223 |
num_inference_steps=num_inference_steps,
|
|
@@ -225,8 +257,15 @@ def generate_images(input_image, prompt="", guidance_scale=3.5,
|
|
| 225 |
height=height,
|
| 226 |
width=width,
|
| 227 |
).images
|
|
|
|
| 228 |
logger.info("Image generation completed")
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
return output_images
|
| 231 |
|
| 232 |
except Exception as e:
|
|
|
|
| 41 |
logger.error(f"Error downloading models: {str(e)}")
|
| 42 |
raise
|
| 43 |
|
| 44 |
+
# 加载小模型到 GPU
|
| 45 |
+
logger.info("Loading small models to GPU...")
|
| 46 |
tokenizer = CLIPTokenizer.from_pretrained(os.path.join(MODEL_CACHE_DIR, "flux/tokenizer"))
|
| 47 |
text_encoder = CLIPTextModel.from_pretrained(
|
| 48 |
os.path.join(MODEL_CACHE_DIR, "flux/text_encoder")
|
| 49 |
+
).to(dtype).to(device)
|
| 50 |
|
| 51 |
text_encoder_two = T5EncoderModel.from_pretrained(
|
| 52 |
os.path.join(MODEL_CACHE_DIR, "flux/text_encoder_2")
|
| 53 |
+
).to(dtype).to(device)
|
| 54 |
|
| 55 |
tokenizer_two = T5TokenizerFast.from_pretrained(
|
| 56 |
os.path.join(MODEL_CACHE_DIR, "flux/tokenizer_2"))
|
| 57 |
|
| 58 |
+
# 大模型初始加载到 CPU
|
| 59 |
+
logger.info("Loading large models to CPU...")
|
| 60 |
vae = AutoencoderKL.from_pretrained(
|
| 61 |
os.path.join(MODEL_CACHE_DIR, "flux/vae")
|
| 62 |
+
).to(dtype).cpu()
|
| 63 |
|
| 64 |
transformer = FluxTransformer2DModel.from_pretrained(
|
| 65 |
os.path.join(MODEL_CACHE_DIR, "flux/transformer")
|
| 66 |
+
).to(dtype).cpu()
|
| 67 |
|
| 68 |
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
|
| 69 |
os.path.join(MODEL_CACHE_DIR, "flux/scheduler"),
|
|
|
|
| 72 |
|
| 73 |
qwen2vl = Qwen2VLSimplifiedModel.from_pretrained(
|
| 74 |
os.path.join(MODEL_CACHE_DIR, "qwen2-vl")
|
| 75 |
+
).to(dtype).cpu()
|
| 76 |
|
| 77 |
qwen2vl_processor = AutoProcessor.from_pretrained(
|
| 78 |
MODEL_ID,
|
|
|
|
| 81 |
max_pixels=256*28*28
|
| 82 |
)
|
| 83 |
|
| 84 |
+
# 加载 connector 和 embedder 到 CPU
|
| 85 |
+
connector = nn.Linear(3584, 4096).to(dtype).cpu()
|
| 86 |
connector_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/connector.pt")
|
| 87 |
connector_state = torch.load(connector_path, map_location='cpu')
|
| 88 |
connector_state = {k.replace('module.', ''): v.to(dtype) for k, v in connector_state.items()}
|
| 89 |
connector.load_state_dict(connector_state)
|
| 90 |
|
| 91 |
+
t5_context_embedder = nn.Linear(4096, 3072).to(dtype).cpu()
|
| 92 |
t5_embedder_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/t5_embedder.pt")
|
| 93 |
t5_embedder_state = torch.load(t5_embedder_path, map_location='cpu')
|
| 94 |
t5_embedder_state = {k: v.to(dtype) for k, v in t5_embedder_state.items()}
|
| 95 |
t5_context_embedder.load_state_dict(t5_embedder_state)
|
| 96 |
|
| 97 |
+
# 创建pipeline (先用CPU上的模型)
|
| 98 |
pipeline = FluxPipeline(
|
| 99 |
transformer=transformer,
|
| 100 |
scheduler=scheduler,
|
|
|
|
| 122 |
def process_image(image):
|
| 123 |
"""Process image with Qwen2VL model"""
|
| 124 |
try:
|
| 125 |
+
# 将 Qwen2VL 相关模型移到 GPU
|
| 126 |
+
logger.info("Moving Qwen2VL models to GPU...")
|
| 127 |
+
qwen2vl.to(device)
|
| 128 |
+
connector.to(device)
|
| 129 |
+
|
| 130 |
message = [
|
| 131 |
{
|
| 132 |
"role": "user",
|
|
|
|
| 154 |
image_hidden_state = output_hidden_state[image_token_mask].view(1, -1, output_hidden_state.size(-1))
|
| 155 |
image_hidden_state = connector(image_hidden_state)
|
| 156 |
|
| 157 |
+
# 保存结果到 CPU
|
| 158 |
+
result = (image_hidden_state.cpu(), image_grid_thw)
|
| 159 |
+
|
| 160 |
+
# 将模型移回 CPU 并清理显存
|
| 161 |
+
logger.info("Moving Qwen2VL models back to CPU...")
|
| 162 |
+
qwen2vl.cpu()
|
| 163 |
+
connector.cpu()
|
| 164 |
+
torch.cuda.empty_cache()
|
| 165 |
+
|
| 166 |
+
return result
|
| 167 |
|
| 168 |
except Exception as e:
|
| 169 |
logger.error(f"Error in process_image: {str(e)}")
|
|
|
|
| 183 |
).to(device)
|
| 184 |
|
| 185 |
prompt_embeds = text_encoder_two(text_inputs.input_ids)[0]
|
| 186 |
+
|
| 187 |
+
# 将 t5_context_embedder 移到 GPU
|
| 188 |
+
t5_context_embedder.to(device)
|
| 189 |
prompt_embeds = t5_context_embedder(prompt_embeds)
|
| 190 |
|
| 191 |
+
# 将 t5_context_embedder 移回 CPU
|
| 192 |
+
t5_context_embedder.cpu()
|
| 193 |
+
|
| 194 |
return prompt_embeds
|
| 195 |
|
| 196 |
def compute_text_embeddings(prompt=""):
|
|
|
|
| 238 |
# Generate images
|
| 239 |
try:
|
| 240 |
logger.info("Starting image generation...")
|
| 241 |
+
|
| 242 |
+
# 将 Transformer 和 VAE 移到 GPU
|
| 243 |
+
logger.info("Moving Transformer and VAE to GPU...")
|
| 244 |
+
transformer.to(device)
|
| 245 |
+
vae.to(device)
|
| 246 |
+
|
| 247 |
+
# 更新 pipeline 中的模型引用
|
| 248 |
+
pipeline.transformer = transformer
|
| 249 |
+
pipeline.vae = vae
|
| 250 |
+
|
| 251 |
output_images = pipeline(
|
| 252 |
+
prompt_embeds=qwen2_hidden_state.to(device).repeat(num_images, 1, 1),
|
| 253 |
pooled_prompt_embeds=pooled_prompt_embeds,
|
| 254 |
t5_prompt_embeds=t5_prompt_embeds.repeat(num_images, 1, 1) if t5_prompt_embeds is not None else None,
|
| 255 |
num_inference_steps=num_inference_steps,
|
|
|
|
| 257 |
height=height,
|
| 258 |
width=width,
|
| 259 |
).images
|
| 260 |
+
|
| 261 |
logger.info("Image generation completed")
|
| 262 |
|
| 263 |
+
# 将 Transformer 和 VAE 移回 CPU
|
| 264 |
+
logger.info("Moving models back to CPU...")
|
| 265 |
+
transformer.cpu()
|
| 266 |
+
vae.cpu()
|
| 267 |
+
torch.cuda.empty_cache()
|
| 268 |
+
|
| 269 |
return output_images
|
| 270 |
|
| 271 |
except Exception as e:
|