flux-lightning

Runtime error

App Files Files Community

Jordan Legg commited on Aug 9, 2024

Commit

3be64a5

1 Parent(s): 383a90d

target the text encoder, merge latent space before the pipeline

Browse files

Files changed (1) hide show

app.py +23 -8

app.py CHANGED Viewed

@@ -14,7 +14,8 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
 LATENT_CHANNELS = 16
-TRANSFORMER_IN_CHANNELS = 64
 SCALING_FACTOR = 0.3611
 # Load FLUX model
@@ -23,8 +24,8 @@ pipe.enable_model_cpu_offload()
 pipe.vae.enable_slicing()
 pipe.vae.enable_tiling()
-# Add a projection layer to match transformer input
-projection = nn.Linear(LATENT_CHANNELS, TRANSFORMER_IN_CHANNELS).to(device).to(dtype)
 def preprocess_image(image, image_size):
     preprocess = transforms.Compose([
@@ -47,10 +48,19 @@ def process_latents(latents, height, width):
     latents = latents.permute(0, 2, 3, 1).reshape(1, -1, LATENT_CHANNELS)
     print(f"Reshaped latent shape: {latents.shape}")
-    # Project latents from 16 to 64 dimensions
     latents = projection(latents)
     print(f"Projected latent shape: {latents.shape}")
     return latents
 @spaces.GPU()
@@ -79,11 +89,16 @@ def infer(prompt, init_image=None, seed=42, randomize_seed=False, width=1024, he
             latents = pipe.vae.encode(init_image).latent_dist.sample() * SCALING_FACTOR
             print(f"Initial latent shape from VAE: {latents.shape}")
-            # Process latents to match transformer input
             latents = process_latents(latents, height, width)
-            print(f"x_embedder weight shape: {pipe.transformer.x_embedder.weight.shape}")
-            print(f"First transformer block input shape: {pipe.transformer.transformer_blocks[0].attn.to_q.weight.shape}")
             image = pipe(
                 prompt=prompt,
@@ -92,7 +107,7 @@ def infer(prompt, init_image=None, seed=42, randomize_seed=False, width=1024, he
                 num_inference_steps=num_inference_steps,
                 generator=generator,
                 guidance_scale=0.0,
-                latents=latents
             ).images[0]
         return image, seed

 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
 LATENT_CHANNELS = 16
+TEXT_EMBED_DIM = 768
+MAX_TEXT_EMBEDDINGS = 77
 SCALING_FACTOR = 0.3611
 # Load FLUX model
 pipe.vae.enable_slicing()
 pipe.vae.enable_tiling()
+# Add a projection layer to match text embedding dimension
+projection = nn.Linear(LATENT_CHANNELS, TEXT_EMBED_DIM).to(device).to(dtype)
 def preprocess_image(image, image_size):
     preprocess = transforms.Compose([
     latents = latents.permute(0, 2, 3, 1).reshape(1, -1, LATENT_CHANNELS)
     print(f"Reshaped latent shape: {latents.shape}")
+    # Project latents to match text embedding dimension
     latents = projection(latents)
     print(f"Projected latent shape: {latents.shape}")
+    # Adjust sequence length to match text embeddings
+    seq_len = latents.shape[1]
+    if seq_len > MAX_TEXT_EMBEDDINGS:
+        latents = latents[:, :MAX_TEXT_EMBEDDINGS, :]
+    elif seq_len < MAX_TEXT_EMBEDDINGS:
+        pad_len = MAX_TEXT_EMBEDDINGS - seq_len
+        latents = torch.nn.functional.pad(latents, (0, 0, 0, pad_len, 0, 0))
+    print(f"Final latent shape: {latents.shape}")
     return latents
 @spaces.GPU()
             latents = pipe.vae.encode(init_image).latent_dist.sample() * SCALING_FACTOR
             print(f"Initial latent shape from VAE: {latents.shape}")
+            # Process latents to match text embedding format
             latents = process_latents(latents, height, width)
+            # Get text embeddings
+            text_embeddings = pipe.transformer.text_encoder([prompt])
+            print(f"Text embedding shape: {text_embeddings.shape}")
+            # Combine image latents and text embeddings
+            combined_embeddings = torch.cat([latents, text_embeddings], dim=1)
+            print(f"Combined embedding shape: {combined_embeddings.shape}")
             image = pipe(
                 prompt=prompt,
                 num_inference_steps=num_inference_steps,
                 generator=generator,
                 guidance_scale=0.0,
+                latents=combined_embeddings
             ).images[0]
         return image, seed