Spaces:

Ryukijano
/

Fastest-image-generation

Runtime error

App Files Files Community

Ryukijano commited on Dec 9, 2024

Commit

af5f1ec

verified ·

1 Parent(s): eeb1f81

Upload 2 files

Browse files

Files changed (2) hide show

app.py +5 -10
custom_pipeline.py +17 -32

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import numpy as np
 import random
 import torch
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
@@ -8,7 +9,6 @@ from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
 torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.benchmark = True
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
@@ -18,7 +18,7 @@ DEFAULT_HEIGHT = 1024
 DEFAULT_INFERENCE_STEPS = 1
 # Device and model setup
-dtype = torch.bfloat16
 pipe = FluxWithCFGPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
 )
@@ -28,16 +28,11 @@ pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-rea
 pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
-pipe.enable_xformers_memory_efficient_attention()
-pipe.unet.to(memory_format=torch.channels_last)
-pipe.vae.to(memory_format=torch.channels_last)
-pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead")
-pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead")
 torch.cuda.empty_cache()
 # Inference function
 def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT, randomize_seed=False, num_inference_steps=2, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
@@ -78,7 +73,7 @@ with gr.Blocks() as demo:
             with gr.Column(scale=2.5):
                 result = gr.Image(label="Generated Image", show_label=False, interactive=False)
             with gr.Column(scale=1):
-                prompt = gr.Textbox(
                     label="Prompt",
                     placeholder="Describe the image you want to generate...",
                     lines=3,
@@ -91,7 +86,7 @@ with gr.Blocks() as demo:
                 with gr.Column("Advanced Options"):
                     with gr.Row():
                         realtime = gr.Checkbox(label="Realtime Toggler", info="If TRUE then uses more GPU but create image in realtime.", value=False)
-                        latency = gr.Textbox(label="Latency")
                     with gr.Row():
                         seed = gr.Number(label="Seed", value=42)
                         randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)

 import gradio as gr
 import numpy as np
 import random
+import spaces
 import torch
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 from custom_pipeline import FluxWithCFGPipeline
 torch.backends.cuda.matmul.allow_tf32 = True
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
 DEFAULT_INFERENCE_STEPS = 1
 # Device and model setup
+dtype = torch.float16
 pipe = FluxWithCFGPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
 )
 pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
 torch.cuda.empty_cache()
 # Inference function
+@spaces.GPU(duration=25)
 def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT, randomize_seed=False, num_inference_steps=2, progress=gr.Progress(track_tqdm=True)):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
             with gr.Column(scale=2.5):
                 result = gr.Image(label="Generated Image", show_label=False, interactive=False)
             with gr.Column(scale=1):
+                prompt = gr.Text(
                     label="Prompt",
                     placeholder="Describe the image you want to generate...",
                     lines=3,
                 with gr.Column("Advanced Options"):
                     with gr.Row():
                         realtime = gr.Checkbox(label="Realtime Toggler", info="If TRUE then uses more GPU but create image in realtime.", value=False)
+                        latency = gr.Text(label="Latency")
                     with gr.Row():
                         seed = gr.Number(label="Seed", value=42)
                         randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)

custom_pipeline.py CHANGED Viewed

@@ -3,29 +3,20 @@ import numpy as np
 from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
 from typing import Any, Dict, List, Optional, Union
 from PIL import Image
-from torch.cuda import graphs
-# Enable TF32 and memory format optimizations
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-torch.backends.cudnn.benchmark = True
-# Constants with optimized values
 BASE_SEQ_LEN = 256
 MAX_SEQ_LEN = 4096
 BASE_SHIFT = 0.5
 MAX_SHIFT = 1.2
-BATCH_SIZE = 4  # Optimal batch size for A100
-@torch.jit.script
 def calculate_timestep_shift(image_seq_len: int) -> float:
-    BASE_SEQ_LEN = 256
-    MAX_SEQ_LEN = 4096
-    BASE_SHIFT = 0.5
-    MAX_SHIFT = 1.2
     m = (MAX_SHIFT - BASE_SHIFT) / (MAX_SEQ_LEN - BASE_SEQ_LEN)
     b = BASE_SHIFT - m * BASE_SEQ_LEN
-    return image_seq_len * m + b
 def prepare_timesteps(
     scheduler: FlowMatchEulerDiscreteScheduler,
@@ -35,25 +26,19 @@ def prepare_timesteps(
     sigmas: Optional[List[float]] = None,
     mu: Optional[float] = None,
 ) -> (torch.Tensor, int):
-    """Optimized timestep preparation with CUDA graphs support"""
-    if device is None:
-        device = torch.device("cuda")
-    # Pre-calculate timesteps using CUDA graph
-    static_input = torch.tensor([], device=device)
-    g = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(g):
-        if timesteps is not None:
-            scheduler.set_timesteps(timesteps=timesteps, device=device)
-        elif sigmas is not None:
-            scheduler.set_timesteps(sigmas=sigmas, device=device)
-        else:
-            scheduler.set_timesteps(num_inference_steps, device=device, mu=mu)
-    timesteps = scheduler.timesteps.to(memory_format=torch.channels_last)
     num_inference_steps = len(timesteps)
     return timesteps, num_inference_steps
 # FLUX pipeline function

 from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
 from typing import Any, Dict, List, Optional, Union
 from PIL import Image
+# Constants for shift calculation
 BASE_SEQ_LEN = 256
 MAX_SEQ_LEN = 4096
 BASE_SHIFT = 0.5
 MAX_SHIFT = 1.2
+# Helper functions
 def calculate_timestep_shift(image_seq_len: int) -> float:
+    """Calculates the timestep shift (mu) based on the image sequence length."""
     m = (MAX_SHIFT - BASE_SHIFT) / (MAX_SEQ_LEN - BASE_SEQ_LEN)
     b = BASE_SHIFT - m * BASE_SEQ_LEN
+    mu = image_seq_len * m + b
+    return mu
 def prepare_timesteps(
     scheduler: FlowMatchEulerDiscreteScheduler,
     sigmas: Optional[List[float]] = None,
     mu: Optional[float] = None,
 ) -> (torch.Tensor, int):
+    """Prepares the timesteps for the diffusion process."""
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
+    if timesteps is not None:
+        scheduler.set_timesteps(timesteps=timesteps, device=device)
+    elif sigmas is not None:
+        scheduler.set_timesteps(sigmas=sigmas, device=device)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, mu=mu)
+    timesteps = scheduler.timesteps
     num_inference_steps = len(timesteps)
     return timesteps, num_inference_steps
 # FLUX pipeline function