Spaces:

Ryukijano
/

Fastest-image-generation

Runtime error

App Files Files Community

Ryukijano commited on Dec 9, 2024

Commit

858fb7b

verified ·

1 Parent(s): 105e0dd

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -112

app.py CHANGED Viewed

@@ -1,16 +1,15 @@
-import torch
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
 import gradio as gr
 import numpy as np
 import random
 import spaces
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
@@ -25,72 +24,110 @@ pipe = FluxWithCFGPipeline.from_pretrained(
 )
 pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype)
 pipe.to("cuda")
-pipe.load_lora_weights(
-    "hugovntr/flux-schnell-realism",
-    weight_name="schnell-realism_v2.3.safetensors",
-    adapter_name="better",
-)
 pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
-# Correctly set memory format
-pipe.transformer.to(memory_format=torch.channels_last)
-pipe.vae.to(memory_format=torch.channels_last)
-# Conditionally enable xformers only for the transformer
-if hasattr(pipe, "transformer") and torch.cuda.is_available():
-    try:
-        pipe.transformer.enable_xformers_memory_efficient_attention()
-    except Exception as e:
-        print(
-            "Warning: Could not enable xformers for the transformer due to the following error:"
-        )
-        print(e)
-torch.cuda.empty_cache()
 # Inference function
 @spaces.GPU(duration=25)
-def generate_image(
-    prompt,
-    seed=24,
-    width=DEFAULT_WIDTH,
-    height=DEFAULT_HEIGHT,
-    randomize_seed=False,
-    num_inference_steps=2,
-    progress=gr.Progress(track_tqdm=True),
-):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(int(float(seed)))
     start_time = time.time()
-    # Dynamically determine shapes based on input width/height
-    latents_shape = (1, 4, height // 8, width // 8)
-    prompt_embeds_shape = (
-        1,
-        pipe.transformer.text_encoder.config.max_position_embeddings,
-        pipe.transformer.text_encoder.config.hidden_size,
-    )
-    pooled_prompt_embeds_shape = (
         1,
-        pipe.transformer.text_encoder.config.hidden_size,
-    )
-    # Only generate the last image in the sequence
-    img = pipe.generate_images(
-        prompt=prompt,
-        width=width,
-        height=height,
-        num_inference_steps=num_inference_steps,
-        generator=generator,
-        latents_shape=latents_shape,
-        prompt_embeds_shape=prompt_embeds_shape,
-        pooled_prompt_embeds_shape=pooled_prompt_embeds_shape
-    )
-    latency = f"Latency: {(time.time()-start_time):.2f} seconds"
     return img, seed, latency
 # Example prompts
@@ -108,18 +145,12 @@ examples = [
 with gr.Blocks() as demo:
     with gr.Column(elem_id="app-container"):
         gr.Markdown("# 🎨 Realtime FLUX Image Generator")
-        gr.Markdown(
-            "Generate stunning images in real-time with Modified Flux.Schnell pipeline."
-        )
-        gr.Markdown(
-            "<span style='color: red;'>Note: Sometimes it stucks or stops generating images (I don't know why). In that situation just refresh the site.</span>"
-        )
         with gr.Row():
             with gr.Column(scale=2.5):
-                result = gr.Image(
-                    label="Generated Image", show_label=False, interactive=False
-                )
             with gr.Column(scale=1):
                 prompt = gr.Text(
                     label="Prompt",
@@ -133,39 +164,15 @@ with gr.Blocks() as demo:
                 with gr.Column("Advanced Options"):
                     with gr.Row():
-                        realtime = gr.Checkbox(
-                            label="Realtime Toggler",
-                            info="If TRUE then uses more GPU but create image in realtime.",
-                            value=False,
-                        )
                         latency = gr.Text(label="Latency")
                     with gr.Row():
                         seed = gr.Number(label="Seed", value=42)
-                        randomize_seed = gr.Checkbox(
-                            label="Randomize Seed", value=True
-                        )
                     with gr.Row():
-                        width = gr.Slider(
-                            label="Width",
-                            minimum=256,
-                            maximum=MAX_IMAGE_SIZE,
-                            step=32,
-                            value=DEFAULT_WIDTH,
-                        )
-                        height = gr.Slider(
-                            label="Height",
-                            minimum=256,
-                            maximum=MAX_IMAGE_SIZE,
-                            step=32,
-                            value=DEFAULT_HEIGHT,
-                        )
-                        num_inference_steps = gr.Slider(
-                            label="Inference Steps",
-                            minimum=1,
-                            maximum=4,
-                            step=1,
-                            value=DEFAULT_INFERENCE_STEPS,
-                        )
         with gr.Row():
             gr.Markdown("### 🌟 Inspiration Gallery")
@@ -175,7 +182,7 @@ with gr.Blocks() as demo:
                 fn=generate_image,
                 inputs=[prompt],
                 outputs=[result, seed, latency],
-                cache_examples="lazy",
             )
     enhanceBtn.click(
@@ -184,7 +191,7 @@ with gr.Blocks() as demo:
         outputs=[result, seed, latency],
         show_progress="full",
         queue=False,
-        concurrency_limit=None,
     )
     generateBtn.click(
@@ -199,7 +206,7 @@ with gr.Blocks() as demo:
     def update_ui(realtime_enabled):
         return {
             prompt: gr.update(interactive=True),
-            generateBtn: gr.update(visible=not realtime_enabled),
         }
     realtime.change(
@@ -207,13 +214,12 @@ with gr.Blocks() as demo:
         inputs=[realtime],
         outputs=[prompt, generateBtn],
         queue=False,
-        concurrency_limit=None,
     )
     def realtime_generation(*args):
         if args[0]:  # If realtime is enabled
-            img, seed, latency = generate_image(*args[1:])
-            return img, seed, latency
     prompt.submit(
         fn=generate_image,
@@ -221,27 +227,19 @@ with gr.Blocks() as demo:
         outputs=[result, seed, latency],
         show_progress="full",
         queue=False,
-        concurrency_limit=None,
     )
     for component in [prompt, width, height, num_inference_steps]:
         component.input(
             fn=realtime_generation,
-            inputs=[
-                realtime,
-                prompt,
-                seed,
-                width,
-                height,
-                randomize_seed,
-                num_inference_steps,
-            ],
             outputs=[result, seed, latency],
             show_progress="hidden",
             trigger_mode="always_last",
-            queue=True,
-            concurrency_limit=None,
         )
 # Launch the app
-demo.launch()

 import gradio as gr
 import numpy as np
 import random
 import spaces
+import torch
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
+torch.backends.cuda.matmul.allow_tf32 = True
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
 )
 pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype)
 pipe.to("cuda")
+pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
 pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
+# Memory optimizations
+pipe.unet.to(memory_format=torch.channels_last)  # Channels last
+pipe.enable_xformers_memory_efficient_attention()  # Flash Attention
+# CUDA Graph setup
+static_inputs = None
+static_model = None
+graph = None
+def setup_cuda_graph(prompt, height, width, num_inference_steps):
+    global static_inputs, static_model, graph
+    batch_size = 1 if isinstance(prompt, str) else len(prompt)
+    device = "cuda"
+    num_images_per_prompt = 1
+    prompt_embeds, pooled_prompt_embeds, text_ids = pipe.encode_prompt(
+        prompt=prompt,
+        prompt_2=None,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=300,
+        lora_scale=None,
+    )
+    latents, latent_image_ids = pipe.prepare_latents(
+        batch_size * num_images_per_prompt,
+        pipe.transformer.config.in_channels // 4,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        None,
+        None,
+    )
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_timestep_shift(image_seq_len)
+    timesteps, num_inference_steps = prepare_timesteps(
+        pipe.scheduler,
+        num_inference_steps,
+        device,
+        None,
+        sigmas,
+        mu=mu,
+    )
+    guidance = torch.full([1], 3.5, device=device, dtype=torch.float16).expand(latents.shape[0]) if pipe.transformer.config.guidance_embeds else None
+    static_inputs = {
+        "hidden_states": latents,
+        "timestep": timesteps,
+        "guidance": guidance,
+        "pooled_projections": pooled_prompt_embeds,
+        "encoder_hidden_states": prompt_embeds,
+        "txt_ids": text_ids,
+        "img_ids": latent_image_ids,
+        "joint_attention_kwargs": None,
+    }
+    static_model = torch.cuda.make_graphed_callables(pipe.transformer, (static_inputs,))
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        static_output = static_model(**static_inputs)
 # Inference function
 @spaces.GPU(duration=25)
+def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT, randomize_seed=False, num_inference_steps=2, progress=gr.Progress(track_tqdm=True)):
+    global static_inputs, graph
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(int(float(seed)))
     start_time = time.time()
+    if static_inputs is None:
+        setup_cuda_graph(prompt, height, width, num_inference_steps)
+    static_inputs["hidden_states"].copy_(pipe.prepare_latents(
         1,
+        pipe.transformer.config.in_channels // 4,
+        height,
+        width,
+        static_inputs["encoder_hidden_states"].dtype,
+        "cuda",
+        generator,
+        None,
+    )[0])
+    graph.replay()
+    latents = static_inputs["hidden_states"]
+    img = pipe._decode_latents_to_image(latents, height, width, "pil")
+    latency = f"Latency: {(time.time()-start_time):.2f} seconds"
     return img, seed, latency
 # Example prompts
 with gr.Blocks() as demo:
     with gr.Column(elem_id="app-container"):
         gr.Markdown("# 🎨 Realtime FLUX Image Generator")
+        gr.Markdown("Generate stunning images in real-time with Modified Flux.Schnell pipeline.")
+        gr.Markdown("<span style='color: red;'>Note: Sometimes it stucks or stops generating images (I don't know why). In that situation just refresh the site.</span>")
         with gr.Row():
             with gr.Column(scale=2.5):
+                result = gr.Image(label="Generated Image", show_label=False, interactive=False)
             with gr.Column(scale=1):
                 prompt = gr.Text(
                     label="Prompt",
                 with gr.Column("Advanced Options"):
                     with gr.Row():
+                        realtime = gr.Checkbox(label="Realtime Toggler", info="If TRUE then uses more GPU but create image in realtime.", value=False)
                         latency = gr.Text(label="Latency")
                     with gr.Row():
                         seed = gr.Number(label="Seed", value=42)
+                        randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                     with gr.Row():
+                        width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=DEFAULT_WIDTH)
+                        height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=DEFAULT_HEIGHT)
+                        num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=4, step=1, value=DEFAULT_INFERENCE_STEPS)
         with gr.Row():
             gr.Markdown("### 🌟 Inspiration Gallery")
                 fn=generate_image,
                 inputs=[prompt],
                 outputs=[result, seed, latency],
+                cache_examples="lazy"
             )
     enhanceBtn.click(
         outputs=[result, seed, latency],
         show_progress="full",
         queue=False,
+        concurrency_limit=None
     )
     generateBtn.click(
     def update_ui(realtime_enabled):
         return {
             prompt: gr.update(interactive=True),
+            generateBtn: gr.update(visible=not realtime_enabled)
         }
     realtime.change(
         inputs=[realtime],
         outputs=[prompt, generateBtn],
         queue=False,
+        concurrency_limit=None
     )
     def realtime_generation(*args):
         if args[0]:  # If realtime is enabled
+            return next(generate_image(*args[1:]))
     prompt.submit(
         fn=generate_image,
         outputs=[result, seed, latency],
         show_progress="full",
         queue=False,
+        concurrency_limit=None
     )
     for component in [prompt, width, height, num_inference_steps]:
         component.input(
             fn=realtime_generation,
+            inputs=[realtime, prompt, seed, width, height, randomize_seed, num_inference_steps],
             outputs=[result, seed, latency],
             show_progress="hidden",
             trigger_mode="always_last",
+            queue=False,
+            concurrency_limit=None
         )
 # Launch the app
+demo.queue(max_size=5, concurrency_count=1).launch()