Spaces:

Ryukijano
/

Fastest-image-generation

Runtime error

App Files Files Community

Ryukijano commited on Dec 9, 2024

Commit

7c212f5

verified ·

1 Parent(s): af5f1ec

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -8

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import gradio as gr
 import numpy as np
 import random
 import spaces
-import torch
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
-torch.backends.cuda.matmul.allow_tf32 = True
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
@@ -29,6 +30,11 @@ pipe.set_adapters(["better"], adapter_weights=[1.0])
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
 torch.cuda.empty_cache()
 # Inference function
@@ -40,13 +46,68 @@ def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT,
     start_time = time.time()
     # Only generate the last image in the sequence
     img = pipe.generate_images(
             prompt=prompt,
             width=width,
             height=height,
             num_inference_steps=num_inference_steps,
-            generator=generator
         )
     latency = f"Latency: {(time.time()-start_time):.2f} seconds"
     return img, seed, latency
@@ -138,9 +199,11 @@ with gr.Blocks() as demo:
         concurrency_limit=None
     )
-    def realtime_generation(*args):
         if args[0]:  # If realtime is enabled
-            return next(generate_image(*args[1:]))
     prompt.submit(
         fn=generate_image,
@@ -158,9 +221,9 @@ with gr.Blocks() as demo:
             outputs=[result, seed, latency],
             show_progress="hidden",
             trigger_mode="always_last",
-            queue=False,
             concurrency_limit=None
         )
 # Launch the app
-demo.launch()

+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
 import gradio as gr
 import numpy as np
 import random
 import spaces
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 from diffusers.models.attention_processor import AttnProcessor2_0
 from custom_pipeline import FluxWithCFGPipeline
+import asyncio
 # Constants
 MAX_SEED = np.iinfo(np.int32).max
 pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
 pipe.unload_lora_weights()
+pipe.unet.to(memory_format=torch.channels_last)
+pipe.vae.to(memory_format=torch.channels_last)
+pipe.enable_xformers_memory_efficient_attention()
 torch.cuda.empty_cache()
 # Inference function
     start_time = time.time()
+    # Initialize static inputs for CUDA graph
+    static_latents = torch.randn((1, 4, height // 8, width // 8), dtype=dtype, device="cuda")
+    static_prompt_embeds = torch.randn((2, 77, 768), dtype=dtype, device="cuda")  # Adjust dimensions as needed
+    static_pooled_prompt_embeds = torch.randn((2, 768), dtype=dtype, device="cuda")  # Adjust dimensions as needed
+    static_text_ids = torch.tensor([[[1, 2, 3]]], dtype=torch.int32, device="cuda")
+    static_latent_image_ids = torch.tensor([1], dtype=torch.int64, device="cuda")
+    static_timestep = torch.tensor([999], dtype=dtype, device="cuda")
+    # Warmup
+    s = torch.cuda.Stream()
+    s.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(s):
+        for _ in range(3):
+            _ = pipe.transformer(
+                hidden_states=static_latents,
+                timestep=static_timestep / 1000,
+                guidance=None,
+                pooled_projections=static_pooled_prompt_embeds,
+                encoder_hidden_states=static_prompt_embeds,
+                txt_ids=static_text_ids,
+                img_ids=static_latent_image_ids,
+                joint_attention_kwargs=pipe.joint_attention_kwargs,
+                return_dict=False,
+            )
+    torch.cuda.current_stream().wait_stream(s)
+    # Capture CUDA Graph
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g):
+        static_noise_pred = pipe.transformer(
+            hidden_states=static_latents,
+            timestep=static_timestep / 1000,
+            guidance=None,
+            pooled_projections=static_pooled_prompt_embeds,
+            encoder_hidden_states=static_prompt_embeds,
+            txt_ids=static_text_ids,
+            img_ids=static_latent_image_ids,
+            joint_attention_kwargs=pipe.joint_attention_kwargs,
+            return_dict=False,
+        )[0]
+        static_latents_out = pipe.scheduler.step(static_noise_pred, static_timestep, static_latents, return_dict=False)[0]
+        static_output = pipe._decode_latents_to_image(static_latents_out, height, width, "pil")
+    # Graph-based generation function
+    def generate_with_graph(latents, prompt_embeds, pooled_prompt_embeds, text_ids, latent_image_ids, timestep):
+        static_latents.copy_(latents)
+        static_prompt_embeds.copy_(prompt_embeds)
+        static_pooled_prompt_embeds.copy_(pooled_prompt_embeds)
+        static_text_ids.copy_(text_ids)
+        static_latent_image_ids.copy_(latent_image_ids)
+        static_timestep.copy_(timestep)
+        g.replay()
+        return static_output
     # Only generate the last image in the sequence
     img = pipe.generate_images(
             prompt=prompt,
             width=width,
             height=height,
             num_inference_steps=num_inference_steps,
+            generator=generator,
+            generate_with_graph=generate_with_graph
         )
     latency = f"Latency: {(time.time()-start_time):.2f} seconds"
     return img, seed, latency
         concurrency_limit=None
     )
+    async def realtime_generation(*args):
         if args[0]:  # If realtime is enabled
+            loop = asyncio.get_event_loop()
+            result = await loop.run_in_executor(None, next, generate_image(*args[1:]))
+            return result
     prompt.submit(
         fn=generate_image,
             outputs=[result, seed, latency],
             show_progress="hidden",
             trigger_mode="always_last",
+            queue=True,
             concurrency_limit=None
         )
 # Launch the app
+demo.launch()