Spaces:

1inkusFace
/

StableDiffusion-3.5-Large-lora

Running on Zero

App Files Files Community

1inkusFace commited on Oct 13

Commit

e95c348

verified ·

1 Parent(s): 89200e0

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -0

app.py CHANGED Viewed

@@ -44,6 +44,40 @@ from diffusers import StableDiffusion3Pipeline, SD3Transformer2DModel, Autoencod
 from PIL import Image
 from image_gen_aux import UpscaleWithModel
 # --- GCS Configuration ---
 # Make sure to set these secrets in your Hugging Face Space settings
 GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
@@ -78,6 +112,17 @@ def upload_to_gcs(image_object, filename):
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 def load_model():
     pipe = StableDiffusion3Pipeline.from_pretrained(
         "ford442/stable-diffusion-3.5-large-bf16",
@@ -89,11 +134,21 @@ def load_model():
     pipe.transformer=ll_transformer
     pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/UltraReal.safetensors")
     pipe.to(device=device, dtype=torch.bfloat16)
     upscaler_2 = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(device)
     return pipe, upscaler_2
 pipe, upscaler_2 = load_model()
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 4096

 from PIL import Image
 from image_gen_aux import UpscaleWithModel
+from diffusers.models.attention_processor import Attention
+from kernels import get_kernel
+vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
+class FlashAttentionProcessor(Attention):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **kwargs):
+        query = attn.to_q(hidden_states)
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Scale the queries
+        scale = attn.scale
+        query = query * scale
+        # Reshape to match kernel requirements
+        b, t, c = query.shape
+        h = attn.heads
+        q_reshaped = query.reshape(b, t, h, c // h)
+        k_reshaped = key.reshape(b, t, h, c // h)
+        v_reshaped = value.reshape(b, t, h, c // h)
+        out_reshaped = torch.empty_like(q_reshaped)
+        # Call the pre-compiled kernel
+        vllm_flash_attn3.attention(q_reshaped, k_reshaped, v_reshaped, out_reshaped)
+        # Reshape output back
+        out = out_reshaped.reshape(b, t, c)
+        out = attn.to_out[0](out)
+        out = attn.to_out[1](out)
+        return out
 # --- GCS Configuration ---
 # Make sure to set these secrets in your Hugging Face Space settings
 GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+@spaces.GPU(duration=120)
+def compile_transformer():
+    with spaces.aoti_capture(pipe.transformer) as call:
+        pipe("A majestic, ancient Egyptian Sphinx stands sentinel in a large, clear pool under a bright, golden desert sun. Around its weathered stone base, several sleek, playful dolphins gracefully navigate the turquoise waters. The surrounding environment features lush, exotic papyrus plants and distant pyramids under a cloudless sky, conveying a sense of timeless wonder and serene majesty.")
+    exported = torch.export.export(
+        pipe.transformer,
+        args=call.args,
+        kwargs=call.kwargs,
+    )
+    return spaces.aoti_compile(exported)
 def load_model():
     pipe = StableDiffusion3Pipeline.from_pretrained(
         "ford442/stable-diffusion-3.5-large-bf16",
     pipe.transformer=ll_transformer
     pipe.load_lora_weights("ford442/sdxl-vae-bf16", weight_name="LoRA/UltraReal.safetensors")
     pipe.to(device=device, dtype=torch.bfloat16)
+    for name, module in pipe.unet.named_modules():
+    if isinstance(module, Attention):
+        module.processor = fa_processor
     upscaler_2 = UpscaleWithModel.from_pretrained("Kim2091/ClearRealityV1").to(device)
     return pipe, upscaler_2
+fa_processor = FlashAttentionProcessor()
 pipe, upscaler_2 = load_model()
+compiled_transformer = compile_transformer()
+spaces.aoti_apply(compiled_transformer, pipe.transformer)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 4096