Ryukijano commited on
Commit
af5f1ec
·
verified ·
1 Parent(s): eeb1f81

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +5 -10
  2. custom_pipeline.py +17 -32
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import numpy as np
3
  import random
 
4
  import torch
5
  import time
6
  from diffusers import DiffusionPipeline, AutoencoderTiny
@@ -8,7 +9,6 @@ from diffusers.models.attention_processor import AttnProcessor2_0
8
  from custom_pipeline import FluxWithCFGPipeline
9
 
10
  torch.backends.cuda.matmul.allow_tf32 = True
11
- torch.backends.cudnn.benchmark = True
12
 
13
  # Constants
14
  MAX_SEED = np.iinfo(np.int32).max
@@ -18,7 +18,7 @@ DEFAULT_HEIGHT = 1024
18
  DEFAULT_INFERENCE_STEPS = 1
19
 
20
  # Device and model setup
21
- dtype = torch.bfloat16
22
  pipe = FluxWithCFGPipeline.from_pretrained(
23
  "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
24
  )
@@ -28,16 +28,11 @@ pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-rea
28
  pipe.set_adapters(["better"], adapter_weights=[1.0])
29
  pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
30
  pipe.unload_lora_weights()
31
- pipe.enable_xformers_memory_efficient_attention()
32
- pipe.unet.to(memory_format=torch.channels_last)
33
- pipe.vae.to(memory_format=torch.channels_last)
34
-
35
- pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead")
36
- pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead")
37
 
38
  torch.cuda.empty_cache()
39
 
40
  # Inference function
 
41
  def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT, randomize_seed=False, num_inference_steps=2, progress=gr.Progress(track_tqdm=True)):
42
  if randomize_seed:
43
  seed = random.randint(0, MAX_SEED)
@@ -78,7 +73,7 @@ with gr.Blocks() as demo:
78
  with gr.Column(scale=2.5):
79
  result = gr.Image(label="Generated Image", show_label=False, interactive=False)
80
  with gr.Column(scale=1):
81
- prompt = gr.Textbox(
82
  label="Prompt",
83
  placeholder="Describe the image you want to generate...",
84
  lines=3,
@@ -91,7 +86,7 @@ with gr.Blocks() as demo:
91
  with gr.Column("Advanced Options"):
92
  with gr.Row():
93
  realtime = gr.Checkbox(label="Realtime Toggler", info="If TRUE then uses more GPU but create image in realtime.", value=False)
94
- latency = gr.Textbox(label="Latency")
95
  with gr.Row():
96
  seed = gr.Number(label="Seed", value=42)
97
  randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
 
1
  import gradio as gr
2
  import numpy as np
3
  import random
4
+ import spaces
5
  import torch
6
  import time
7
  from diffusers import DiffusionPipeline, AutoencoderTiny
 
9
  from custom_pipeline import FluxWithCFGPipeline
10
 
11
  torch.backends.cuda.matmul.allow_tf32 = True
 
12
 
13
  # Constants
14
  MAX_SEED = np.iinfo(np.int32).max
 
18
  DEFAULT_INFERENCE_STEPS = 1
19
 
20
  # Device and model setup
21
+ dtype = torch.float16
22
  pipe = FluxWithCFGPipeline.from_pretrained(
23
  "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
24
  )
 
28
  pipe.set_adapters(["better"], adapter_weights=[1.0])
29
  pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
30
  pipe.unload_lora_weights()
 
 
 
 
 
 
31
 
32
  torch.cuda.empty_cache()
33
 
34
  # Inference function
35
+ @spaces.GPU(duration=25)
36
  def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT, randomize_seed=False, num_inference_steps=2, progress=gr.Progress(track_tqdm=True)):
37
  if randomize_seed:
38
  seed = random.randint(0, MAX_SEED)
 
73
  with gr.Column(scale=2.5):
74
  result = gr.Image(label="Generated Image", show_label=False, interactive=False)
75
  with gr.Column(scale=1):
76
+ prompt = gr.Text(
77
  label="Prompt",
78
  placeholder="Describe the image you want to generate...",
79
  lines=3,
 
86
  with gr.Column("Advanced Options"):
87
  with gr.Row():
88
  realtime = gr.Checkbox(label="Realtime Toggler", info="If TRUE then uses more GPU but create image in realtime.", value=False)
89
+ latency = gr.Text(label="Latency")
90
  with gr.Row():
91
  seed = gr.Number(label="Seed", value=42)
92
  randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
custom_pipeline.py CHANGED
@@ -3,29 +3,20 @@ import numpy as np
3
  from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
4
  from typing import Any, Dict, List, Optional, Union
5
  from PIL import Image
6
- from torch.cuda import graphs
7
 
8
- # Enable TF32 and memory format optimizations
9
- torch.backends.cuda.matmul.allow_tf32 = True
10
- torch.backends.cudnn.allow_tf32 = True
11
- torch.backends.cudnn.benchmark = True
12
-
13
- # Constants with optimized values
14
  BASE_SEQ_LEN = 256
15
  MAX_SEQ_LEN = 4096
16
  BASE_SHIFT = 0.5
17
  MAX_SHIFT = 1.2
18
- BATCH_SIZE = 4 # Optimal batch size for A100
19
 
20
- @torch.jit.script
21
  def calculate_timestep_shift(image_seq_len: int) -> float:
22
- BASE_SEQ_LEN = 256
23
- MAX_SEQ_LEN = 4096
24
- BASE_SHIFT = 0.5
25
- MAX_SHIFT = 1.2
26
  m = (MAX_SHIFT - BASE_SHIFT) / (MAX_SEQ_LEN - BASE_SEQ_LEN)
27
  b = BASE_SHIFT - m * BASE_SEQ_LEN
28
- return image_seq_len * m + b
 
29
 
30
  def prepare_timesteps(
31
  scheduler: FlowMatchEulerDiscreteScheduler,
@@ -35,25 +26,19 @@ def prepare_timesteps(
35
  sigmas: Optional[List[float]] = None,
36
  mu: Optional[float] = None,
37
  ) -> (torch.Tensor, int):
38
- """Optimized timestep preparation with CUDA graphs support"""
39
- if device is None:
40
- device = torch.device("cuda")
41
-
42
- # Pre-calculate timesteps using CUDA graph
43
- static_input = torch.tensor([], device=device)
44
- g = torch.cuda.CUDAGraph()
45
-
46
- with torch.cuda.graph(g):
47
- if timesteps is not None:
48
- scheduler.set_timesteps(timesteps=timesteps, device=device)
49
- elif sigmas is not None:
50
- scheduler.set_timesteps(sigmas=sigmas, device=device)
51
- else:
52
- scheduler.set_timesteps(num_inference_steps, device=device, mu=mu)
53
-
54
- timesteps = scheduler.timesteps.to(memory_format=torch.channels_last)
55
  num_inference_steps = len(timesteps)
56
-
57
  return timesteps, num_inference_steps
58
 
59
  # FLUX pipeline function
 
3
  from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
4
  from typing import Any, Dict, List, Optional, Union
5
  from PIL import Image
 
6
 
7
+ # Constants for shift calculation
 
 
 
 
 
8
  BASE_SEQ_LEN = 256
9
  MAX_SEQ_LEN = 4096
10
  BASE_SHIFT = 0.5
11
  MAX_SHIFT = 1.2
 
12
 
13
+ # Helper functions
14
  def calculate_timestep_shift(image_seq_len: int) -> float:
15
+ """Calculates the timestep shift (mu) based on the image sequence length."""
 
 
 
16
  m = (MAX_SHIFT - BASE_SHIFT) / (MAX_SEQ_LEN - BASE_SEQ_LEN)
17
  b = BASE_SHIFT - m * BASE_SEQ_LEN
18
+ mu = image_seq_len * m + b
19
+ return mu
20
 
21
  def prepare_timesteps(
22
  scheduler: FlowMatchEulerDiscreteScheduler,
 
26
  sigmas: Optional[List[float]] = None,
27
  mu: Optional[float] = None,
28
  ) -> (torch.Tensor, int):
29
+ """Prepares the timesteps for the diffusion process."""
30
+ if timesteps is not None and sigmas is not None:
31
+ raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
32
+
33
+ if timesteps is not None:
34
+ scheduler.set_timesteps(timesteps=timesteps, device=device)
35
+ elif sigmas is not None:
36
+ scheduler.set_timesteps(sigmas=sigmas, device=device)
37
+ else:
38
+ scheduler.set_timesteps(num_inference_steps, device=device, mu=mu)
39
+
40
+ timesteps = scheduler.timesteps
 
 
 
 
 
41
  num_inference_steps = len(timesteps)
 
42
  return timesteps, num_inference_steps
43
 
44
  # FLUX pipeline function