Gianpaolo Macario commited on
Commit
b63b631
·
1 Parent(s): ae24614

feat: add stablediffusion.py

Browse files

Based on sample code at modal.com

Files changed (5) hide show
  1. app.py +1 -1
  2. flux.py +230 -0
  3. pyproject.toml +4 -0
  4. stablediffusion.py +72 -0
  5. uv.lock +0 -0
app.py CHANGED
@@ -95,4 +95,4 @@ with gr.Blocks() as demo:
95
  outputs=calc_output,
96
  api_name="calculate")
97
 
98
- demo.launch(mcp_server=True)
 
95
  outputs=calc_output,
96
  api_name="calculate")
97
 
98
+ demo.launch(mcp_server=True, share=True)
flux.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run Flux fast on H100s with torch.compile
2
+ #
3
+ # See https://modal.com/docs/examples/flux
4
+
5
+ # Setting up the image and dependencies
6
+
7
+ import time
8
+ from io import BytesIO
9
+ from pathlib import Path
10
+ import modal
11
+
12
+ # We’ll make use of the full CUDA toolkit in this example, so we’ll build our container image
13
+ # off of the nvidia/cuda base.
14
+
15
+ cuda_version = "12.4.0" # should be no greater than host CUDA version
16
+ flavor = "devel" # includes full CUDA toolkit
17
+ operating_sys = "ubuntu22.04"
18
+ tag = f"{cuda_version}-{flavor}-{operating_sys}"
19
+
20
+ cuda_dev_image = modal.Image.from_registry(
21
+ f"nvidia/cuda:{tag}", add_python="3.11"
22
+ ).entrypoint([])
23
+
24
+ # Now we install most of our dependencies with apt and pip.
25
+ # For Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library
26
+ # we install from GitHub source and so pin to a specific commit.
27
+ #
28
+ # PyTorch added faster attention kernels for Hopper GPUs in version 2.5,
29
+ # so we pin to that version to ensure we get the best performance on H100s.
30
+
31
+ diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b"
32
+
33
+ flux_image = (
34
+ cuda_dev_image.apt_install(
35
+ "git",
36
+ "libglib2.0-0",
37
+ "libsm6",
38
+ "libxrender1",
39
+ "libxext6",
40
+ "ffmpeg",
41
+ "libgl1",
42
+ )
43
+ .pip_install(
44
+ "invisible_watermark==0.2.0",
45
+ "transformers==4.44.0",
46
+ "huggingface_hub[hf_transfer]==0.26.2",
47
+ "accelerate==0.33.0",
48
+ "safetensors==0.4.4",
49
+ "sentencepiece==0.2.0",
50
+ "torch==2.5.0",
51
+ f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}",
52
+ "numpy<2",
53
+ )
54
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"})
55
+ )
56
+
57
+ # Later, we’ll also use torch.compile to increase the speed further.
58
+ # Torch compilation needs to be re-executed when each new container starts,
59
+ # So we turn on some extra caching to reduce compile times for later containers.
60
+
61
+ flux_image = flux_image.env(
62
+ {
63
+ "TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache",
64
+ "TORCHINDUCTOR_FX_GRAPH_CACHE": "1",
65
+ }
66
+ )
67
+
68
+ # Finally, we construct our Modal App, set its default image to the one we just constructed,
69
+ # and import FluxPipeline for downloading and running Flux.1.
70
+
71
+ app = modal.App(
72
+ "example-flux",
73
+ image=flux_image,
74
+ secrets=[modal.Secret.from_name("huggingface-secret")],
75
+ )
76
+
77
+ # @app.function(
78
+ # image=modal.Image.debian_slim().pip_install("torch", "diffusers[torch]", "transformers", "ftfy"),
79
+ # gpu="any",
80
+ # )
81
+
82
+ with flux_image.imports():
83
+ import torch
84
+ from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
85
+
86
+ # Defining a parameterized Model inference class
87
+ #
88
+ # Next, we map the model’s setup and inference code onto Modal.
89
+ #
90
+ # 1. We the model setun in the method decorated with @modal.enter().
91
+ # This includes loading the weights and moving them to the GPU,
92
+ # along with an optional torch.compile step (see details below).
93
+ # The @modal.enter() decorator ensures that this method runs only once,
94
+ # when a new container starts, instead of in the path of every call.
95
+ #
96
+ # 2. We run the actual inference in methods decorated with @modal.method().
97
+
98
+ MINUTES = 60 # seconds
99
+ VARIANT = "schnell" # or "dev", but note [dev] requires you to accept terms and conditions on HF
100
+ NUM_INFERENCE_STEPS = 4 # use ~50 for [dev], smaller for [schnell]
101
+
102
+
103
+ @app.cls(
104
+ gpu="H100", # fastest GPU on Modal
105
+ scaledown_window=20 * MINUTES,
106
+ timeout=60 * MINUTES, # leave plenty of time for compilation
107
+ volumes={ # add Volumes to store serializable compilation artifacts, see section on torch.compile below
108
+ "/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True),
109
+ "/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True),
110
+ "/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True),
111
+ "/root/.inductor-cache": modal.Volume.from_name(
112
+ "inductor-cache", create_if_missing=True
113
+ ),
114
+ },
115
+ )
116
+ class Model:
117
+ compile: bool = ( # see section on torch.compile below for details
118
+ modal.parameter(default=False)
119
+ )
120
+
121
+ @modal.enter()
122
+ def enter(self):
123
+ pipe = FluxPipeline.from_pretrained(
124
+ f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16
125
+ ).to("cuda") # move model to GPU
126
+ self.pipe = optimize(pipe, compile=self.compile)
127
+
128
+ @modal.method()
129
+ def inference(self, prompt: str) -> bytes:
130
+ print("🎨 generating image...")
131
+ out = self.pipe(
132
+ prompt,
133
+ output_type="pil",
134
+ num_inference_steps=NUM_INFERENCE_STEPS,
135
+ ).images[0] # type: ignore
136
+
137
+ byte_stream = BytesIO()
138
+ out.save(byte_stream, format="JPEG")
139
+ return byte_stream.getvalue()
140
+
141
+ # Calling our inference function
142
+ #
143
+ # To generate an image we just need to call the Model’s generate method with .remote appended to it.
144
+ # You can call .generate.remote from any Python environment that has access to your Modal credentials.
145
+ # The local environment will get back the image as bytes.
146
+ #
147
+ # Here, we wrap the call in a Modal local_entrypoint so that it can be run with modal run:
148
+ #
149
+ # modal run flux.py
150
+
151
+ # By default, we call generate twice to demonstrate how much faster the inference is after cold start.
152
+ # In our tests, clients received images in about 1.2 seconds. We save the output bytes to a temporary file.
153
+
154
+ @app.local_entrypoint()
155
+ def main(
156
+ prompt: str = "a computer screen showing ASCII terminal art of the"
157
+ " word 'Modal' in neon green. two programmers are pointing excitedly"
158
+ " at the screen.",
159
+ twice: bool = True,
160
+ compile: bool = False,
161
+ ):
162
+ t0 = time.time()
163
+ image_bytes = Model(compile=compile).inference.remote(prompt)
164
+ print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")
165
+
166
+ if twice:
167
+ t0 = time.time()
168
+ image_bytes = Model(compile=compile).inference.remote(prompt)
169
+ print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")
170
+
171
+ output_path = Path("/tmp") / "flux" / "output.jpg"
172
+ output_path.parent.mkdir(exist_ok=True, parents=True)
173
+ print(f"🎨 saving output to {output_path}")
174
+ output_path.write_bytes(image_bytes)
175
+
176
+ # TODO: Speeding up Flux with torch.compile
177
+
178
+ def optimize(pipe, compile=True):
179
+ # fuse QKV projections in Transformer and VAE
180
+ pipe.transformer.fuse_qkv_projections()
181
+ pipe.vae.fuse_qkv_projections()
182
+
183
+ # switch memory layout to Torch's preferred, channels_last
184
+ pipe.transformer.to(memory_format=torch.channels_last)
185
+ pipe.vae.to(memory_format=torch.channels_last)
186
+
187
+ if not compile:
188
+ return pipe
189
+
190
+ # set torch compile flags
191
+ config = torch._inductor.config # type: ignore
192
+ config.disable_progress = False # show progress bar
193
+ config.conv_1x1_as_mm = True # treat 1x1 convolutions as matrix muls
194
+ # adjust autotuning algorithm
195
+ config.coordinate_descent_tuning = True
196
+ config.coordinate_descent_check_all_directions = True
197
+ config.epilogue_fusion = False # do not fuse pointwise ops into matmuls
198
+
199
+ # tag the compute-intensive modules, the Transformer and VAE decoder, for compilation
200
+ pipe.transformer = torch.compile(
201
+ pipe.transformer, mode="max-autotune", fullgraph=True
202
+ )
203
+ pipe.vae.decode = torch.compile(
204
+ pipe.vae.decode, mode="max-autotune", fullgraph=True
205
+ )
206
+
207
+ # trigger torch compilation
208
+ print("🔦 running torch compilation (may take up to 20 minutes)...")
209
+
210
+ pipe(
211
+ "dummy prompt to trigger torch compilation",
212
+ output_type="pil",
213
+ num_inference_steps=NUM_INFERENCE_STEPS, # use ~50 for [dev], smaller for [schnell]
214
+ ).images[0]
215
+
216
+ print("🔦 finished torch compilation")
217
+
218
+ return pipe
219
+
220
+ # To run this script, use the command:
221
+ # modal run flux.py --prompt "a beautiful landscape with mountains and a river" --twice --compile
222
+
223
+ # This will generate an image based on the provided prompt, run it twice,
224
+ # and save the output to a file named `output.jpg` in the `/tmp/flux/` directory.
225
+ #
226
+ # Make sure to have Modal CLI installed and configured with your API key.
227
+ # You can install Modal CLI with:
228
+ # pip install modal-cli
229
+
230
+ # EOF
pyproject.toml CHANGED
@@ -5,6 +5,10 @@ description = "Add your description here"
5
  readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
 
8
  "gradio[mcp]>=5.32.1",
 
9
  "mcp>=1.9.0",
 
 
10
  ]
 
5
  readme = "README.md"
6
  requires-python = ">=3.12"
7
  dependencies = [
8
+ "diffusers>=0.33.1",
9
  "gradio[mcp]>=5.32.1",
10
+ "icecream>=2.1.4",
11
  "mcp>=1.9.0",
12
+ "modal>=1.0.2",
13
+ "torch>=2.7.1",
14
  ]
stablediffusion.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Run Stable Diffusion to generate images from text prompts.
2
+ # This script uses the Modal framework to run Stable Diffusion in a cloud environment.
3
+ # It requires the `modal` package and a Hugging Face token stored in a Modal secret.
4
+ # Make sure to set up the Modal environment and install the necessary dependencies.
5
+
6
+ # Usage: modal run stablediffusion.py
7
+
8
+ from icecream import ic
9
+ import io
10
+ import os
11
+
12
+ import modal
13
+
14
+ app = modal.App()
15
+
16
+ @app.function(
17
+ image=modal.Image.debian_slim().pip_install(
18
+ "icecream",
19
+ "torch",
20
+ "diffusers[torch]",
21
+ "transformers",
22
+ "ftfy"
23
+ ),
24
+ secrets=[modal.Secret.from_name("huggingface-secret")],
25
+ gpu="any",
26
+ )
27
+ def run_stable_diffusion(prompt: str):
28
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline
29
+
30
+ pipe = StableDiffusionPipeline.from_pretrained(
31
+ "runwayml/stable-diffusion-v1-5",
32
+ use_auth_token=os.environ["HF_TOKEN"],
33
+ ).to("cuda")
34
+
35
+ image = pipe(prompt, num_inference_steps=10).images[0] # type: ignore
36
+
37
+ buf = io.BytesIO()
38
+ image.save(buf, format="PNG")
39
+ img_bytes = buf.getvalue()
40
+
41
+ return img_bytes
42
+
43
+
44
+ @app.local_entrypoint()
45
+ def main():
46
+ prompt = "Wu-Tang Clan climbing Mount Everest"
47
+ # prompt = "A robot dog walking down a vineyard" # Example prompt
48
+
49
+ # out_path = "/tmp/output.png"
50
+ out_path = "stablediffusion_output.png"
51
+
52
+ # ic(os.getcwd())
53
+ # img_bytes = b"<image_bytes>" # Placeholder for the actual image bytes
54
+
55
+ print("DEBUG: Starting Stable Diffusion with prompt:", prompt)
56
+ img_bytes = run_stable_diffusion.remote(prompt=prompt)
57
+ print("DEBUG: Writing img_bytes length:", len(img_bytes))
58
+ with open(out_path, "wb") as f:
59
+ f.write(img_bytes)
60
+ print("DEBUG: Image saved to ", out_path)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
65
+ print("Image saved to /tmp/output.png")
66
+ print("Run `modal deploy` to deploy this app.")
67
+ print("Run `modal serve` to serve this app locally.")
68
+ print("Run `modal run` to run this app in the cloud.")
69
+ print("Run `modal logs` to view the logs of this app.")
70
+ print("Run `modal shell` to open a shell in the cloud environment.")
71
+ print("Run `modal run --help` to see all available options.")
72
+ print("Run `modal deploy --help` to see all available options for deployment.")
uv.lock CHANGED
The diff for this file is too large to render. See raw diff