# Run Flux fast on H100s with torch.compile # # See https://modal.com/docs/examples/flux import os import sys import time from io import BytesIO from pathlib import Path import modal # import modal.running_app # Check for Modal credentials MODAL_TOKEN_ID = os.getenv('MODAL_TOKEN_ID') MODAL_TOKEN_SECRET = os.getenv('MODAL_TOKEN_SECRET') if not MODAL_TOKEN_ID or not MODAL_TOKEN_SECRET: print("WARNING: Modal credentials not found. Image generation will return placeholder images.", file=sys.stderr) MODAL_AVAILABLE = False else: try: MODAL_AVAILABLE = True except ImportError: print("WARNING: Modal package not available. Image generation will return placeholder images.", file=sys.stderr) MODAL_AVAILABLE = False # We’ll make use of the full CUDA toolkit in this example, so we’ll build our container image # off of the nvidia/cuda base. cuda_version = "12.4.0" # should be no greater than host CUDA version flavor = "devel" # includes full CUDA toolkit operating_sys = "ubuntu22.04" tag = f"{cuda_version}-{flavor}-{operating_sys}" cuda_dev_image = modal.Image.from_registry( f"nvidia/cuda:{tag}", add_python="3.11" ).entrypoint([]) # Now we install most of our dependencies with apt and pip. # For Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library # we install from GitHub source and so pin to a specific commit. # # PyTorch added faster attention kernels for Hopper GPUs in version 2.5, # so we pin to that version to ensure we get the best performance on H100s. diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b" flux_image = ( cuda_dev_image.apt_install( "git", "libglib2.0-0", "libsm6", "libxrender1", "libxext6", "ffmpeg", "libgl1", ) .pip_install( "invisible_watermark==0.2.0", "transformers==4.44.0", "huggingface_hub[hf_transfer]==0.26.2", "accelerate==0.33.0", "safetensors==0.4.4", "sentencepiece==0.2.0", "torch==2.5.0", f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}", "numpy<2", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"}) ) # Later, we’ll also use torch.compile to increase the speed further. # Torch compilation needs to be re-executed when each new container starts, # So we turn on some extra caching to reduce compile times for later containers. flux_image = flux_image.env( { "TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache", "TORCHINDUCTOR_FX_GRAPH_CACHE": "1", } ) # Finally, we construct our Modal App, set its default image to the one we just constructed, # and import FluxPipeline for downloading and running Flux.1. app = modal.App( "example-flux", image=flux_image, secrets=[modal.Secret.from_name("huggingface-secret")], ) # @app.function( # image=modal.Image.debian_slim().pip_install("torch", "diffusers[torch]", "transformers", "ftfy"), # gpu="any", # ) with flux_image.imports(): import torch from diffusers.pipelines.flux.pipeline_flux import FluxPipeline # Defining a parameterized Model inference class # # Next, we map the model’s setup and inference code onto Modal. # # 1. We the model setun in the method decorated with @modal.enter(). # This includes loading the weights and moving them to the GPU, # along with an optional torch.compile step (see details below). # The @modal.enter() decorator ensures that this method runs only once, # when a new container starts, instead of in the path of every call. # # 2. We run the actual inference in methods decorated with @modal.method(). MINUTES = 60 # seconds VARIANT = "schnell" # or "dev", but note [dev] requires you to accept terms and conditions on HF NUM_INFERENCE_STEPS = 4 # use ~50 for [dev], smaller for [schnell] @app.cls( gpu="H100", # fastest GPU on Modal scaledown_window=20 * MINUTES, timeout=60 * MINUTES, # leave plenty of time for compilation volumes={ # add Volumes to store serializable compilation artifacts, see section on torch.compile below "/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True), "/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True), "/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True), "/root/.inductor-cache": modal.Volume.from_name( "inductor-cache", create_if_missing=True ), }, ) class Model: compile: bool = ( # see section on torch.compile below for details modal.parameter(default=False) ) @modal.enter() def enter(self): pipe = FluxPipeline.from_pretrained( f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16 ).to("cuda") # move model to GPU self.pipe = optimize(pipe, compile=self.compile) @modal.method() def inference(self, prompt: str) -> bytes: print("🎨 generating image...") out = self.pipe( prompt, output_type="pil", num_inference_steps=NUM_INFERENCE_STEPS, ).images[0] # type: ignore byte_stream = BytesIO() out.save(byte_stream, format="JPEG") return byte_stream.getvalue() # Calling our inference function # # To generate an image we just need to call the Model’s generate method with .remote appended to it. # You can call .generate.remote from any Python environment that has access to your Modal credentials. # The local environment will get back the image as bytes. # # Here, we wrap the call in a Modal local_entrypoint so that it can be run with modal run: # # modal run flux.py # By default, we call generate twice to demonstrate how much faster the inference is after cold start. # In our tests, clients received images in about 1.2 seconds. We save the output bytes to a temporary file. @app.local_entrypoint() def main( prompt: str = "a computer screen showing ASCII terminal art of the" " word 'Modal' in neon green. two programmers are pointing excitedly" " at the screen.", twice: bool = True, compile: bool = False, ): t0 = time.time() image_bytes = Model(compile=compile).inference.remote(prompt) print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds") if twice: t0 = time.time() image_bytes = Model(compile=compile).inference.remote(prompt) print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds") output_path = Path("/tmp") / "flux" / "output.jpg" output_path.parent.mkdir(exist_ok=True, parents=True) print(f"🎨 saving output to {output_path}") output_path.write_bytes(image_bytes) # TODO: Speeding up Flux with torch.compile def optimize(pipe, compile=True): # fuse QKV projections in Transformer and VAE pipe.transformer.fuse_qkv_projections() pipe.vae.fuse_qkv_projections() # switch memory layout to Torch's preferred, channels_last pipe.transformer.to(memory_format=torch.channels_last) pipe.vae.to(memory_format=torch.channels_last) if not compile: return pipe # set torch compile flags config = torch._inductor.config # type: ignore config.disable_progress = False # show progress bar config.conv_1x1_as_mm = True # treat 1x1 convolutions as matrix muls # adjust autotuning algorithm config.coordinate_descent_tuning = True config.coordinate_descent_check_all_directions = True config.epilogue_fusion = False # do not fuse pointwise ops into matmuls # tag the compute-intensive modules, the Transformer and VAE decoder, for compilation pipe.transformer = torch.compile( pipe.transformer, mode="max-autotune", fullgraph=True ) pipe.vae.decode = torch.compile( pipe.vae.decode, mode="max-autotune", fullgraph=True ) # trigger torch compilation print("🔦 running torch compilation (may take up to 20 minutes)...") pipe( "dummy prompt to trigger torch compilation", output_type="pil", num_inference_steps=NUM_INFERENCE_STEPS, # use ~50 for [dev], smaller for [schnell] ).images[0] print("🔦 finished torch compilation") return pipe @app.function() def generate_image( prompt: str = "Question Mark", twice: bool = True, compile: bool = False, ): """ Generates an image based on a text prompt using the Flux model running on Modal serverless infrastructure. Args: prompt (str): The text prompt to generate the image from. twice (bool): Whether to run the inference twice. compile (bool): Whether to compile the model. Returns: A bytes object containing the generated image. """ print("DEBUG: generate_image called with parameters:") print(f" prompt: {prompt}") print(f" twice: {twice}") print(f" compile: {compile}") print("DEBUG: Starting image generation...") t0 = time.time() image_bytes = Model(compile=compile).inference.remote(prompt) print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds") if twice: t0 = time.time() image_bytes = Model(compile=compile).inference.remote(prompt) print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds") print(f"DEBUG: Image generation completed - {len(image_bytes)} image_bytes") return image_bytes def generate_image1(prompt: str): """ Generates an image based on a text prompt using the Flux model. For demonstration, we'll return a placeholder image URL Args: prompt (str): The text prompt to generate the image from. Returns: str: The URL of the generated image. """ return "https://avatars.githubusercontent.com/u/75182?v=4" def generate_image2(prompt: str): """ Generates an image based on a text prompt using the Flux model running on Modal serverless infrastructure. Args: prompt (str): The text prompt to generate the image from. Returns: str: The URL of the generated image. """ print("DEBUG: generate_image2 called with prompt:", prompt) if not MODAL_AVAILABLE: print("DEBUG: Modal not available, returning placeholder image") return generate_image1(prompt) if prompt is None or prompt.strip() == "A portrait of a handsome software developer": print("DEBUG: Returning hardcoded image URL for default prompt") result = generate_image1(prompt) else: # Call the generate_image function in the Modal app context # This will ensure that the function is executed in the Modal environment # and can access the necessary resources and configurations. print("DEBUG: Calling generate_image.remote with prompt:", prompt) with app.run(): print("DEBUG: Running in Modal app context") # This will return the path to the generated image. # Note: This is a blocking call, so it will wait for the image generation to complete. image_bytes = generate_image.remote(prompt=prompt) # Use .remote() to call the function asynchronously # This allows the function to run in the Modal serverless infrastructure. # The result will be a future object that can be awaited or used to get the result later. # In this case, we are returning the future object directly. output_path = Path("/tmp") / "flux2" / "output.jpg" output_path.parent.mkdir(exist_ok=True, parents=True) print(f"🎨 Writing {len(image_bytes)} to {output_path}") output_path.write_bytes(image_bytes) print(f"✅ Image generated and saved to {output_path}") result = output_path print(f"DEBUG: Image generation completed, returning result: {result}") return result # To run this script, use the command: # modal run flux.py --prompt "a beautiful landscape with mountains and a river" --twice --compile # This will generate an image based on the provided prompt, run it twice, # and save the output to a file named `output.jpg` in the `/tmp/flux/` directory. # # Make sure to have Modal CLI installed and configured with your API key. # You can install Modal CLI with: # pip install modal-cli # EOF