Spaces:

Agents-MCP-Hackathon
/

simple-calculator

Running

Gianpaolo Macario

feat: first shot at dockerizing app.py

e21102a 2 months ago

12.4 kB

	# Run Flux fast on H100s with torch.compile
	#
	# See https://modal.com/docs/examples/flux

	import os
	import sys
	import time
	from io import BytesIO
	from pathlib import Path
	import modal
	# import modal.running_app

	# Check for Modal credentials
	MODAL_TOKEN_ID = os.getenv('MODAL_TOKEN_ID')
	MODAL_TOKEN_SECRET = os.getenv('MODAL_TOKEN_SECRET')

	if not MODAL_TOKEN_ID or not MODAL_TOKEN_SECRET:
	print("WARNING: Modal credentials not found. Image generation will return placeholder images.", file=sys.stderr)
	MODAL_AVAILABLE = False
	else:
	try:
	MODAL_AVAILABLE = True
	except ImportError:
	print("WARNING: Modal package not available. Image generation will return placeholder images.", file=sys.stderr)
	MODAL_AVAILABLE = False

	# We’ll make use of the full CUDA toolkit in this example, so we’ll build our container image
	# off of the nvidia/cuda base.

	cuda_version = "12.4.0" # should be no greater than host CUDA version
	flavor = "devel" # includes full CUDA toolkit
	operating_sys = "ubuntu22.04"
	tag = f"{cuda_version}-{flavor}-{operating_sys}"

	cuda_dev_image = modal.Image.from_registry(
	f"nvidia/cuda:{tag}", add_python="3.11"
	).entrypoint([])

	# Now we install most of our dependencies with apt and pip.
	# For Hugging Face’s [Diffusers](https://github.com/huggingface/diffusers) library
	# we install from GitHub source and so pin to a specific commit.
	#
	# PyTorch added faster attention kernels for Hopper GPUs in version 2.5,
	# so we pin to that version to ensure we get the best performance on H100s.

	diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b"

	flux_image = (
	cuda_dev_image.apt_install(
	"git",
	"libglib2.0-0",
	"libsm6",
	"libxrender1",
	"libxext6",
	"ffmpeg",
	"libgl1",
	)
	.pip_install(
	"invisible_watermark==0.2.0",
	"transformers==4.44.0",
	"huggingface_hub[hf_transfer]==0.26.2",
	"accelerate==0.33.0",
	"safetensors==0.4.4",
	"sentencepiece==0.2.0",
	"torch==2.5.0",
	f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}",
	"numpy<2",
	)
	.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"})
	)

	# Later, we’ll also use torch.compile to increase the speed further.
	# Torch compilation needs to be re-executed when each new container starts,
	# So we turn on some extra caching to reduce compile times for later containers.

	flux_image = flux_image.env(
	{
	"TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache",
	"TORCHINDUCTOR_FX_GRAPH_CACHE": "1",
	}
	)

	# Finally, we construct our Modal App, set its default image to the one we just constructed,
	# and import FluxPipeline for downloading and running Flux.1.

	app = modal.App(
	"example-flux",
	image=flux_image,
	secrets=[modal.Secret.from_name("huggingface-secret")],
	)

	# @app.function(
	# image=modal.Image.debian_slim().pip_install("torch", "diffusers[torch]", "transformers", "ftfy"),
	# gpu="any",
	# )

	with flux_image.imports():
	import torch
	from diffusers.pipelines.flux.pipeline_flux import FluxPipeline

	# Defining a parameterized Model inference class
	#
	# Next, we map the model’s setup and inference code onto Modal.
	#
	# 1. We the model setun in the method decorated with @modal.enter().
	# This includes loading the weights and moving them to the GPU,
	# along with an optional torch.compile step (see details below).
	# The @modal.enter() decorator ensures that this method runs only once,
	# when a new container starts, instead of in the path of every call.
	#
	# 2. We run the actual inference in methods decorated with @modal.method().

	MINUTES = 60 # seconds
	VARIANT = "schnell" # or "dev", but note [dev] requires you to accept terms and conditions on HF
	NUM_INFERENCE_STEPS = 4 # use ~50 for [dev], smaller for [schnell]


	@app.cls(
	gpu="H100", # fastest GPU on Modal
	scaledown_window=20 * MINUTES,
	timeout=60 * MINUTES, # leave plenty of time for compilation
	volumes={ # add Volumes to store serializable compilation artifacts, see section on torch.compile below
	"/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True),
	"/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True),
	"/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True),
	"/root/.inductor-cache": modal.Volume.from_name(
	"inductor-cache", create_if_missing=True
	),
	},
	)
	class Model:
	compile: bool = ( # see section on torch.compile below for details
	modal.parameter(default=False)
	)

	@modal.enter()
	def enter(self):
	pipe = FluxPipeline.from_pretrained(
	f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16
	).to("cuda") # move model to GPU
	self.pipe = optimize(pipe, compile=self.compile)

	@modal.method()
	def inference(self, prompt: str) -> bytes:
	print("🎨 generating image...")
	out = self.pipe(
	prompt,
	output_type="pil",
	num_inference_steps=NUM_INFERENCE_STEPS,
	).images[0] # type: ignore

	byte_stream = BytesIO()
	out.save(byte_stream, format="JPEG")
	return byte_stream.getvalue()

	# Calling our inference function
	#
	# To generate an image we just need to call the Model’s generate method with .remote appended to it.
	# You can call .generate.remote from any Python environment that has access to your Modal credentials.
	# The local environment will get back the image as bytes.
	#
	# Here, we wrap the call in a Modal local_entrypoint so that it can be run with modal run:
	#
	# modal run flux.py

	# By default, we call generate twice to demonstrate how much faster the inference is after cold start.
	# In our tests, clients received images in about 1.2 seconds. We save the output bytes to a temporary file.

	@app.local_entrypoint()
	def main(
	prompt: str = "a computer screen showing ASCII terminal art of the"
	" word 'Modal' in neon green. two programmers are pointing excitedly"
	" at the screen.",
	twice: bool = True,
	compile: bool = False,
	):
	t0 = time.time()
	image_bytes = Model(compile=compile).inference.remote(prompt)
	print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")

	if twice:
	t0 = time.time()
	image_bytes = Model(compile=compile).inference.remote(prompt)
	print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")

	output_path = Path("/tmp") / "flux" / "output.jpg"
	output_path.parent.mkdir(exist_ok=True, parents=True)
	print(f"🎨 saving output to {output_path}")
	output_path.write_bytes(image_bytes)

	# TODO: Speeding up Flux with torch.compile

	def optimize(pipe, compile=True):
	# fuse QKV projections in Transformer and VAE
	pipe.transformer.fuse_qkv_projections()
	pipe.vae.fuse_qkv_projections()

	# switch memory layout to Torch's preferred, channels_last
	pipe.transformer.to(memory_format=torch.channels_last)
	pipe.vae.to(memory_format=torch.channels_last)

	if not compile:
	return pipe

	# set torch compile flags
	config = torch._inductor.config # type: ignore
	config.disable_progress = False # show progress bar
	config.conv_1x1_as_mm = True # treat 1x1 convolutions as matrix muls
	# adjust autotuning algorithm
	config.coordinate_descent_tuning = True
	config.coordinate_descent_check_all_directions = True
	config.epilogue_fusion = False # do not fuse pointwise ops into matmuls

	# tag the compute-intensive modules, the Transformer and VAE decoder, for compilation
	pipe.transformer = torch.compile(
	pipe.transformer, mode="max-autotune", fullgraph=True
	)
	pipe.vae.decode = torch.compile(
	pipe.vae.decode, mode="max-autotune", fullgraph=True
	)

	# trigger torch compilation
	print("🔦 running torch compilation (may take up to 20 minutes)...")

	pipe(
	"dummy prompt to trigger torch compilation",
	output_type="pil",
	num_inference_steps=NUM_INFERENCE_STEPS, # use ~50 for [dev], smaller for [schnell]
	).images[0]

	print("🔦 finished torch compilation")

	return pipe


	@app.function()
	def generate_image(
	prompt: str = "Question Mark",
	twice: bool = True,
	compile: bool = False,
	):
	"""
	Generates an image based on a text prompt using the Flux model
	running on Modal serverless infrastructure.

	Args:
	prompt (str): The text prompt to generate the image from.
	twice (bool): Whether to run the inference twice.
	compile (bool): Whether to compile the model.

	Returns:
	A bytes object containing the generated image.
	"""

	print("DEBUG: generate_image called with parameters:")
	print(f" prompt: {prompt}")
	print(f" twice: {twice}")
	print(f" compile: {compile}")
	print("DEBUG: Starting image generation...")

	t0 = time.time()
	image_bytes = Model(compile=compile).inference.remote(prompt)
	print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds")

	if twice:
	t0 = time.time()
	image_bytes = Model(compile=compile).inference.remote(prompt)
	print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds")

	print(f"DEBUG: Image generation completed - {len(image_bytes)} image_bytes")
	return image_bytes


	def generate_image1(prompt: str):
	"""
	Generates an image based on a text prompt using the Flux model.
	For demonstration, we'll return a placeholder image URL

	Args:
	prompt (str): The text prompt to generate the image from.

	Returns:
	str: The URL of the generated image.
	"""
	return "https://avatars.githubusercontent.com/u/75182?v=4"


	def generate_image2(prompt: str):
	"""
	Generates an image based on a text prompt using the Flux model
	running on Modal serverless infrastructure.

	Args:
	prompt (str): The text prompt to generate the image from.

	Returns:
	str: The URL of the generated image.
	"""

	print("DEBUG: generate_image2 called with prompt:", prompt)

	if not MODAL_AVAILABLE:
	print("DEBUG: Modal not available, returning placeholder image")
	return generate_image1(prompt)

	if prompt is None or prompt.strip() == "A portrait of a handsome software developer":
	print("DEBUG: Returning hardcoded image URL for default prompt")
	result = generate_image1(prompt)
	else:
	# Call the generate_image function in the Modal app context
	# This will ensure that the function is executed in the Modal environment
	# and can access the necessary resources and configurations.
	print("DEBUG: Calling generate_image.remote with prompt:", prompt)
	with app.run():
	print("DEBUG: Running in Modal app context")
	# This will return the path to the generated image.
	# Note: This is a blocking call, so it will wait for the image generation to complete.
	image_bytes = generate_image.remote(prompt=prompt)
	# Use .remote() to call the function asynchronously
	# This allows the function to run in the Modal serverless infrastructure.
	# The result will be a future object that can be awaited or used to get the result later.
	# In this case, we are returning the future object directly.

	output_path = Path("/tmp") / "flux2" / "output.jpg"
	output_path.parent.mkdir(exist_ok=True, parents=True)
	print(f"🎨 Writing {len(image_bytes)} to {output_path}")
	output_path.write_bytes(image_bytes)

	print(f"✅ Image generated and saved to {output_path}")
	result = output_path

	print(f"DEBUG: Image generation completed, returning result: {result}")
	return result


	# To run this script, use the command:
	# modal run flux.py --prompt "a beautiful landscape with mountains and a river" --twice --compile

	# This will generate an image based on the provided prompt, run it twice,
	# and save the output to a file named `output.jpg` in the `/tmp/flux/` directory.
	#
	# Make sure to have Modal CLI installed and configured with your API key.
	# You can install Modal CLI with:
	# pip install modal-cli

	# EOF