|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
import time |
|
from io import BytesIO |
|
from pathlib import Path |
|
import modal |
|
|
|
|
|
|
|
MODAL_TOKEN_ID = os.getenv('MODAL_TOKEN_ID') |
|
MODAL_TOKEN_SECRET = os.getenv('MODAL_TOKEN_SECRET') |
|
|
|
if not MODAL_TOKEN_ID or not MODAL_TOKEN_SECRET: |
|
print("WARNING: Modal credentials not found. Image generation will return placeholder images.", file=sys.stderr) |
|
MODAL_AVAILABLE = False |
|
else: |
|
try: |
|
MODAL_AVAILABLE = True |
|
except ImportError: |
|
print("WARNING: Modal package not available. Image generation will return placeholder images.", file=sys.stderr) |
|
MODAL_AVAILABLE = False |
|
|
|
|
|
|
|
|
|
cuda_version = "12.4.0" |
|
flavor = "devel" |
|
operating_sys = "ubuntu22.04" |
|
tag = f"{cuda_version}-{flavor}-{operating_sys}" |
|
|
|
cuda_dev_image = modal.Image.from_registry( |
|
f"nvidia/cuda:{tag}", add_python="3.11" |
|
).entrypoint([]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
diffusers_commit_sha = "81cf3b2f155f1de322079af28f625349ee21ec6b" |
|
|
|
flux_image = ( |
|
cuda_dev_image.apt_install( |
|
"git", |
|
"libglib2.0-0", |
|
"libsm6", |
|
"libxrender1", |
|
"libxext6", |
|
"ffmpeg", |
|
"libgl1", |
|
) |
|
.pip_install( |
|
"invisible_watermark==0.2.0", |
|
"transformers==4.44.0", |
|
"huggingface_hub[hf_transfer]==0.26.2", |
|
"accelerate==0.33.0", |
|
"safetensors==0.4.4", |
|
"sentencepiece==0.2.0", |
|
"torch==2.5.0", |
|
f"git+https://github.com/huggingface/diffusers.git@{diffusers_commit_sha}", |
|
"numpy<2", |
|
) |
|
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HUB_CACHE": "/cache"}) |
|
) |
|
|
|
|
|
|
|
|
|
|
|
flux_image = flux_image.env( |
|
{ |
|
"TORCHINDUCTOR_CACHE_DIR": "/root/.inductor-cache", |
|
"TORCHINDUCTOR_FX_GRAPH_CACHE": "1", |
|
} |
|
) |
|
|
|
|
|
|
|
|
|
app = modal.App( |
|
"example-flux", |
|
image=flux_image, |
|
secrets=[modal.Secret.from_name("huggingface-secret")], |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
with flux_image.imports(): |
|
import torch |
|
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MINUTES = 60 |
|
VARIANT = "schnell" |
|
NUM_INFERENCE_STEPS = 4 |
|
|
|
|
|
@app.cls( |
|
gpu="H100", |
|
scaledown_window=20 * MINUTES, |
|
timeout=60 * MINUTES, |
|
volumes={ |
|
"/cache": modal.Volume.from_name("hf-hub-cache", create_if_missing=True), |
|
"/root/.nv": modal.Volume.from_name("nv-cache", create_if_missing=True), |
|
"/root/.triton": modal.Volume.from_name("triton-cache", create_if_missing=True), |
|
"/root/.inductor-cache": modal.Volume.from_name( |
|
"inductor-cache", create_if_missing=True |
|
), |
|
}, |
|
) |
|
class Model: |
|
compile: bool = ( |
|
modal.parameter(default=False) |
|
) |
|
|
|
@modal.enter() |
|
def enter(self): |
|
pipe = FluxPipeline.from_pretrained( |
|
f"black-forest-labs/FLUX.1-{VARIANT}", torch_dtype=torch.bfloat16 |
|
).to("cuda") |
|
self.pipe = optimize(pipe, compile=self.compile) |
|
|
|
@modal.method() |
|
def inference(self, prompt: str) -> bytes: |
|
print("🎨 generating image...") |
|
out = self.pipe( |
|
prompt, |
|
output_type="pil", |
|
num_inference_steps=NUM_INFERENCE_STEPS, |
|
).images[0] |
|
|
|
byte_stream = BytesIO() |
|
out.save(byte_stream, format="JPEG") |
|
return byte_stream.getvalue() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.local_entrypoint() |
|
def main( |
|
prompt: str = "a computer screen showing ASCII terminal art of the" |
|
" word 'Modal' in neon green. two programmers are pointing excitedly" |
|
" at the screen.", |
|
twice: bool = True, |
|
compile: bool = False, |
|
): |
|
t0 = time.time() |
|
image_bytes = Model(compile=compile).inference.remote(prompt) |
|
print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds") |
|
|
|
if twice: |
|
t0 = time.time() |
|
image_bytes = Model(compile=compile).inference.remote(prompt) |
|
print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds") |
|
|
|
output_path = Path("/tmp") / "flux" / "output.jpg" |
|
output_path.parent.mkdir(exist_ok=True, parents=True) |
|
print(f"🎨 saving output to {output_path}") |
|
output_path.write_bytes(image_bytes) |
|
|
|
|
|
|
|
def optimize(pipe, compile=True): |
|
|
|
pipe.transformer.fuse_qkv_projections() |
|
pipe.vae.fuse_qkv_projections() |
|
|
|
|
|
pipe.transformer.to(memory_format=torch.channels_last) |
|
pipe.vae.to(memory_format=torch.channels_last) |
|
|
|
if not compile: |
|
return pipe |
|
|
|
|
|
config = torch._inductor.config |
|
config.disable_progress = False |
|
config.conv_1x1_as_mm = True |
|
|
|
config.coordinate_descent_tuning = True |
|
config.coordinate_descent_check_all_directions = True |
|
config.epilogue_fusion = False |
|
|
|
|
|
pipe.transformer = torch.compile( |
|
pipe.transformer, mode="max-autotune", fullgraph=True |
|
) |
|
pipe.vae.decode = torch.compile( |
|
pipe.vae.decode, mode="max-autotune", fullgraph=True |
|
) |
|
|
|
|
|
print("🔦 running torch compilation (may take up to 20 minutes)...") |
|
|
|
pipe( |
|
"dummy prompt to trigger torch compilation", |
|
output_type="pil", |
|
num_inference_steps=NUM_INFERENCE_STEPS, |
|
).images[0] |
|
|
|
print("🔦 finished torch compilation") |
|
|
|
return pipe |
|
|
|
|
|
@app.function() |
|
def generate_image( |
|
prompt: str = "Question Mark", |
|
twice: bool = True, |
|
compile: bool = False, |
|
): |
|
""" |
|
Generates an image based on a text prompt using the Flux model |
|
running on Modal serverless infrastructure. |
|
|
|
Args: |
|
prompt (str): The text prompt to generate the image from. |
|
twice (bool): Whether to run the inference twice. |
|
compile (bool): Whether to compile the model. |
|
|
|
Returns: |
|
A bytes object containing the generated image. |
|
""" |
|
|
|
print("DEBUG: generate_image called with parameters:") |
|
print(f" prompt: {prompt}") |
|
print(f" twice: {twice}") |
|
print(f" compile: {compile}") |
|
print("DEBUG: Starting image generation...") |
|
|
|
t0 = time.time() |
|
image_bytes = Model(compile=compile).inference.remote(prompt) |
|
print(f"🎨 first inference latency: {time.time() - t0:.2f} seconds") |
|
|
|
if twice: |
|
t0 = time.time() |
|
image_bytes = Model(compile=compile).inference.remote(prompt) |
|
print(f"🎨 second inference latency: {time.time() - t0:.2f} seconds") |
|
|
|
print(f"DEBUG: Image generation completed - {len(image_bytes)} image_bytes") |
|
return image_bytes |
|
|
|
|
|
def generate_image1(prompt: str): |
|
""" |
|
Generates an image based on a text prompt using the Flux model. |
|
For demonstration, we'll return a placeholder image URL |
|
|
|
Args: |
|
prompt (str): The text prompt to generate the image from. |
|
|
|
Returns: |
|
str: The URL of the generated image. |
|
""" |
|
return "https://avatars.githubusercontent.com/u/75182?v=4" |
|
|
|
|
|
def generate_image2(prompt: str): |
|
""" |
|
Generates an image based on a text prompt using the Flux model |
|
running on Modal serverless infrastructure. |
|
|
|
Args: |
|
prompt (str): The text prompt to generate the image from. |
|
|
|
Returns: |
|
str: The URL of the generated image. |
|
""" |
|
|
|
print("DEBUG: generate_image2 called with prompt:", prompt) |
|
|
|
if not MODAL_AVAILABLE: |
|
print("DEBUG: Modal not available, returning placeholder image") |
|
return generate_image1(prompt) |
|
|
|
if prompt is None or prompt.strip() == "A portrait of a handsome software developer": |
|
print("DEBUG: Returning hardcoded image URL for default prompt") |
|
result = generate_image1(prompt) |
|
else: |
|
|
|
|
|
|
|
print("DEBUG: Calling generate_image.remote with prompt:", prompt) |
|
with app.run(): |
|
print("DEBUG: Running in Modal app context") |
|
|
|
|
|
image_bytes = generate_image.remote(prompt=prompt) |
|
|
|
|
|
|
|
|
|
|
|
output_path = Path("/tmp") / "flux2" / "output.jpg" |
|
output_path.parent.mkdir(exist_ok=True, parents=True) |
|
print(f"🎨 Writing {len(image_bytes)} to {output_path}") |
|
output_path.write_bytes(image_bytes) |
|
|
|
print(f"✅ Image generated and saved to {output_path}") |
|
result = output_path |
|
|
|
print(f"DEBUG: Image generation completed, returning result: {result}") |
|
return result |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|