Spaces:

andresampa
/

CtB-AI-img-gen

Sleeping

File size: 2,996 Bytes

4f48282

import gradio as gr
import numpy as np
import random
import torch
from diffusers import DiffusionPipeline, FlowMatchEulerDiscreteScheduler, AutoencoderTiny, AutoencoderKL
from transformers import CLIPTextModel, CLIPTokenizer,T5EncoderModel, T5TokenizerFast
from live_preview_helpers import calculate_shift, retrieve_timesteps, flux_pipe_call_that_returns_an_iterable_of_images
import modal
import random
import io
from config.config import prompts, models  # Indirect import
import os
import sentencepiece
from huggingface_hub import login
from transformers import AutoTokenizer
from datetime import datetime
from PIL import Image



CACHE_DIR = "/model_cache"

# Define the Modal image
image = (
    modal.Image.from_registry("nvidia/cuda:12.2.0-devel-ubuntu22.04", add_python="3.9")
                .pip_install_from_requirements("requirements.txt")
    #modal.Image.debian_slim(python_version="3.9")  # Base image
    # .apt_install(
    #     "git",
    # )
    # .pip_install(
    #     "diffusers",
    #     f"git+https://github.com/huggingface/transformers.git"
    # )
    .env(
        {
            "HF_HUB_ENABLE_HF_TRANSFER": "1", "HF_HOME": "HF_HOME", "HF_HUB_CACHE": CACHE_DIR
        }
    )
)

# Create a Modal app
app = modal.App("img-gen-modal-live", image=image)
with image.imports():
    import os

flux_model_vol = modal.Volume.from_name("flux-model-vol", create_if_missing=True)  # Reference your volume


# GPU FUNCTION
@app.function(volumes={"/data": flux_model_vol},
              secrets=[modal.Secret.from_name("huggingface-token")],
              gpu="L40S",
              timeout = 300
              )
def main():
    
    def latents_to_rgb(latents):
        weights = (
            (60, -60, 25, -70),
            (60,  -5, 15, -50),
            (60,  10, -5, -35),
        )

        weights_tensor = torch.t(torch.tensor(weights, dtype=latents.dtype).to(latents.device))
        biases_tensor = torch.tensor((150, 140, 130), dtype=latents.dtype).to(latents.device)
        rgb_tensor = torch.einsum("...lxy,lr -> ...rxy", latents, weights_tensor) + biases_tensor.unsqueeze(-1).unsqueeze(-1)
        image_array = rgb_tensor.clamp(0, 255).byte().cpu().numpy().transpose(1, 2, 0)

        return Image.fromarray(image_array)

    def decode_tensors(pipe, step, timestep, callback_kwargs):
        latents = callback_kwargs["latents"]

        image = latents_to_rgb(latents[0])
        image.save(f"{step}.png")

        return callback_kwargs
    model_name = "FLUX.1-dev"
    model_path = f"/data/{model_name}" 

    pipeline = DiffusionPipeline.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        use_safetensors=True
    ).to("cuda")

    image = pipeline(
        prompt="A croissant shaped like a cute bear.",
        negative_prompt="Deformed, ugly, bad anatomy",
        width=300,
        height=200,
        callback_on_step_end=decode_tensors,
        callback_on_step_end_tensor_inputs=["latents"],
    ).images[0]