import gradio as gr
import numpy as np
import random
import spaces
import torch
import time
from diffusers import DiffusionPipeline, AutoencoderTiny
from diffusers.models.attention_processor import AttnProcessor2_0
from custom_pipeline import FluxWithCFGPipeline

torch.backends.cuda.matmul.allow_tf32 = True

# Constants
MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 2048
DEFAULT_WIDTH = 1024
DEFAULT_HEIGHT = 1024
DEFAULT_INFERENCE_STEPS = 1

# Device and model setup
dtype = torch.float16
pipe = FluxWithCFGPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
)
pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype)
pipe.to("cuda")
pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
pipe.set_adapters(["better"], adapter_weights=[1.0])
pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0)
pipe.unload_lora_weights()

# Memory optimizations
pipe.unet.to(memory_format=torch.channels_last)  # Channels last
pipe.enable_xformers_memory_efficient_attention()  # Flash Attention

# CUDA Graph setup
static_inputs = None
static_model = None
graph = None

def setup_cuda_graph(prompt, height, width, num_inference_steps):
    global static_inputs, static_model, graph

    batch_size = 1 if isinstance(prompt, str) else len(prompt)
    device = "cuda"
    num_images_per_prompt = 1

    prompt_embeds, pooled_prompt_embeds, text_ids = pipe.encode_prompt(
        prompt=prompt,
        prompt_2=None,
        prompt_embeds=None,
        pooled_prompt_embeds=None,
        device=device,
        num_images_per_prompt=num_images_per_prompt,
        max_sequence_length=300,
        lora_scale=None,
    )

    latents, latent_image_ids = pipe.prepare_latents(
        batch_size * num_images_per_prompt,
        pipe.transformer.config.in_channels // 4,
        height,
        width,
        prompt_embeds.dtype,
        device,
        None,
        None,
    )
    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
    image_seq_len = latents.shape[1]
    mu = calculate_timestep_shift(image_seq_len)

    timesteps, num_inference_steps = prepare_timesteps(
        pipe.scheduler,
        num_inference_steps,
        device,
        None,
        sigmas,
        mu=mu,
    )

    guidance = torch.full([1], 3.5, device=device, dtype=torch.float16).expand(latents.shape[0]) if pipe.transformer.config.guidance_embeds else None

    static_inputs = {
        "hidden_states": latents,
        "timestep": timesteps,
        "guidance": guidance,
        "pooled_projections": pooled_prompt_embeds,
        "encoder_hidden_states": prompt_embeds,
        "txt_ids": text_ids,
        "img_ids": latent_image_ids,
        "joint_attention_kwargs": None,
    }

    static_model = torch.cuda.make_graphed_callables(pipe.transformer, (static_inputs,))
    graph = torch.cuda.CUDAGraph()

    with torch.cuda.graph(graph):
        static_output = static_model(**static_inputs)

# Inference function
@spaces.GPU(duration=25)
def generate_image(prompt, seed=24, width=DEFAULT_WIDTH, height=DEFAULT_HEIGHT, randomize_seed=False, num_inference_steps=2, progress=gr.Progress(track_tqdm=True)):
    global static_inputs, graph

    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    generator = torch.Generator().manual_seed(int(float(seed)))

    start_time = time.time()
    
    if static_inputs is None:
        setup_cuda_graph(prompt, height, width, num_inference_steps)

    static_inputs["hidden_states"].copy_(pipe.prepare_latents(
        1,
        pipe.transformer.config.in_channels // 4,
        height,
        width,
        static_inputs["encoder_hidden_states"].dtype,
        "cuda",
        generator,
        None,
    )[0])

    graph.replay()
    latents = static_inputs["hidden_states"]

    img = pipe._decode_latents_to_image(latents, height, width, "pil")
    latency = f"Latency: {(time.time()-start_time):.2f} seconds"    
    return img, seed, latency

# Example prompts
examples = [
    "a tiny astronaut hatching from an egg on the moon",
    "a cute white cat holding a sign that says hello world",
    "an anime illustration of Steve Jobs",
    "Create image of Modern house in minecraft style",
    "photo of a woman on the beach, shot from above. She is facing the sea, while wearing a white dress. She has long blonde hair",
    "Selfie photo of a wizard with long beard and purple robes, he is apparently in the middle of Tokyo. Probably taken from a phone.",
    "Photo of a young woman with long, wavy brown hair tied in a bun and glasses. She has a fair complexion and is wearing subtle makeup, emphasizing her eyes and lips. She is dressed in a black top. The background appears to be an urban setting with a building facade, and the sunlight casts a warm glow on her face.",
]

# --- Gradio UI ---
with gr.Blocks() as demo:
    with gr.Column(elem_id="app-container"):
        gr.Markdown("# 🎨 Realtime FLUX Image Generator")
        gr.Markdown("Generate stunning images in real-time with Modified Flux.Schnell pipeline.")
        gr.Markdown("<span style='color: red;'>Note: Sometimes it stucks or stops generating images (I don't know why). In that situation just refresh the site.</span>")

        with gr.Row():
            with gr.Column(scale=2.5):
                result = gr.Image(label="Generated Image", show_label=False, interactive=False)
            with gr.Column(scale=1):
                prompt = gr.Text(
                    label="Prompt",
                    placeholder="Describe the image you want to generate...",
                    lines=3,
                    show_label=False,
                    container=False,
                )
                generateBtn = gr.Button("🖼️ Generate Image")
                enhanceBtn = gr.Button("🚀 Enhance Image")

                with gr.Column("Advanced Options"):
                    with gr.Row():
                        realtime = gr.Checkbox(label="Realtime Toggler", info="If TRUE then uses more GPU but create image in realtime.", value=False)
                        latency = gr.Text(label="Latency")
                    with gr.Row():
                        seed = gr.Number(label="Seed", value=42)
                        randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                    with gr.Row():
                        width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=DEFAULT_WIDTH)
                        height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=DEFAULT_HEIGHT)
                        num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=4, step=1, value=DEFAULT_INFERENCE_STEPS)

        with gr.Row():
            gr.Markdown("### 🌟 Inspiration Gallery")
        with gr.Row():
            gr.Examples(
                examples=examples,
                fn=generate_image,
                inputs=[prompt],
                outputs=[result, seed, latency],
                cache_examples="lazy" 
            )

    enhanceBtn.click(
        fn=generate_image,
        inputs=[prompt, seed, width, height],
        outputs=[result, seed, latency],
        show_progress="full",
        queue=False,
        concurrency_limit=None
    )

    generateBtn.click(
        fn=generate_image,
        inputs=[prompt, seed, width, height, randomize_seed, num_inference_steps],
        outputs=[result, seed, latency],
        show_progress="full",
        api_name="RealtimeFlux",
        queue=False
    )

    def update_ui(realtime_enabled):
        return {
            prompt: gr.update(interactive=True),
            generateBtn: gr.update(visible=not realtime_enabled)
        }

    realtime.change(
        fn=update_ui,
        inputs=[realtime],
        outputs=[prompt, generateBtn],
        queue=False,
        concurrency_limit=None
    )

    def realtime_generation(*args):
        if args[0]:  # If realtime is enabled
            return next(generate_image(*args[1:]))

    prompt.submit(
        fn=generate_image,
        inputs=[prompt, seed, width, height, randomize_seed, num_inference_steps],
        outputs=[result, seed, latency],
        show_progress="full",
        queue=False,
        concurrency_limit=None
    )

    for component in [prompt, width, height, num_inference_steps]:
        component.input(
            fn=realtime_generation,
            inputs=[realtime, prompt, seed, width, height, randomize_seed, num_inference_steps],
            outputs=[result, seed, latency],
            show_progress="hidden",
            trigger_mode="always_last",
            queue=False,
            concurrency_limit=None
        )

# Launch the app
demo.queue(max_size=5, concurrency_count=1).launch()