import spaces
import gradio as gr
import torch
from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor, pipeline
from diffusers import StableDiffusion3Pipeline
import re
import random
import numpy as np
import os
from huggingface_hub import snapshot_download

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16

huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

model_path = snapshot_download(
    repo_id="stabilityai/stable-diffusion-3-medium", 
    revision="refs/pr/26",
    repo_type="model",     
    ignore_patterns=["*.md", "*..gitattributes"],
    local_dir="SD3",
    token=huggingface_token, # type a new token-id.
    )

# VLM Captioner
vlm_model = PaliGemmaForConditionalGeneration.from_pretrained("gokaygokay/sd3-long-captioner-v2").to(device).eval()
vlm_processor = PaliGemmaProcessor.from_pretrained("gokaygokay/sd3-long-captioner-v2")

# Prompt Enhancer
enhancer_medium = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance", device=device)
enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device)

# SD3
sd3_pipe = StableDiffusion3Pipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1344

# VLM Captioner function
def create_captions_rich(image):
    prompt = "caption en"
    model_inputs = vlm_processor(text=prompt, images=image, return_tensors="pt").to(device)
    input_len = model_inputs["input_ids"].shape[-1]

    with torch.inference_mode():
        generation = vlm_model.generate(**model_inputs, repetition_penalty=1.10, max_new_tokens=256, do_sample=False)
        generation = generation[0][input_len:]
        decoded = vlm_processor.decode(generation, skip_special_tokens=True)

    return modify_caption(decoded)

# Helper function for caption modification
def modify_caption(caption: str) -> str:
    prefix_substrings = [
        ('captured from ', ''),
        ('captured at ', '')
    ]
    pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings])
    replacers = {opening: replacer for opening, replacer in prefix_substrings}
    
    def replace_fn(match):
        return replacers[match.group(0)]
    
    return re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE)

# Prompt Enhancer function
def enhance_prompt(input_prompt, model_choice):
    if model_choice == "Medium":
        result = enhancer_medium("Enhance the description: " + input_prompt)
        enhanced_text = result[0]['summary_text']
        
        pattern = r'^.*?of\s+(.*?(?:\.|$))'
        match = re.match(pattern, enhanced_text, re.IGNORECASE | re.DOTALL)
        
        if match:
            remaining_text = enhanced_text[match.end():].strip()
            modified_sentence = match.group(1).capitalize()
            enhanced_text = modified_sentence + ' ' + remaining_text
    else:  # Long
        result = enhancer_long("Enhance the description: " + input_prompt)
        enhanced_text = result[0]['summary_text']
    
    return enhanced_text

# SD3 Generation function
def generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)
    
    generator = torch.Generator().manual_seed(seed)
    
    image = sd3_pipe(
        prompt=prompt, 
        negative_prompt=negative_prompt,
        guidance_scale=guidance_scale, 
        num_inference_steps=num_inference_steps, 
        width=width, 
        height=height,
        generator=generator
    ).images[0]
    
    return image, seed

# Gradio Interface
@spaces.GPU
def process_workflow(image, text_prompt, use_vlm, use_enhancer, model_choice, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
    if use_vlm and image is not None:
        prompt = create_captions_rich(image)
    else:
        prompt = text_prompt
    
    if use_enhancer:
        prompt = enhance_prompt(prompt, model_choice)
    
    generated_image, used_seed = generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps)
    
    return generated_image, prompt, used_seed


custom_css = """
.input-group, .output-group {
    border: 1px solid #e0e0e0;
    border-radius: 10px;
    padding: 20px;
    margin-bottom: 20px;
    background-color: #f9f9f9;
}
.submit-btn {
    background-color: #2980b9 !important;
    color: white !important;
}
.submit-btn:hover {
    background-color: #3498db !important;
}
"""

title = """<h1 align="center">VLM Captioner + Prompt Enhancer + SD3 Image Generator</h1>
<p><center>
<a href="https://huggingface.co/spaces/gokaygokay/SD3-Long-Captioner-V2" target="_blank">[VLM Model]</a>
<a href="https://huggingface.co/gokaygokay/Lamini-Prompt-Enchance-Long" target="_blank">[Prompt Enhancer Long]</a>
<a href="https://huggingface.co/gokaygokay/Lamini-Prompt-Enchance" target="_blank">[Prompt Enhancer Medium]</a>
<a href="https://github.com/gokayfem" target="_blank">[Github]</a>
<a href="https://x.com/NONDA30" target="_blank">[X/twitter]</a>
<p align="center">Dont forget to click <b>Use VLM Captioner</b> or <b>Use Prompt Enhancer</b> Buttons!</p>
</center></p>
"""

# Gradio Interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo:
    
    gr.HTML(title)
    
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group(elem_classes="input-group"):
                input_image = gr.Image(label="Input Image for VLM")
                use_vlm = gr.Checkbox(label="Use VLM Captioner", value=False)
            
            with gr.Group(elem_classes="input-group"):
                text_prompt = gr.Textbox(label="Text Prompt")
                use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False)
                model_choice = gr.Radio(["Medium", "Long"], label="Enhancer Model", value="Long")
            
            with gr.Accordion("Advanced Settings", open=False):
                negative_prompt = gr.Textbox(label="Negative Prompt")
                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=64, value=1024)
                height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=64, value=1024)
                guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=10.0, step=0.1, value=5.0)
                num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28)
            
            generate_btn = gr.Button("Generate Image", elem_classes="submit-btn")
        
        with gr.Column(scale=1):
            with gr.Group(elem_classes="output-group"):
                output_image = gr.Image(label="Generated Image")
                final_prompt = gr.Textbox(label="Final Prompt Used")
                used_seed = gr.Number(label="Seed Used")
    
    generate_btn.click(
        fn=process_workflow,
        inputs=[
            input_image, text_prompt, use_vlm, use_enhancer, model_choice,
            negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps
        ],
        outputs=[output_image, final_prompt, used_seed]
    )

demo.launch(debug=True)