import spaces import gradio as gr import torch from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM, pipeline from diffusers import DiffusionPipeline import random import numpy as np # Initialize models device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # FLUX.1-dev model pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=dtype, token = huggingface_token).to(device) # Initialize Florence model florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval() florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True) # Prompt Enhancer enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device) MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 2048 # Florence caption function def florence_caption(image): # Convert image to PIL if it's not already if not isinstance(image, Image.Image): image = Image.fromarray(image) inputs = florence_processor(text="", images=image, return_tensors="pt").to(device) generated_ids = florence_model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3, ) generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = florence_processor.post_process_generation( generated_text, task="", image_size=(image.width, image.height) ) return parsed_answer[""] # Prompt Enhancer function def enhance_prompt(input_prompt): result = enhancer_long("Enhance the description: " + input_prompt) enhanced_text = result[0]['summary_text'] return enhanced_text @spaces.GPU(duration=190) def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, progress=gr.Progress(track_tqdm=True)): if image is not None: # Convert image to PIL if it's not already if not isinstance(image, Image.Image): image = Image.fromarray(image) prompt = florence_caption(image) else: prompt = text_prompt if use_enhancer: prompt = enhance_prompt(prompt) if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator(device=device).manual_seed(seed) image = pipe( prompt=prompt, generator=generator, num_inference_steps=num_inference_steps, width=width, height=height, guidance_scale=guidance_scale ).images[0] return image, prompt, seed custom_css = """ .input-group, .output-group { border: 1px solid #e0e0e0; border-radius: 10px; padding: 20px; margin-bottom: 20px; background-color: #f9f9f9; } .submit-btn { background-color: #2980b9 !important; color: white !important; } .submit-btn:hover { background-color: #3498db !important; } """ title = """

FLUX.1-dev with Florence-2 Captioner and Prompt Enhancer

[FLUX.1-dev Model] [Florence-2 Model] [Prompt Enhancer Long]

Create long prompts from images or enhance your short prompts with prompt enhancer

""" with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo: gr.HTML(title) with gr.Row(): with gr.Column(scale=1): with gr.Group(elem_classes="input-group"): input_image = gr.Image(label="Input Image (Florence-2 Captioner)") with gr.Accordion("Advanced Settings", open=False): text_prompt = gr.Textbox(label="Text Prompt (optional, used if no image is uploaded)") use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False) seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0) randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024) height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024) guidance_scale = gr.Slider(label="Guidance Scale", minimum=1, maximum=15, step=0.1, value=3.5) num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28) generate_btn = gr.Button("Generate Image", elem_classes="submit-btn") with gr.Column(scale=1): with gr.Group(elem_classes="output-group"): output_image = gr.Image(label="Result", elem_id="gallery", show_label=False) final_prompt = gr.Textbox(label="Final Prompt Used") used_seed = gr.Number(label="Seed Used") generate_btn.click( fn=process_workflow, inputs=[ input_image, text_prompt, use_enhancer, seed, randomize_seed, width, height, guidance_scale, num_inference_steps ], outputs=[output_image, final_prompt, used_seed] ) demo.launch(debug=True)