Spaces:

gokaygokay
/

FLUX.1-dev-with-Captioner

Running on Zero

App Files Files Community

FLUX.1-dev-with-Captioner / app.py

gokaygokay

Update app.py

0a0ceda verified about 1 year ago

raw

history blame

7 kB

	import spaces
	import gradio as gr
	import torch
	from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor, pipeline
	from diffusers import StableDiffusion3Pipeline
	import re
	import random
	import numpy as np
	import os
	from huggingface_hub import snapshot_download

	# Initialize models
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.float16

	huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

	model_path = snapshot_download(
	repo_id="stabilityai/stable-diffusion-3-medium",
	revision="refs/pr/26",
	repo_type="model",
	ignore_patterns=[".md", "..gitattributes"],
	local_dir="SD3",
	token=huggingface_token, # type a new token-id.
	)

	# VLM Captioner
	vlm_model = PaliGemmaForConditionalGeneration.from_pretrained("gokaygokay/sd3-long-captioner").to(device).eval()
	vlm_processor = PaliGemmaProcessor.from_pretrained("gokaygokay/sd3-long-captioner")

	# Prompt Enhancer
	enhancer_medium = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance", device=device)
	enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device)

	# SD3
	sd3_pipe = StableDiffusion3Pipeline.from_pretrained(model_path, torch_dtype=dtype).to(device)

	MAX_SEED = np.iinfo(np.int32).max
	MAX_IMAGE_SIZE = 1344

	# VLM Captioner function
	def create_captions_rich(image):
	prompt = "caption en"
	model_inputs = vlm_processor(text=prompt, images=image, return_tensors="pt").to(device)
	input_len = model_inputs["input_ids"].shape[-1]

	with torch.inference_mode():
	generation = vlm_model.generate(**model_inputs, max_new_tokens=256, do_sample=False)
	generation = generation[0][input_len:]
	decoded = vlm_processor.decode(generation, skip_special_tokens=True)

	return modify_caption(decoded)

	# Helper function for caption modification
	def modify_caption(caption: str) -> str:
	prefix_substrings = [
	('captured from ', ''),
	('captured at ', '')
	]
	pattern = '\|'.join([re.escape(opening) for opening, _ in prefix_substrings])
	replacers = {opening: replacer for opening, replacer in prefix_substrings}

	def replace_fn(match):
	return replacers[match.group(0)]

	return re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE)

	# Prompt Enhancer function
	def enhance_prompt(input_prompt, model_choice):
	if model_choice == "Medium":
	result = enhancer_medium("Enhance the description: " + input_prompt)
	enhanced_text = result[0]['summary_text']

	pattern = r'^.?of\s+(.?(?:\.\|$))'
	match = re.match(pattern, enhanced_text, re.IGNORECASE \| re.DOTALL)

	if match:
	remaining_text = enhanced_text[match.end():].strip()
	modified_sentence = match.group(1).capitalize()
	enhanced_text = modified_sentence + ' ' + remaining_text
	else: # Long
	result = enhancer_long("Enhance the description: " + input_prompt)
	enhanced_text = result[0]['summary_text']

	return enhanced_text

	# SD3 Generation function
	def generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
	if randomize_seed:
	seed = random.randint(0, MAX_SEED)

	generator = torch.Generator().manual_seed(seed)

	image = sd3_pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps,
	width=width,
	height=height,
	generator=generator
	).images[0]

	return image, seed

	# Gradio Interface
	@spaces.GPU
	def process_workflow(image, text_prompt, use_vlm, use_enhancer, model_choice, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
	if use_vlm and image is not None:
	prompt = create_captions_rich(image)
	else:
	prompt = text_prompt

	if use_enhancer:
	prompt = enhance_prompt(prompt, model_choice)

	generated_image, used_seed = generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps)

	return generated_image, prompt, used_seed


	css = """
	body {
	font-family: 'Arial', sans-serif;
	background-color: #f0f4f8;
	}
	.container {
	max-width: 800px;
	margin: 0 auto;
	padding: 20px;
	background-color: white;
	border-radius: 10px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}
	h1 {
	color: #2c3e50;
	text-align: center;
	margin-bottom: 20px;
	}
	.input-box, .output-box {
	border: 1px solid #bdc3c7;
	border-radius: 5px;
	padding: 10px;
	}
	.input-box:focus, .output-box:focus {
	border-color: #3498db;
	box-shadow: 0 0 5px rgba(52, 152, 219, 0.5);
	}
	.submit-btn {
	background-color: #2980b9;
	color: white;
	border: none;
	padding: 10px 20px;
	border-radius: 5px;
	cursor: pointer;
	transition: background-color 0.3s;
	}
	.submit-btn:hover {
	background-color: #3498db;
	}
	"""


	# Gradio Interface
	with gr.Blocks(css=css) as demo:
	gr.Markdown("# SD3 Image Generator + VLM Captioner + Prompt Enhancer")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(label="Input Image for VLM")
	text_prompt = gr.Textbox(label="Text Prompt")
	use_vlm = gr.Checkbox(label="Use VLM Captioner", value=False)
	use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False)
	model_choice = gr.Radio(["Medium", "Long"], label="Enhancer Model", value="Long")

	with gr.Accordion("Advanced Settings", open=False):
	negative_prompt = gr.Textbox(label="Negative Prompt")
	seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
	randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
	width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=64, value=1024)
	height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=64, value=1024)
	guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=10.0, step=0.1, value=5.0)
	num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=28)

	generate_btn = gr.Button("Generate Image")

	with gr.Column():
	output_image = gr.Image(label="Generated Image")
	final_prompt = gr.Textbox(label="Final Prompt Used")
	used_seed = gr.Number(label="Seed Used")

	generate_btn.click(
	fn=process_workflow,
	inputs=[
	input_image, text_prompt, use_vlm, use_enhancer, model_choice,
	negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps
	],
	outputs=[output_image, final_prompt, used_seed]
	)

	demo.launch()