Spaces:
Paused
Paused
import gradio as gr | |
import torch | |
from diffusers import ( | |
StableDiffusion3Pipeline, # For SD3 models like Stable Diffusion 3.5 | |
ControlNetModel, | |
SD3Transformer2DModel, # Replacing UNet with SD3 transformer | |
AutoencoderKL, | |
UniPCMultistepScheduler, | |
) | |
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer | |
from huggingface_hub import login | |
import os | |
# Log in to Hugging Face with token from environment variables | |
token = os.getenv("HF_TOKEN") | |
login(token=token) | |
# Model IDs for the base Stable Diffusion model and ControlNet variant | |
model_id = "stabilityai/stable-diffusion-3.5-large-turbo" | |
controlnet_id = "lllyasviel/control_v11p_sd15_inpaint" | |
# Load each model component required by the pipeline | |
controlnet = ControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16) | |
transformer = SD3Transformer2DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.float16) | |
vae = AutoencoderKL.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float16) | |
feature_extractor = CLIPFeatureExtractor.from_pretrained(model_id) | |
text_encoder = CLIPTextModel.from_pretrained(model_id, subfolder="text_encoder") | |
tokenizer = CLIPTokenizer.from_pretrained(model_id) | |
# Initialize the pipeline with all components | |
pipeline = StableDiffusion3Pipeline( | |
transformer=transformer, # Using SD3 transformer | |
vae=vae, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
controlnet=controlnet, | |
scheduler=UniPCMultistepScheduler.from_config({"name": "UniPCMultistepScheduler"}), | |
feature_extractor=feature_extractor, | |
torch_dtype=torch.float16, | |
) | |
# Set device for pipeline | |
pipeline = pipeline.to("cuda") if torch.cuda.is_available() else pipeline | |
# Enable model CPU offloading for memory optimization | |
pipeline.enable_model_cpu_offload() | |
# Gradio interface function | |
def generate_image(prompt, reference_image): | |
# Resize and prepare reference image | |
reference_image = reference_image.convert("RGB").resize((512, 512)) | |
# Generate image using the pipeline with ControlNet | |
generated_image = pipeline( | |
prompt=prompt, | |
image=reference_image, | |
controlnet_conditioning_scale=1.0, | |
guidance_scale=7.5, | |
num_inference_steps=50 | |
).images[0] | |
return generated_image | |
# Set up Gradio interface | |
interface = gr.Interface( | |
fn=generate_image, | |
inputs=[ | |
gr.Textbox(label="Prompt"), | |
gr.Image(type="pil", label="Reference Image (Style)") | |
], | |
outputs="image", | |
title="Image Generation with ControlNet (Reference-Only Style Transfer)", | |
description="Generates an image based on a text prompt and style reference image using Stable Diffusion 3.5 and ControlNet (reference-only mode)." | |
) | |
# Launch the Gradio interface | |
interface.launch() | |