zs38's picture
update
102c482
raw
history blame
4.13 kB
import os
import torch
import spaces
import safetensors
import gradio as gr
from PIL import Image
from loguru import logger
from torchvision import transforms
from huggingface_hub import hf_hub_download, login
from diffusers import FluxPipeline, FluxTransformer2DModel
from projection import ImageEncoder
from transformer_flux_custom import FluxTransformer2DModel as FluxTransformer2DModelWithIP
model_config = './config.json'
pretrained_model_name = 'black-forest-labs/FLUX.1-dev'
adapter_path = 'model.safetensors'
adapter_repo_id = "ashen0209/Flux-Character-Consitancy"
conditioner_base_model = 'eva02_large_patch14_448.mim_in22k_ft_in1k'
conditioner_layer_num = 12
device = "cuda" if torch.cuda.is_available() else "cpu"
output_dim = 4096
logger.info(f"pretrained_model_name: {pretrained_model_name}, adapter_repo_id: {adapter_repo_id}, adapter_path: {adapter_path}, conditioner_layer: {conditioner_layer_num}, output_dim {output_dim}, device: {device}")
logger.info("init model")
model = FluxTransformer2DModelWithIP.from_config(model_config, torch_dtype=torch.bfloat16) # type: ignore
logger.info("load model")
copy = FluxTransformer2DModel.from_pretrained(pretrained_model_name, subfolder='transformer', torch_dtype=torch.bfloat16)
model.load_state_dict(copy.state_dict(), strict=False)
del copy
logger.info("load proj")
extra_embedder = ImageEncoder(output_dim, layer_num=conditioner_layer_num, seq_len=2, device=device, base_model=conditioner_base_model).to(device=device, dtype=torch.bfloat16)
logger.info("load pipe")
pipe = FluxPipeline.from_pretrained(pretrained_model_name, transformer=model, torch_dtype=torch.bfloat16)
pipe.to(dtype=torch.bfloat16, device=device)
logger.info("download adapter")
login(token=os.environ['HF_TOKEN'])
file_path = hf_hub_download(repo_id=adapter_repo_id, filename=adapter_path)
logger.info("load adapter")
state_dict = safetensors.torch.load_file(file_path)
state_dict = {'.'.join(k.split('.')[1:]): state_dict[k] for k in state_dict.keys()}
diff = model.load_state_dict(state_dict, strict=False)
diff = extra_embedder.load_state_dict(state_dict, strict=False)
IMAGE_PROCESS_TRANSFORM = transforms.Compose([
transforms.Resize((448, 448)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.276])
])
@spaces.GPU
def generate_image(ref_image, prompt, height=512, width=512, num_steps=25, guidance_scale=3.5, ip_scale=1.0):
with torch.no_grad():
image_refs = map(torch.stack, [
[IMAGE_PROCESS_TRANSFORM(i) for i in [ref_image, ]]
])
image_refs = [i.to(dtype=torch.bfloat16, device='cuda') for i in image_refs]
prompt_embeds, pooled_prompt_embeds, txt_ids = pipe.encode_prompt(prompt, prompt)
visual_prompt_embeds = extra_embedder(image_refs)
prompt_embeds_with_ref = torch.cat([prompt_embeds, visual_prompt_embeds], dim=1)
pipe.transformer.ip_scale = ip_scale
image = pipe(
prompt_embeds=prompt_embeds_with_ref,
pooled_prompt_embeds=pooled_prompt_embeds,
# negative_prompt_embeds=negative_prompt_embeds,
# negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
height=height,
width=width,
num_inference_steps=num_steps,
guidance_scale=guidance_scale,
).images[0]
return image
iface = gr.Interface(
fn=generate_image,
inputs=[
gr.Image(type="pil", label="Upload Reference Subject Image"),
gr.Textbox(lines=2, placeholder="Describe the desired contents", label="Description Text"),
gr.Slider(minimum=256, maximum=1024, value=512, label="Height"),
gr.Slider(minimum=256, maximum=1024, value=512, label="Width"),
gr.Slider(minimum=20, maximum=50, value=25, label="Number of Steps"),
gr.Slider(minimum=1.0, maximum=8.0, value=3.5, label="Guidance Scale"),
gr.Slider(minimum=0.0, maximum=2.0, value=1.0, label="Reference image Scale"),
],
outputs=gr.Image(type="pil", label="Generated Image"),
)
if __name__ == "__main__":
iface.launch()