Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import sys | |
sys.path.append('app/') | |
import torch | |
import spaces | |
import safetensors | |
import gradio as gr | |
from PIL import Image | |
from loguru import logger | |
from torchvision import transforms | |
from huggingface_hub import hf_hub_download, login | |
from diffusers import FluxPipeline, FluxTransformer2DModel | |
from projection import ImageEncoder | |
from transformer_flux_custom import FluxTransformer2DModel as FluxTransformer2DModelWithIP | |
model_config = './config.json' | |
pretrained_model_name = 'black-forest-labs/FLUX.1-dev' | |
adapter_path = 'model.safetensors' | |
adapter_repo_id = "ashen0209/Flux-Character-Consitancy" | |
conditioner_base_model = 'eva02_large_patch14_448.mim_in22k_ft_in1k' | |
conditioner_layer_num = 12 | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
output_dim = 4096 | |
logger.info("init model") | |
model = FluxTransformer2DModelWithIP.from_config(model_config, torch_dtype=torch.bfloat16) # type: ignore | |
logger.info("load model") | |
copy = FluxTransformer2DModel.from_pretrained(pretrained_model_name, subfolder='transformer', torch_dtype=torch.bfloat16) | |
model.load_state_dict(copy.state_dict(), strict=False) | |
del copy | |
logger.info("load proj") | |
extra_embedder = ImageEncoder(output_dim, layer_num=conditioner_layer_num, seq_len=2, device=device, base_model=conditioner_base_model).to(device=device, dtype=torch.bfloat16) | |
logger.info("load pipe") | |
pipe = FluxPipeline.from_pretrained(pretrained_model_name, transformer=model, torch_dtype=torch.bfloat16) | |
pipe.to(dtype=torch.bfloat16, device=device) | |
logger.info("download adapter") | |
login(token=os.environ['HF_TOKEN']) | |
file_path = hf_hub_download(repo_id=adapter_repo_id, filename=adapter_path) | |
logger.info("load adapter") | |
state_dict = safetensors.torch.load_file(adapter_path) | |
state_dict = {'.'.join(k.split('.')[1:]): state_dict[k] for k in state_dict.keys()} | |
diff = model.load_state_dict(state_dict, strict=False) | |
diff = extra_embedder.load_state_dict(state_dict, strict=False) | |
IMAGE_PROCESS_TRANSFORM = transforms.Compose([ | |
transforms.Resize((448, 448)), | |
transforms.ToTensor(), | |
transforms.Normalize(mean=[0.4815, 0.4578, 0.4082], std=[0.2686, 0.2613, 0.276]) | |
]) | |
def generate_image(ref_image, prompt, height=512, width=512, num_steps=25, guidance_scale=3.5, ip_scale=1.0): | |
nonlocal pipe | |
with torch.no_grad(): | |
image_refs = map(torch.stack, [ | |
[IMAGE_PROCESS_TRANSFORM(i) for i in [ref_image, ]] | |
]) | |
image_refs = [i.to(dtype=torch.bfloat16, device='cuda') for i in image_refs] | |
prompt_embeds, pooled_prompt_embeds, txt_ids = pipe.encode_prompt(prompt, prompt) | |
visual_prompt_embeds = extra_embedder(image_refs) | |
prompt_embeds_with_ref = torch.cat([prompt_embeds, visual_prompt_embeds], dim=1) | |
pipe.transformer.ip_scale = ip_scale | |
image = pipe( | |
prompt_embeds=prompt_embeds_with_ref, | |
pooled_prompt_embeds=pooled_prompt_embeds, | |
# negative_prompt_embeds=negative_prompt_embeds, | |
# negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, | |
height=height, | |
width=width, | |
num_inference_steps=num_steps, | |
guidance_scale=guidance_scale, | |
).images[0] | |
return image | |
iface = gr.Interface( | |
fn=generate_image, | |
inputs=[ | |
gr.Image(type="pil", label="Upload Reference Subject Image"), | |
gr.Textbox(lines=2, placeholder="Describe the desired contents", label="Description Text"), | |
], | |
outputs=gr.Image(type="pil", label="Generated Image"), | |
live=True | |
) | |
if __name__ == "__main__": | |
iface.launch() | |