ohamlab-ai-toolkit

Running

App Files Files Community

ohamlab-ai-toolkit / toolkit /clip_vision_adapter.py

rahul7star

boilerplate

fcc02a2 verified 5 months ago

raw

history blame

18 kB

	from typing import TYPE_CHECKING, Mapping, Any

	import torch
	import weakref

	from toolkit.config_modules import AdapterConfig
	from toolkit.models.clip_fusion import ZipperBlock
	from toolkit.models.zipper_resampler import ZipperModule
	from toolkit.prompt_utils import PromptEmbeds
	from toolkit.train_tools import get_torch_dtype

	if TYPE_CHECKING:
	from toolkit.stable_diffusion_model import StableDiffusion

	from transformers import (
	CLIPImageProcessor,
	CLIPVisionModelWithProjection,
	CLIPVisionModel
	)

	from toolkit.resampler import Resampler

	import torch.nn as nn


	class Embedder(nn.Module):
	def __init__(
	self,
	num_input_tokens: int = 1,
	input_dim: int = 1024,
	num_output_tokens: int = 8,
	output_dim: int = 768,
	mid_dim: int = 1024
	):
	super(Embedder, self).__init__()
	self.num_output_tokens = num_output_tokens
	self.num_input_tokens = num_input_tokens
	self.input_dim = input_dim
	self.output_dim = output_dim

	self.layer_norm = nn.LayerNorm(input_dim)
	self.fc1 = nn.Linear(input_dim, mid_dim)
	self.gelu = nn.GELU()
	# self.fc2 = nn.Linear(mid_dim, mid_dim)
	self.fc2 = nn.Linear(mid_dim, mid_dim)

	self.fc2.weight.data.zero_()

	self.layer_norm2 = nn.LayerNorm(mid_dim)
	self.fc3 = nn.Linear(mid_dim, mid_dim)
	self.gelu2 = nn.GELU()
	self.fc4 = nn.Linear(mid_dim, output_dim * num_output_tokens)

	# set the weights to 0
	self.fc3.weight.data.zero_()
	self.fc4.weight.data.zero_()


	# self.static_tokens = nn.Parameter(torch.zeros(num_output_tokens, output_dim))
	# self.scaler = nn.Parameter(torch.zeros(num_output_tokens, output_dim))

	def forward(self, x):
	if len(x.shape) == 2:
	x = x.unsqueeze(1)
	x = self.layer_norm(x)
	x = self.fc1(x)
	x = self.gelu(x)
	x = self.fc2(x)
	x = self.layer_norm2(x)
	x = self.fc3(x)
	x = self.gelu2(x)
	x = self.fc4(x)

	x = x.view(-1, self.num_output_tokens, self.output_dim)

	return x


	class ClipVisionAdapter(torch.nn.Module):
	def __init__(self, sd: 'StableDiffusion', adapter_config: AdapterConfig):
	super().__init__()
	self.config = adapter_config
	self.trigger = adapter_config.trigger
	self.trigger_class_name = adapter_config.trigger_class_name
	self.sd_ref: weakref.ref = weakref.ref(sd)
	# embedding stuff
	self.text_encoder_list = sd.text_encoder if isinstance(sd.text_encoder, list) else [sd.text_encoder]
	self.tokenizer_list = sd.tokenizer if isinstance(sd.tokenizer, list) else [sd.tokenizer]
	placeholder_tokens = [self.trigger]

	# add dummy tokens for multi-vector
	additional_tokens = []
	for i in range(1, self.config.num_tokens):
	additional_tokens.append(f"{self.trigger}_{i}")
	placeholder_tokens += additional_tokens

	# handle dual tokenizer
	self.tokenizer_list = self.sd_ref().tokenizer if isinstance(self.sd_ref().tokenizer, list) else [
	self.sd_ref().tokenizer]
	self.text_encoder_list = self.sd_ref().text_encoder if isinstance(self.sd_ref().text_encoder, list) else [
	self.sd_ref().text_encoder]

	self.placeholder_token_ids = []
	self.embedding_tokens = []

	print(f"Adding {placeholder_tokens} tokens to tokenizer")
	print(f"Adding {self.config.num_tokens} tokens to tokenizer")


	for text_encoder, tokenizer in zip(self.text_encoder_list, self.tokenizer_list):
	num_added_tokens = tokenizer.add_tokens(placeholder_tokens)
	if num_added_tokens != self.config.num_tokens:
	raise ValueError(
	f"The tokenizer already contains the token {self.trigger}. Please pass a different"
	f" `placeholder_token` that is not already in the tokenizer. Only added {num_added_tokens}"
	)

	# Convert the initializer_token, placeholder_token to ids
	init_token_ids = tokenizer.encode(self.config.trigger_class_name, add_special_tokens=False)
	# if length of token ids is more than number of orm embedding tokens fill with *
	if len(init_token_ids) > self.config.num_tokens:
	init_token_ids = init_token_ids[:self.config.num_tokens]
	elif len(init_token_ids) < self.config.num_tokens:
	pad_token_id = tokenizer.encode(["*"], add_special_tokens=False)
	init_token_ids += pad_token_id * (self.config.num_tokens - len(init_token_ids))

	placeholder_token_ids = tokenizer.encode(placeholder_tokens, add_special_tokens=False)
	self.placeholder_token_ids.append(placeholder_token_ids)

	# Resize the token embeddings as we are adding new special tokens to the tokenizer
	text_encoder.resize_token_embeddings(len(tokenizer))

	# Initialise the newly added placeholder token with the embeddings of the initializer token
	token_embeds = text_encoder.get_input_embeddings().weight.data
	with torch.no_grad():
	for initializer_token_id, token_id in zip(init_token_ids, placeholder_token_ids):
	token_embeds[token_id] = token_embeds[initializer_token_id].clone()

	# replace "[name] with this. on training. This is automatically generated in pipeline on inference
	self.embedding_tokens.append(" ".join(tokenizer.convert_ids_to_tokens(placeholder_token_ids)))

	# backup text encoder embeddings
	self.orig_embeds_params = [x.get_input_embeddings().weight.data.clone() for x in self.text_encoder_list]

	try:
	self.clip_image_processor = CLIPImageProcessor.from_pretrained(self.config.image_encoder_path)
	except EnvironmentError:
	self.clip_image_processor = CLIPImageProcessor()
	self.device = self.sd_ref().unet.device
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
	self.config.image_encoder_path,
	ignore_mismatched_sizes=True
	).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
	if self.config.train_image_encoder:
	self.image_encoder.train()
	else:
	self.image_encoder.eval()

	# max_seq_len = CLIP tokens + CLS token
	image_encoder_state_dict = self.image_encoder.state_dict()
	in_tokens = 257
	if "vision_model.embeddings.position_embedding.weight" in image_encoder_state_dict:
	# clip
	in_tokens = int(image_encoder_state_dict["vision_model.embeddings.position_embedding.weight"].shape[0])

	if hasattr(self.image_encoder.config, 'hidden_sizes'):
	embedding_dim = self.image_encoder.config.hidden_sizes[-1]
	else:
	embedding_dim = self.image_encoder.config.target_hidden_size

	if self.config.clip_layer == 'image_embeds':
	in_tokens = 1
	embedding_dim = self.image_encoder.config.projection_dim

	self.embedder = Embedder(
	num_output_tokens=self.config.num_tokens,
	num_input_tokens=in_tokens,
	input_dim=embedding_dim,
	output_dim=self.sd_ref().unet.config['cross_attention_dim'],
	mid_dim=embedding_dim * self.config.num_tokens,
	).to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))

	self.embedder.train()

	def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
	state_dict = {
	'embedder': self.embedder.state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
	}
	if self.config.train_image_encoder:
	state_dict['image_encoder'] = self.image_encoder.state_dict(
	*args, destination=destination, prefix=prefix,
	keep_vars=keep_vars)

	return state_dict

	def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
	self.embedder.load_state_dict(state_dict["embedder"], strict=strict)
	if self.config.train_image_encoder and 'image_encoder' in state_dict:
	self.image_encoder.load_state_dict(state_dict["image_encoder"], strict=strict)

	def parameters(self, args, *kwargs):
	yield from self.embedder.parameters(args, *kwargs)

	def named_parameters(self, args, *kwargs):
	yield from self.embedder.named_parameters(args, *kwargs)

	def get_clip_image_embeds_from_tensors(
	self, tensors_0_1: torch.Tensor, drop=False,
	is_training=False,
	has_been_preprocessed=False
	) -> torch.Tensor:
	with torch.no_grad():
	if not has_been_preprocessed:
	# tensors should be 0-1
	if tensors_0_1.ndim == 3:
	tensors_0_1 = tensors_0_1.unsqueeze(0)
	# training tensors are 0 - 1
	tensors_0_1 = tensors_0_1.to(self.device, dtype=torch.float16)

	# if images are out of this range throw error
	if tensors_0_1.min() < -0.3 or tensors_0_1.max() > 1.3:
	raise ValueError("image tensor values must be between 0 and 1. Got min: {}, max: {}".format(
	tensors_0_1.min(), tensors_0_1.max()
	))
	# unconditional
	if drop:
	if self.clip_noise_zero:
	tensors_0_1 = torch.rand_like(tensors_0_1).detach()
	noise_scale = torch.rand([tensors_0_1.shape[0], 1, 1, 1], device=self.device,
	dtype=get_torch_dtype(self.sd_ref().dtype))
	tensors_0_1 = tensors_0_1 * noise_scale
	else:
	tensors_0_1 = torch.zeros_like(tensors_0_1).detach()
	# tensors_0_1 = tensors_0_1 * 0
	clip_image = self.clip_image_processor(
	images=tensors_0_1,
	return_tensors="pt",
	do_resize=True,
	do_rescale=False,
	).pixel_values
	else:
	if drop:
	# scale the noise down
	if self.clip_noise_zero:
	tensors_0_1 = torch.rand_like(tensors_0_1).detach()
	noise_scale = torch.rand([tensors_0_1.shape[0], 1, 1, 1], device=self.device,
	dtype=get_torch_dtype(self.sd_ref().dtype))
	tensors_0_1 = tensors_0_1 * noise_scale
	else:
	tensors_0_1 = torch.zeros_like(tensors_0_1).detach()
	# tensors_0_1 = tensors_0_1 * 0
	mean = torch.tensor(self.clip_image_processor.image_mean).to(
	self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
	).detach()
	std = torch.tensor(self.clip_image_processor.image_std).to(
	self.device, dtype=get_torch_dtype(self.sd_ref().dtype)
	).detach()
	tensors_0_1 = torch.clip((255. * tensors_0_1), 0, 255).round() / 255.0
	clip_image = (tensors_0_1 - mean.view([1, 3, 1, 1])) / std.view([1, 3, 1, 1])

	else:
	clip_image = tensors_0_1
	clip_image = clip_image.to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype)).detach()
	with torch.set_grad_enabled(is_training):
	if is_training:
	self.image_encoder.train()
	else:
	self.image_encoder.eval()
	clip_output = self.image_encoder(clip_image, output_hidden_states=True)

	if self.config.clip_layer == 'penultimate_hidden_states':
	# they skip last layer for ip+
	# https://github.com/tencent-ailab/IP-Adapter/blob/f4b6742db35ea6d81c7b829a55b0a312c7f5a677/tutorial_train_plus.py#L403C26-L403C26
	clip_image_embeds = clip_output.hidden_states[-2]
	elif self.config.clip_layer == 'last_hidden_state':
	clip_image_embeds = clip_output.hidden_states[-1]
	else:
	clip_image_embeds = clip_output.image_embeds
	return clip_image_embeds

	import torch

	def set_vec(self, new_vector, text_encoder_idx=0):
	# Get the embedding layer
	embedding_layer = self.text_encoder_list[text_encoder_idx].get_input_embeddings()

	# Indices to replace in the embeddings
	indices_to_replace = self.placeholder_token_ids[text_encoder_idx]

	# Replace the specified embeddings with new_vector
	for idx in indices_to_replace:
	vector_idx = idx - indices_to_replace[0]
	embedding_layer.weight[idx] = new_vector[vector_idx]

	# adds it to the tokenizer
	def forward(self, clip_image_embeds: torch.Tensor) -> PromptEmbeds:
	clip_image_embeds = clip_image_embeds.to(self.device, dtype=get_torch_dtype(self.sd_ref().dtype))
	if clip_image_embeds.ndim == 2:
	# expand the token dimension
	clip_image_embeds = clip_image_embeds.unsqueeze(1)
	image_prompt_embeds = self.embedder(clip_image_embeds)
	# todo add support for multiple batch sizes
	if image_prompt_embeds.shape[0] != 1:
	raise ValueError("Batch size must be 1 for embedder for now")

	# output on sd1.5 is bs, num_tokens, 768
	if len(self.text_encoder_list) == 1:
	# add it to the text encoder
	self.set_vec(image_prompt_embeds[0], text_encoder_idx=0)
	elif len(self.text_encoder_list) == 2:
	if self.text_encoder_list[0].config.target_hidden_size + self.text_encoder_list[1].config.target_hidden_size != \
	image_prompt_embeds.shape[2]:
	raise ValueError("Something went wrong. The embeddings do not match the text encoder sizes")
	# sdxl variants
	# image_prompt_embeds = 2048
	# te1 = 768
	# te2 = 1280
	te1_embeds = image_prompt_embeds[:, :, :self.text_encoder_list[0].config.target_hidden_size]
	te2_embeds = image_prompt_embeds[:, :, self.text_encoder_list[0].config.target_hidden_size:]
	self.set_vec(te1_embeds[0], text_encoder_idx=0)
	self.set_vec(te2_embeds[0], text_encoder_idx=1)
	else:

	raise ValueError("Unsupported number of text encoders")
	# just a place to put a breakpoint
	pass

	def restore_embeddings(self):
	# Let's make sure we don't update any embedding weights besides the newly added token
	for text_encoder, tokenizer, orig_embeds, placeholder_token_ids in zip(
	self.text_encoder_list,
	self.tokenizer_list,
	self.orig_embeds_params,
	self.placeholder_token_ids
	):
	index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool)
	index_no_updates[
	min(placeholder_token_ids): max(placeholder_token_ids) + 1] = False
	with torch.no_grad():
	text_encoder.get_input_embeddings().weight[
	index_no_updates
	] = orig_embeds[index_no_updates]
	# detach it all
	text_encoder.get_input_embeddings().weight.detach_()

	def enable_gradient_checkpointing(self):
	self.image_encoder.gradient_checkpointing = True

	def inject_trigger_into_prompt(self, prompt, expand_token=False, to_replace_list=None, add_if_not_present=True):
	output_prompt = prompt
	embedding_tokens = self.embedding_tokens[0] # shoudl be the same
	default_replacements = ["[name]", "[trigger]"]

	replace_with = embedding_tokens if expand_token else self.trigger
	if to_replace_list is None:
	to_replace_list = default_replacements
	else:
	to_replace_list += default_replacements

	# remove duplicates
	to_replace_list = list(set(to_replace_list))

	# replace them all
	for to_replace in to_replace_list:
	# replace it
	output_prompt = output_prompt.replace(to_replace, replace_with)

	# see how many times replace_with is in the prompt
	num_instances = output_prompt.count(replace_with)

	if num_instances == 0 and add_if_not_present:
	# add it to the beginning of the prompt
	output_prompt = replace_with + " " + output_prompt

	if num_instances > 1:
	print(
	f"Warning: {replace_with} token appears {num_instances} times in prompt {output_prompt}. This may cause issues.")

	return output_prompt

	# reverses injection with class name. useful for normalizations
	def inject_trigger_class_name_into_prompt(self, prompt):
	output_prompt = prompt
	embedding_tokens = self.embedding_tokens[0] # shoudl be the same

	default_replacements = ["[name]", "[trigger]", embedding_tokens, self.trigger]

	replace_with = self.config.trigger_class_name
	to_replace_list = default_replacements

	# remove duplicates
	to_replace_list = list(set(to_replace_list))

	# replace them all
	for to_replace in to_replace_list:
	# replace it
	output_prompt = output_prompt.replace(to_replace, replace_with)

	# see how many times replace_with is in the prompt
	num_instances = output_prompt.count(replace_with)

	if num_instances > 1:
	print(
	f"Warning: {replace_with} token appears {num_instances} times in prompt {output_prompt}. This may cause issues.")

	return output_prompt