diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..4a11ac3faf66ebfec3f32edba29b42098f087c58 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +assets/demo/fix_face_weight.gif filter=lfs diff=lfs merge=lfs -text +assets/demo/gt_generate_compare.gif filter=lfs diff=lfs merge=lfs -text +assets/demo/talk_tys_fix_face_post_processing.gif filter=lfs diff=lfs merge=lfs -text +assets/demo/talk_tys_naive_retarget_post_processing.gif filter=lfs diff=lfs merge=lfs -text +assets/demo/talk_tys_offset_retarget_post_processing.gif filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..61213d44f171bb360b9911993a293f24f54f0615 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +--- +tags: + - text-to-image + - stable-diffusion + - audio-to-video +license: apache-2.0 +language: + - en +library_name: diffusers +--- + +# V-Express Model Card + +
+ +[**Project Page**](https://tenvence.github.io/p/v-express/) **|** [**Paper**](https://arxiv.org/abs/2406.02511) **|** [**Code**](https://github.com/tencent-ailab/V-Express) + +
+ +--- + +## Introduction + +## Models + +### Audio Encoder + +- [model_ckpts/wav2vec2-base-960h](https://huggingface.co/tk93/V-Express/tree/main/model_ckpts/wav2vec2-base-960h). (It is also available from the original model card [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h)) + +### Face Analysis + +- [model_ckpts/insightface_models/models/buffalo_l](https://huggingface.co/tk93/V-Express/tree/main/model_ckpts/insightface_models/models/buffalo_l). (It is also available from the original repository [insightface/buffalo_l](https://github.com/deepinsight/insightface/releases/download/v0.7/buffalo_l.zip)) + +### V-Express + +- [model_ckpts/sd-vae-ft-mse](https://huggingface.co/tk93/V-Express/tree/main/model_ckpts/sd-vae-ft-mse). VAE encoder. (original model card [stabilityai/sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse)) +- [model_ckpts/stable-diffusion-v1-5](https://huggingface.co/tk93/V-Express/tree/main/model_ckpts/stable-diffusion-v1-5). Only the model configuration file for unet is needed here. (original model card [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)) +- [model_ckpts/v-express](https://huggingface.co/tk93/V-Express/tree/main/model_ckpts/v-express). The video generation model conditional on audio and V-kps we call V-Express. +- You should download and put all `.bin` model to `model_ckpts/v-express` directory, which includes `audio_projection.bin`, `denoising_unet.bin`, `motion_module.bin`, `reference_net.bin`, and `v_kps_guider.bin`. diff --git a/SECourses.py b/SECourses.py new file mode 100644 index 0000000000000000000000000000000000000000..98a9993f73cfb66b3b4b9aaec99913f795d5e274 --- /dev/null +++ b/SECourses.py @@ -0,0 +1,393 @@ +import os +import subprocess +import gradio as gr +from retinaface import RetinaFace +from PIL import Image +import filetype +from datetime import datetime +import re +import sys +import torch +import argparse + +import platform, os + +def open_folder(): + open_folder_path = os.path.abspath("outputs") + if platform.system() == "Windows": + os.startfile(open_folder_path) + elif platform.system() == "Linux": + os.system(f'xdg-open "{open_folder_path}"') + + +# Get the path to the currently activated Python executable +python_executable = sys.executable + +def display_media(file): + # Determine the type of the uploaded file using filetype + if file is None: + return gr.update(visible=False), gr.update(visible=False) + kind = filetype.guess(file.name) + + if kind is None: + return gr.update(visible=False), gr.update(visible=False) + + if kind.mime.startswith('video'): + return gr.update(value=file.name, visible=True), gr.update(visible=False) + elif kind.mime.startswith('audio'): + return gr.update(visible=False), gr.update(value=file.name, visible=True) + else: + return gr.update(visible=False), gr.update(visible=False) + + +parser = argparse.ArgumentParser() +parser.add_argument("--share", type=str, default=False, help="Set to True to share the app publicly.") +args = parser.parse_args() + + +# Function to extract audio from video using FFmpeg +def extract_audio(video_path, audio_path): + command = [python_executable, "-m", "ffmpeg", "-i", video_path, "-vn", "-acodec", "libmp3lame", "-q:a", "2", audio_path] + subprocess.call(command) + +# Function to convert audio to MP3 using FFmpeg +def convert_audio_to_mp3(audio_path, mp3_path): + command = ["ffmpeg", "-i", audio_path, "-acodec", "libmp3lame", "-q:a", "2", mp3_path] + subprocess.call(command) + +def crop_and_save_image(image_path, auto_crop, crop_width, crop_height, crop_expansion): + cropped_image = auto_crop_image(image_path, crop_expansion, crop_size=(crop_width, crop_height)) + if cropped_image is not None: + cropped_folder = os.path.join("outputs", "cropped_images") + os.makedirs(cropped_folder, exist_ok=True) + + # Get the base name and extension of the image file + base_name, extension = os.path.splitext(os.path.basename(image_path)) + + # Initialize the counter for the image number + counter = 1 + + # Generate the new image name with the incremented number + new_image_name = f"{base_name}_{counter:04d}{extension}" + cropped_image_path = os.path.join(cropped_folder, new_image_name) + + # Check if the image already exists and increment the counter until a unique name is found + while os.path.exists(cropped_image_path): + counter += 1 + new_image_name = f"{base_name}_{counter:04d}{extension}" + cropped_image_path = os.path.join(cropped_folder, new_image_name) + + # Save the cropped image with the new name + cropped_image.save(cropped_image_path, format='PNG') + return cropped_image_path + return None + +# Function to generate kps sequence and audio from video +def generate_kps_sequence_and_audio(video_path, kps_sequence_save_path, audio_save_path): + command = [python_executable, "scripts/extract_kps_sequence_and_audio.py", "--video_path", video_path, "--kps_sequence_save_path", kps_sequence_save_path, "--audio_save_path", audio_save_path] + subprocess.call(command) + +def auto_crop_image(image_path, expand_percent, crop_size=(512, 512)): + # Check if CUDA is available + if torch.cuda.is_available(): + device = 'cuda' + print("Using GPU for RetinaFace detection.") + else: + device = 'cpu' + print("Using CPU for RetinaFace detection.") + + # Load image + img = Image.open(image_path) + + # Perform face detection + faces = RetinaFace.detect_faces(image_path) + + if not faces: + print("No faces detected.") + return None + + # Assuming 'faces' is a dictionary of detected faces + # Pick the first face detected + face = list(faces.values())[0] + landmarks = face['landmarks'] + + # Extract the landmarks + right_eye = landmarks['right_eye'] + left_eye = landmarks['left_eye'] + right_mouth = landmarks['mouth_right'] + left_mouth = landmarks['mouth_left'] + + # Calculate the distance between the eyes + eye_distance = abs(right_eye[0] - left_eye[0]) + + # Estimate the head width and height + head_width = eye_distance * 4.5 # Increase the width multiplier + head_height = eye_distance * 6.5 # Increase the height multiplier + + # Calculate the center point between the eyes + eye_center_x = (right_eye[0] + left_eye[0]) // 2 + eye_center_y = (right_eye[1] + left_eye[1]) // 2 + + # Calculate the top-left and bottom-right coordinates of the assumed head region + head_left = max(0, int(eye_center_x - head_width // 2)) + head_top = max(0, int(eye_center_y - head_height // 2)) # Adjust the top coordinate + head_right = min(img.width, int(eye_center_x + head_width // 2)) + head_bottom = min(img.height, int(eye_center_y + head_height // 2)) # Adjust the bottom coordinate + + # Save the assumed head image + assumed_head_img = img.crop((head_left, head_top, head_right, head_bottom)) + assumed_head_img.save("assumed_head.png", format='PNG') + + # Calculate the expansion in pixels and the new dimensions + expanded_w = int(head_width * (1 + expand_percent)) + expanded_h = int(head_height * (1 + expand_percent)) + + # Calculate the top-left and bottom-right points of the expanded box + center_x, center_y = head_left + head_width // 2, head_top + head_height // 2 + left = max(0, center_x - expanded_w // 2) + right = min(img.width, center_x + expanded_w // 2) + top = max(0, center_y - expanded_h // 2) + bottom = min(img.height, center_y + expanded_h // 2) + + # Crop the image with the expanded boundaries + cropped_img = img.crop((left, top, right, bottom)) + cropped_img.save("expanded_face.png", format='PNG') + + # Calculate the aspect ratio of the cropped image + cropped_width, cropped_height = cropped_img.size + aspect_ratio = cropped_width / cropped_height + + # Calculate the target dimensions based on the desired crop size + target_width = crop_size[0] + target_height = crop_size[1] + + # Adjust the crop to match the desired aspect ratio + if aspect_ratio > target_width / target_height: + # Crop from left and right + new_width = int(cropped_height * target_width / target_height) + left_crop = (cropped_width - new_width) // 2 + right_crop = left_crop + new_width + top_crop = 0 + bottom_crop = cropped_height + else: + # Crop from top and bottom + new_height = int(cropped_width * target_height / target_width) + top_crop = (cropped_height - new_height) // 2 + bottom_crop = top_crop + new_height + left_crop = 0 + right_crop = cropped_width + + # Crop the image with the adjusted boundaries + final_cropped_img = cropped_img.crop((left_crop, top_crop, right_crop, bottom_crop)) + final_cropped_img.save("final_cropped_img.png", format='PNG') + + # Resize the cropped image to the desired size (512x512 by default) with best quality + resized_img = final_cropped_img.resize(crop_size, resample=Image.LANCZOS) + + # Save the resized image as PNG + resized_img.save(image_path, format='PNG') + return resized_img + + +def generate_output_video(reference_image_path, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height, low_vram): + print("auto cropping...") + if auto_crop: + auto_crop_image(reference_image_path,crop_expansion, crop_size=(crop_width, crop_height)) + + print("starting inference...") + command = [ + python_executable, "inference.py", + "--reference_image_path", reference_image_path, + "--audio_path", audio_path, + "--kps_path", kps_path, + "--output_path", output_path, + "--retarget_strategy", retarget_strategy, + "--num_inference_steps", str(num_inference_steps), + "--reference_attention_weight", str(reference_attention_weight), + "--audio_attention_weight", str(audio_attention_weight), + "--image_width", str(image_width), + "--image_height", str(image_height) + ] + + if low_vram: # Add the --save_gpu_memory flag if Low VRAM is checked + command.append("--save_gpu_memory") + + with open("executed_command.txt", "w") as file: + file.write(" ".join(command)) + + subprocess.call(command) + return output_path, reference_image_path + +def sanitize_folder_name(name): + # Define a regex pattern to match invalid characters for both Linux and Windows + invalid_chars = r'[<>:"/\\|?*\x00-\x1F]' + # Replace invalid characters with an underscore + sanitized_name = re.sub(invalid_chars, '_', name) + return sanitized_name + +# Function to handle the input and generate the output +def process_input(reference_image, target_input, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop, crop_width, crop_height, crop_expansion,image_width,image_height,low_vram): + # Create temp_process directory for intermediate files + temp_process_dir = "temp_process" + os.makedirs(temp_process_dir, exist_ok=True) + + input_file_name = os.path.splitext(os.path.basename(reference_image))[0] + input_file_name=sanitize_folder_name(input_file_name) + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + temp_dir = os.path.join(temp_process_dir, f"{input_file_name}_{timestamp}") + os.makedirs(temp_dir, exist_ok=True) + + kind = filetype.guess(target_input) + if not kind: + raise ValueError("Cannot determine file type. Please provide a valid video or audio file.") + + mime_type = kind.mime + + if mime_type.startswith("video/"): # Video input + audio_path = os.path.join(temp_dir, "target_audio.mp3") + kps_path = os.path.join(temp_dir, "kps.pth") + print("generating generate_kps_sequence_and_audio...") + generate_kps_sequence_and_audio(target_input, kps_path, audio_path) + elif mime_type.startswith("audio/"): # Audio input + audio_path = target_input + if mime_type != "audio/mpeg": + mp3_path = os.path.join(temp_dir, "target_audio_converted.mp3") + convert_audio_to_mp3(target_input, mp3_path) + audio_path = mp3_path + kps_path = "" + else: + raise ValueError("Unsupported file type. Please provide a video or audio file.") + + output_dir = "outputs" + os.makedirs(output_dir, exist_ok=True) + output_file_name = f"{input_file_name}_result_" + output_file_name=sanitize_folder_name(output_file_name) + output_file_ext = ".mp4" + output_file_count = 1 + while os.path.exists(os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}")): + output_file_count += 1 + output_path = os.path.join(output_dir, f"{output_file_name}{output_file_count:04d}{output_file_ext}") + + + output_video_path, cropped_image_path = generate_output_video(reference_image, audio_path, kps_path, output_path, retarget_strategy, num_inference_steps, reference_attention_weight, audio_attention_weight, auto_crop,crop_width,crop_height, crop_expansion,image_width,image_height,low_vram) + + return output_video_path, cropped_image_path + +def launch_interface(): + retarget_strategies = ["fix_face", "no_retarget", "offset_retarget", "naive_retarget"] + + with gr.Blocks() as demo: + gr.Markdown("# Tencent AI Lab - V-Express Image to Animation V4 : https://www.patreon.com/posts/105251204") + with gr.Row(): + with gr.Column(): + input_image = gr.Image(label="Reference Image", format="png", type="filepath", height=512) + generate_button = gr.Button("Generate Talking Video") + low_vram = gr.Checkbox(label="Low VRAM - Greatly reduces VRAM usage but takes longer", value=False,visible=False) + crop_button = gr.Button("Crop Image") + with gr.Row(): + + with gr.Column(min_width=0): + image_width = gr.Number(label="Target Video Width", value=512) + + with gr.Column(min_width=0): + image_height = gr.Number(label="Target Video Height", value=512) + + with gr.Row(): + with gr.Column(min_width=0): + retarget_strategy = gr.Dropdown(retarget_strategies, label="Retarget Strategy", value="fix_face") + with gr.Column(min_width=0): + inference_steps = gr.Slider(10, 90, step=1, label="Number of Inference Steps", value=30) + + with gr.Row(): + with gr.Column(min_width=0): + reference_attention = gr.Slider(0.80, 1.1, step=0.01, label="Reference Attention Weight", value=0.95) + with gr.Column(min_width=0): + audio_attention = gr.Slider(1.0, 5.0, step=0.1, label="Audio Attention Weight", value=3.0) + + with gr.Row(visible=True) as crop_size_row: + with gr.Column(min_width=0): + auto_crop = gr.Checkbox(label="Auto Crop Image", value=True) + with gr.Column(min_width=0): + crop_expansion = gr.Slider(0.0, 1.0, step=0.01, label="Face Focus Expansion Percent", value=0.15) + with gr.Row(): + with gr.Column(min_width=0): + crop_width = gr.Number(label="Crop Width", value=512) + with gr.Column(min_width=0): + crop_height = gr.Number(label="Crop Height", value=512) + + with gr.Column(): + input_video = gr.File( + label="Target Input (Image or Video)", + type="filepath", + file_count="single", + file_types=[ + ".mp4", ".avi", ".mov", ".wmv", ".flv", ".mkv", ".webm", # Video extensions + ".3gp", ".m4v", ".mpg", ".mpeg", ".m2v", ".m4v", ".mts", # More video extensions + ".mp3", ".wav", ".aac", ".flac", ".m4a", ".wma", ".ogg" # Audio extensions + ], + height=512 ) + video_output = gr.Video(visible=False) + audio_output = gr.Audio(visible=False) + + input_video.change(display_media, inputs=input_video, outputs=[video_output, audio_output]) + btn_open_outputs = gr.Button("Open Outputs Folder") + btn_open_outputs.click(fn=open_folder) + gr.Markdown(""" + + Retarget Strategies + + Only target audio : fix_face + + Input picture and target video (same person - best practice) select : no_retarget + + Input picture and target video (different person) select : offset_retarget or naive_retarget + + Please look examples in Tests folder to see which settings you like most. I feel like offset_retarget is best + + You can turn up reference_attention_weight to make the model maintain higher character consistency, and turn down audio_attention_weight to reduce mouth artifacts. E.g. setting both values to 1.0 + """) + + + + with gr.Column(): + output_video = gr.Video(label="Generated Video", height=512) + output_image = gr.Image(label="Cropped Image") + + + generate_button.click( + fn=process_input, + inputs=[ + input_image, + input_video, + retarget_strategy, + inference_steps, + reference_attention, + audio_attention, + auto_crop, + crop_width, + crop_height, + crop_expansion, + image_width, + image_height, + low_vram + ], + outputs=[output_video, output_image] + ) + + crop_button.click( + fn=crop_and_save_image, + inputs=[ + input_image, + auto_crop, + crop_width, + crop_height, + crop_expansion + ], + outputs=output_image + ) + + demo.queue() + demo.launch(inbrowser=True,share=args.share) + +# Run the Gradio interface +launch_interface() \ No newline at end of file diff --git a/assets/crop_example.jpeg b/assets/crop_example.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..080532561a8d2d825d7c410895060c7810e8c2db Binary files /dev/null and b/assets/crop_example.jpeg differ diff --git a/assets/demo/fix_face_weight.gif b/assets/demo/fix_face_weight.gif new file mode 100644 index 0000000000000000000000000000000000000000..ffe41ae92dd19bcd26e72e17be22e7c5086cb0fd --- /dev/null +++ b/assets/demo/fix_face_weight.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff0a3eca118822dd6676b1856854be99ad7a11a571f08e735a2777c13613db5f +size 41657508 diff --git a/assets/demo/gt_generate_compare.gif b/assets/demo/gt_generate_compare.gif new file mode 100644 index 0000000000000000000000000000000000000000..43cfe2f046c40f4a0dc7332f744df4519a14e1ab --- /dev/null +++ b/assets/demo/gt_generate_compare.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:943a0b94df873eeffe1334148fb8935251e3a200cf4bd7631aae89c718aab5e3 +size 13286675 diff --git a/assets/demo/talk_tys_fix_face_post_processing.gif b/assets/demo/talk_tys_fix_face_post_processing.gif new file mode 100644 index 0000000000000000000000000000000000000000..e30229c80ef6a5e37ec73a8a0a4d829a18548566 --- /dev/null +++ b/assets/demo/talk_tys_fix_face_post_processing.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20924e3ab2e7e7d32fe5ab0adc70d4918fefac11e3fa2da77f265a5cc63296cb +size 3955774 diff --git a/assets/demo/talk_tys_naive_retarget_post_processing.gif b/assets/demo/talk_tys_naive_retarget_post_processing.gif new file mode 100644 index 0000000000000000000000000000000000000000..697efd128856e5bb581696825929a58104df97a9 --- /dev/null +++ b/assets/demo/talk_tys_naive_retarget_post_processing.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d476f224799620c161ac199eb3748312e385c5f169435db9f677c3cdb10709e +size 7673609 diff --git a/assets/demo/talk_tys_offset_retarget_post_processing.gif b/assets/demo/talk_tys_offset_retarget_post_processing.gif new file mode 100644 index 0000000000000000000000000000000000000000..98e9078671749c7d1e8ffeebb5cb2fab89a6d73e --- /dev/null +++ b/assets/demo/talk_tys_offset_retarget_post_processing.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0132d055bd25875a0e70dd4828afc0273f754293c083cbd0051f6895d6722c41 +size 7315651 diff --git a/assets/global_framework.png b/assets/global_framework.png new file mode 100644 index 0000000000000000000000000000000000000000..ef82b40ab721eeac7c23753967364ee8b8017227 Binary files /dev/null and b/assets/global_framework.png differ diff --git a/executed_command.txt b/executed_command.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e35ec4f5db6f68ad7b455db6716af828732165e --- /dev/null +++ b/executed_command.txt @@ -0,0 +1 @@ +R:\V_Express_Installers_V6\V-Express\venv\Scripts\python.exe inference.py --reference_image_path C:\Users\King\AppData\Local\Temp\gradio\e57288fe0b82a668f63c481f82c0fc65eaa84618\Biden_Photo_Big.png --audio_path temp_process\Biden_Photo_Big_20240620112047\target_audio.mp3 --kps_path temp_process\Biden_Photo_Big_20240620112047\kps.pth --output_path outputs\Biden_Photo_Big_result_0006.mp4 --retarget_strategy offset_retarget --num_inference_steps 30 --reference_attention_weight 0.95 --audio_attention_weight 3 --image_width 512 --image_height 512 \ No newline at end of file diff --git a/git b/git new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/inference.py b/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..c71977b5350421d0e54e14a390d4dbe0dac445d9 --- /dev/null +++ b/inference.py @@ -0,0 +1,291 @@ +import argparse +import os +import time + +import accelerate +import cv2 +import numpy as np +import torch +import torchaudio.functional +import torchvision.io +from PIL import Image +from diffusers import AutoencoderKL, DDIMScheduler +from diffusers.utils.import_utils import is_xformers_available +from insightface.app import FaceAnalysis +from omegaconf import OmegaConf +from transformers import Wav2Vec2Model, Wav2Vec2Processor + +from modules import UNet2DConditionModel, UNet3DConditionModel, VKpsGuider, AudioProjection +from pipelines import VExpressPipeline +from pipelines.utils import draw_kps_image, save_video +from pipelines.utils import retarget_kps + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument('--unet_config_path', type=str, default='./model_ckpts/stable-diffusion-v1-5/unet/config.json') + parser.add_argument('--vae_path', type=str, default='./model_ckpts/sd-vae-ft-mse/') + parser.add_argument('--audio_encoder_path', type=str, default='./model_ckpts/wav2vec2-base-960h/') + parser.add_argument('--insightface_model_path', type=str, default='./model_ckpts/insightface_models/') + + parser.add_argument('--denoising_unet_path', type=str, default='./model_ckpts/v-express/denoising_unet.bin') + parser.add_argument('--reference_net_path', type=str, default='./model_ckpts/v-express/reference_net.bin') + parser.add_argument('--v_kps_guider_path', type=str, default='./model_ckpts/v-express/v_kps_guider.bin') + parser.add_argument('--audio_projection_path', type=str, default='./model_ckpts/v-express/audio_projection.bin') + parser.add_argument('--motion_module_path', type=str, default='./model_ckpts/v-express/motion_module.bin') + + parser.add_argument('--retarget_strategy', type=str, default='fix_face', + help='{fix_face, no_retarget, offset_retarget, naive_retarget}') + + parser.add_argument('--dtype', type=str, default='fp16') + parser.add_argument('--device', type=str, default='cuda') + parser.add_argument('--gpu_id', type=int, default=0) + parser.add_argument('--do_multi_devices_inference', action='store_true') + parser.add_argument('--save_gpu_memory', action='store_true') + + parser.add_argument('--num_pad_audio_frames', type=int, default=2) + parser.add_argument('--standard_audio_sampling_rate', type=int, default=16000) + + parser.add_argument('--reference_image_path', type=str, default='./test_samples/emo/talk_emotion/ref.jpg') + parser.add_argument('--audio_path', type=str, default='./test_samples/emo/talk_emotion/aud.mp3') + parser.add_argument('--kps_path', type=str, default='./test_samples/emo/talk_emotion/kps.pth') + parser.add_argument('--output_path', type=str, default='./output/emo/talk_emotion.mp4') + + parser.add_argument('--image_width', type=int, default=512) + parser.add_argument('--image_height', type=int, default=512) + parser.add_argument('--fps', type=float, default=30.0) + parser.add_argument('--seed', type=int, default=42) + parser.add_argument('--num_inference_steps', type=int, default=25) + parser.add_argument('--guidance_scale', type=float, default=3.5) + parser.add_argument('--context_frames', type=int, default=12) + parser.add_argument('--context_overlap', type=int, default=4) + parser.add_argument('--reference_attention_weight', default=0.95, type=float) + parser.add_argument('--audio_attention_weight', default=3., type=float) + + args = parser.parse_args() + + return args + + +def load_reference_net(unet_config_path, reference_net_path, dtype, device): + reference_net = UNet2DConditionModel.from_config(unet_config_path).to(dtype=dtype, device=device) + reference_net.load_state_dict(torch.load(reference_net_path, map_location="cpu"), strict=False) + print(f'Loaded weights of Reference Net from {reference_net_path}.') + return reference_net + + +def load_denoising_unet(inf_config_path, unet_config_path, denoising_unet_path, motion_module_path, dtype, device): + inference_config = OmegaConf.load(inf_config_path) + denoising_unet = UNet3DConditionModel.from_config_2d( + unet_config_path, + unet_additional_kwargs=inference_config.unet_additional_kwargs, + ).to(dtype=dtype, device=device) + denoising_unet.load_state_dict(torch.load(denoising_unet_path, map_location="cpu"), strict=False) + print(f'Loaded weights of Denoising U-Net from {denoising_unet_path}.') + + denoising_unet.load_state_dict(torch.load(motion_module_path, map_location="cpu"), strict=False) + print(f'Loaded weights of Denoising U-Net Motion Module from {motion_module_path}.') + + return denoising_unet + + +def load_v_kps_guider(v_kps_guider_path, dtype, device): + v_kps_guider = VKpsGuider(320, block_out_channels=(16, 32, 96, 256)).to(dtype=dtype, device=device) + v_kps_guider.load_state_dict(torch.load(v_kps_guider_path, map_location="cpu")) + print(f'Loaded weights of V-Kps Guider from {v_kps_guider_path}.') + return v_kps_guider + + +def load_audio_projection( + audio_projection_path, + dtype, + device, + inp_dim: int, + mid_dim: int, + out_dim: int, + inp_seq_len: int, + out_seq_len: int, +): + audio_projection = AudioProjection( + dim=mid_dim, + depth=4, + dim_head=64, + heads=12, + num_queries=out_seq_len, + embedding_dim=inp_dim, + output_dim=out_dim, + ff_mult=4, + max_seq_len=inp_seq_len, + ).to(dtype=dtype, device=device) + audio_projection.load_state_dict(torch.load(audio_projection_path, map_location='cpu')) + print(f'Loaded weights of Audio Projection from {audio_projection_path}.') + return audio_projection + + +def get_scheduler(inference_config_path): + inference_config = OmegaConf.load(inference_config_path) + scheduler_kwargs = OmegaConf.to_container(inference_config.noise_scheduler_kwargs) + scheduler = DDIMScheduler(**scheduler_kwargs) + return scheduler + + +def main(): + args = parse_args() + start_time = time.time() + + if not args.do_multi_devices_inference: + # TODO + accelerator = None + device = torch.device(f'{args.device}:{args.gpu_id}' if args.device == 'cuda' else args.device) + else: + accelerator = accelerate.Accelerator() + device = torch.device(f'cuda:{accelerator.process_index}') + dtype = torch.float16 if args.dtype == 'fp16' else torch.float32 + + vae_path = args.vae_path + audio_encoder_path = args.audio_encoder_path + + vae = AutoencoderKL.from_pretrained(vae_path).to(dtype=dtype, device=device) + audio_encoder = Wav2Vec2Model.from_pretrained(audio_encoder_path).to(dtype=dtype, device=device) + audio_processor = Wav2Vec2Processor.from_pretrained(audio_encoder_path) + + unet_config_path = args.unet_config_path + reference_net_path = args.reference_net_path + denoising_unet_path = args.denoising_unet_path + v_kps_guider_path = args.v_kps_guider_path + audio_projection_path = args.audio_projection_path + motion_module_path = args.motion_module_path + + inference_config_path = './inference_v2.yaml' + scheduler = get_scheduler(inference_config_path) + reference_net = load_reference_net(unet_config_path, reference_net_path, dtype, device) + denoising_unet = load_denoising_unet( + inference_config_path, unet_config_path, denoising_unet_path, motion_module_path, + dtype, device + ) + v_kps_guider = load_v_kps_guider(v_kps_guider_path, dtype, device) + audio_projection = load_audio_projection( + audio_projection_path, + dtype, + device, + inp_dim=denoising_unet.config.cross_attention_dim, + mid_dim=denoising_unet.config.cross_attention_dim, + out_dim=denoising_unet.config.cross_attention_dim, + inp_seq_len=2 * (2 * args.num_pad_audio_frames + 1), + out_seq_len=2 * args.num_pad_audio_frames + 1, + ) + + if is_xformers_available(): + reference_net.enable_xformers_memory_efficient_attention() + denoising_unet.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + generator = torch.manual_seed(args.seed) + pipeline = VExpressPipeline( + vae=vae, + reference_net=reference_net, + denoising_unet=denoising_unet, + v_kps_guider=v_kps_guider, + audio_processor=audio_processor, + audio_encoder=audio_encoder, + audio_projection=audio_projection, + scheduler=scheduler, + ).to(dtype=dtype, device=device) + + app = FaceAnalysis( + providers=['CUDAExecutionProvider' if args.device == 'cuda' else 'CPUExecutionProvider'], + provider_options=[{'device_id': args.gpu_id}] if args.device == 'cuda' else [], + root=args.insightface_model_path, + ) + app.prepare(ctx_id=0, det_size=(args.image_height, args.image_width)) + + reference_image = Image.open(args.reference_image_path).convert('RGB') + reference_image = reference_image.resize((args.image_height, args.image_width)) + + reference_image_for_kps = cv2.imread(args.reference_image_path) + reference_image_for_kps = cv2.resize(reference_image_for_kps, (args.image_width, args.image_height)) + reference_kps = app.get(reference_image_for_kps)[0].kps[:3] + if args.save_gpu_memory: + del app + torch.cuda.empty_cache() + + _, audio_waveform, meta_info = torchvision.io.read_video(os.path.join(os.path.dirname(args.audio_path), os.path.basename(args.audio_path)), pts_unit='sec') + audio_sampling_rate = meta_info['audio_fps'] + print(f'Length of audio is {audio_waveform.shape[1]} with the sampling rate of {audio_sampling_rate}.') + if audio_sampling_rate != args.standard_audio_sampling_rate: + audio_waveform = torchaudio.functional.resample( + audio_waveform, + orig_freq=audio_sampling_rate, + new_freq=args.standard_audio_sampling_rate, + ) + audio_waveform = audio_waveform.mean(dim=0) + + duration = audio_waveform.shape[0] / args.standard_audio_sampling_rate + init_video_length = int(duration * args.fps) + num_contexts = np.around((init_video_length + args.context_overlap) / args.context_frames) + video_length = int(num_contexts * args.context_frames - args.context_overlap) + fps = video_length / duration + print(f'The corresponding video length is {video_length}.') + + kps_sequence = None + if args.kps_path != "": + assert os.path.exists(args.kps_path), f'{args.kps_path} does not exist' + kps_sequence = torch.tensor(torch.load(args.kps_path)) # [len, 3, 2] + print(f'The original length of kps sequence is {kps_sequence.shape[0]}.') + + if kps_sequence.shape[0] > video_length: + kps_sequence = kps_sequence[:video_length, :, :] + + kps_sequence = torch.nn.functional.interpolate(kps_sequence.permute(1, 2, 0), size=video_length, mode='linear') + kps_sequence = kps_sequence.permute(2, 0, 1) + print(f'The interpolated length of kps sequence is {kps_sequence.shape[0]}.') + + retarget_strategy = args.retarget_strategy + if retarget_strategy == 'fix_face': + kps_sequence = torch.tensor([reference_kps] * video_length) + elif retarget_strategy == 'no_retarget': + kps_sequence = kps_sequence + elif retarget_strategy == 'offset_retarget': + kps_sequence = retarget_kps(reference_kps, kps_sequence, only_offset=True) + elif retarget_strategy == 'naive_retarget': + kps_sequence = retarget_kps(reference_kps, kps_sequence, only_offset=False) + else: + raise ValueError(f'The retarget strategy {retarget_strategy} is not supported.') + + kps_images = [] + for i in range(video_length): + kps_image = draw_kps_image(args.image_height, args.image_width, kps_sequence[i]) + kps_images.append(Image.fromarray(kps_image)) + + video_tensor = pipeline( + reference_image=reference_image, + kps_images=kps_images, + audio_waveform=audio_waveform, + width=args.image_width, + height=args.image_height, + video_length=video_length, + num_inference_steps=args.num_inference_steps, + guidance_scale=args.guidance_scale, + context_frames=args.context_frames, + context_overlap=args.context_overlap, + reference_attention_weight=args.reference_attention_weight, + audio_attention_weight=args.audio_attention_weight, + num_pad_audio_frames=args.num_pad_audio_frames, + generator=generator, + do_multi_devices_inference=args.do_multi_devices_inference, + save_gpu_memory=args.save_gpu_memory, + ) + + if accelerator is None or accelerator.is_main_process: + save_video(video_tensor, args.audio_path, args.output_path, device, fps) + consumed_time = time.time() - start_time + generation_fps = video_tensor.shape[2] / consumed_time + print(f'The generated video has been saved at {args.output_path}. ' + f'The generation time is {consumed_time:.1f} seconds. ' + f'The generation FPS is {generation_fps:.2f}.') + + +if __name__ == '__main__': + main() diff --git a/inference_v2.yaml b/inference_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..99ece426f40d762b3c49ec63fb0289d6f1747efe --- /dev/null +++ b/inference_v2.yaml @@ -0,0 +1,35 @@ +unet_additional_kwargs: + use_inflated_groupnorm: true + unet_use_cross_frame_attention: false + unet_use_temporal_attention: false + use_motion_module: true + motion_module_resolutions: + - 1 + - 2 + - 4 + - 8 + motion_module_mid_block: true + motion_module_decoder_only: false + motion_module_type: Vanilla + motion_module_kwargs: + num_attention_heads: 8 + num_transformer_block: 1 + attention_block_types: + - Temporal_Self + - Temporal_Self + temporal_position_encoding: true + temporal_position_encoding_max_len: 32 + temporal_attention_dim_div: 1 + +noise_scheduler_kwargs: + beta_start: 0.00085 + beta_end: 0.012 + beta_schedule: "scaled_linear" + clip_sample: false + steps_offset: 1 + ### Zero-SNR params + prediction_type: "v_prediction" + rescale_betas_zero_snr: True + timestep_spacing: "trailing" + +sampler: DDIM diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..500b8edd6e562ab5e56bf986a28d2f16701914eb --- /dev/null +++ b/modules/__init__.py @@ -0,0 +1,5 @@ +from .unet_2d_condition import UNet2DConditionModel +from .unet_3d import UNet3DConditionModel +from .v_kps_guider import VKpsGuider +from .audio_projection import AudioProjection +from .mutual_self_attention import ReferenceAttentionControl diff --git a/modules/__pycache__/__init__.cpython-310.pyc b/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd5b35c90765e3cb06d8aa075865c980343e9765 Binary files /dev/null and b/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/modules/__pycache__/attention.cpython-310.pyc b/modules/__pycache__/attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89818c8154261e8f79b1e026f344f80e5e85e1a5 Binary files /dev/null and b/modules/__pycache__/attention.cpython-310.pyc differ diff --git a/modules/__pycache__/audio_projection.cpython-310.pyc b/modules/__pycache__/audio_projection.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8f32ec80fdeb025e363b242fae4474e92609ffe Binary files /dev/null and b/modules/__pycache__/audio_projection.cpython-310.pyc differ diff --git a/modules/__pycache__/motion_module.cpython-310.pyc b/modules/__pycache__/motion_module.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bc04b892e154f72e5d24be719e430c887cb46af Binary files /dev/null and b/modules/__pycache__/motion_module.cpython-310.pyc differ diff --git a/modules/__pycache__/mutual_self_attention.cpython-310.pyc b/modules/__pycache__/mutual_self_attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2806d05b2966f5310643f22671a14a928550724c Binary files /dev/null and b/modules/__pycache__/mutual_self_attention.cpython-310.pyc differ diff --git a/modules/__pycache__/resnet.cpython-310.pyc b/modules/__pycache__/resnet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cbf60606812ff046dbc54352f5da9f7aeb82199 Binary files /dev/null and b/modules/__pycache__/resnet.cpython-310.pyc differ diff --git a/modules/__pycache__/transformer_2d.cpython-310.pyc b/modules/__pycache__/transformer_2d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..191b38ec4ef443601e094b77b3f1ee90aa515ea5 Binary files /dev/null and b/modules/__pycache__/transformer_2d.cpython-310.pyc differ diff --git a/modules/__pycache__/transformer_3d.cpython-310.pyc b/modules/__pycache__/transformer_3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53fd1b1a090f8685187c23cc2137be1cc6f4d640 Binary files /dev/null and b/modules/__pycache__/transformer_3d.cpython-310.pyc differ diff --git a/modules/__pycache__/unet_2d_blocks.cpython-310.pyc b/modules/__pycache__/unet_2d_blocks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c5ff5d769839cf4c01e5b1c4c4098f01ba31551 Binary files /dev/null and b/modules/__pycache__/unet_2d_blocks.cpython-310.pyc differ diff --git a/modules/__pycache__/unet_2d_condition.cpython-310.pyc b/modules/__pycache__/unet_2d_condition.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..856bc8cbdc8b7d19772afa0c165941e99e4fc610 Binary files /dev/null and b/modules/__pycache__/unet_2d_condition.cpython-310.pyc differ diff --git a/modules/__pycache__/unet_3d.cpython-310.pyc b/modules/__pycache__/unet_3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d74d953055eb969d2286c429493f3a9b2e593cce Binary files /dev/null and b/modules/__pycache__/unet_3d.cpython-310.pyc differ diff --git a/modules/__pycache__/unet_3d_blocks.cpython-310.pyc b/modules/__pycache__/unet_3d_blocks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..131a7e3cf1da0bf30b904bfaa602b79c00869e52 Binary files /dev/null and b/modules/__pycache__/unet_3d_blocks.cpython-310.pyc differ diff --git a/modules/__pycache__/v_kps_guider.cpython-310.pyc b/modules/__pycache__/v_kps_guider.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..201197425810b283e8de8ebd7b928d57a8e9a678 Binary files /dev/null and b/modules/__pycache__/v_kps_guider.cpython-310.pyc differ diff --git a/modules/attention.py b/modules/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..1f9d4f20e6909b19521e7463ceb054a1f53e0fbd --- /dev/null +++ b/modules/attention.py @@ -0,0 +1,626 @@ +# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py + +from typing import Any, Dict, Optional + +import torch +from diffusers.models.attention import AdaLayerNorm, AdaLayerNormZero, Attention, FeedForward, GatedSelfAttentionDense +from diffusers.models.embeddings import SinusoidalPositionalEmbedding +from einops import rearrange +from torch import nn + + +class BasicTransformerBlock(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + upcast_attention (`bool`, *optional*): + Whether to upcast the attention computation to float32. This is useful for mixed precision training. + norm_elementwise_affine (`bool`, *optional*, defaults to `True`): + Whether to use learnable elementwise affine parameters for normalization. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`. + final_dropout (`bool` *optional*, defaults to False): + Whether to apply a final dropout after the last feed-forward layer. + attention_type (`str`, *optional*, defaults to `"default"`): + The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`. + positional_embeddings (`str`, *optional*, defaults to `None`): + The type of positional embeddings to apply to. + num_positional_embeddings (`int`, *optional*, defaults to `None`): + The maximum number of positional embeddings to apply. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single' + norm_eps: float = 1e-5, + final_dropout: bool = False, + attention_type: str = "default", + positional_embeddings: Optional[str] = None, + num_positional_embeddings: Optional[int] = None, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm_zero = ( + num_embeds_ada_norm is not None + ) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = ( + num_embeds_ada_norm is not None + ) and norm_type == "ada_norm" + self.use_ada_layer_norm_single = norm_type == "ada_norm_single" + self.use_layer_norm = norm_type == "layer_norm" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + if positional_embeddings and (num_positional_embeddings is None): + raise ValueError( + "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined." + ) + + if positional_embeddings == "sinusoidal": + self.pos_embed = SinusoidalPositionalEmbedding( + dim, max_seq_length=num_positional_embeddings + ) + else: + self.pos_embed = None + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif self.use_ada_layer_norm_zero: + self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm( + dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps + ) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm( + dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps + ) + ) + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim + if not double_self_attention + else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + if not self.use_ada_layer_norm_single: + self.norm3 = nn.LayerNorm( + dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps + ) + + self.ff = FeedForward( + dim, + dropout=dropout, + activation_fn=activation_fn, + final_dropout=final_dropout, + ) + + # 4. Fuser + if attention_type == "gated" or attention_type == "gated-text-image": + self.fuser = GatedSelfAttentionDense( + dim, cross_attention_dim, num_attention_heads, attention_head_dim + ) + + # 5. Scale-shift for PixArt-Alpha. + if self.use_ada_layer_norm_single: + self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + ) -> torch.FloatTensor: + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.use_layer_norm: + norm_hidden_states = self.norm1(hidden_states) + elif self.use_ada_layer_norm_single: + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + norm_hidden_states = norm_hidden_states.squeeze(1) + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Retrieve lora scale. + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + + # 2. Prepare GLIGEN inputs + cross_attention_kwargs = ( + cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + ) + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states + if self.only_cross_attention + else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.use_ada_layer_norm_single: + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 2.5 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.use_ada_layer_norm: + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.use_ada_layer_norm_zero or self.use_layer_norm: + norm_hidden_states = self.norm2(hidden_states) + elif self.use_ada_layer_norm_single: + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.use_ada_layer_norm_single is False: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + if not self.use_ada_layer_norm_single: + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = ( + norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + ) + + if self.use_ada_layer_norm_single: + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + ff_output = self.ff(norm_hidden_states, scale=lora_scale) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.use_ada_layer_norm_single: + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + + +class TemporalBasicTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + self.use_ada_layer_norm = num_embeds_ada_norm is not None + self.unet_use_cross_frame_attention = unet_use_cross_frame_attention + self.unet_use_temporal_attention = unet_use_temporal_attention + + # old self attention layer for only self-attention + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + self.norm1 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + + # new self attention layer for reference features + self.attn1_5 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + self.norm1_5 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + + # Cross-Attn + if cross_attention_dim is not None: + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + else: + self.attn2 = None + + if cross_attention_dim is not None: + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + else: + self.norm2 = None + + # Feed-forward + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn) + self.norm3 = nn.LayerNorm(dim) + self.use_ada_layer_norm_zero = False + + # Temp-Attn + assert unet_use_temporal_attention is not None + if unet_use_temporal_attention: + self.attn_temp = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + nn.init.zeros_(self.attn_temp.to_out[0].weight.data) + self.norm_temp = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + attention_mask=None, + video_length=None, + ): + norm_hidden_states = ( + self.norm1(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm1(hidden_states) + ) + + if self.unet_use_cross_frame_attention: + hidden_states = ( + self.attn1( + norm_hidden_states, + attention_mask=attention_mask, + video_length=video_length, + ) + + hidden_states + ) + else: + hidden_states = ( + self.attn1(norm_hidden_states, attention_mask=attention_mask) + + hidden_states + ) + + norm_hidden_states = ( + self.norm1_5(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm1_5(hidden_states) + ) + + if self.unet_use_cross_frame_attention: + hidden_states = ( + self.attn1_5( + norm_hidden_states, + attention_mask=attention_mask, + video_length=video_length, + ) + + hidden_states + ) + else: + hidden_states = ( + self.attn1_5(norm_hidden_states, attention_mask=attention_mask) + + hidden_states + ) + + if self.attn2 is not None: + # Cross-Attention + norm_hidden_states = ( + self.norm2(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm2(hidden_states) + ) + hidden_states = ( + self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + + hidden_states + ) + + # Feed-forward + hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states + + # Temporal-Attention + if self.unet_use_temporal_attention: + d = hidden_states.shape[1] + hidden_states = rearrange( + hidden_states, "(b f) d c -> (b d) f c", f=video_length + ) + norm_hidden_states = ( + self.norm_temp(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm_temp(hidden_states) + ) + hidden_states = self.attn_temp(norm_hidden_states) + hidden_states + hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d) + + return hidden_states + +class TemporalBasicTransformerBlockOld(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + self.use_ada_layer_norm = num_embeds_ada_norm is not None + self.unet_use_cross_frame_attention = unet_use_cross_frame_attention + self.unet_use_temporal_attention = unet_use_temporal_attention + + # SC-Attn + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + self.norm1 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + + # Cross-Attn + if cross_attention_dim is not None: + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + else: + self.attn2 = None + + if cross_attention_dim is not None: + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + else: + self.norm2 = None + + # Feed-forward + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn) + self.norm3 = nn.LayerNorm(dim) + self.use_ada_layer_norm_zero = False + + # Temp-Attn + assert unet_use_temporal_attention is not None + if unet_use_temporal_attention: + self.attn_temp = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) + nn.init.zeros_(self.attn_temp.to_out[0].weight.data) + self.norm_temp = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim) + ) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + attention_mask=None, + video_length=None, + ): + norm_hidden_states = ( + self.norm1(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm1(hidden_states) + ) + + if self.unet_use_cross_frame_attention: + hidden_states = ( + self.attn1( + norm_hidden_states, + attention_mask=attention_mask, + video_length=video_length, + ) + + hidden_states + ) + else: + hidden_states = ( + self.attn1(norm_hidden_states, attention_mask=attention_mask) + + hidden_states + ) + + if self.attn2 is not None: + # Cross-Attention + norm_hidden_states = ( + self.norm2(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm2(hidden_states) + ) + hidden_states = ( + self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + + hidden_states + ) + + # Feed-forward + hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states + + # Temporal-Attention + if self.unet_use_temporal_attention: + d = hidden_states.shape[1] + hidden_states = rearrange( + hidden_states, "(b f) d c -> (b d) f c", f=video_length + ) + norm_hidden_states = ( + self.norm_temp(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm_temp(hidden_states) + ) + hidden_states = self.attn_temp(norm_hidden_states) + hidden_states + hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d) + + return hidden_states \ No newline at end of file diff --git a/modules/audio_projection.py b/modules/audio_projection.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce2040afd3647b2d4b16e9c7259472615aac221 --- /dev/null +++ b/modules/audio_projection.py @@ -0,0 +1,150 @@ +import math + +import torch +import torch.nn as nn +from diffusers.models.modeling_utils import ModelMixin +from einops import rearrange +from einops.layers.torch import Rearrange + + +def reshape_tensor(x, heads): + bs, length, width = x.shape + # (bs, length, width) --> (bs, length, n_heads, dim_per_head) + x = x.view(bs, length, heads, -1) + # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) + x = x.transpose(1, 2) + # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head) + x = x.reshape(bs, heads, length, -1) + return x + + +def masked_mean(t, *, dim, mask=None): + if mask is None: + return t.mean(dim=dim) + + denom = mask.sum(dim=dim, keepdim=True) + mask = rearrange(mask, "b n -> b n 1") + masked_t = t.masked_fill(~mask, 0.0) + + return masked_t.sum(dim=dim) / denom.clamp(min=1e-5) + + +class PerceiverAttention(nn.Module): + def __init__(self, *, dim, dim_head=64, heads=8): + super().__init__() + self.scale = dim_head ** -0.5 + self.dim_head = dim_head + self.heads = heads + inner_dim = dim_head * heads + + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) + self.to_out = nn.Linear(inner_dim, dim, bias=False) + + def forward(self, x, latents): + """ + Args: + x (torch.Tensor): image features + shape (b, n1, D) + latent (torch.Tensor): latent features + shape (b, n2, D) + """ + x = self.norm1(x) + latents = self.norm2(latents) + + b, l, _ = latents.shape + + q = self.to_q(latents) + kv_input = torch.cat((x, latents), dim=-2) + k, v = self.to_kv(kv_input).chunk(2, dim=-1) + + q = reshape_tensor(q, self.heads) + k = reshape_tensor(k, self.heads) + v = reshape_tensor(v, self.heads) + + # attention + scale = 1 / math.sqrt(math.sqrt(self.dim_head)) + weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards + weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype) + out = weight @ v + + out = out.permute(0, 2, 1, 3).reshape(b, l, -1) + + return self.to_out(out) + + +def FeedForward(dim, mult=4): + inner_dim = int(dim * mult) + return nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, inner_dim, bias=False), + nn.GELU(), + nn.Linear(inner_dim, dim, bias=False), + ) + + +class AudioProjection(ModelMixin): + def __init__( + self, + dim=1024, + depth=8, + dim_head=64, + heads=16, + num_queries=8, + embedding_dim=768, + output_dim=1024, + ff_mult=4, + max_seq_len: int = 257, + num_latents_mean_pooled: int = 0, + ): + super().__init__() + + self.pos_emb = nn.Embedding(max_seq_len, embedding_dim) + self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim ** 0.5) + + self.proj_in = nn.Linear(embedding_dim, dim) + + self.proj_out = nn.Linear(dim, output_dim) + self.norm_out = nn.LayerNorm(output_dim) + + self.to_latents_from_mean_pooled_seq = ( + nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, dim * num_latents_mean_pooled), + Rearrange("b (n d) -> b n d", n=num_latents_mean_pooled), + ) + if num_latents_mean_pooled > 0 + else None + ) + + self.layers = nn.ModuleList([]) + for _ in range(depth): + self.layers.append(nn.ModuleList([ + PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads), + FeedForward(dim=dim, mult=ff_mult), + ])) + + def forward(self, x): + if self.pos_emb is not None: + n, device = x.shape[1], x.device + pos_emb = self.pos_emb(torch.arange(n, device=device)) + x = x + pos_emb + + latents = self.latents.repeat(x.size(0), 1, 1) + + x = self.proj_in(x) + + if self.to_latents_from_mean_pooled_seq: + meanpooled_seq = masked_mean(x, dim=1, mask=torch.ones(x.shape[:2], device=x.device, dtype=torch.bool)) + meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq) + latents = torch.cat((meanpooled_latents, latents), dim=-2) + + for attn, ff in self.layers: + latents = attn(x, latents) + latents + latents = ff(latents) + latents + + latents = self.proj_out(latents) + return self.norm_out(latents) diff --git a/modules/motion_module.py b/modules/motion_module.py new file mode 100644 index 0000000000000000000000000000000000000000..afb2f485edc28eb84e9110cb169cb790dce306cf --- /dev/null +++ b/modules/motion_module.py @@ -0,0 +1,388 @@ +# Adapt from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/motion_module.py +import math +from dataclasses import dataclass +from typing import Callable, Optional + +import torch +from diffusers.models.attention import FeedForward +from diffusers.models.attention_processor import Attention, AttnProcessor +from diffusers.utils import BaseOutput +from diffusers.utils.import_utils import is_xformers_available +from einops import rearrange, repeat +from torch import nn + + +def zero_module(module): + # Zero out the parameters of a module and return it. + for p in module.parameters(): + p.detach().zero_() + return module + + +@dataclass +class TemporalTransformer3DModelOutput(BaseOutput): + sample: torch.FloatTensor + + +if is_xformers_available(): + import xformers + import xformers.ops +else: + xformers = None + + +def get_motion_module(in_channels, motion_module_type: str, motion_module_kwargs: dict): + if motion_module_type == "Vanilla": + return VanillaTemporalModule( + in_channels=in_channels, + **motion_module_kwargs, + ) + else: + raise ValueError + + +class VanillaTemporalModule(nn.Module): + def __init__( + self, + in_channels, + num_attention_heads=8, + num_transformer_block=2, + attention_block_types=("Temporal_Self", "Temporal_Self"), + cross_frame_attention_mode=None, + temporal_position_encoding=False, + temporal_position_encoding_max_len=24, + temporal_attention_dim_div=1, + zero_initialize=True, + ): + super().__init__() + + self.temporal_transformer = TemporalTransformer3DModel( + in_channels=in_channels, + num_attention_heads=num_attention_heads, + attention_head_dim=in_channels + // num_attention_heads + // temporal_attention_dim_div, + num_layers=num_transformer_block, + attention_block_types=attention_block_types, + cross_frame_attention_mode=cross_frame_attention_mode, + temporal_position_encoding=temporal_position_encoding, + temporal_position_encoding_max_len=temporal_position_encoding_max_len, + ) + + if zero_initialize: + self.temporal_transformer.proj_out = zero_module( + self.temporal_transformer.proj_out + ) + + def forward( + self, + input_tensor, + temb, + encoder_hidden_states, + attention_mask=None, + anchor_frame_idx=None, + ): + hidden_states = input_tensor + hidden_states = self.temporal_transformer( + hidden_states, encoder_hidden_states, attention_mask + ) + + output = hidden_states + return output + + +class TemporalTransformer3DModel(nn.Module): + def __init__( + self, + in_channels, + num_attention_heads, + attention_head_dim, + num_layers, + attention_block_types=( + "Temporal_Self", + "Temporal_Self", + ), + dropout=0.0, + norm_num_groups=32, + cross_attention_dim=768, + activation_fn="geglu", + attention_bias=False, + upcast_attention=False, + cross_frame_attention_mode=None, + temporal_position_encoding=False, + temporal_position_encoding_max_len=24, + ): + super().__init__() + + inner_dim = num_attention_heads * attention_head_dim + + self.norm = torch.nn.GroupNorm( + num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True + ) + self.proj_in = nn.Linear(in_channels, inner_dim) + + self.transformer_blocks = nn.ModuleList( + [ + TemporalTransformerBlock( + dim=inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + attention_block_types=attention_block_types, + dropout=dropout, + norm_num_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + attention_bias=attention_bias, + upcast_attention=upcast_attention, + cross_frame_attention_mode=cross_frame_attention_mode, + temporal_position_encoding=temporal_position_encoding, + temporal_position_encoding_max_len=temporal_position_encoding_max_len, + ) + for d in range(num_layers) + ] + ) + self.proj_out = nn.Linear(inner_dim, in_channels) + + def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None): + assert ( + hidden_states.dim() == 5 + ), f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}." + video_length = hidden_states.shape[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + + batch, channel, height, weight = hidden_states.shape + residual = hidden_states + + hidden_states = self.norm(hidden_states) + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( + batch, height * weight, inner_dim + ) + hidden_states = self.proj_in(hidden_states) + + # Transformer Blocks + for block in self.transformer_blocks: + hidden_states = block( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + video_length=video_length, + ) + + # output + hidden_states = self.proj_out(hidden_states) + hidden_states = ( + hidden_states.reshape(batch, height, weight, inner_dim) + .permute(0, 3, 1, 2) + .contiguous() + ) + + output = hidden_states + residual + output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length) + + return output + + +class TemporalTransformerBlock(nn.Module): + def __init__( + self, + dim, + num_attention_heads, + attention_head_dim, + attention_block_types=( + "Temporal_Self", + "Temporal_Self", + ), + dropout=0.0, + norm_num_groups=32, + cross_attention_dim=768, + activation_fn="geglu", + attention_bias=False, + upcast_attention=False, + cross_frame_attention_mode=None, + temporal_position_encoding=False, + temporal_position_encoding_max_len=24, + ): + super().__init__() + + attention_blocks = [] + norms = [] + + for block_name in attention_block_types: + attention_blocks.append( + VersatileAttention( + attention_mode=block_name.split("_")[0], + cross_attention_dim=cross_attention_dim + if block_name.endswith("_Cross") + else None, + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + cross_frame_attention_mode=cross_frame_attention_mode, + temporal_position_encoding=temporal_position_encoding, + temporal_position_encoding_max_len=temporal_position_encoding_max_len, + ) + ) + norms.append(nn.LayerNorm(dim)) + + self.attention_blocks = nn.ModuleList(attention_blocks) + self.norms = nn.ModuleList(norms) + + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn) + self.ff_norm = nn.LayerNorm(dim) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + video_length=None, + ): + for attention_block, norm in zip(self.attention_blocks, self.norms): + norm_hidden_states = norm(hidden_states) + hidden_states = ( + attention_block( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states + if attention_block.is_cross_attention + else None, + video_length=video_length, + ) + + hidden_states + ) + + hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states + + output = hidden_states + return output + + +class PositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.0, max_len=24): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + position = torch.arange(max_len).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model) + ) + pe = torch.zeros(1, max_len, d_model) + pe[0, :, 0::2] = torch.sin(position * div_term) + pe[0, :, 1::2] = torch.cos(position * div_term) + self.register_buffer("pe", pe) + + def forward(self, x): + x = x + self.pe[:, : x.size(1)] + return self.dropout(x) + + +class VersatileAttention(Attention): + def __init__( + self, + attention_mode=None, + cross_frame_attention_mode=None, + temporal_position_encoding=False, + temporal_position_encoding_max_len=24, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + assert attention_mode == "Temporal" + + self.attention_mode = attention_mode + self.is_cross_attention = kwargs["cross_attention_dim"] is not None + + self.pos_encoder = ( + PositionalEncoding( + kwargs["query_dim"], + dropout=0.0, + max_len=temporal_position_encoding_max_len, + ) + if (temporal_position_encoding and attention_mode == "Temporal") + else None + ) + + def extra_repr(self): + return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}" + + def set_use_memory_efficient_attention_xformers( + self, + use_memory_efficient_attention_xformers: bool, + attention_op: Optional[Callable] = None, + ): + if use_memory_efficient_attention_xformers: + if not is_xformers_available(): + raise ModuleNotFoundError( + ( + "Refer to https://github.com/facebookresearch/xformers for more information on how to install" + " xformers" + ), + name="xformers", + ) + elif not torch.cuda.is_available(): + raise ValueError( + "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is" + " only available for GPU " + ) + else: + try: + # Make sure we can run the memory efficient attention + _ = xformers.ops.memory_efficient_attention( + torch.randn((1, 2, 40), device="cuda"), + torch.randn((1, 2, 40), device="cuda"), + torch.randn((1, 2, 40), device="cuda"), + ) + except Exception as e: + raise e + + # XFormersAttnProcessor corrupts video generation and work with Pytorch 1.13. + # Pytorch 2.0.1 AttnProcessor works the same as XFormersAttnProcessor in Pytorch 1.13. + # You don't need XFormersAttnProcessor here. + # processor = XFormersAttnProcessor( + # attention_op=attention_op, + # ) + processor = AttnProcessor() + else: + processor = AttnProcessor() + + self.set_processor(processor) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + video_length=None, + **cross_attention_kwargs, + ): + if self.attention_mode == "Temporal": + d = hidden_states.shape[1] # d means HxW + hidden_states = rearrange( + hidden_states, "(b f) d c -> (b d) f c", f=video_length + ) + + if self.pos_encoder is not None: + hidden_states = self.pos_encoder(hidden_states) + + encoder_hidden_states = ( + repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d) + if encoder_hidden_states is not None + else encoder_hidden_states + ) + + else: + raise NotImplementedError + + hidden_states = self.processor( + self, + hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + if self.attention_mode == "Temporal": + hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d) + + return hidden_states diff --git a/modules/mutual_self_attention.py b/modules/mutual_self_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..f6fb78590755d645c0efd2310bf61e027127c8e8 --- /dev/null +++ b/modules/mutual_self_attention.py @@ -0,0 +1,376 @@ +# Adapted from https://github.com/magic-research/magic-animate/blob/main/magicanimate/models/mutual_self_attention.py +from typing import Any, Dict, Optional + +import torch +from einops import rearrange + +from .attention import BasicTransformerBlock +from .attention import TemporalBasicTransformerBlock + + +def torch_dfs(model: torch.nn.Module): + result = [model] + for child in model.children(): + result += torch_dfs(child) + return result + + +class ReferenceAttentionControl: + def __init__( + self, + unet, + mode="write", + do_classifier_free_guidance=False, + attention_auto_machine_weight=float("inf"), + gn_auto_machine_weight=1.0, + style_fidelity=1.0, + reference_attn=True, + reference_adain=False, + fusion_blocks="midup", + batch_size=1, + reference_attention_weight=1., + audio_attention_weight=1., + ) -> None: + # 10. Modify self attention and group norm + self.unet = unet + assert mode in ["read", "write"] + assert fusion_blocks in ["midup", "full"] + self.reference_attn = reference_attn + self.reference_adain = reference_adain + self.fusion_blocks = fusion_blocks + self.reference_attention_weight = reference_attention_weight + self.audio_attention_weight = audio_attention_weight + self.register_reference_hooks( + mode, + do_classifier_free_guidance, + attention_auto_machine_weight, + gn_auto_machine_weight, + style_fidelity, + reference_attn, + reference_adain, + fusion_blocks, + batch_size=batch_size, + ) + + def register_reference_hooks( + self, + mode, + do_classifier_free_guidance, + attention_auto_machine_weight, + gn_auto_machine_weight, + style_fidelity, + reference_attn, + reference_adain, + dtype=torch.float16, + batch_size=1, + num_images_per_prompt=1, + device=torch.device("cpu"), + fusion_blocks="midup", + ): + MODE = mode + do_classifier_free_guidance = do_classifier_free_guidance + attention_auto_machine_weight = attention_auto_machine_weight + gn_auto_machine_weight = gn_auto_machine_weight + style_fidelity = style_fidelity + reference_attn = reference_attn + reference_adain = reference_adain + fusion_blocks = fusion_blocks + num_images_per_prompt = num_images_per_prompt + reference_attention_weight = self.reference_attention_weight + audio_attention_weight = self.audio_attention_weight + dtype = dtype + if do_classifier_free_guidance: + uc_mask = ( + torch.Tensor( + [1] * batch_size * num_images_per_prompt * 16 + + [0] * batch_size * num_images_per_prompt * 16 + ) + .to(device) + .bool() + ) + else: + uc_mask = ( + torch.Tensor([0] * batch_size * num_images_per_prompt * 2) + .to(device) + .bool() + ) + + def hacked_basic_transformer_inner_forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + video_length=None, + ): + if self.use_ada_layer_norm: # False + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + ( + norm_hidden_states, + gate_msa, + shift_mlp, + scale_mlp, + gate_mlp, + ) = self.norm1( + hidden_states, + timestep, + class_labels, + hidden_dtype=hidden_states.dtype, + ) + else: + norm_hidden_states = self.norm1(hidden_states) + + # 1. Self-Attention + # self.only_cross_attention = False + cross_attention_kwargs = ( + cross_attention_kwargs if cross_attention_kwargs is not None else {} + ) + if self.only_cross_attention: + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states + if self.only_cross_attention + else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + else: + if MODE == "write": + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states + if self.only_cross_attention + else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = attn_output + hidden_states + + if self.attn2 is not None: + norm_hidden_states = ( + self.norm2(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm2(hidden_states) + ) + self.bank.append(norm_hidden_states.clone()) + + # 2. Cross-Attention + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + if MODE == "read": + hidden_states = ( + self.attn1( + norm_hidden_states, + encoder_hidden_states=norm_hidden_states, + attention_mask=attention_mask, + ) + + hidden_states + ) + + if self.use_ada_layer_norm: # False + norm_hidden_states = self.norm1_5(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + ( + norm_hidden_states, + gate_msa, + shift_mlp, + scale_mlp, + gate_mlp, + ) = self.norm1_5( + hidden_states, + timestep, + class_labels, + hidden_dtype=hidden_states.dtype, + ) + else: + norm_hidden_states = self.norm1_5(hidden_states) + + bank_fea = [] + for d in self.bank: + if len(d.shape) == 3: + d = d.unsqueeze(1).repeat(1, video_length, 1, 1) + bank_fea.append(rearrange(d, "b t l c -> (b t) l c")) + + attn_hidden_states = self.attn1_5( + norm_hidden_states, + encoder_hidden_states=bank_fea[0], + attention_mask=attention_mask, + ) + + if reference_attention_weight != 1.: + attn_hidden_states *= reference_attention_weight + + hidden_states = (attn_hidden_states + hidden_states) + + # self.bank.clear() + if self.attn2 is not None: + # Cross-Attention + norm_hidden_states = ( + self.norm2(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm2(hidden_states) + ) + + attn_hidden_states = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + + if audio_attention_weight != 1.: + attn_hidden_states *= audio_attention_weight + + hidden_states = (attn_hidden_states + hidden_states) + + # Feed-forward + hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states + + # Temporal-Attention + if self.unet_use_temporal_attention: + d = hidden_states.shape[1] + hidden_states = rearrange( + hidden_states, "(b f) d c -> (b d) f c", f=video_length + ) + norm_hidden_states = ( + self.norm_temp(hidden_states, timestep) + if self.use_ada_layer_norm + else self.norm_temp(hidden_states) + ) + hidden_states = ( + self.attn_temp(norm_hidden_states) + hidden_states + ) + hidden_states = rearrange( + hidden_states, "(b d) f c -> (b f) d c", d=d + ) + + return hidden_states + + # 3. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = ( + norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + ) + + ff_output = self.ff(norm_hidden_states) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = ff_output + hidden_states + + return hidden_states + + if self.reference_attn: + if self.fusion_blocks == "midup": + attn_modules = [ + module + for module in ( + torch_dfs(self.unet.mid_block) + torch_dfs(self.unet.up_blocks) + ) + if isinstance(module, BasicTransformerBlock) + or isinstance(module, TemporalBasicTransformerBlock) + ] + elif self.fusion_blocks == "full": + attn_modules = [ + module + for module in torch_dfs(self.unet) + if isinstance(module, BasicTransformerBlock) + or isinstance(module, TemporalBasicTransformerBlock) + ] + attn_modules = sorted( + attn_modules, key=lambda x: -x.norm1.normalized_shape[0] + ) + + for i, module in enumerate(attn_modules): + module._original_inner_forward = module.forward + if isinstance(module, BasicTransformerBlock): + module.forward = hacked_basic_transformer_inner_forward.__get__( + module, BasicTransformerBlock + ) + if isinstance(module, TemporalBasicTransformerBlock): + module.forward = hacked_basic_transformer_inner_forward.__get__( + module, TemporalBasicTransformerBlock + ) + + module.bank = [] + module.attn_weight = float(i) / float(len(attn_modules)) + + def update( + self, + writer, + do_classifier_free_guidance=True, + dtype=torch.float16, + ): + if self.reference_attn: + if self.fusion_blocks == "midup": + reader_attn_modules = [ + module + for module in (torch_dfs(self.unet.mid_block) + torch_dfs(self.unet.up_blocks)) + if isinstance(module, TemporalBasicTransformerBlock) + ] + writer_attn_modules = [ + module + for module in (torch_dfs(writer.unet.mid_block) + torch_dfs(writer.unet.up_blocks)) + if isinstance(module, BasicTransformerBlock) + ] + elif self.fusion_blocks == "full": + reader_attn_modules = [ + module + for module in torch_dfs(self.unet) + if isinstance(module, TemporalBasicTransformerBlock) + ] + writer_attn_modules = [ + module + for module in torch_dfs(writer.unet) + if isinstance(module, BasicTransformerBlock) + ] + reader_attn_modules = sorted( + reader_attn_modules, key=lambda x: -x.norm1.normalized_shape[0] + ) + writer_attn_modules = sorted( + writer_attn_modules, key=lambda x: -x.norm1.normalized_shape[0] + ) + for r, w in zip(reader_attn_modules, writer_attn_modules): + if do_classifier_free_guidance: + r.bank = [torch.cat([torch.zeros_like(v), v]).to(dtype) for v in w.bank] + else: + r.bank = [v.clone().to(dtype) for v in w.bank] + + def clear(self): + if self.reference_attn: + if self.fusion_blocks == "midup": + reader_attn_modules = [ + module + for module in ( + torch_dfs(self.unet.mid_block) + torch_dfs(self.unet.up_blocks) + ) + if isinstance(module, BasicTransformerBlock) + or isinstance(module, TemporalBasicTransformerBlock) + ] + elif self.fusion_blocks == "full": + reader_attn_modules = [ + module + for module in torch_dfs(self.unet) + if isinstance(module, BasicTransformerBlock) + or isinstance(module, TemporalBasicTransformerBlock) + ] + reader_attn_modules = sorted( + reader_attn_modules, key=lambda x: -x.norm1.normalized_shape[0] + ) + for r in reader_attn_modules: + r.bank.clear() diff --git a/modules/resnet.py b/modules/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..63b4464e727338f605eb3852957cf34d9e31a8af --- /dev/null +++ b/modules/resnet.py @@ -0,0 +1,256 @@ +# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + + +class InflatedConv3d(nn.Conv2d): + def forward(self, x): + video_length = x.shape[2] + + x = rearrange(x, "b c f h w -> (b f) c h w") + x = super().forward(x) + x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) + + return x + + +class InflatedGroupNorm(nn.GroupNorm): + def forward(self, x): + video_length = x.shape[2] + + x = rearrange(x, "b c f h w -> (b f) c h w") + x = super().forward(x) + x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length) + + return x + + +class Upsample3D(nn.Module): + def __init__( + self, + channels, + use_conv=False, + use_conv_transpose=False, + out_channels=None, + name="conv", + ): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_conv_transpose = use_conv_transpose + self.name = name + + conv = None + if use_conv_transpose: + raise NotImplementedError + elif use_conv: + self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1) + + def forward(self, hidden_states, output_size=None): + assert hidden_states.shape[1] == self.channels + + if self.use_conv_transpose: + raise NotImplementedError + + # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16 + dtype = hidden_states.dtype + if dtype == torch.bfloat16: + hidden_states = hidden_states.to(torch.float32) + + # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984 + if hidden_states.shape[0] >= 64: + hidden_states = hidden_states.contiguous() + + # if `output_size` is passed we force the interpolation output + # size and do not make use of `scale_factor=2` + if output_size is None: + hidden_states = F.interpolate( + hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest" + ) + else: + hidden_states = F.interpolate( + hidden_states, size=output_size, mode="nearest" + ) + + # If the input is bfloat16, we cast back to bfloat16 + if dtype == torch.bfloat16: + hidden_states = hidden_states.to(dtype) + + # if self.use_conv: + # if self.name == "conv": + # hidden_states = self.conv(hidden_states) + # else: + # hidden_states = self.Conv2d_0(hidden_states) + hidden_states = self.conv(hidden_states) + + return hidden_states + + +class Downsample3D(nn.Module): + def __init__( + self, channels, use_conv=False, out_channels=None, padding=1, name="conv" + ): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.padding = padding + stride = 2 + self.name = name + + if use_conv: + self.conv = InflatedConv3d( + self.channels, self.out_channels, 3, stride=stride, padding=padding + ) + else: + raise NotImplementedError + + def forward(self, hidden_states): + assert hidden_states.shape[1] == self.channels + if self.use_conv and self.padding == 0: + raise NotImplementedError + + assert hidden_states.shape[1] == self.channels + hidden_states = self.conv(hidden_states) + + return hidden_states + + +class ResnetBlock3D(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout=0.0, + temb_channels=512, + groups=32, + groups_out=None, + pre_norm=True, + eps=1e-6, + non_linearity="swish", + time_embedding_norm="default", + output_scale_factor=1.0, + use_in_shortcut=None, + use_inflated_groupnorm=None, + ): + super().__init__() + self.pre_norm = pre_norm + self.pre_norm = True + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + self.time_embedding_norm = time_embedding_norm + self.output_scale_factor = output_scale_factor + + if groups_out is None: + groups_out = groups + + assert use_inflated_groupnorm != None + if use_inflated_groupnorm: + self.norm1 = InflatedGroupNorm( + num_groups=groups, num_channels=in_channels, eps=eps, affine=True + ) + else: + self.norm1 = torch.nn.GroupNorm( + num_groups=groups, num_channels=in_channels, eps=eps, affine=True + ) + + self.conv1 = InflatedConv3d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + + if temb_channels is not None: + if self.time_embedding_norm == "default": + time_emb_proj_out_channels = out_channels + elif self.time_embedding_norm == "scale_shift": + time_emb_proj_out_channels = out_channels * 2 + else: + raise ValueError( + f"unknown time_embedding_norm : {self.time_embedding_norm} " + ) + + self.time_emb_proj = torch.nn.Linear( + temb_channels, time_emb_proj_out_channels + ) + else: + self.time_emb_proj = None + + if use_inflated_groupnorm: + self.norm2 = InflatedGroupNorm( + num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True + ) + else: + self.norm2 = torch.nn.GroupNorm( + num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True + ) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = InflatedConv3d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + + if non_linearity == "swish": + self.nonlinearity = lambda x: F.silu(x) + elif non_linearity == "mish": + self.nonlinearity = Mish() + elif non_linearity == "silu": + self.nonlinearity = nn.SiLU() + + self.use_in_shortcut = ( + self.in_channels != self.out_channels + if use_in_shortcut is None + else use_in_shortcut + ) + + self.conv_shortcut = None + if self.use_in_shortcut: + self.conv_shortcut = InflatedConv3d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, input_tensor, temb): + hidden_states = input_tensor + + hidden_states = self.norm1(hidden_states) + hidden_states = self.nonlinearity(hidden_states) + + hidden_states = self.conv1(hidden_states) + + if temb is not None: + temb = self.time_emb_proj(self.nonlinearity(temb)) + if len(temb.shape) == 2: + temb = temb[:, :, None, None, None] + elif len(temb.shape) == 3: + temb = temb[:, :, :, None, None].permute(0, 2, 1, 3, 4) + + if temb is not None and self.time_embedding_norm == "default": + hidden_states = hidden_states + temb + + hidden_states = self.norm2(hidden_states) + + if temb is not None and self.time_embedding_norm == "scale_shift": + scale, shift = torch.chunk(temb, 2, dim=1) + hidden_states = hidden_states * (1 + scale) + shift + + hidden_states = self.nonlinearity(hidden_states) + + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + + if self.conv_shortcut is not None: + input_tensor = self.conv_shortcut(input_tensor) + + output_tensor = (input_tensor + hidden_states) / self.output_scale_factor + + return output_tensor + + +class Mish(torch.nn.Module): + def forward(self, hidden_states): + return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states)) diff --git a/modules/transformer_2d.py b/modules/transformer_2d.py new file mode 100644 index 0000000000000000000000000000000000000000..686affaf62e46db8cffd6b6cd66daac69b346a26 --- /dev/null +++ b/modules/transformer_2d.py @@ -0,0 +1,401 @@ +# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py +from dataclasses import dataclass +from typing import Any, Dict, Optional + +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config + +try: + from diffusers.models.embeddings import CaptionProjection +except: + from diffusers.models.embeddings import PixArtAlphaTextProjection as CaptionProjection + +from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.normalization import AdaLayerNormSingle +from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, is_torch_version +from torch import nn + +from .attention import BasicTransformerBlock + + +@dataclass +class Transformer2DModelOutput(BaseOutput): + """ + The output of [`Transformer2DModel`]. + + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability + distributions for the unnoised latent pixels. + """ + + sample: torch.FloatTensor + ref_feature: torch.FloatTensor + + +class Transformer2DModel(ModelMixin, ConfigMixin): + """ + A 2D Transformer model for image-like data. + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + The number of channels in the input and output (specify if the input is **continuous**). + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. + sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**). + This is fixed during training since it is used to learn a number of position embeddings. + num_vector_embeds (`int`, *optional*): + The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**). + Includes the class for the masked latent pixel. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): + The number of diffusion steps used during training. Pass if at least one of the norm_layers is + `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are + added to the hidden states. + + During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`. + attention_bias (`bool`, *optional*): + Configure if the `TransformerBlocks` attention should contain a bias parameter. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + patch_size: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_type: str = "layer_norm", + norm_elementwise_affine: bool = True, + norm_eps: float = 1e-5, + attention_type: str = "default", + caption_channels: int = None, + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv + linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear + + # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` + # Define whether input is continuous or discrete depending on configuration + self.is_input_continuous = (in_channels is not None) and (patch_size is None) + self.is_input_vectorized = num_vector_embeds is not None + self.is_input_patches = in_channels is not None and patch_size is not None + + if norm_type == "layer_norm" and num_embeds_ada_norm is not None: + deprecation_message = ( + f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or" + " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config." + " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect" + " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it" + " would be very nice if you could open a Pull request for the `transformer/config.json` file" + ) + deprecate( + "norm_type!=num_embeds_ada_norm", + "1.0.0", + deprecation_message, + standard_warn=False, + ) + norm_type = "ada_norm" + + if self.is_input_continuous and self.is_input_vectorized: + raise ValueError( + f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make" + " sure that either `in_channels` or `num_vector_embeds` is None." + ) + elif self.is_input_vectorized and self.is_input_patches: + raise ValueError( + f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make" + " sure that either `num_vector_embeds` or `num_patches` is None." + ) + elif ( + not self.is_input_continuous + and not self.is_input_vectorized + and not self.is_input_patches + ): + raise ValueError( + f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:" + f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None." + ) + + # 2. Define input layers + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm( + num_groups=norm_num_groups, + num_channels=in_channels, + eps=1e-6, + affine=True, + ) + if use_linear_projection: + self.proj_in = linear_cls(in_channels, inner_dim) + else: + self.proj_in = conv_cls( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0 + ) + + # 3. Define transformers blocks + self.transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + double_self_attention=double_self_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + norm_elementwise_affine=norm_elementwise_affine, + norm_eps=norm_eps, + attention_type=attention_type, + ) + for d in range(num_layers) + ] + ) + + # 4. Define output layers + self.out_channels = in_channels if out_channels is None else out_channels + # TODO: should use out_channels for continuous projections + if use_linear_projection: + self.proj_out = linear_cls(inner_dim, in_channels) + else: + self.proj_out = conv_cls( + inner_dim, in_channels, kernel_size=1, stride=1, padding=0 + ) + + # 5. PixArt-Alpha blocks. + self.adaln_single = None + self.use_additional_conditions = False + if norm_type == "ada_norm_single": + self.use_additional_conditions = self.config.sample_size == 128 + # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use + # additional conditions until we find better name + self.adaln_single = AdaLayerNormSingle( + inner_dim, use_additional_conditions=self.use_additional_conditions + ) + + self.caption_projection = None + if caption_channels is not None: + self.caption_projection = CaptionProjection( + in_features=caption_channels, hidden_size=inner_dim + ) + + self.gradient_checkpointing = False + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + timestep: Optional[torch.LongTensor] = None, + added_cond_kwargs: Dict[str, torch.Tensor] = None, + class_labels: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + ): + """ + The [`Transformer2DModel`] forward method. + + Args: + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + Input `hidden_states`. + encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `torch.LongTensor`, *optional*): + Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`. + class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): + Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in + `AdaLayerZeroNorm`. + cross_attention_kwargs ( `Dict[str, Any]`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + attention_mask ( `torch.Tensor`, *optional*): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + encoder_attention_mask ( `torch.Tensor`, *optional*): + Cross-attention mask applied to `encoder_hidden_states`. Two formats supported: + + * Mask `(batch, sequence_length)` True = keep, False = discard. + * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard. + + If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format + above. This bias will be added to the cross-attention scores. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + + Returns: + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. + """ + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. + # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. + # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None and attention_mask.ndim == 2: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2: + encoder_attention_mask = ( + 1 - encoder_attention_mask.to(hidden_states.dtype) + ) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # Retrieve lora scale. + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + + # 1. Input + batch, _, height, width = hidden_states.shape + residual = hidden_states + + hidden_states = self.norm(hidden_states) + if not self.use_linear_projection: + hidden_states = ( + self.proj_in(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_in(hidden_states) + ) + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( + batch, height * width, inner_dim + ) + else: + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( + batch, height * width, inner_dim + ) + hidden_states = ( + self.proj_in(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_in(hidden_states) + ) + + # 2. Blocks + if self.caption_projection is not None: + batch_size = hidden_states.shape[0] + encoder_hidden_states = self.caption_projection(encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states.view( + batch_size, -1, hidden_states.shape[-1] + ) + + ref_feature = hidden_states.reshape(batch, height, width, inner_dim) + for block in self.transformer_blocks: + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + timestep, + cross_attention_kwargs, + class_labels, + **ckpt_kwargs, + ) + else: + hidden_states = block( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + ) + + # 3. Output + if self.is_input_continuous: + if not self.use_linear_projection: + hidden_states = ( + hidden_states.reshape(batch, height, width, inner_dim) + .permute(0, 3, 1, 2) + .contiguous() + ) + hidden_states = ( + self.proj_out(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_out(hidden_states) + ) + else: + hidden_states = ( + self.proj_out(hidden_states, scale=lora_scale) + if not USE_PEFT_BACKEND + else self.proj_out(hidden_states) + ) + hidden_states = ( + hidden_states.reshape(batch, height, width, inner_dim) + .permute(0, 3, 1, 2) + .contiguous() + ) + + output = hidden_states + residual + if not return_dict: + return (output, ref_feature) + + return Transformer2DModelOutput(sample=output, ref_feature=ref_feature) diff --git a/modules/transformer_3d.py b/modules/transformer_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..4f231b0084c38cc22622ad1f8dac36b214817f9d --- /dev/null +++ b/modules/transformer_3d.py @@ -0,0 +1,169 @@ +from dataclasses import dataclass +from typing import Optional + +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models import ModelMixin +from diffusers.utils import BaseOutput +from diffusers.utils.import_utils import is_xformers_available +from einops import rearrange, repeat +from torch import nn + +from .attention import TemporalBasicTransformerBlock + + +@dataclass +class Transformer3DModelOutput(BaseOutput): + sample: torch.FloatTensor + + +if is_xformers_available(): + import xformers + import xformers.ops +else: + xformers = None + + +class Transformer3DModel(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + # Define input layers + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm( + num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True + ) + if use_linear_projection: + self.proj_in = nn.Linear(in_channels, inner_dim) + else: + self.proj_in = nn.Conv2d( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0 + ) + + # Define transformers blocks + self.transformer_blocks = nn.ModuleList( + [ + TemporalBasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + ) + for d in range(num_layers) + ] + ) + + # 4. Define output layers + if use_linear_projection: + self.proj_out = nn.Linear(in_channels, inner_dim) + else: + self.proj_out = nn.Conv2d( + inner_dim, in_channels, kernel_size=1, stride=1, padding=0 + ) + + self.gradient_checkpointing = False + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + return_dict: bool = True, + ): + # Input + assert ( + hidden_states.dim() == 5 + ), f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}." + video_length = hidden_states.shape[2] + hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w") + if encoder_hidden_states.shape[0] != hidden_states.shape[0]: + encoder_hidden_states = repeat( + encoder_hidden_states, "b n c -> (b f) n c", f=video_length + ) + + batch, channel, height, weight = hidden_states.shape + residual = hidden_states + + hidden_states = self.norm(hidden_states) + if not self.use_linear_projection: + hidden_states = self.proj_in(hidden_states) + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( + batch, height * weight, inner_dim + ) + else: + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape( + batch, height * weight, inner_dim + ) + hidden_states = self.proj_in(hidden_states) + + # Blocks + for i, block in enumerate(self.transformer_blocks): + hidden_states = block( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=timestep, + video_length=video_length, + ) + + # Output + if not self.use_linear_projection: + hidden_states = ( + hidden_states.reshape(batch, height, weight, inner_dim) + .permute(0, 3, 1, 2) + .contiguous() + ) + hidden_states = self.proj_out(hidden_states) + else: + hidden_states = self.proj_out(hidden_states) + hidden_states = ( + hidden_states.reshape(batch, height, weight, inner_dim) + .permute(0, 3, 1, 2) + .contiguous() + ) + + output = hidden_states + residual + + output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length) + if not return_dict: + return (output,) + + return Transformer3DModelOutput(sample=output) diff --git a/modules/unet_2d_blocks.py b/modules/unet_2d_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..00339dc625fc62858663cd2300d420a26ccacd21 --- /dev/null +++ b/modules/unet_2d_blocks.py @@ -0,0 +1,1072 @@ +# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py +from typing import Any, Dict, Optional, Tuple, Union + +import torch +from diffusers.models.activations import get_activation +from diffusers.models.attention_processor import Attention +from diffusers.models.dual_transformer_2d import DualTransformer2DModel +from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D +from diffusers.utils import is_torch_version, logging +from diffusers.utils.torch_utils import apply_freeu +from torch import nn + +from .transformer_2d import Transformer2DModel + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def get_down_block( + down_block_type: str, + num_layers: int, + in_channels: int, + out_channels: int, + temb_channels: int, + add_downsample: bool, + resnet_eps: float, + resnet_act_fn: str, + transformer_layers_per_block: int = 1, + num_attention_heads: Optional[int] = None, + resnet_groups: Optional[int] = None, + cross_attention_dim: Optional[int] = None, + downsample_padding: Optional[int] = None, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + attention_type: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: float = 1.0, + cross_attention_norm: Optional[str] = None, + attention_head_dim: Optional[int] = None, + downsample_type: Optional[str] = None, + dropout: float = 0.0, +): + # If attn head dim is not defined, we default it to the number of heads + if attention_head_dim is None: + logger.warn( + f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}." + ) + attention_head_dim = num_attention_heads + + down_block_type = ( + down_block_type[7:] + if down_block_type.startswith("UNetRes") + else down_block_type + ) + if down_block_type == "DownBlock2D": + return DownBlock2D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + dropout=dropout, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + elif down_block_type == "CrossAttnDownBlock2D": + if cross_attention_dim is None: + raise ValueError( + "cross_attention_dim must be specified for CrossAttnDownBlock2D" + ) + return CrossAttnDownBlock2D( + num_layers=num_layers, + transformer_layers_per_block=transformer_layers_per_block, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + dropout=dropout, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + attention_type=attention_type, + ) + raise ValueError(f"{down_block_type} does not exist.") + + +def get_up_block( + up_block_type: str, + num_layers: int, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + add_upsample: bool, + resnet_eps: float, + resnet_act_fn: str, + resolution_idx: Optional[int] = None, + transformer_layers_per_block: int = 1, + num_attention_heads: Optional[int] = None, + resnet_groups: Optional[int] = None, + cross_attention_dim: Optional[int] = None, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + attention_type: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: float = 1.0, + cross_attention_norm: Optional[str] = None, + attention_head_dim: Optional[int] = None, + upsample_type: Optional[str] = None, + dropout: float = 0.0, +) -> nn.Module: + # If attn head dim is not defined, we default it to the number of heads + if attention_head_dim is None: + logger.warn( + f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}." + ) + attention_head_dim = num_attention_heads + + up_block_type = ( + up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type + ) + if up_block_type == "UpBlock2D": + return UpBlock2D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + resolution_idx=resolution_idx, + dropout=dropout, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + elif up_block_type == "CrossAttnUpBlock2D": + if cross_attention_dim is None: + raise ValueError( + "cross_attention_dim must be specified for CrossAttnUpBlock2D" + ) + return CrossAttnUpBlock2D( + num_layers=num_layers, + transformer_layers_per_block=transformer_layers_per_block, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + resolution_idx=resolution_idx, + dropout=dropout, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + attention_type=attention_type, + ) + + raise ValueError(f"{up_block_type} does not exist.") + + +class AutoencoderTinyBlock(nn.Module): + """ + Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU + blocks. + + Args: + in_channels (`int`): The number of input channels. + out_channels (`int`): The number of output channels. + act_fn (`str`): + ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`. + + Returns: + `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to + `out_channels`. + """ + + def __init__(self, in_channels: int, out_channels: int, act_fn: str): + super().__init__() + act_fn = get_activation(act_fn) + self.conv = nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1), + act_fn, + nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), + act_fn, + nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1), + ) + self.skip = ( + nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False) + if in_channels != out_channels + else nn.Identity() + ) + self.fuse = nn.ReLU() + + def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: + return self.fuse(self.conv(x) + self.skip(x)) + + +class UNetMidBlock2D(nn.Module): + """ + A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks. + + Args: + in_channels (`int`): The number of input channels. + temb_channels (`int`): The number of temporal embedding channels. + dropout (`float`, *optional*, defaults to 0.0): The dropout rate. + num_layers (`int`, *optional*, defaults to 1): The number of residual blocks. + resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks. + resnet_time_scale_shift (`str`, *optional*, defaults to `default`): + The type of normalization to apply to the time embeddings. This can help to improve the performance of the + model on tasks with long-range temporal dependencies. + resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks. + resnet_groups (`int`, *optional*, defaults to 32): + The number of groups to use in the group normalization layers of the resnet blocks. + attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks. + resnet_pre_norm (`bool`, *optional*, defaults to `True`): + Whether to use pre-normalization for the resnet blocks. + add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks. + attention_head_dim (`int`, *optional*, defaults to 1): + Dimension of a single attention head. The number of attention heads is determined based on this value and + the number of input channels. + output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor. + + Returns: + `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size, + in_channels, height, width)`. + + """ + + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", # default, spatial + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + attn_groups: Optional[int] = None, + resnet_pre_norm: bool = True, + add_attention: bool = True, + attention_head_dim: int = 1, + output_scale_factor: float = 1.0, + ): + super().__init__() + resnet_groups = ( + resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) + ) + self.add_attention = add_attention + + if attn_groups is None: + attn_groups = ( + resnet_groups if resnet_time_scale_shift == "default" else None + ) + + # there is always at least one resnet + resnets = [ + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ] + attentions = [] + + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}." + ) + attention_head_dim = in_channels + + for _ in range(num_layers): + if self.add_attention: + attentions.append( + Attention( + in_channels, + heads=in_channels // attention_head_dim, + dim_head=attention_head_dim, + rescale_output_factor=output_scale_factor, + eps=resnet_eps, + norm_num_groups=attn_groups, + spatial_norm_dim=temb_channels + if resnet_time_scale_shift == "spatial" + else None, + residual_connection=True, + bias=True, + upcast_softmax=True, + _from_deprecated_attn_block=True, + ) + ) + else: + attentions.append(None) + + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + def forward( + self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None + ) -> torch.FloatTensor: + hidden_states = self.resnets[0](hidden_states, temb) + for attn, resnet in zip(self.attentions, self.resnets[1:]): + if attn is not None: + hidden_states = attn(hidden_states, temb=temb) + hidden_states = resnet(hidden_states, temb) + + return hidden_states + + +class UNetMidBlock2DCrossAttn(nn.Module): + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + num_attention_heads: int = 1, + output_scale_factor: float = 1.0, + cross_attention_dim: int = 1280, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + upcast_attention: bool = False, + attention_type: str = "default", + ): + super().__init__() + + self.has_cross_attention = True + self.num_attention_heads = num_attention_heads + resnet_groups = ( + resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) + ) + + # support for variable transformer layers per block + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * num_layers + + # there is always at least one resnet + resnets = [ + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ] + attentions = [] + + for i in range(num_layers): + if not dual_cross_attention: + attentions.append( + Transformer2DModel( + num_attention_heads, + in_channels // num_attention_heads, + in_channels=in_channels, + num_layers=transformer_layers_per_block[i], + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + attention_type=attention_type, + ) + ) + else: + attentions.append( + DualTransformer2DModel( + num_attention_heads, + in_channels // num_attention_heads, + in_channels=in_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + ) + ) + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale) + for attn, resnet in zip(self.attentions, self.resnets[1:]): + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + hidden_states, ref_feature = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + else: + hidden_states, ref_feature = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + ) + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + + return hidden_states + + +class CrossAttnDownBlock2D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + num_attention_heads: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + downsample_padding: int = 1, + add_downsample: bool = True, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + attention_type: str = "default", + ): + super().__init__() + resnets = [] + attentions = [] + + self.has_cross_attention = True + self.num_attention_heads = num_attention_heads + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * num_layers + + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + if not dual_cross_attention: + attentions.append( + Transformer2DModel( + num_attention_heads, + out_channels // num_attention_heads, + in_channels=out_channels, + num_layers=transformer_layers_per_block[i], + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + attention_type=attention_type, + ) + ) + else: + attentions.append( + DualTransformer2DModel( + num_attention_heads, + out_channels // num_attention_heads, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + ) + ) + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) + else: + self.downsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + additional_residuals: Optional[torch.FloatTensor] = None, + ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + output_states = () + + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + + blocks = list(zip(self.resnets, self.attentions)) + + for i, (resnet, attn) in enumerate(blocks): + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states, ref_feature = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + ) + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + hidden_states, ref_feature = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + ) + + # apply additional residuals to the output of the last pair of resnet and attention blocks + if i == len(blocks) - 1 and additional_residuals is not None: + hidden_states = hidden_states + additional_residuals + + output_states = output_states + (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states, scale=lora_scale) + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + +class DownBlock2D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_downsample: bool = True, + downsample_padding: int = 1, + ): + super().__init__() + resnets = [] + + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + + self.resnets = nn.ModuleList(resnets) + + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + Downsample2D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) + else: + self.downsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + output_states = () + + for resnet in self.resnets: + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + use_reentrant=False, + ) + else: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + else: + hidden_states = resnet(hidden_states, temb, scale=scale) + + output_states = output_states + (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states, scale=scale) + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + +class CrossAttnUpBlock2D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + resolution_idx: Optional[int] = None, + dropout: float = 0.0, + num_layers: int = 1, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + num_attention_heads: int = 1, + cross_attention_dim: int = 1280, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + attention_type: str = "default", + ): + super().__init__() + resnets = [] + attentions = [] + + self.has_cross_attention = True + self.num_attention_heads = num_attention_heads + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * num_layers + + for i in range(num_layers): + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + ResnetBlock2D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + if not dual_cross_attention: + attentions.append( + Transformer2DModel( + num_attention_heads, + out_channels // num_attention_heads, + in_channels=out_channels, + num_layers=transformer_layers_per_block[i], + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + attention_type=attention_type, + ) + ) + else: + attentions.append( + DualTransformer2DModel( + num_attention_heads, + out_channels // num_attention_heads, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + ) + ) + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + + if add_upsample: + self.upsamplers = nn.ModuleList( + [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)] + ) + else: + self.upsamplers = None + + self.gradient_checkpointing = False + self.resolution_idx = resolution_idx + + def forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + is_freeu_enabled = ( + getattr(self, "s1", None) + and getattr(self, "s2", None) + and getattr(self, "b1", None) + and getattr(self, "b2", None) + ) + + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + + # FreeU: Only operate on the first two stages + if is_freeu_enabled: + hidden_states, res_hidden_states = apply_freeu( + self.resolution_idx, + hidden_states, + res_hidden_states, + s1=self.s1, + s2=self.s2, + b1=self.b1, + b2=self.b2, + ) + + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states, ref_feature = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + ) + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + hidden_states, ref_feature = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + ) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler( + hidden_states, upsample_size, scale=lora_scale + ) + + return hidden_states + + +class UpBlock2D(nn.Module): + def __init__( + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + resolution_idx: Optional[int] = None, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + add_upsample: bool = True, + ): + super().__init__() + resnets = [] + + for i in range(num_layers): + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + ResnetBlock2D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + + self.resnets = nn.ModuleList(resnets) + + if add_upsample: + self.upsamplers = nn.ModuleList( + [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)] + ) + else: + self.upsamplers = None + + self.gradient_checkpointing = False + self.resolution_idx = resolution_idx + + def forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + upsample_size: Optional[int] = None, + scale: float = 1.0, + ) -> torch.FloatTensor: + is_freeu_enabled = ( + getattr(self, "s1", None) + and getattr(self, "s2", None) + and getattr(self, "b1", None) + and getattr(self, "b2", None) + ) + + for resnet in self.resnets: + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + + # FreeU: Only operate on the first two stages + if is_freeu_enabled: + hidden_states, res_hidden_states = apply_freeu( + self.resolution_idx, + hidden_states, + res_hidden_states, + s1=self.s1, + s2=self.s2, + b1=self.b1, + b2=self.b2, + ) + + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + use_reentrant=False, + ) + else: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + else: + hidden_states = resnet(hidden_states, temb, scale=scale) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size, scale=scale) + + return hidden_states diff --git a/modules/unet_2d_condition.py b/modules/unet_2d_condition.py new file mode 100644 index 0000000000000000000000000000000000000000..c3c67c5dc74590a0cbb750f535ed7c16dccc2c9b --- /dev/null +++ b/modules/unet_2d_condition.py @@ -0,0 +1,1312 @@ +# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.loaders import UNet2DConditionLoadersMixin +from diffusers.models.activations import get_activation +from diffusers.models.attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) +from diffusers.models.embeddings import ( + GaussianFourierProjection, + ImageHintTimeEmbedding, + ImageProjection, + ImageTimeEmbedding, + TextImageProjection, + TextImageTimeEmbedding, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) +try: + from diffusers.models.embeddings import PositionNet +except: + from diffusers.models.embeddings import GLIGENTextBoundingboxProjection as PositionNet + +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils import ( + USE_PEFT_BACKEND, + BaseOutput, + deprecate, + logging, + scale_lora_layers, + unscale_lora_layers, +) + +from .unet_2d_blocks import ( + UNetMidBlock2D, + UNetMidBlock2DCrossAttn, + get_down_block, + get_up_block, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class UNet2DConditionOutput(BaseOutput): + """ + The output of [`UNet2DConditionModel`]. + + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. + """ + + sample: torch.FloatTensor = None + ref_features: Tuple[torch.FloatTensor] = None + + +class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): + r""" + A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. + + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). + + Parameters: + sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): + Height and width of input/output sample. + in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample. + out_channels (`int`, *optional*, defaults to 4): Number of channels in the output. + center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample. + flip_sin_to_cos (`bool`, *optional*, defaults to `False`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding. + down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): + Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or + `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): + The tuple of upsample blocks to use. + only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`): + Whether to include self-attention in the basic transformer blocks, see + [`~models.attention.BasicTransformerBlock`]. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block. + downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. + If `None`, normalization and activation layers is skipped in post-processing. + norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. + cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): + The dimension of the cross attention features. + transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling + blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. + attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int`, *optional*): + The number of attention heads. If not defined, defaults to `attention_head_dim` + resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config + for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`. + class_embed_type (`str`, *optional*, defaults to `None`): + The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, + `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to `None`): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. + addition_time_embed_dim: (`int`, *optional*, defaults to `None`): + Dimension for the timestep embeddings. + num_class_embeds (`int`, *optional*, defaults to `None`): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing + class conditioning with `class_embed_type` equal to `None`. + time_embedding_type (`str`, *optional*, defaults to `positional`): + The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. + time_embedding_dim (`int`, *optional*, defaults to `None`): + An optional override for the dimension of the projected time embedding. + time_embedding_act_fn (`str`, *optional*, defaults to `None`): + Optional activation function to use only once on the time embeddings before they are passed to the rest of + the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`. + timestep_post_act (`str`, *optional*, defaults to `None`): + The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`. + time_cond_proj_dim (`int`, *optional*, defaults to `None`): + The dimension of `cond_proj` layer in the timestep embedding. + conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`, + *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`, + *optional*): The dimension of the `class_labels` input when + `class_embed_type="projection"`. Required when `class_embed_type="projection"`. + class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time + embeddings with the class embeddings. + mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`): + Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If + `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the + `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False` + otherwise. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ), + mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", + up_block_types: Tuple[str] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: Union[int, Tuple[int]] = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + dropout: float = 0.0, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: Union[int, Tuple[int]] = 1280, + transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1, + reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None, + encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, + attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: int = 1.0, + time_embedding_type: str = "positional", + time_embedding_dim: Optional[int] = None, + time_embedding_act_fn: Optional[str] = None, + timestep_post_act: Optional[str] = None, + time_cond_proj_dim: Optional[int] = None, + conv_in_kernel: int = 3, + conv_out_kernel: int = 3, + projection_class_embeddings_input_dim: Optional[int] = None, + attention_type: str = "default", + class_embeddings_concat: bool = False, + mid_block_only_cross_attention: Optional[bool] = None, + cross_attention_norm: Optional[str] = None, + addition_embed_type_num_heads=64, + ): + super().__init__() + + self.sample_size = sample_size + + if num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + + # Check inputs + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(only_cross_attention, bool) and len( + only_cross_attention + ) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len( + down_block_types + ): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len( + down_block_types + ): + raise ValueError( + f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + ) + + if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len( + down_block_types + ): + raise ValueError( + f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(layers_per_block, int) and len(layers_per_block) != len( + down_block_types + ): + raise ValueError( + f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." + ) + if ( + isinstance(transformer_layers_per_block, list) + and reverse_transformer_layers_per_block is None + ): + for layer_number_per_block in transformer_layers_per_block: + if isinstance(layer_number_per_block, list): + raise ValueError( + "Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet." + ) + + # input + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in = nn.Conv2d( + in_channels, + block_out_channels[0], + kernel_size=conv_in_kernel, + padding=conv_in_padding, + ) + + # time + if time_embedding_type == "fourier": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 + if time_embed_dim % 2 != 0: + raise ValueError( + f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}." + ) + self.time_proj = GaussianFourierProjection( + time_embed_dim // 2, + set_W_to_weight=False, + log=False, + flip_sin_to_cos=flip_sin_to_cos, + ) + timestep_input_dim = time_embed_dim + elif time_embedding_type == "positional": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 + + self.time_proj = Timesteps( + block_out_channels[0], flip_sin_to_cos, freq_shift + ) + timestep_input_dim = block_out_channels[0] + else: + raise ValueError( + f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." + ) + + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + post_act_fn=timestep_post_act, + cond_proj_dim=time_cond_proj_dim, + ) + + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) + logger.info( + "encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined." + ) + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 + self.encoder_hid_proj = ImageProjection( + image_embed_dim=encoder_hid_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) + else: + self.encoder_hid_proj = None + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding( + timestep_input_dim, time_embed_dim, act_fn=act_fn + ) + elif class_embed_type == "identity": + self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) + elif class_embed_type == "projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" + ) + # The projection `class_embed_type` is the same as the timestep `class_embed_type` except + # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings + # 2. it projects from an arbitrary input dimension. + # + # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. + # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. + # As a result, `TimestepEmbedding` can be passed arbitrary vectors. + self.class_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim + ) + elif class_embed_type == "simple_projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" + ) + self.class_embedding = nn.Linear( + projection_class_embeddings_input_dim, time_embed_dim + ) + else: + self.class_embedding = None + + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, + time_embed_dim, + num_heads=addition_embed_type_num_heads, + ) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, + image_embed_dim=cross_attention_dim, + time_embed_dim=time_embed_dim, + ) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps( + addition_time_embed_dim, flip_sin_to_cos, freq_shift + ) + self.add_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim + ) + elif addition_embed_type == "image": + # Kandinsky 2.2 + self.add_embedding = ImageTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim + ) + elif addition_embed_type == "image_hint": + # Kandinsky 2.2 ControlNet + self.add_embedding = ImageHintTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim + ) + elif addition_embed_type is not None: + raise ValueError( + f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'." + ) + + if time_embedding_act_fn is None: + self.time_embed_act = None + else: + self.time_embed_act = get_activation(time_embedding_act_fn) + + self.down_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + if isinstance(only_cross_attention, bool): + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = only_cross_attention + + only_cross_attention = [only_cross_attention] * len(down_block_types) + + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = False + + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + if isinstance(cross_attention_dim, int): + cross_attention_dim = (cross_attention_dim,) * len(down_block_types) + + if isinstance(layers_per_block, int): + layers_per_block = [layers_per_block] * len(down_block_types) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len( + down_block_types + ) + + if class_embeddings_concat: + # The time embeddings are concatenated with the class embeddings. The dimension of the + # time embeddings passed to the down, middle, and up blocks is twice the dimension of the + # regular time embeddings + blocks_time_embed_dim = time_embed_dim * 2 + else: + blocks_time_embed_dim = time_embed_dim + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block[i], + transformer_layers_per_block=transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + temb_channels=blocks_time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim[i], + num_attention_heads=num_attention_heads[i], + downsample_padding=downsample_padding, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + attention_type=attention_type, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None + else output_channel, + dropout=dropout, + ) + self.down_blocks.append(down_block) + + # mid + if mid_block_type == "UNetMidBlock2DCrossAttn": + self.mid_block = UNetMidBlock2DCrossAttn( + transformer_layers_per_block=transformer_layers_per_block[-1], + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + dropout=dropout, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim[-1], + num_attention_heads=num_attention_heads[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + attention_type=attention_type, + ) + elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn": + raise NotImplementedError(f"Unsupport mid_block_type: {mid_block_type}") + elif mid_block_type == "UNetMidBlock2D": + self.mid_block = UNetMidBlock2D( + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + dropout=dropout, + num_layers=0, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_groups=norm_num_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + add_attention=False, + ) + elif mid_block_type is None: + self.mid_block = None + else: + raise ValueError(f"unknown mid_block_type : {mid_block_type}") + + # count how many layers upsample the images + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_layers_per_block = list(reversed(layers_per_block)) + reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = ( + list(reversed(transformer_layers_per_block)) + if reverse_transformer_layers_per_block is None + else reverse_transformer_layers_per_block + ) + only_cross_attention = list(reversed(only_cross_attention)) + + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[ + min(i + 1, len(block_out_channels) - 1) + ] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=reversed_layers_per_block[i] + 1, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=blocks_time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resolution_idx=i, + resnet_groups=norm_num_groups, + cross_attention_dim=reversed_cross_attention_dim[i], + num_attention_heads=reversed_num_attention_heads[i], + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + attention_type=attention_type, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None + else output_channel, + dropout=dropout, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if norm_num_groups is not None: + self.conv_norm_out = nn.GroupNorm( + num_channels=block_out_channels[0], + num_groups=norm_num_groups, + eps=norm_eps, + ) + + self.conv_act = get_activation(act_fn) + + else: + self.conv_norm_out = None + self.conv_act = None + self.conv_norm_out = None + + conv_out_padding = (conv_out_kernel - 1) // 2 + self.conv_out = nn.Conv2d( + block_out_channels[0], + out_channels, + kernel_size=conv_out_kernel, + padding=conv_out_padding, + ) + + if attention_type in ["gated", "gated-text-image"]: + positive_len = 768 + if isinstance(cross_attention_dim, int): + positive_len = cross_attention_dim + elif isinstance(cross_attention_dim, tuple) or isinstance( + cross_attention_dim, list + ): + positive_len = cross_attention_dim[0] + + feature_type = "text-only" if attention_type == "gated" else "text-image" + self.position_net = PositionNet( + positive_len=positive_len, + out_dim=cross_attention_dim, + feature_type=feature_type, + ) + + @property + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors( + name: str, + module: torch.nn.Module, + processors: Dict[str, AttentionProcessor], + ): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor( + return_deprecated_lora=True + ) + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + def set_attn_processor( + self, + processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], + _remove_lora=False, + ): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor, _remove_lora=_remove_lora) + else: + module.set_processor( + processor.pop(f"{name}.processor"), _remove_lora=_remove_lora + ) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + if all( + proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS + for proc in self.attn_processors.values() + ): + processor = AttnAddedKVProcessor() + elif all( + proc.__class__ in CROSS_ATTENTION_PROCESSORS + for proc in self.attn_processors.values() + ): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor, _remove_lora=True) + + def set_attention_slice(self, slice_size): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = ( + num_sliceable_layers * [slice_size] + if not isinstance(slice_size, list) + else slice_size + ) + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice( + module: torch.nn.Module, slice_size: List[int] + ): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def enable_freeu(self, s1, s2, b1, b2): + r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497. + + The suffixes after the scaling factors represent the stage blocks where they are being applied. + + Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that + are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL. + + Args: + s1 (`float`): + Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to + mitigate the "oversmoothing effect" in the enhanced denoising process. + s2 (`float`): + Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to + mitigate the "oversmoothing effect" in the enhanced denoising process. + b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features. + b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features. + """ + for i, upsample_block in enumerate(self.up_blocks): + setattr(upsample_block, "s1", s1) + setattr(upsample_block, "s2", s2) + setattr(upsample_block, "b1", b1) + setattr(upsample_block, "b2", b2) + + def disable_freeu(self): + """Disables the FreeU mechanism.""" + freeu_keys = {"s1", "s2", "b1", "b2"} + for i, upsample_block in enumerate(self.up_blocks): + for k in freeu_keys: + if ( + hasattr(upsample_block, k) + or getattr(upsample_block, k, None) is not None + ): + setattr(upsample_block, k, None) + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, Tuple]: + r""" + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): + Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed + through the `self.time_embedding` layer to obtain the timestep embeddings. + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*): + A tuple of tensors that if specified are added to the residuals of down unet blocks. + mid_block_additional_residual: (`torch.Tensor`, *optional*): + A tensor that if specified is added to the residual of the middle unet block. + encoder_attention_mask (`torch.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*): + additional residuals to be added to UNet long skip connections from down blocks to up blocks for + example from ControlNet side model(s) + mid_block_additional_residual (`torch.Tensor`, *optional*): + additional residual to be added to UNet mid block output, for example from ControlNet side model + down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*): + additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s) + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2 ** self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + for dim in sample.shape[-2:]: + if dim % default_overall_up_factor != 0: + # Forward upsample size to force interpolation output size. + forward_upsample_size = True + break + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = ( + 1 - encoder_attention_mask.to(sample.dtype) + ) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError( + "class_labels should be provided when num_class_embeds > 0" + ) + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + + class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype) + + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + elif self.config.addition_embed_type == "text_image": + # Kandinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) + aug_emb = self.add_embedding(text_embs, image_embs) + elif self.config.addition_embed_type == "text_time": + # SDXL - style + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + # Kandinsky 2.2 - style + if ( + "image_embeds" not in added_cond_kwargs + or "hint" not in added_cond_kwargs + ): + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = torch.cat([sample, hint], dim=1) + + emb = emb + aug_emb if aug_emb is not None else emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + if ( + self.encoder_hid_proj is not None + and self.config.encoder_hid_dim_type == "text_proj" + ): + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) + elif ( + self.encoder_hid_proj is not None + and self.config.encoder_hid_dim_type == "text_image_proj" + ): + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj( + encoder_hidden_states, image_embeds + ) + elif ( + self.encoder_hid_proj is not None + and self.config.encoder_hid_dim_type == "image_proj" + ): + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) + elif ( + self.encoder_hid_proj is not None + and self.config.encoder_hid_dim_type == "ip_image_proj" + ): + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + image_embeds = self.encoder_hid_proj(image_embeds).to( + encoder_hidden_states.dtype + ) + encoder_hidden_states = torch.cat( + [encoder_hidden_states, image_embeds], dim=1 + ) + + # 2. pre-process + sample = self.conv_in(sample) + + # 2.5 GLIGEN position net + if ( + cross_attention_kwargs is not None + and cross_attention_kwargs.get("gligen", None) is not None + ): + cross_attention_kwargs = cross_attention_kwargs.copy() + gligen_args = cross_attention_kwargs.pop("gligen") + cross_attention_kwargs["gligen"] = { + "objs": self.position_net(**gligen_args) + } + + # 3. down + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + if USE_PEFT_BACKEND: + # weight the lora layers by setting `lora_scale` for each PEFT layer + scale_lora_layers(self, lora_scale) + + is_controlnet = ( + mid_block_additional_residual is not None + and down_block_additional_residuals is not None + ) + # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets + is_adapter = down_intrablock_additional_residuals is not None + # maintain backward compatibility for legacy usage, where + # T2I-Adapter and ControlNet both use down_block_additional_residuals arg + # but can only use one or the other + if ( + not is_adapter + and mid_block_additional_residual is None + and down_block_additional_residuals is not None + ): + deprecate( + "T2I should not use down_block_additional_residuals", + "1.3.0", + "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \ + and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \ + for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ", + standard_warn=False, + ) + down_intrablock_additional_residuals = down_block_additional_residuals + is_adapter = True + + down_block_res_samples = (sample,) + tot_referece_features = () + for downsample_block in self.down_blocks: + if ( + hasattr(downsample_block, "has_cross_attention") + and downsample_block.has_cross_attention + ): + # For t2i-adapter CrossAttnDownBlock2D + additional_residuals = {} + if is_adapter and len(down_intrablock_additional_residuals) > 0: + additional_residuals[ + "additional_residuals" + ] = down_intrablock_additional_residuals.pop(0) + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + sample, res_samples = downsample_block( + hidden_states=sample, temb=emb, scale=lora_scale + ) + if is_adapter and len(down_intrablock_additional_residuals) > 0: + sample += down_intrablock_additional_residuals.pop(0) + + down_block_res_samples += res_samples + + if is_controlnet: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = ( + down_block_res_sample + down_block_additional_residual + ) + new_down_block_res_samples = new_down_block_res_samples + ( + down_block_res_sample, + ) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + if ( + hasattr(self.mid_block, "has_cross_attention") + and self.mid_block.has_cross_attention + ): + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = self.mid_block(sample, emb) + + # To support T2I-Adapter-XL + if ( + is_adapter + and len(down_intrablock_additional_residuals) > 0 + and sample.shape == down_intrablock_additional_residuals[0].shape + ): + sample += down_intrablock_additional_residuals.pop(0) + + if is_controlnet: + sample = sample + mid_block_additional_residual + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets):] + down_block_res_samples = down_block_res_samples[ + : -len(upsample_block.resnets) + ] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if ( + hasattr(upsample_block, "has_cross_attention") + and upsample_block.has_cross_attention + ): + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + scale=lora_scale, + ) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + if USE_PEFT_BACKEND: + # remove `lora_scale` from each PEFT layer + unscale_lora_layers(self, lora_scale) + + if not return_dict: + return (sample,) + + return UNet2DConditionOutput(sample=sample) diff --git a/modules/unet_3d.py b/modules/unet_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea5474224cf5248be83d3acf17ebdd17cebf28e --- /dev/null +++ b/modules/unet_3d.py @@ -0,0 +1,698 @@ +# Adapted from https://github.com/guoyww/AnimateDiff/blob/main/animatediff/models/unet_blocks.py + +from collections import OrderedDict +from dataclasses import dataclass +from os import PathLike +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import AttentionProcessor +from diffusers.models.embeddings import TimestepEmbedding, Timesteps +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, BaseOutput, logging +from safetensors.torch import load_file + +from .resnet import InflatedConv3d, InflatedGroupNorm +from .unet_3d_blocks import UNetMidBlock3DCrossAttn, get_down_block, get_up_block + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class UNet3DConditionOutput(BaseOutput): + sample: torch.FloatTensor + + +class UNet3DConditionModel(ModelMixin, ConfigMixin): + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ), + mid_block_type: str = "UNetMidBlock3DCrossAttn", + up_block_types: Tuple[str] = ( + "UpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: int = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 1280, + attention_head_dim: Union[int, Tuple[int]] = 8, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + use_inflated_groupnorm=False, + # Additional + use_motion_module=False, + motion_module_resolutions=(1, 2, 4, 8), + motion_module_mid_block=False, + motion_module_decoder_only=False, + motion_module_type=None, + motion_module_kwargs={}, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + ): + super().__init__() + + self.sample_size = sample_size + time_embed_dim = block_out_channels[0] * 4 + + # input + self.conv_in = InflatedConv3d( + in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1) + ) + + # time + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) + timestep_input_dim = block_out_channels[0] + + self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + elif class_embed_type == "identity": + self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) + else: + self.class_embedding = None + + self.down_blocks = nn.ModuleList([]) + self.mid_block = None + self.up_blocks = nn.ModuleList([]) + + if isinstance(only_cross_attention, bool): + only_cross_attention = [only_cross_attention] * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + res = 2 ** i + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block, + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attention_head_dim[i], + downsample_padding=downsample_padding, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module + and (res in motion_module_resolutions) + and (not motion_module_decoder_only), + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + self.down_blocks.append(down_block) + + # mid + if mid_block_type == "UNetMidBlock3DCrossAttn": + self.mid_block = UNetMidBlock3DCrossAttn( + in_channels=block_out_channels[-1], + temb_channels=time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attention_head_dim[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module and motion_module_mid_block, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + else: + raise ValueError(f"unknown mid_block_type : {mid_block_type}") + + # count how many layers upsample the videos + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_attention_head_dim = list(reversed(attention_head_dim)) + only_cross_attention = list(reversed(only_cross_attention)) + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + res = 2 ** (3 - i) + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[ + min(i + 1, len(block_out_channels) - 1) + ] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=layers_per_block + 1, + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=reversed_attention_head_dim[i], + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module + and (res in motion_module_resolutions), + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if use_inflated_groupnorm: + self.conv_norm_out = InflatedGroupNorm( + num_channels=block_out_channels[0], + num_groups=norm_num_groups, + eps=norm_eps, + ) + else: + self.conv_norm_out = nn.GroupNorm( + num_channels=block_out_channels[0], + num_groups=norm_num_groups, + eps=norm_eps, + ) + self.conv_act = nn.SiLU() + self.conv_out = InflatedConv3d( + block_out_channels[0], out_channels, kernel_size=3, padding=1 + ) + + @property + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors( + name: str, + module: torch.nn.Module, + processors: Dict[str, AttentionProcessor], + ): + # if hasattr(module, "set_processor"): + # processors[f"{name}.processor"] = module.processor + + if hasattr(module, "get_processor") or hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True) + + for sub_name, child in module.named_children(): + if "temporal_transformer" not in sub_name: + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + if "temporal_transformer" not in name: + fn_recursive_add_processors(name, module, processors) + + return processors + + def set_attention_slice(self, slice_size): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_slicable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_slicable_dims(module) + + num_slicable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_slicable_layers * [1] + + slice_size = ( + num_slicable_layers * [slice_size] + if not isinstance(slice_size, list) + else slice_size + ) + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice( + module: torch.nn.Module, slice_size: List[int] + ): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor( + self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]] + ): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + if "temporal_transformer" not in sub_name: + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + if "temporal_transformer" not in name: + fn_recursive_attn_processor(name, module, processor) + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + kps_features: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet3DConditionOutput, Tuple]: + r""" + Args: + sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor + timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps + encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2 ** self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info("Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # prepare attention_mask + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # time + timesteps = timestep + if not torch.is_tensor(timesteps): + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=self.dtype) + emb = self.time_embedding(t_emb) + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError( + "class_labels should be provided when num_class_embeds > 0" + ) + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) + emb = emb + class_emb + + # pre-process + sample = self.conv_in(sample) + if kps_features is not None: + sample = sample + kps_features + + # down + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if ( + hasattr(downsample_block, "has_cross_attention") + and downsample_block.has_cross_attention + ): + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + else: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + ) + + down_block_res_samples += res_samples + + if down_block_additional_residuals is not None: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = ( + down_block_res_sample + down_block_additional_residual + ) + new_down_block_res_samples += (down_block_res_sample,) + + down_block_res_samples = new_down_block_res_samples + + # mid + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + ) + + if mid_block_additional_residual is not None: + sample = sample + mid_block_additional_residual + + # up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets):] + down_block_res_samples = down_block_res_samples[ + : -len(upsample_block.resnets) + ] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if ( + hasattr(upsample_block, "has_cross_attention") + and upsample_block.has_cross_attention + ): + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + upsample_size=upsample_size, + attention_mask=attention_mask, + ) + else: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + encoder_hidden_states=encoder_hidden_states, + ) + + # post-process + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + if not return_dict: + return (sample,) + + return UNet3DConditionOutput(sample=sample) + + @classmethod + def from_pretrained_2d( + cls, + pretrained_model_path: PathLike, + motion_module_path: PathLike, + subfolder=None, + unet_additional_kwargs=None, + mm_zero_proj_out=False, + ): + pretrained_model_path = Path(pretrained_model_path) + motion_module_path = Path(motion_module_path) + if subfolder is not None: + pretrained_model_path = pretrained_model_path.joinpath(subfolder) + logger.info( + f"loaded temporal unet's pretrained weights from {pretrained_model_path} ..." + ) + + config_file = pretrained_model_path / "config.json" + if not (config_file.exists() and config_file.is_file()): + raise RuntimeError(f"{config_file} does not exist or is not a file") + + unet_config = cls.load_config(config_file) + unet_config["_class_name"] = cls.__name__ + unet_config["down_block_types"] = [ + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ] + unet_config["up_block_types"] = [ + "UpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + ] + unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn" + + model = cls.from_config(unet_config, **unet_additional_kwargs) + # load the vanilla weights + if pretrained_model_path.joinpath(SAFETENSORS_WEIGHTS_NAME).exists(): + logger.debug( + f"loading safeTensors weights from {pretrained_model_path} ..." + ) + state_dict = load_file( + pretrained_model_path.joinpath(SAFETENSORS_WEIGHTS_NAME), device="cpu" + ) + + elif pretrained_model_path.joinpath(WEIGHTS_NAME).exists(): + logger.debug(f"loading weights from {pretrained_model_path} ...") + state_dict = torch.load( + pretrained_model_path.joinpath(WEIGHTS_NAME), + map_location="cpu", + weights_only=True, + ) + else: + raise FileNotFoundError(f"no weights file found in {pretrained_model_path}") + + # load the motion module weights + if motion_module_path.exists() and motion_module_path.is_file(): + if motion_module_path.suffix.lower() in [".pth", ".pt", ".ckpt", ".bin"]: + logger.info(f"Load motion module params from {motion_module_path}") + motion_state_dict = torch.load( + motion_module_path, map_location="cpu", weights_only=True + ) + elif motion_module_path.suffix.lower() == ".safetensors": + motion_state_dict = load_file(motion_module_path, device="cpu") + else: + raise RuntimeError( + f"unknown file format for motion module weights: {motion_module_path.suffix}" + ) + if mm_zero_proj_out: + logger.info(f"Zero initialize proj_out layers in motion module...") + new_motion_state_dict = OrderedDict() + for k in motion_state_dict: + if "proj_out" in k: + continue + new_motion_state_dict[k] = motion_state_dict[k] + motion_state_dict = new_motion_state_dict + + # merge the state dicts + state_dict.update(motion_state_dict) + + # load the weights into the model + m, u = model.load_state_dict(state_dict, strict=False) + logger.debug(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};") + + params = [ + p.numel() if "temporal" in n else 0 for n, p in model.named_parameters() + ] + logger.info(f"Loaded {sum(params) / 1e6}M-parameter motion module") + + return model + + @classmethod + def from_config_2d( + cls, + unet_config_path: PathLike, + unet_additional_kwargs=None, + ): + config_file = unet_config_path + + unet_config = cls.load_config(config_file) + unet_config["_class_name"] = cls.__name__ + unet_config["down_block_types"] = [ + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "CrossAttnDownBlock3D", + "DownBlock3D", + ] + unet_config["up_block_types"] = [ + "UpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + "CrossAttnUpBlock3D", + ] + unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn" + + model = cls.from_config(unet_config, **unet_additional_kwargs) + return model diff --git a/modules/unet_3d_blocks.py b/modules/unet_3d_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..eccda5024008bebae4094d8c6370b7a20817575d --- /dev/null +++ b/modules/unet_3d_blocks.py @@ -0,0 +1,862 @@ +# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py + +import pdb + +import torch +from torch import nn + +from .motion_module import get_motion_module + +# from .motion_module import get_motion_module +from .resnet import Downsample3D, ResnetBlock3D, Upsample3D +from .transformer_3d import Transformer3DModel + + +def get_down_block( + down_block_type, + num_layers, + in_channels, + out_channels, + temb_channels, + add_downsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + downsample_padding=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + use_inflated_groupnorm=None, + use_motion_module=None, + motion_module_type=None, + motion_module_kwargs=None, +): + down_block_type = ( + down_block_type[7:] + if down_block_type.startswith("UNetRes") + else down_block_type + ) + if down_block_type == "DownBlock3D": + return DownBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + resnet_time_scale_shift=resnet_time_scale_shift, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + elif down_block_type == "CrossAttnDownBlock3D": + if cross_attention_dim is None: + raise ValueError( + "cross_attention_dim must be specified for CrossAttnDownBlock3D" + ) + return CrossAttnDownBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + add_downsample=add_downsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + downsample_padding=downsample_padding, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attn_num_head_channels, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + raise ValueError(f"{down_block_type} does not exist.") + + +def get_up_block( + up_block_type, + num_layers, + in_channels, + out_channels, + prev_output_channel, + temb_channels, + add_upsample, + resnet_eps, + resnet_act_fn, + attn_num_head_channels, + resnet_groups=None, + cross_attention_dim=None, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + resnet_time_scale_shift="default", + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + use_inflated_groupnorm=None, + use_motion_module=None, + motion_module_type=None, + motion_module_kwargs=None, +): + up_block_type = ( + up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type + ) + if up_block_type == "UpBlock3D": + return UpBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + elif up_block_type == "CrossAttnUpBlock3D": + if cross_attention_dim is None: + raise ValueError( + "cross_attention_dim must be specified for CrossAttnUpBlock3D" + ) + return CrossAttnUpBlock3D( + num_layers=num_layers, + in_channels=in_channels, + out_channels=out_channels, + prev_output_channel=prev_output_channel, + temb_channels=temb_channels, + add_upsample=add_upsample, + resnet_eps=resnet_eps, + resnet_act_fn=resnet_act_fn, + resnet_groups=resnet_groups, + cross_attention_dim=cross_attention_dim, + attn_num_head_channels=attn_num_head_channels, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + use_inflated_groupnorm=use_inflated_groupnorm, + use_motion_module=use_motion_module, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + raise ValueError(f"{up_block_type} does not exist.") + + +class UNetMidBlock3DCrossAttn(nn.Module): + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + output_scale_factor=1.0, + cross_attention_dim=1280, + dual_cross_attention=False, + use_linear_projection=False, + upcast_attention=False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + use_inflated_groupnorm=None, + use_motion_module=None, + motion_module_type=None, + motion_module_kwargs=None, + ): + super().__init__() + + self.has_cross_attention = True + self.attn_num_head_channels = attn_num_head_channels + resnet_groups = ( + resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) + ) + + # there is always at least one resnet + resnets = [ + ResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + use_inflated_groupnorm=use_inflated_groupnorm, + ) + ] + attentions = [] + motion_modules = [] + + for _ in range(num_layers): + if dual_cross_attention: + raise NotImplementedError + attentions.append( + Transformer3DModel( + attn_num_head_channels, + in_channels // attn_num_head_channels, + in_channels=in_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + ) + ) + motion_modules.append( + get_motion_module( + in_channels=in_channels, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + if use_motion_module + else None + ) + resnets.append( + ResnetBlock3D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + use_inflated_groupnorm=use_inflated_groupnorm, + ) + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + self.motion_modules = nn.ModuleList(motion_modules) + + def forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + ): + hidden_states = self.resnets[0](hidden_states, temb) + for attn, resnet, motion_module in zip( + self.attentions, self.resnets[1:], self.motion_modules + ): + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ).sample + hidden_states = ( + motion_module( + hidden_states, temb, encoder_hidden_states=encoder_hidden_states + ) + if motion_module is not None + else hidden_states + ) + hidden_states = resnet(hidden_states, temb) + + return hidden_states + + +class CrossAttnDownBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + downsample_padding=1, + add_downsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + use_inflated_groupnorm=None, + use_motion_module=None, + motion_module_type=None, + motion_module_kwargs=None, + ): + super().__init__() + resnets = [] + attentions = [] + motion_modules = [] + + self.has_cross_attention = True + self.attn_num_head_channels = attn_num_head_channels + + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock3D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + use_inflated_groupnorm=use_inflated_groupnorm, + ) + ) + if dual_cross_attention: + raise NotImplementedError + attentions.append( + Transformer3DModel( + attn_num_head_channels, + out_channels // attn_num_head_channels, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + ) + ) + motion_modules.append( + get_motion_module( + in_channels=out_channels, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + if use_motion_module + else None + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + self.motion_modules = nn.ModuleList(motion_modules) + + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + Downsample3D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) + else: + self.downsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + ): + output_states = () + + for i, (resnet, attn, motion_module) in enumerate( + zip(self.resnets, self.attentions, self.motion_modules) + ): + # self.gradient_checkpointing = False + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + )[0] + + # add motion module + hidden_states = ( + motion_module( + hidden_states, temb, encoder_hidden_states=encoder_hidden_states + ) + if motion_module is not None + else hidden_states + ) + + else: + hidden_states = resnet(hidden_states, temb) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ).sample + + # add motion module + hidden_states = ( + motion_module( + hidden_states, temb, encoder_hidden_states=encoder_hidden_states + ) + if motion_module is not None + else hidden_states + ) + + output_states += (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states += (hidden_states,) + + return hidden_states, output_states + + +class DownBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_downsample=True, + downsample_padding=1, + use_inflated_groupnorm=None, + use_motion_module=None, + motion_module_type=None, + motion_module_kwargs=None, + ): + super().__init__() + resnets = [] + motion_modules = [] + + # use_motion_module = False + for i in range(num_layers): + in_channels = in_channels if i == 0 else out_channels + resnets.append( + ResnetBlock3D( + in_channels=in_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + use_inflated_groupnorm=use_inflated_groupnorm, + ) + ) + motion_modules.append( + get_motion_module( + in_channels=out_channels, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + if use_motion_module + else None + ) + + self.resnets = nn.ModuleList(resnets) + self.motion_modules = nn.ModuleList(motion_modules) + + if add_downsample: + self.downsamplers = nn.ModuleList( + [ + Downsample3D( + out_channels, + use_conv=True, + out_channels=out_channels, + padding=downsample_padding, + name="op", + ) + ] + ) + else: + self.downsamplers = None + + self.gradient_checkpointing = False + + def forward(self, hidden_states, temb=None, encoder_hidden_states=None): + output_states = () + + for resnet, motion_module in zip(self.resnets, self.motion_modules): + # print(f"DownBlock3D {self.gradient_checkpointing = }") + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + if motion_module is not None: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(motion_module), + hidden_states.requires_grad_(), + temb, + encoder_hidden_states, + ) + else: + hidden_states = resnet(hidden_states, temb) + + # add motion module + hidden_states = ( + motion_module( + hidden_states, temb, encoder_hidden_states=encoder_hidden_states + ) + if motion_module is not None + else hidden_states + ) + + output_states += (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states += (hidden_states,) + + return hidden_states, output_states + + +class CrossAttnUpBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + prev_output_channel: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + attn_num_head_channels=1, + cross_attention_dim=1280, + output_scale_factor=1.0, + add_upsample=True, + dual_cross_attention=False, + use_linear_projection=False, + only_cross_attention=False, + upcast_attention=False, + unet_use_cross_frame_attention=None, + unet_use_temporal_attention=None, + use_motion_module=None, + use_inflated_groupnorm=None, + motion_module_type=None, + motion_module_kwargs=None, + ): + super().__init__() + resnets = [] + attentions = [] + motion_modules = [] + + self.has_cross_attention = True + self.attn_num_head_channels = attn_num_head_channels + + for i in range(num_layers): + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + ResnetBlock3D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + use_inflated_groupnorm=use_inflated_groupnorm, + ) + ) + if dual_cross_attention: + raise NotImplementedError + attentions.append( + Transformer3DModel( + attn_num_head_channels, + out_channels // attn_num_head_channels, + in_channels=out_channels, + num_layers=1, + cross_attention_dim=cross_attention_dim, + norm_num_groups=resnet_groups, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + unet_use_cross_frame_attention=unet_use_cross_frame_attention, + unet_use_temporal_attention=unet_use_temporal_attention, + ) + ) + motion_modules.append( + get_motion_module( + in_channels=out_channels, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + if use_motion_module + else None + ) + + self.attentions = nn.ModuleList(attentions) + self.resnets = nn.ModuleList(resnets) + self.motion_modules = nn.ModuleList(motion_modules) + + if add_upsample: + self.upsamplers = nn.ModuleList( + [Upsample3D(out_channels, use_conv=True, out_channels=out_channels)] + ) + else: + self.upsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + upsample_size=None, + attention_mask=None, + ): + for i, (resnet, attn, motion_module) in enumerate( + zip(self.resnets, self.attentions, self.motion_modules) + ): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ).sample + if motion_module is not None: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(motion_module), + hidden_states.requires_grad_(), + temb, + encoder_hidden_states, + ) + + else: + hidden_states = resnet(hidden_states, temb) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + ).sample + + # add motion module + hidden_states = ( + motion_module( + hidden_states, temb, encoder_hidden_states=encoder_hidden_states + ) + if motion_module is not None + else hidden_states + ) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states + + +class UpBlock3D(nn.Module): + def __init__( + self, + in_channels: int, + prev_output_channel: int, + out_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor=1.0, + add_upsample=True, + use_inflated_groupnorm=None, + use_motion_module=None, + motion_module_type=None, + motion_module_kwargs=None, + ): + super().__init__() + resnets = [] + motion_modules = [] + + # use_motion_module = False + for i in range(num_layers): + res_skip_channels = in_channels if (i == num_layers - 1) else out_channels + resnet_in_channels = prev_output_channel if i == 0 else out_channels + + resnets.append( + ResnetBlock3D( + in_channels=resnet_in_channels + res_skip_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + use_inflated_groupnorm=use_inflated_groupnorm, + ) + ) + motion_modules.append( + get_motion_module( + in_channels=out_channels, + motion_module_type=motion_module_type, + motion_module_kwargs=motion_module_kwargs, + ) + if use_motion_module + else None + ) + + self.resnets = nn.ModuleList(resnets) + self.motion_modules = nn.ModuleList(motion_modules) + + if add_upsample: + self.upsamplers = nn.ModuleList( + [Upsample3D(out_channels, use_conv=True, out_channels=out_channels)] + ) + else: + self.upsamplers = None + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + upsample_size=None, + encoder_hidden_states=None, + ): + for resnet, motion_module in zip(self.resnets, self.motion_modules): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + # print(f"UpBlock3D {self.gradient_checkpointing = }") + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + if motion_module is not None: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(motion_module), + hidden_states.requires_grad_(), + temb, + encoder_hidden_states, + ) + else: + hidden_states = resnet(hidden_states, temb) + hidden_states = ( + motion_module( + hidden_states, temb, encoder_hidden_states=encoder_hidden_states + ) + if motion_module is not None + else hidden_states + ) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states diff --git a/modules/v_kps_guider.py b/modules/v_kps_guider.py new file mode 100644 index 0000000000000000000000000000000000000000..37306505ae7e28a940fd931d6e3ba3a080dd6f3f --- /dev/null +++ b/modules/v_kps_guider.py @@ -0,0 +1,45 @@ +from typing import Tuple + +import torch.nn as nn +import torch.nn.functional as F +from diffusers.models.modeling_utils import ModelMixin +from .motion_module import zero_module +from .resnet import InflatedConv3d + + +class VKpsGuider(ModelMixin): + def __init__( + self, + conditioning_embedding_channels: int, + conditioning_channels: int = 3, + block_out_channels: Tuple[int] = (16, 32, 64, 128), + ): + super().__init__() + self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1) + + self.blocks = nn.ModuleList([]) + + for i in range(len(block_out_channels) - 1): + channel_in = block_out_channels[i] + channel_out = block_out_channels[i + 1] + self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1)) + self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2)) + + self.conv_out = zero_module(InflatedConv3d( + block_out_channels[-1], + conditioning_embedding_channels, + kernel_size=3, + padding=1, + )) + + def forward(self, conditioning): + embedding = self.conv_in(conditioning) + embedding = F.silu(embedding) + + for block in self.blocks: + embedding = block(embedding) + embedding = F.silu(embedding) + + embedding = self.conv_out(embedding) + + return embedding diff --git a/pipelines/__init__.py b/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1de7444d67d2beabf5503cbd3bad522148d35d77 --- /dev/null +++ b/pipelines/__init__.py @@ -0,0 +1 @@ +from .v_express_pipeline import VExpressPipeline diff --git a/pipelines/__pycache__/__init__.cpython-310.pyc b/pipelines/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b331e874ec901085bd706b4280c4d5320f340e2e Binary files /dev/null and b/pipelines/__pycache__/__init__.cpython-310.pyc differ diff --git a/pipelines/__pycache__/context.cpython-310.pyc b/pipelines/__pycache__/context.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58556de127d4b8ce46a8f609c8d566a67b59c670 Binary files /dev/null and b/pipelines/__pycache__/context.cpython-310.pyc differ diff --git a/pipelines/__pycache__/utils.cpython-310.pyc b/pipelines/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84d4d6c9377b876d1393ec1e0a5560515ecdceae Binary files /dev/null and b/pipelines/__pycache__/utils.cpython-310.pyc differ diff --git a/pipelines/__pycache__/v_express_pipeline.cpython-310.pyc b/pipelines/__pycache__/v_express_pipeline.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0317adc831826df750b2ed7a8817c84c6fbdb674 Binary files /dev/null and b/pipelines/__pycache__/v_express_pipeline.cpython-310.pyc differ diff --git a/pipelines/context.py b/pipelines/context.py new file mode 100644 index 0000000000000000000000000000000000000000..eefa6afebc1a2d0f1a94f44635269e3f9335d5b1 --- /dev/null +++ b/pipelines/context.py @@ -0,0 +1,78 @@ +# TODO: Adapted from cli +from typing import Callable, List, Optional + +import numpy as np + + +def ordered_halving(val): + bin_str = f"{val:064b}" + bin_flip = bin_str[::-1] + as_int = int(bin_flip, 2) + + return as_int / (1 << 64) + + +def uniform( + step: int = ..., + num_frames: int = ..., + context_size: Optional[int] = None, + context_stride: int = 3, + context_overlap: int = 4, + closed_loop: bool = True, +): + if num_frames <= context_size: + yield list(range(num_frames)) + return + + context_stride = min( + context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1 + ) + + for context_step in 1 << np.arange(context_stride): + pad = int(round(num_frames * ordered_halving(step))) + for j in range( + int(ordered_halving(step) * context_step) + pad, + num_frames + pad + (0 if closed_loop else -context_overlap), + (context_size * context_step - context_overlap), + ): + next_itr = [] + for e in range(j, j + context_size * context_step, context_step): + if e >= num_frames: + e = num_frames - 2 - e % num_frames + next_itr.append(e) + + yield next_itr + + +def get_context_scheduler(name: str) -> Callable: + if name == "uniform": + return uniform + else: + raise ValueError(f"Unknown context_overlap policy {name}") + + +def get_total_steps( + scheduler, + timesteps: List[int], + num_steps: Optional[int] = None, + num_frames: int = ..., + context_size: Optional[int] = None, + context_stride: int = 3, + context_overlap: int = 4, + closed_loop: bool = True, +): + return sum( + len( + list( + scheduler( + i, + num_steps, + num_frames, + context_size, + context_stride, + context_overlap, + ) + ) + ) + for i in range(len(timesteps)) + ) diff --git a/pipelines/utils.py b/pipelines/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8995b235571410a693f3a1275f838752f3f18712 --- /dev/null +++ b/pipelines/utils.py @@ -0,0 +1,218 @@ +import math +import os +import pathlib + +import cv2 +import numpy as np +import torch +import torch.nn.functional as func +import tqdm +from imageio_ffmpeg import get_ffmpeg_exe + +tensor_interpolation = None + + +def get_tensor_interpolation_method(): + return tensor_interpolation + + +def set_tensor_interpolation_method(is_slerp): + global tensor_interpolation + tensor_interpolation = slerp if is_slerp else linear + + +def linear(v1, v2, t): + return (1.0 - t) * v1 + t * v2 + + +def slerp(v0: torch.Tensor, v1: torch.Tensor, t: float, DOT_THRESHOLD: float = 0.9995) -> torch.Tensor: + u0 = v0 / v0.norm() + u1 = v1 / v1.norm() + dot = (u0 * u1).sum() + if dot.abs() > DOT_THRESHOLD: + # logger.info(f'warning: v0 and v1 close to parallel, using linear interpolation instead.') + return (1.0 - t) * v0 + t * v1 + omega = dot.acos() + return (((1.0 - t) * omega).sin() * v0 + (t * omega).sin() * v1) / omega.sin() + + +def draw_kps_image(height, width, kps, color_list=[(255, 0, 0), (0, 255, 0), (0, 0, 255)]): + stick_width = 4 + limb_seq = np.array([[0, 2], [1, 2]]) + kps = np.array(kps) + + canvas = np.zeros((height, width, 3), dtype=np.uint8) + + for i in range(len(limb_seq)): + index = limb_seq[i] + color = color_list[index[0]] + + x = kps[index][:, 0] + y = kps[index][:, 1] + length = ((x[0] - x[1]) ** 2 + (y[0] - y[1]) ** 2) ** 0.5 + angle = int(math.degrees(math.atan2(y[0] - y[1], x[0] - x[1]))) + polygon = cv2.ellipse2Poly((int(np.mean(x)), int(np.mean(y))), (int(length / 2), stick_width), angle, 0, 360, 1) + cv2.fillConvexPoly(canvas, polygon, [int(float(c) * 0.6) for c in color]) + + for idx_kp, kp in enumerate(kps): + color = color_list[idx_kp] + x, y = kp + cv2.circle(canvas, (int(x), int(y)), 4, color, -1) + + return canvas + + + +import os +import pathlib +import shutil +import cv2 +import numpy as np +from scipy.ndimage.filters import median_filter + +def get_ffmpeg_exe(): + if os.name == 'nt': # Windows + return 'ffmpeg' + else: # Ubuntu and other Unix-based systems + return 'ffmpeg' + + +def median_filter_3d(video_tensor, kernel_size, device): + _, video_length, height, width = video_tensor.shape + + pad_size = kernel_size // 2 + video_tensor = func.pad(video_tensor, (pad_size, pad_size, pad_size, pad_size, pad_size, pad_size), mode='reflect') + + filtered_video_tensor = [] + for i in tqdm.tqdm(range(video_length), desc='Median Filtering'): + video_segment = video_tensor[:, i:i + kernel_size, ...].to(device) + video_segment = video_segment.unfold(dimension=2, size=kernel_size, step=1) + video_segment = video_segment.unfold(dimension=3, size=kernel_size, step=1) + video_segment = video_segment.permute(0, 2, 3, 1, 4, 5).reshape(3, height, width, -1) + filtered_video_frame = torch.median(video_segment, dim=-1)[0] + filtered_video_tensor.append(filtered_video_frame.cpu()) + filtered_video_tensor = torch.stack(filtered_video_tensor, dim=1) + return filtered_video_tensor + + +def save_video(video_tensor, audio_path, output_path, device, fps=30.0): + pathlib.Path(output_path).parent.mkdir(exist_ok=True, parents=True) + + video_tensor = video_tensor[0, ...] + _, num_frames, height, width = video_tensor.shape + + video_tensor = median_filter_3d(video_tensor, kernel_size=3, device=device) + video_tensor = video_tensor.permute(1, 2, 3, 0) + video_frames = (video_tensor * 255).numpy().astype(np.uint8) + + output_name = pathlib.Path(output_path).stem + temp_output_path = output_path.replace(output_name, output_name + '-temp') + video_writer = cv2.VideoWriter(temp_output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height)) + + for i in tqdm.tqdm(range(num_frames), 'Writing frames into file'): + frame_image = video_frames[i, ...] + frame_image = cv2.cvtColor(frame_image, cv2.COLOR_RGB2BGR) + video_writer.write(frame_image) + video_writer.release() + + cmd = (f'{get_ffmpeg_exe()} -i "{temp_output_path}" -i "{audio_path}" ' + f'-map 0:v -map 1:a -c:v h264 -shortest -y "{output_path}" -loglevel quiet') + os.system(cmd) + + os.remove(temp_output_path) + + + +def compute_dist(x1, y1, x2, y2): + return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) + + +def compute_ratio(kps): + l_eye_x, l_eye_y = kps[0][0], kps[0][1] + r_eye_x, r_eye_y = kps[1][0], kps[1][1] + nose_x, nose_y = kps[2][0], kps[2][1] + d_left = compute_dist(l_eye_x, l_eye_y, nose_x, nose_y) + d_right = compute_dist(r_eye_x, r_eye_y, nose_x, nose_y) + ratio = d_left / (d_right + 1e-6) + return ratio + + +def point_to_line_dist(point, line_points): + point = np.array(point) + line_points = np.array(line_points) + line_vec = line_points[1] - line_points[0] + point_vec = point - line_points[0] + line_norm = line_vec / np.sqrt(np.sum(line_vec ** 2)) + point_vec_scaled = point_vec * 1.0 / np.sqrt(np.sum(line_vec ** 2)) + t = np.dot(line_norm, point_vec_scaled) + if t < 0.0: + t = 0.0 + elif t > 1.0: + t = 1.0 + nearest = line_points[0] + t * line_vec + dist = np.sqrt(np.sum((point - nearest) ** 2)) + return dist + + +def get_face_size(kps): + # 0: left eye, 1: right eye, 2: nose + A = kps[0, :] + B = kps[1, :] + C = kps[2, :] + + AB_dist = math.sqrt((A[0] - B[0]) ** 2 + (A[1] - B[1]) ** 2) + C_AB_dist = point_to_line_dist(C, [A, B]) + return AB_dist, C_AB_dist + + +def get_rescale_params(kps_ref, kps_target): + kps_ref = np.array(kps_ref) + kps_target = np.array(kps_target) + + ref_AB_dist, ref_C_AB_dist = get_face_size(kps_ref) + target_AB_dist, target_C_AB_dist = get_face_size(kps_target) + + scale_width = ref_AB_dist / target_AB_dist + scale_height = ref_C_AB_dist / target_C_AB_dist + + return scale_width, scale_height + + +def retarget_kps(ref_kps, tgt_kps_list, only_offset=True): + ref_kps = np.array(ref_kps) + tgt_kps_list = np.array(tgt_kps_list) + + ref_ratio = compute_ratio(ref_kps) + + ratio_delta = 10000 + selected_tgt_kps_idx = None + for idx, tgt_kps in enumerate(tgt_kps_list): + tgt_ratio = compute_ratio(tgt_kps) + if math.fabs(tgt_ratio - ref_ratio) < ratio_delta: + selected_tgt_kps_idx = idx + ratio_delta = tgt_ratio + + scale_width, scale_height = get_rescale_params( + kps_ref=ref_kps, + kps_target=tgt_kps_list[selected_tgt_kps_idx], + ) + + rescaled_tgt_kps_list = np.array(tgt_kps_list) + rescaled_tgt_kps_list[:, :, 0] *= scale_width + rescaled_tgt_kps_list[:, :, 1] *= scale_height + + if only_offset: + nose_offset = rescaled_tgt_kps_list[:, 2, :] - rescaled_tgt_kps_list[0, 2, :] + nose_offset = nose_offset[:, np.newaxis, :] + ref_kps_repeat = np.tile(ref_kps, (tgt_kps_list.shape[0], 1, 1)) + + ref_kps_repeat[:, :, :] -= (nose_offset / 2.0) + rescaled_tgt_kps_list = ref_kps_repeat + else: + nose_offset_x = rescaled_tgt_kps_list[0, 2, 0] - ref_kps[2][0] + nose_offset_y = rescaled_tgt_kps_list[0, 2, 1] - ref_kps[2][1] + + rescaled_tgt_kps_list[:, :, 0] -= nose_offset_x + rescaled_tgt_kps_list[:, :, 1] -= nose_offset_y + + return rescaled_tgt_kps_list diff --git a/pipelines/v_express_pipeline.py b/pipelines/v_express_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..fb3006bf453150122c12d30b0c3dfc92e8289cc8 --- /dev/null +++ b/pipelines/v_express_pipeline.py @@ -0,0 +1,586 @@ +# Adapted from https://github.com/magic-research/magic-animate/blob/main/magicanimate/pipelines/pipeline_animation.py +import inspect +import math +from typing import Callable, List, Optional, Union + +import torch +from diffusers import DiffusionPipeline +from diffusers.image_processor import VaeImageProcessor +from diffusers.schedulers import ( + DDIMScheduler, + DPMSolverMultistepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, +) +from diffusers.utils import is_accelerate_available +from diffusers.utils.torch_utils import randn_tensor +from einops import rearrange +from tqdm import tqdm +from transformers import CLIPImageProcessor + +from modules import ReferenceAttentionControl +from .context import get_context_scheduler + + +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, + `timesteps` must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` + must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class VExpressPipeline(DiffusionPipeline): + _optional_components = [] + + def __init__( + self, + vae, + reference_net, + denoising_unet, + v_kps_guider, + audio_processor, + audio_encoder, + audio_projection, + scheduler: Union[ + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + ], + image_proj_model=None, + tokenizer=None, + text_encoder=None, + ): + super().__init__() + + self.register_modules( + vae=vae, + reference_net=reference_net, + denoising_unet=denoising_unet, + v_kps_guider=v_kps_guider, + audio_processor=audio_processor, + audio_encoder=audio_encoder, + audio_projection=audio_projection, + scheduler=scheduler, + image_proj_model=image_proj_model, + tokenizer=tokenizer, + text_encoder=text_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.clip_image_processor = CLIPImageProcessor() + self.reference_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True + ) + self.condition_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, + do_convert_rgb=True, + do_normalize=False, + ) + + def enable_vae_slicing(self): + self.vae.enable_slicing() + + def disable_vae_slicing(self): + self.vae.disable_slicing() + + def enable_sequential_cpu_offload(self, gpu_id=0): + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + @property + def _execution_device(self): + if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + def decode_latents(self, latents): + video_length = latents.shape[2] + latents = 1 / 0.18215 * latents + latents = rearrange(latents, "b c f h w -> (b f) c h w") + video = [] + for frame_idx in tqdm(range(latents.shape[0]), desc='Decoding latents into frames'): + image = self.vae.decode(latents[frame_idx: frame_idx + 1].to(self.vae.device)).sample + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().float() + video.append(image) + video = torch.cat(video) + video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length) + + return video + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def prepare_latents( + self, + batch_size, + num_channels_latents, + width, + height, + video_length, + dtype, + device, + generator, + latents=None + ): + shape = ( + batch_size, + num_channels_latents, + video_length, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor( + shape, generator=generator, device=device, dtype=dtype + ) + + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def _encode_prompt( + self, + prompt, + device, + num_videos_per_prompt, + do_classifier_free_guidance, + negative_prompt, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer( + prompt, padding="longest", return_tensors="pt" + ).input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1: -1] + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + text_embeddings = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + text_embeddings = text_embeddings[0] + + # duplicate text embeddings for each generation per prompt, using mps friendly method + bs_embed, seq_len, _ = text_embeddings.shape + text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1) + text_embeddings = text_embeddings.view( + bs_embed * num_videos_per_prompt, seq_len, -1 + ) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = text_input_ids.shape[-1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + uncond_embeddings = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + uncond_embeddings = uncond_embeddings[0] + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = uncond_embeddings.shape[1] + uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1) + uncond_embeddings = uncond_embeddings.view( + batch_size * num_videos_per_prompt, seq_len, -1 + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + return text_embeddings + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order:] + + return timesteps, num_inference_steps - t_start + + def prepare_reference_latent(self, reference_image, height, width): + reference_image_tensor = self.reference_image_processor.preprocess(reference_image, height=height, width=width) + reference_image_tensor = reference_image_tensor.to(dtype=self.dtype, device=self.device) + reference_image_latents = self.vae.encode(reference_image_tensor).latent_dist.mean + reference_image_latents = reference_image_latents * 0.18215 + return reference_image_latents + + def prepare_kps_feature(self, kps_images, height, width, do_classifier_free_guidance): + kps_image_tensors = [] + for idx, kps_image in enumerate(kps_images): + kps_image_tensor = self.condition_image_processor.preprocess(kps_image, height=height, width=width) + kps_image_tensor = kps_image_tensor.unsqueeze(2) # [bs, c, 1, h, w] + kps_image_tensors.append(kps_image_tensor) + kps_images_tensor = torch.cat(kps_image_tensors, dim=2) # [bs, c, t, h, w] + + bs = 16 + num_forward = math.ceil(kps_images_tensor.shape[2] / bs) + kps_feature = [] + for i in range(num_forward): + tensor = kps_images_tensor[:, :, i * bs:(i + 1) * bs, ...].to(device=self.device, dtype=self.dtype) + feature = self.v_kps_guider(tensor).cpu() + kps_feature.append(feature) + torch.cuda.empty_cache() + kps_feature = torch.cat(kps_feature, dim=2) + + if do_classifier_free_guidance: + uc_kps_feature = torch.zeros_like(kps_feature) + kps_feature = torch.cat([uc_kps_feature, kps_feature], dim=0) + + return kps_feature + + def prepare_audio_embeddings(self, audio_waveform, video_length, num_pad_audio_frames, do_classifier_free_guidance): + audio_waveform = self.audio_processor(audio_waveform, return_tensors="pt", sampling_rate=16000)['input_values'] + audio_waveform = audio_waveform.to(self.device, self.dtype) + audio_embeddings = self.audio_encoder(audio_waveform).last_hidden_state # [1, num_embeds, d] + + audio_embeddings = torch.nn.functional.interpolate( + audio_embeddings.permute(0, 2, 1), + size=2 * video_length, + mode='linear', + )[0, :, :].permute(1, 0) # [2*vid_len, dim] + + audio_embeddings = torch.cat([ + torch.zeros_like(audio_embeddings)[:2 * num_pad_audio_frames, :], + audio_embeddings, + torch.zeros_like(audio_embeddings)[:2 * num_pad_audio_frames, :], + ], dim=0) # [2*num_pad+2*vid_len+2*num_pad, dim] + + frame_audio_embeddings = [] + for frame_idx in range(video_length): + start_sample = frame_idx + end_sample = frame_idx + 2 * num_pad_audio_frames + + frame_audio_embedding = audio_embeddings[2 * start_sample:2 * (end_sample + 1), :] # [2*num_pad+1, dim] + frame_audio_embeddings.append(frame_audio_embedding) + audio_embeddings = torch.stack(frame_audio_embeddings, dim=0) # [vid_len, 2*num_pad+1, dim] + + audio_embeddings = self.audio_projection(audio_embeddings).unsqueeze(0) + if do_classifier_free_guidance: + uc_audio_embeddings = torch.zeros_like(audio_embeddings) + audio_embeddings = torch.cat([uc_audio_embeddings, audio_embeddings], dim=0) + return audio_embeddings + + @torch.no_grad() + def __call__( + self, + reference_image, + kps_images, + audio_waveform, + width, + height, + video_length, + num_inference_steps, + guidance_scale, + strength=1., + num_images_per_prompt=1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + output_type: Optional[str] = "tensor", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: Optional[int] = 1, + context_schedule="uniform", + context_frames=24, + context_overlap=4, + reference_attention_weight=1., + audio_attention_weight=1., + num_pad_audio_frames=2, + do_multi_devices_inference=False, + save_gpu_memory=False, + **kwargs, + ): + # Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + device = self._execution_device + + do_classifier_free_guidance = guidance_scale > 1.0 + batch_size = 1 + + # Prepare timesteps + timesteps = None + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + + reference_control_writer = ReferenceAttentionControl( + self.reference_net, + do_classifier_free_guidance=do_classifier_free_guidance, + mode="write", + batch_size=batch_size, + fusion_blocks="full", + ) + reference_control_reader = ReferenceAttentionControl( + self.denoising_unet, + do_classifier_free_guidance=do_classifier_free_guidance, + mode="read", + batch_size=batch_size, + fusion_blocks="full", + reference_attention_weight=reference_attention_weight, + audio_attention_weight=audio_attention_weight, + ) + + num_channels_latents = self.denoising_unet.in_channels + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + reference_image_latents = self.prepare_reference_latent(reference_image, height, width) + kps_feature = self.prepare_kps_feature(kps_images, height, width, do_classifier_free_guidance) + if save_gpu_memory: + del self.v_kps_guider + torch.cuda.empty_cache() + audio_embeddings = self.prepare_audio_embeddings( + audio_waveform, + video_length, + num_pad_audio_frames, + do_classifier_free_guidance, + ) + if save_gpu_memory: + del self.audio_processor, self.audio_encoder, self.audio_projection + torch.cuda.empty_cache() + + context_scheduler = get_context_scheduler(context_schedule) + context_queue = list( + context_scheduler( + step=0, + num_frames=video_length, + context_size=context_frames, + context_stride=1, + context_overlap=context_overlap, + closed_loop=False, + ) + ) + + num_frame_context = torch.zeros(video_length, device=device, dtype=torch.long) + for context in context_queue: + num_frame_context[context] += 1 + + encoder_hidden_states = torch.zeros((1, 1, 768), dtype=self.dtype, device=self.device) + self.reference_net( + reference_image_latents, + timestep=0, + encoder_hidden_states=encoder_hidden_states, + return_dict=False, + ) + reference_control_reader.update(reference_control_writer, do_classifier_free_guidance) + if save_gpu_memory: + del self.reference_net + torch.cuda.empty_cache() + + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + width, + height, + video_length, + self.dtype, + torch.device('cpu'), + generator, + ) + + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + context_counter = torch.zeros(video_length, device=device, dtype=torch.long) + noise_preds = [None] * video_length + for context_idx, context in enumerate(context_queue): + latent_kps_feature = kps_feature[:, :, context].to(device, self.dtype) + + latent_audio_embeddings = audio_embeddings[:, context, ...] + _, _, num_tokens, dim = latent_audio_embeddings.shape + latent_audio_embeddings = latent_audio_embeddings.reshape(-1, num_tokens, dim) + + input_latents = latents[:, :, context, ...].to(device) + input_latents = input_latents.repeat(2 if do_classifier_free_guidance else 1, 1, 1, 1, 1) + input_latents = self.scheduler.scale_model_input(input_latents, t) + noise_pred = self.denoising_unet( + input_latents, + t, + encoder_hidden_states=latent_audio_embeddings.reshape(-1, num_tokens, dim), + kps_features=latent_kps_feature, + return_dict=False, + )[0] + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + context_counter[context] += 1 + noise_pred /= num_frame_context[context][None, None, :, None, None] + step_frame_ids = [] + step_noise_preds = [] + for latent_idx, frame_idx in enumerate(context): + if noise_preds[frame_idx] is None: + noise_preds[frame_idx] = noise_pred[:, :, latent_idx, ...] + else: + noise_preds[frame_idx] += noise_pred[:, :, latent_idx, ...] + if context_counter[frame_idx] == num_frame_context[frame_idx]: + step_frame_ids.append(frame_idx) + step_noise_preds.append(noise_preds[frame_idx]) + noise_preds[frame_idx] = None + step_noise_preds = torch.stack(step_noise_preds, dim=2) + output_latents = self.scheduler.step( + step_noise_preds, + t, + latents[:, :, step_frame_ids, ...].to(device), + **extra_step_kwargs, + ).prev_sample + latents[:, :, step_frame_ids, ...] = output_latents.cpu() + + progress_bar.set_description( + f'Denoising Step Index: {i + 1} / {len(timesteps)}, ' + f'Context Index: {context_idx + 1} / {len(context_queue)}' + ) + + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + reference_control_reader.clear() + reference_control_writer.clear() + + video_tensor = self.decode_latents(latents) + return video_tensor diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..26b9acbca2c8dc680efbab126608d680f1c34dfc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,21 @@ +diffusers==0.24.0 +imageio-ffmpeg==0.4.9 +insightface==0.7.3 +omegaconf==2.2.3 +onnx==1.14.0 +onnxruntime-gpu==1.16.3 +safetensors==0.4.2 +torch==2.0.1 +torchaudio==2.0.2 +torchvision==0.15.2 +transformers==4.41.1 +einops==0.4.1 +tqdm==4.66.4 +xformers==0.0.22 +av==11.0.0 +gradio +retina-face +tf-keras +filetype +bitsandbytes==0.43.0 +accelerate diff --git a/scripts/crop.py b/scripts/crop.py new file mode 100644 index 0000000000000000000000000000000000000000..c54a70d84bc102aa384bebe8ba3bb245413a0b80 --- /dev/null +++ b/scripts/crop.py @@ -0,0 +1,108 @@ +from retinaface import RetinaFace +from PIL import Image +import torch + + + +def auto_crop_image(image_path=r"F:\V_Express_V1\Material\Biden_Photo_Big.png", expand_percent=0.15, crop_size=(512, 512)): + # Check if CUDA is available + if torch.cuda.is_available(): + device = 'cuda' + print("Using GPU for RetinaFace detection.") + else: + device = 'cpu' + print("Using CPU for RetinaFace detection.") + + # Load image + img = Image.open(image_path) + + # Perform face detection + faces = RetinaFace.detect_faces(image_path) + + if not faces: + print("No faces detected.") + return None + + # Assuming 'faces' is a dictionary of detected faces + # Pick the first face detected + face = list(faces.values())[0] + landmarks = face['landmarks'] + + # Extract the landmarks + right_eye = landmarks['right_eye'] + left_eye = landmarks['left_eye'] + right_mouth = landmarks['mouth_right'] + left_mouth = landmarks['mouth_left'] + + # Calculate the distance between the eyes + eye_distance = abs(right_eye[0] - left_eye[0]) + + # Estimate the head width and height + head_width = eye_distance * 4.5 # Increase the width multiplier + head_height = eye_distance * 6.5 # Increase the height multiplier + + # Calculate the center point between the eyes + eye_center_x = (right_eye[0] + left_eye[0]) // 2 + eye_center_y = (right_eye[1] + left_eye[1]) // 2 + + # Calculate the top-left and bottom-right coordinates of the assumed head region + head_left = max(0, int(eye_center_x - head_width // 2)) + head_top = max(0, int(eye_center_y - head_height // 2)) # Adjust the top coordinate + head_right = min(img.width, int(eye_center_x + head_width // 2)) + head_bottom = min(img.height, int(eye_center_y + head_height // 2)) # Adjust the bottom coordinate + + # Save the assumed head image + assumed_head_img = img.crop((head_left, head_top, head_right, head_bottom)) + assumed_head_img.save("assumed_head.png", format='PNG') + + # Calculate the expansion in pixels and the new dimensions + expanded_w = int(head_width * (1 + expand_percent)) + expanded_h = int(head_height * (1 + expand_percent)) + + # Calculate the top-left and bottom-right points of the expanded box + center_x, center_y = head_left + head_width // 2, head_top + head_height // 2 + left = max(0, center_x - expanded_w // 2) + right = min(img.width, center_x + expanded_w // 2) + top = max(0, center_y - expanded_h // 2) + bottom = min(img.height, center_y + expanded_h // 2) + + # Crop the image with the expanded boundaries + cropped_img = img.crop((left, top, right, bottom)) + cropped_img.save("expanded_face.png", format='PNG') + + # Calculate the aspect ratio of the cropped image + cropped_width, cropped_height = cropped_img.size + aspect_ratio = cropped_width / cropped_height + + # Calculate the target dimensions based on the desired crop size + target_width = crop_size[0] + target_height = crop_size[1] + + # Adjust the crop to match the desired aspect ratio + if aspect_ratio > target_width / target_height: + # Crop from left and right + new_width = int(cropped_height * target_width / target_height) + left_crop = (cropped_width - new_width) // 2 + right_crop = left_crop + new_width + top_crop = 0 + bottom_crop = cropped_height + else: + # Crop from top and bottom + new_height = int(cropped_width * target_height / target_width) + top_crop = (cropped_height - new_height) // 2 + bottom_crop = top_crop + new_height + left_crop = 0 + right_crop = cropped_width + + # Crop the image with the adjusted boundaries + final_cropped_img = cropped_img.crop((left_crop, top_crop, right_crop, bottom_crop)) + final_cropped_img.save("final_cropped_img.png", format='PNG') + + # Resize the cropped image to the desired size (512x512 by default) with best quality + resized_img = final_cropped_img.resize(crop_size, resample=Image.LANCZOS) + + # Save the resized image as PNG + resized_img_path = image_path.rsplit('.', 1)[0] + '_cropped.png' # Change file name to avoid overwriting + resized_img.save("resized_img.png", format='PNG') + +auto_crop_image() \ No newline at end of file diff --git a/scripts/extract_kps_sequence_and_audio.py b/scripts/extract_kps_sequence_and_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..2a62b988ab69b1eb4b4a531976b49855a6713935 --- /dev/null +++ b/scripts/extract_kps_sequence_and_audio.py @@ -0,0 +1,69 @@ +import argparse +import os +import cv2 +import torch +from insightface.app import FaceAnalysis +from imageio_ffmpeg import get_ffmpeg_exe +import time +import subprocess + +parser = argparse.ArgumentParser() +parser.add_argument('--video_path', type=str, default='') +parser.add_argument('--kps_sequence_save_path', type=str, default='') +parser.add_argument('--audio_save_path', type=str, default='') +parser.add_argument('--device', type=str, default='cuda') +parser.add_argument('--gpu_id', type=int, default=0) +parser.add_argument('--insightface_model_path', type=str, default='./model_ckpts/insightface_models/') +parser.add_argument('--height', type=int, default=512) +parser.add_argument('--width', type=int, default=512) +args = parser.parse_args() + +# Convert paths to absolute paths +args.video_path = os.path.abspath(args.video_path) +args.kps_sequence_save_path = os.path.abspath(args.kps_sequence_save_path) +args.audio_save_path = os.path.abspath(args.audio_save_path) +args.insightface_model_path = os.path.abspath(args.insightface_model_path) + +app = FaceAnalysis( + providers=['CUDAExecutionProvider' if args.device == 'cuda' else 'CPUExecutionProvider'], + provider_options=[{'device_id': args.gpu_id}] if args.device == 'cuda' else [], + root=args.insightface_model_path, +) +app.prepare(ctx_id=0, det_size=(args.height, args.width)) + +# Use subprocess.run() with shell=True for paths with space characters +subprocess.run(f'"{get_ffmpeg_exe()}" -i "{args.video_path}" -y -vn "{args.audio_save_path}"', shell=True) + +kps_sequence = [] +video_capture = cv2.VideoCapture(args.video_path) +total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT)) +frame_idx = 0 + +while video_capture.isOpened(): + ret, frame = video_capture.read() + if not ret: + break + + + start_time = time.time() + + frame = cv2.resize(frame, (args.width, args.height)) + + faces = app.get(frame) + end_time = time.time() + duration = end_time - start_time + + assert len(faces) == 1, f'There are {len(faces)} faces in the {frame_idx}-th frame. Only one face is supported.' + + kps = faces[0].kps[:3] + kps_sequence.append(kps) + frame_idx += 1 + + processed_frames = frame_idx + remaining_frames = total_frames - frame_idx + + print(f"Frame {frame_idx}: Face detection duration = {duration:.4f} seconds") + print(f"Status: Processed {processed_frames} frames, {remaining_frames} frames remaining") + +# Use double quotes for paths with space characters +torch.save(kps_sequence, args.kps_sequence_save_path) \ No newline at end of file