Spaces:

jmanhype
/

MuseV

Runtime error

App Files Files Community

jmanhype commited on Dec 4, 2024

Commit

06e9d12

0 Parent(s):

Initial commit without binary files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +65 -0
.gitmodules +11 -0
CHANGES +5 -0
Dockerfile +18 -0
LICENSE +175 -0
MMCM +1 -0
README-zh.md +465 -0
README.md +37 -0
configs/model/T2I_all_model.py +15 -0
configs/model/ip_adapter.py +66 -0
configs/model/lcm_model.py +17 -0
configs/model/motion_model.py +22 -0
configs/model/negative_prompt.py +32 -0
configs/model/referencenet.py +14 -0
configs/tasks/example.yaml +210 -0
controlnet_aux +1 -0
data/models/musev_structure.png +0 -0
data/models/parallel_denoise.png +0 -0
diffusers +1 -0
environment.yml +312 -0
musev/__init__.py +9 -0
musev/auto_prompt/__init__.py +0 -0
musev/auto_prompt/attributes/__init__.py +8 -0
musev/auto_prompt/attributes/attr2template.py +127 -0
musev/auto_prompt/attributes/attributes.py +227 -0
musev/auto_prompt/attributes/human.py +424 -0
musev/auto_prompt/attributes/render.py +33 -0
musev/auto_prompt/attributes/style.py +12 -0
musev/auto_prompt/human.py +40 -0
musev/auto_prompt/load_template.py +37 -0
musev/auto_prompt/util.py +25 -0
musev/data/__init__.py +0 -0
musev/data/data_util.py +681 -0
musev/logging.conf +32 -0
musev/models/__init__.py +3 -0
musev/models/attention.py +431 -0
musev/models/attention_processor.py +750 -0
musev/models/controlnet.py +399 -0
musev/models/embeddings.py +87 -0
musev/models/facein_loader.py +120 -0
musev/models/ip_adapter_face_loader.py +179 -0
musev/models/ip_adapter_loader.py +340 -0
musev/models/referencenet.py +1216 -0
musev/models/referencenet_loader.py +124 -0
musev/models/resnet.py +135 -0
musev/models/super_model.py +253 -0
musev/models/temporal_transformer.py +308 -0
musev/models/text_model.py +40 -0
musev/models/transformer_2d.py +445 -0
musev/models/unet_2d_blocks.py +1537 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,65 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Checkpoints
+checkpoints/
+# Logs
+*.log
+logs/
+tensorboard/
+# Environment
+.env
+.venv
+env/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Large files
+data/result_video/
+*.mp4
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.webp
+*.avi
+*.mov
+*.mkv
+*.flv
+*.wmv
+# Demo and source files
+data/demo/
+data/source_video/
+data/images/

.gitmodules ADDED Viewed

	@@ -0,0 +1,11 @@

+[submodule "MMCM"]
+	path = MMCM
+	url = https://github.com/TMElyralab/MMCM.git
+[submodule "controlnet_aux"]
+	path = controlnet_aux
+	url = https://github.com/TMElyralab/controlnet_aux.git
+	branch = tme
+[submodule "diffusers"]
+	path = diffusers
+	url = https://github.com/TMElyralab/diffusers.git
+	branch = tme

CHANGES ADDED Viewed

	@@ -0,0 +1,5 @@

+Version 1.0.0 (2024.03.27)
+ * init musev, support video generation with text and image
+ * controlnet_aux: enrich interface and function of dwpose.
+ * diffusers: controlnet support latent instead of images only.

Dockerfile ADDED Viewed

	@@ -0,0 +1,18 @@

+FROM anchorxia/musev:1.0.0
+#MAINTAINER 维护者信息
+LABEL MAINTAINER="anchorxia"
+LABEL Email="[email protected]"
+LABEL Description="musev gpu runtime image, base docker is pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel"
+ARG DEBIAN_FRONTEND=noninteractive
+USER root
+SHELL ["/bin/bash", "--login", "-c"]
+RUN . /opt/conda/etc/profile.d/conda.sh  \
+    && echo "source activate musev" >> ~/.bashrc \
+    && conda activate musev \
+    && conda env list \
+    && pip --no-cache-dir install cuid gradio==4.12 spaces
+USER root

LICENSE ADDED Viewed

	@@ -0,0 +1,175 @@

+MIT License
+Copyright (c) 2024 Tencent Music Entertainment Group
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+Other dependencies and licenses:
+Open Source Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. BriVL-BUA-applications
+Files：https://github.com/chuhaojin/BriVL-BUA-applications
+License：MIT License
+	Copyright (c) 2021 chuhaojin
+For details：https://github.com/chuhaojin/BriVL-BUA-applications/blob/master/LICENSE
+2.deep-person-reid
+Files：https://github.com/KaiyangZhou/deep-person-reid
+License：MIT License
+         Copyright (c) 2018 Kaiyang Zhou
+For details：https://github.com/KaiyangZhou/deep-person-reid/blob/master/LICENSE
+Open Source Software Licensed under the Apache License Version 2.0:
+--------------------------------------------------------------------
+1. diffusers
+Files：https://github.com/huggingface/diffusers
+License：Apache License 2.0
+         Copyright 2024 The HuggingFace Team. All rights reserved.
+For details：https://github.com/huggingface/diffusers/blob/main/LICENSE
+             https://github.com/huggingface/diffusers/blob/main/setup.py
+2. controlnet_aux
+Files: https://github.com/huggingface/controlnet_aux
+License: Apache License 2.0
+         Copyright 2023 The HuggingFace Team. All rights reserved.
+For details:  https://github.com/huggingface/controlnet_aux/blob/master/LICENSE.txt
+              https://github.com/huggingface/controlnet_aux/blob/master/setup.py
+3. decord
+Files：https://github.com/dmlc/decord
+License：Apache License 2.0
+For details：https://github.com/dmlc/decord/blob/master/LICENSE
+Terms of the Apache License Version 2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Software Licensed under the BSD 3-Clause License:
+--------------------------------------------------------------------
+1. pynvml
+Files：https://github.com/gpuopenanalytics/pynvml/tree/master
+License：BSD 3-Clause
+	Copyright (c) 2011-2021, NVIDIA Corporation.
+	All rights reserved.
+For details：https://github.com/gpuopenanalytics/pynvml/blob/master/LICENSE.txt
+Terms of the BSD 3-Clause License:
+--------------------------------------------------------------------
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Other Open Source Software：
+--------------------------------------------------------------------
+1.SceneSeg
+Files：https://github.com/AnyiRao/SceneSeg/tree/master

MMCM ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 1e2b6e6a848f0f116e8acaf0621c2ee64d3642ce

README-zh.md ADDED Viewed

	@@ -0,0 +1,465 @@

+# MuseV [English](README.md) [中文](README-zh.md)
+<font size=5>MuseV：基于视觉条件并行去噪的无限长度和高保真虚拟人视频生成。
+</br>
+Zhiqiang Xia <sup>\*</sup>,
+Zhaokang Chen<sup>\*</sup>,
+Bin Wu<sup>†</sup>,
+Chao Li,
+Kwok-Wai Hung,
+Chao Zhan,
+Yingjie He,
+Wenjiang Zhou
+(<sup>*</sup>co-first author, <sup>†</sup>Corresponding Author, [email protected])
+</font>
+**[github](https://github.com/TMElyralab/MuseV)**    **[huggingface](https://huggingface.co/TMElyralab/MuseV)**   **[HuggingfaceSpace](https://huggingface.co/spaces/AnchorFake/MuseVDemo)**  **[project](https://tmelyralab.github.io/MuseV_Page/)**    **Technical report (comming soon)**
+我们在2023年3月相信扩散模型可以模拟世界，也开始基于扩散模型研发世界视觉模拟器。`MuseV`是在 2023 年 7 月左右实现的一个里程碑。受到 Sora 进展的启发，我们决定开源 MuseV。MuseV 站在开源的肩膀上成长，也希望能够借此反馈社区。接下来，我们将转向有前景的扩散+变换器方案。
+我们已经发布 <a href="https://github.com/TMElyralab/MuseTalk" style="font-size:24px; color:red;">MuseTalk</a>. `MuseTalk`是一个实时高质量的唇同步模型，可与 `MuseV` 一起构建完整的`虚拟人生成解决方案`。请保持关注！
+:new: 我们新发布了<a href="https://github.com/TMElyralab/MusePose" style="font-size:24px; color:red;">MusePose</a>。 MusePose是一个用于虚拟人物的图像到视频生成框架，它可以根据控制信号（姿态）生成视频。结合 MuseV 和 MuseTalk，我们希望社区能够加入我们，一起迈向一个愿景：能够端到端生成具有全身运动和交互能力的虚拟人物。
+# 概述
+`MuseV` 是基于扩散模型的虚拟人视频生成框架，具有以下特点：
+1. 支持使用新颖的视觉条件并行去噪方案进行无限长度生成，不会再有误差累计的问题，尤其适用于固定相机位的场景。
+1. 提供了基于人物类型数据集训练的虚拟人视频生成预训练模型。
+1. 支持图像到视频、文本到图像到视频、视频到视频的生成。
+1. 兼容 `Stable Diffusion` 文图生成生态系统，包括 `base_model`、`lora`、`controlnet` 等。
+1. 支持多参考图像技术，包括 `IPAdapter`、`ReferenceOnly`、`ReferenceNet`、`IPAdapterFaceID`。
+1. 我们后面也会推出训练代码。
+# 重要更新
+1. `musev_referencenet_pose`: `unet`, `ip_adapter` 的模型名字指定错误，请使用 `musev_referencenet_pose`而不是`musev_referencenet`，请使用最新的main分支。
+# 进展
+- [2024年3月27日] 发布 `MuseV` 项目和训练好的模型 `musev`、`muse_referencenet`、`muse_referencenet_pose`。
+- [03/30/2024] 在 huggingface space 上新增 [gui](https://huggingface.co/spaces/AnchorFake/MuseVDemo) 交互方式来生成视频.
+## 模型
+### 模型结构示意图
+![model_structure](./data/models/musev_structure.png)
+### 并行去噪算法示意图
+![parallel_denoise](./data//models/parallel_denoise.png)
+## 测试用例
+生成结果的所有帧直接由`MuseV`生成，没有时序超分辨、空间超分辨等任何后处理。
+更多测试结果请看[MuseVPage]()
+<!-- # TODO: // use youtu video link? -->
+以下所有测试用例都维护在 `configs/tasks/example.yaml`，可以直接运行复现。
+**[project](https://tmelyralab.github.io/)** 有更多测试用例，包括直接生成的、一两分钟的长视频。
+### 输入文本、图像的视频生成
+#### 人类
+<table class="center">
+  <tr style="font-weight: bolder;text-align:center;">
+        <td width="50%">image</td>
+        <td width="45%">video </td>
+        <td width="5%">prompt</td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/yongen.jpeg width="400">
+    </td>
+    <td >
+     <video src="https://github.com/TMElyralab/MuseV/assets/163980830/732cf1fd-25e7-494e-b462-969c9425d277" width="100" controls preload></video>
+    </td>
+    <td>(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/seaside4.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/9b75a46c-f4e6-45ef-ad02-05729f091c8f" width="100" controls preload></video>
+    </td>
+    <td>
+    (masterpiece, best quality, highres:1), peaceful beautiful sea scene
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/seaside_girl.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/d0f3b401-09bf-4018-81c3-569ec24a4de9" width="100" controls preload></video>
+    </td>
+    <td>
+      (masterpiece, best quality, highres:1), peaceful beautiful sea scene
+    </td>
+  </tr>
+  <!-- guitar  -->
+  <tr>
+    <td>
+      <img src=./data/images/boy_play_guitar.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/61bf955e-7161-44c8-a498-8811c4f4eb4f" width="100" controls preload></video>
+    </td>
+    <td>
+       (masterpiece, best quality, highres:1), playing guitar
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/girl_play_guitar2.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/40982aa7-9f6a-4e44-8ef6-3f185d284e6a" width="100" controls preload></video>
+    </td>
+    <td>
+      (masterpiece, best quality, highres:1), playing guitar
+    </td>
+  </tr>
+  <!-- famous people -->
+  <tr>
+    <td>
+      <img src=./data/images/dufu.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/28294baa-b996-420f-b1fb-046542adf87d" width="100" controls preload></video>
+    </td>
+    <td>
+    (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/Mona_Lisa.jpg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/1ce11da6-14c6-4dcd-b7f9-7a5f060d71fb" width="100" controls preload></video>
+    </td>
+    <td>
+    (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
+    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+    </td>
+  </tr>
+</table >
+#### 场景
+<table class="center">
+    <tr style="font-weight: bolder;text-align:center;">
+        <td width="35%">image</td>
+        <td width="50%">video</td>
+        <td width="15%">prompt</td>
+    </tr>
+  <tr>
+    <td>
+      <img src=./data/images/waterfall4.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/852daeb6-6b58-4931-81f9-0dddfa1b4ea5" width="100" controls preload></video>
+    </td>
+    <td>
+      (masterpiece, best quality, highres:1), peaceful beautiful waterfall, an
+    endless waterfall
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/seaside2.jpeg width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/4a4d527a-6203-411f-afe9-31c992d26816" width="100" controls preload></video>
+    </td>
+    <td>(masterpiece, best quality, highres:1), peaceful beautiful sea scene
+    </td>
+  </tr>
+</table >
+### 输入视频条件的视频生成
+当前生成模式下，需要参考视频的首帧条件和参考图像的首帧条件对齐，不然会破坏首帧的信息，效果会更差。所以一般生成流程是
+1. 确定参考视频；
+2. 用参考视频的首帧走图生图、controlnet流程，可以使用`MJ`等各种平台；
+3. 拿2中的生成图、参考视频用MuseV生成视频；
+4.
+**pose2video**
+`duffy` 的测试用例中，视觉条件帧的姿势与控制视频的第一帧不对齐。需要`posealign` 将解决这个问题。
+<table class="center">
+    <tr style="font-weight: bolder;text-align:center;">
+        <td width="25%">image</td>
+        <td width="65%">video</td>
+        <td width="10%">prompt</td>
+    </tr>
+  <tr>
+    <td>
+      <img src=./data/images/spark_girl.png width="200">
+      <img src=./data/images/cyber_girl.png width="200">
+    </td>
+    <td>
+        <video src="https://github.com/TMElyralab/MuseV/assets/163980830/484cc69d-c316-4464-a55b-3df929780a8e" width="400" controls preload></video>
+    </td>
+    <td>
+      (masterpiece, best quality, highres:1) , a girl is dancing, animation
+    </td>
+  </tr>
+  <tr>
+    <td>
+      <img src=./data/images/duffy.png width="400">
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/c44682e6-aafc-4730-8fc1-72825c1bacf2" width="400" controls preload></video>
+    </td>
+    <td>
+      (masterpiece, best quality, highres:1), is dancing, animation
+    </td>
+  </tr>
+</table >
+### MuseTalk
+`talk`的角色`孙昕荧`著名的网络大V，可以在 [抖音](https://www.douyin.com/user/MS4wLjABAAAAWDThbMPN_6Xmm_JgXexbOii1K-httbu2APdG8DvDyM8) 关注。
+<table class="center">
+    <tr style="font-weight: bolder;">
+        <td width="35%">name</td>
+        <td width="50%">video</td>
+    </tr>
+  <tr>
+    <td>
+       talk
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/951188d1-4731-4e7f-bf40-03cacba17f2f" width="100" controls preload></video>
+    </td>
+  <tr>
+    <td>
+       sing
+    </td>
+    <td>
+      <video src="https://github.com/TMElyralab/MuseV/assets/163980830/50b8ffab-9307-4836-99e5-947e6ce7d112" width="100" controls preload></video>
+    </td>
+  </tr>
+</table >
+# 待办事项：
+- [ ] 技术报告（即将推出）。
+- [ ] 训练代码。
+- [ ] 扩散变换生成框架。
+- [ ] `posealign` 模块。
+# 快速入门
+准备 Python 环境并安装额外的包，如 `diffusers`、`controlnet_aux`、`mmcm`。
+## 第三方整合版
+一些第三方的整合，方便大家安装、使用，感谢第三方的工作。
+同时也希望注意，我们没有对第���方的支持做验证、维护和后续更新，具体效果请以本项目为准。
+### [ComfyUI](https://github.com/chaojie/ComfyUI-MuseV)
+### [windows整合包](https://www.bilibili.com/video/BV1ux4y1v7pF/?vd_source=fe03b064abab17b79e22a692551405c3)
+netdisk:https://www.123pan.com/s/Pf5Yjv-Bb9W3.html
+code: glut
+## 准备环境
+建议您优先使用 `docker` 来准备 Python 环境。
+### 准备 Python 环境
+**注意**：我们只测试了 Docker，使用 conda 或其他环境可能会遇到问题。我们将尽力解决。但依然请优先使用 `docker`。
+#### 方法 1：使用 Docker
+1. 拉取 Docker 镜像
+```bash
+docker pull anchorxia/musev:latest
+```
+2. 运行 Docker 容器
+```bash
+docker run --gpus all -it --entrypoint /bin/bash anchorxia/musev:latest
+```
+docker启动后默认的 conda 环境是 `musev`。
+#### 方法 2：使用 conda
+从 environment.yaml 创建 conda 环境
+```
+conda env create --name musev --file ./environment.yml
+```
+#### 方法 3：使用 pip requirements
+```
+pip install -r requirements.txt
+```
+#### 准备 [openmmlab](https://openmmlab.com/) 包
+如果不使用 Docker方式，还需要额外安装 mmlab 包。
+```bash
+pip install --no-cache-dir -U openmim
+mim install mmengine
+mim install "mmcv>=2.0.1"
+mim install "mmdet>=3.1.0"
+mim install "mmpose>=1.1.0"
+```
+### 准备我们开发的包
+#### 下载
+```bash
+git clone --recursive https://github.com/TMElyralab/MuseV.git
+```
+#### 准备 PYTHONPATH
+```bash
+current_dir=$(pwd)
+export PYTHONPATH=${PYTHONPATH}:${current_dir}/MuseV
+export PYTHONPATH=${PYTHONPATH}:${current_dir}/MuseV/MMCM
+export PYTHONPATH=${PYTHONPATH}:${current_dir}/MuseV/diffusers/src
+export PYTHONPATH=${PYTHONPATH}:${current_dir}/MuseV/controlnet_aux/src
+cd MuseV
+```
+1. `MMCM`：多媒体、跨模态处理包。
+1. `diffusers`：基于 [diffusers](https://github.com/huggingface/diffusers) 修改的 diffusers 包。
+1. `controlnet_aux`：基于 [controlnet_aux](https://github.com/TMElyralab/controlnet_aux) 修改的包。
+## 下载模型
+```bash
+git clone https://huggingface.co/TMElyralab/MuseV ./checkpoints
+```
+- `motion`：多个版本的视频生成模型。使用小数据集 `ucf101` 和小 `webvid` 数据子集进行训练，约 60K 个视频文本对。GPU 内存消耗测试在 `resolution` $=512*512，`time_size=12`。
+    - `musev/unet`：这个版本 仅训练 `unet` 运动模块。推断 `GPU 内存消耗` $\approx 8G$。
+    - `musev_referencenet`：这个版本训练 `unet` 运动模块、`referencenet`、`IPAdapter`。推断 `GPU 内存消耗` $\approx 12G$。
+        - `unet`：`motion` 模块，具有 `Attention` 层中的 `to_k`、`to_v`，参考 `IPAdapter`。
+        - `referencenet`：类似于 `AnimateAnyone`。
+        - `ip_adapter_image_proj.bin`：图像特征变换层，参考 `IPAdapter`。
+    - `musev_referencenet_pose`：这个版本基于 `musev_referencenet`，固定 `referencenet` 和 `controlnet_pose`，训练 `unet motion` 和 `IPAdapter`。推断 `GPU 内存消耗` $\approx 12G$。
+- `t2i/sd1.5`：text2image 模型，在训练运动模块时参数被冻结。
+    - `majicmixRealv6Fp16`：示例，可以替换为其他 t2i 基础。从 [majicmixRealv6Fp16](https://civitai.com/models/43331/majicmix-realistic) 下载。
+    - `fantasticmix_v10`: 可在 [fantasticmix_v10](https://civitai.com/models/22402?modelVersionId=26744) 下载。
+- `IP-Adapter/models`：从 [IPAdapter](https://huggingface.co/h94/IP-Adapter/tree/main) 下载。
+    - `image_encoder`：视觉特征抽取模型。
+    - `ip-adapter_sd15.bin`：原始 IPAdapter 模型预训练权重。
+    - `ip-adapter-faceid_sd15.bin`：原始 IPAdapter 模型预训练权重。
+## 推理
+### 准备模型路径
+当使用示例推断命令运行示例任务时，可以跳过此步骤。
+该模块主要是在配置文件中设置模型路径和缩写，以在推断脚本中使用简单缩写而不是完整路径。
+- T2I SD：参考 `musev/configs/model/T2I_all_model.py`
+- 运动 Unet：参考 `musev/configs/model/motion_model.py`
+- 任务：参考 `musev/configs/tasks/example.yaml`
+### musev_referencenet
+#### 输入文本、图像的视频生成
+```bash
+python scripts/inference/text2video.py   --sd_model_name majicmixRealv6Fp16   --unet_model_name musev_referencenet --referencenet_model_name musev_referencenet --ip_adapter_model_name musev_referencenet   -test_data_path ./configs/tasks/example.yaml  --output_dir ./output  --n_batch 1  --target_datas yongen  --vision_clip_extractor_class_name ImageClipVisionFeatureExtractor --vision_clip_model_path ./checkpoints/IP-Adapter/models/image_encoder  --time_size 12 --fps 12
+```
+**通用参数**：
+- `test_data_path`：测试用例任务路径
+- `target_datas`：如果 `test_data_path` 中的 `name` 在 `target_datas` 中，则只运行这些子任务。`sep` 是 `,`；
+- `sd_model_cfg_path`：T2I sd 模型路径，模型配置路径或模型路径。
+- `sd_model_name`：sd 模型名称，用于在 `sd_model_cfg_path` 中选择完整模型��径。使用 `,` 分隔的多个模型名称，或 `all`。
+- `unet_model_cfg_path`：运动 unet 模型配置路径或模型路径。
+- `unet_model_name`：unet 模型名称，用于获取 `unet_model_cfg_path` 中的模型路径，并在 `musev/models/unet_loader.py` 中初始化 unet 类实例。使用 `,` 分隔的多个模型名称，或 `all`。如果 `unet_model_cfg_path` 是模型路径，则 `unet_name` 必须在 `musev/models/unet_loader.py` 中支持。
+- `time_size`：扩散模型每次生成一个片段，这里是一个片段的帧数。默认为 `12`。
+- `n_batch`：首尾相连方式生成总片段数，$total\_frames=n\_batch * time\_size + n\_viscond$，默认为 `1`。
+- `context_frames`： 并行去噪子窗口一次生成的帧数。如果 `time_size` > `context_frame`，则会启动并行去噪逻辑， `time_size` 窗口会分成多个子窗口进行并行去噪。默认为 `12`。
+生成**长视频**，有两种方法，可以共同使用：
+1. `视觉条件并行去噪`：设置 `n_batch=1`，`time_size` = 想要的所有帧。
+2. `传统的首尾相连方式`：设置 `time_size` = `context_frames` = 一次片段的帧数 (`12`)，`context_overlap` = 0。会首尾相连方式生成`n_batch`片段数，首尾相连存在误差累计，当`n_batch`越大，最后的结果越差。
+**模型参数**：
+支持 `referencenet`、`IPAdapter`、`IPAdapterFaceID`、`Facein`。
+- `referencenet_model_name`：`referencenet` 模型名称。
+- `ImageClipVisionFeatureExtractor`：`ImageEmbExtractor` 名称，在 `IPAdapter` 中提取视觉特征。
+- `vision_clip_model_path`：`ImageClipVisionFeatureExtractor` 模型路径。
+- `ip_adapter_model_name`：来自 `IPAdapter` 的，它是 `ImagePromptEmbProj`，与 `ImageEmbExtractor` 一起使用。
+- `ip_adapter_face_model_name`：`IPAdapterFaceID`，来自 `IPAdapter`，应该设置 `face_image_path`。
+**一些影响运动范围和生成结果的参数**：
+- `video_guidance_scale`：类似于 text2image，控制 cond 和 uncond 之间的影响，影响较大，默认为 `3.5`。
+- `use_condition_image`：是否使用给定的第一帧进行视频生成, 默认 `True`。
+- `redraw_condition_image`：是否重新绘制给定的第一帧图像。
+- `video_negative_prompt`：配置文件中全 `negative_prompt` 的缩写。默认为 `V2`。
+#### 输入视频的视频生成
+```bash
+python scripts/inference/video2video.py --sd_model_name majicmixRealv6Fp16  --unet_model_name musev_referencenet --referencenet_model_name   musev_referencenet --ip_adapter_model_name musev_referencenet    -test_data_path ./configs/tasks/example.yaml    --vision_clip_extractor_class_name ImageClipVisionFeatureExtractor --vision_clip_model_path ./checkpoints/IP-Adapter/models/image_encoder      --output_dir ./output  --n_batch 1 --controlnet_name dwpose_body_hand  --which2video "video_middle"  --target_datas dance1 --fps 12 --time_size 12
+```
+**一些重要参数**
+大多数参数与 `musev_text2video` 相同。`video2video` 的特殊参数有：
+1. 需要在 `test_data` 中设置 `video_path`。现在支持 `rgb video` 和 `controlnet_middle_video`。
+- `which2video`： 参考视频类型。 如果是 `video_middle`，则只使用类似`pose`、`depth`的 `video_middle`；如果是 `video`， 视频本身也会参与视频噪声初始化，类似于`img2imge`。
+- `controlnet_name`：是否使用 `controlnet condition`，例如 `dwpose,depth`， pose的话 优先建议使用`dwpose_body_hand`。
+- `video_is_middle`：`video_path` 是 `rgb video` 还是 `controlnet_middle_video`。可以为 `test_data_path` 中的每个 `test_data` 设置。
+- `video_has_condition`：condtion_images 是否与 video_path 的第一帧对齐。如果不是，则首先生成 `condition_images`，然后与参考视频拼接对齐。 目前仅支持参考视频是`video_is_middle=True`，可`test_data` 设置。
+所有 `controlnet_names` 维护在 [mmcm](https://github.com/TMElyralab/MMCM/blob/main/mmcm/vision/feature_extractor/controlnet.py#L513)
+```python
+['pose', 'pose_body', 'pose_hand', 'pose_face', 'pose_hand_body', 'pose_hand_face', 'dwpose', 'dwpose_face', 'dwpose_hand', 'dwpose_body', 'dwpose_body_hand', 'canny', 'tile', 'hed', 'hed_scribble', 'depth', 'pidi', 'normal_bae', 'lineart', 'lineart_anime', 'zoe', 'sam', 'mobile_sam', 'leres', 'content', 'face_detector']
+```
+### musev_referencenet_pose
+仅用于 `pose2video`
+基于 `musev_referencenet` 训练，固定 `referencenet`、`pose-controlnet` 和 `T2I`，训练 `motion` 模块和 `IPAdapter`。
+```bash
+python scripts/inference/video2video.py --sd_model_name majicmixRealv6Fp16  --unet_model_name musev_referencenet_pose --referencenet_model_name   musev_referencenet --ip_adapter_model_name musev_referencenet_pose    -test_data_path ./configs/tasks/example.yaml    --vision_clip_extractor_class_name ImageClipVisionFeatureExtractor --vision_clip_model_path ./checkpoints/IP-Adapter/models/image_encoder      --output_dir ./output  --n_batch 1 --controlnet_name dwpose_body_hand  --which2video "video_middle"  --target_datas  dance1   --fps 12 --time_size 12
+```
+### musev
+仅有动作模块，没有 referencenet，需要更少的 GPU 内存。
+#### 文本到视频
+```bash
+python scripts/inference/text2video.py   --sd_model_name majicmixRealv6Fp16   --unet_model_name musev   -test_data_path ./configs/tasks/example.yaml  --output_dir ./output  --n_batch 1  --target_datas yongen  --time_size 12 --fps 12
+```
+#### 视频到视频
+```bash
+python scripts/inference/video2video.py --sd_model_name majicmixRealv6Fp16  --unet_model_name musev    -test_data_path ./configs/tasks/example.yaml --output_dir ./output  --n_batch 1 --controlnet_name dwpose_body_hand  --which2video "video_middle"  --target_datas  dance1   --fps 12 --time_size 12
+```
+### Gradio 演示
+MuseV 提供 gradio 脚本，可在本地机器上生成 GUI，方便生成视频。
+```bash
+cd scripts/gradio
+python app.py
+```
+# 致谢
+1. MuseV 开发过程中参考学习了很多开源工作 [TuneAVideo](https://github.com/showlab/Tune-A-Video)、[diffusers](https://github.com/huggingface/diffusers)、[Moore-AnimateAnyone](https://github.com/MooreThreads/Moore-AnimateAnyone/tree/master/src/pipelines)、[animatediff](https://github.com/guoyww/AnimateDiff)、[IP-Adapter](https://github.com/tencent-ailab/IP-Adapter)、[AnimateAnyone](https://arxiv.org/abs/2311.17117)、[VideoFusion](https://arxiv.org/abs/2303.08320) 和 [insightface](https://github.com/deepinsight/insightface)。
+2. MuseV 基于 `ucf101` 和 `webvid` 数据集构建。
+感谢开源社区的贡献！
+# 限制
+`MuseV` 仍然存在很多待优化项，包括：
+1. 缺乏泛化能力。对视觉条件帧敏感，有些视觉条件图像表现良好，有些表现不佳。有些预训练的 t2i 模型表现良好，有些表现不佳。
+1. 有限的视频生成类型和有限的动作范围，部分原因是训练数据类型有限。发布的 `MuseV` 已经在大约 6 万对分辨率为 `512*320` 的人类文本视频对上进行了训练。`MuseV` 在较低分辨率下具有更大的动作范围，但视频质量较低。`MuseV` 在高分辨率下画质很好、但动作范围较小。在更大、更高分辨率、更高质量的文本视频数据集上进行训练可能会使 `MuseV` 更好。
+1. 因为使用 `webvid` 训练会有水印问题。使用没有水印的、更干净的数据集可能会解决这个问题。
+1. 有限类型的长视频生成。视觉条件并行去噪可以解决视频生成的累积误差，但当前的方法只适用于相对固定的摄像机场景。
+1. referencenet 和 IP-Adapter 训练不足，因为时间有限和资源有限。
+1. 代码结构不够完善。`MuseV` 支持丰富而动态的功能，但代码复杂且未经过重构。熟悉需要时间。
+<!-- # Contribution 暂时不需要组织开源共建 -->
+# 引用
+```bib
+@article{musev,
+  title={MuseV: 基于视觉条件的并行去噪的无限长度和高保真虚拟人视频生成},
+  author={Xia, Zhiqiang and Chen, Zhaokang and Wu, Bin and Li, Chao and Hung, Kwok-Wai and Zhan, Chao and He, Yingjie and Zhou, Wenjiang},
+  journal={arxiv},
+  year={2024}
+}
+```
+# 免责声明/许可
+1. `代码`：`MuseV` 的代码采用 `MIT` 许可证发布，学术用途和商业用途都可以。
+1. `模型`：训练好的模型仅供非商业研究目的使用。
+1. `其他开源模型`：使用的其他开源模型必须遵守他们的许可证，如 `insightface`、`IP-Adapter`、`ft-mse-vae` 等。
+1. 测试数据收集自互联网，仅供非商业研究目的使用。
+1. `AIGC`：本项目旨在积极影响基于人工智能的视频生成领域。用户被授予使用此工具创建视频的自由，但他们应该遵守当地法律，并负责任地使用。开发人员不对用户可能的不当使用承担任何责任。

README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+---
+title: MuseV Demo
+emoji: 🎥
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.50.2
+app_file: scripts/gradio/app_gradio_space.py
+pinned: false
+---
+# MuseV Demo
+This is a Hugging Face Space for MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising.
+## Features
+- Text-to-Video generation
+- Visual condition support
+- High-quality video generation
+For more details, visit the [GitHub repository](https://github.com/TMElyralab/MuseV).
+## Usage
+1. Enter your prompt describing the video you want to generate
+2. Upload a reference image
+3. Adjust parameters like seed, FPS, dimensions, etc.
+4. Click generate and wait for the results
+## Model Details
+The model will be automatically downloaded when you first run the demo.
+## Credits
+Created by Lyra Lab, Tencent Music Entertainment

configs/model/T2I_all_model.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+T2IDir = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "../../checkpoints", "t2i"
+)
+MODEL_CFG = {
+    "majicmixRealv6Fp16": {
+        "sd": os.path.join(T2IDir, "sd1.5/majicmixRealv6Fp16"),
+    },
+    "fantasticmix_v10": {
+        "sd": os.path.join(T2IDir, "sd1.5/fantasticmix_v10"),
+    },
+}

configs/model/ip_adapter.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+IPAdapterModelDir = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "../../checkpoints", "IP-Adapter"
+)
+MotionDir = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "../../checkpoints", "motion"
+)
+MODEL_CFG = {
+    "IPAdapter": {
+        "ip_image_encoder": os.path.join(IPAdapterModelDir, "models/image_encoder"),
+        "ip_ckpt": os.path.join(IPAdapterModelDir, "ip-adapter_sd15.bin"),
+        "ip_scale": 1.0,
+        "clip_extra_context_tokens": 4,
+        "clip_embeddings_dim": 1024,
+        "desp": "",
+    },
+    "IPAdapterPlus": {
+        "ip_image_encoder": os.path.join(IPAdapterModelDir, "image_encoder"),
+        "ip_ckpt": os.path.join(IPAdapterModelDir, "ip-adapter-plus_sd15.bin"),
+        "ip_scale": 1.0,
+        "clip_extra_context_tokens": 16,
+        "clip_embeddings_dim": 1024,
+        "desp": "",
+    },
+    "IPAdapterPlus-face": {
+        "ip_image_encoder": os.path.join(IPAdapterModelDir, "image_encoder"),
+        "ip_ckpt": os.path.join(IPAdapterModelDir, "ip-adapter-plus-face_sd15.bin"),
+        "ip_scale": 1.0,
+        "clip_extra_context_tokens": 16,
+        "clip_embeddings_dim": 1024,
+        "desp": "",
+    },
+    "IPAdapterFaceID": {
+        "ip_image_encoder": os.path.join(IPAdapterModelDir, "image_encoder"),
+        "ip_ckpt": os.path.join(IPAdapterModelDir, "ip-adapter-faceid_sd15.bin"),
+        "ip_scale": 1.0,
+        "clip_extra_context_tokens": 4,
+        "clip_embeddings_dim": 512,
+        "desp": "",
+    },
+    "musev_referencenet": {
+        "ip_image_encoder": os.path.join(IPAdapterModelDir, "image_encoder"),
+        "ip_ckpt": os.path.join(
+            MotionDir, "musev_referencenet/ip_adapter_image_proj.bin"
+        ),
+        "ip_scale": 1.0,
+        "clip_extra_context_tokens": 4,
+        "clip_embeddings_dim": 1024,
+        "desp": "",
+    },
+    "musev_referencenet_pose": {
+        "ip_image_encoder": os.path.join(IPAdapterModelDir, "image_encoder"),
+        "ip_ckpt": os.path.join(
+            MotionDir, "musev_referencenet_pose/ip_adapter_image_proj.bin"
+        ),
+        "ip_scale": 1.0,
+        "clip_extra_context_tokens": 4,
+        "clip_embeddings_dim": 1024,
+        "desp": "",
+    },
+}

configs/model/lcm_model.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+LCMDir = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "../../checkpoints", "lcm"
+)
+MODEL_CFG = {
+    "lcm": {
+        os.path.join(LCMDir, "lcm-lora-sdv1-5/pytorch_lora_weights.safetensors"): {
+            "strength": 1.0,
+            "lora_block_weight": "ALL",
+            "strength_offset": 0,
+        },
+    },
+}

configs/model/motion_model.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+MotionDIr = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "../../checkpoints", "motion"
+)
+MODEL_CFG = {
+    "musev": {
+        "unet": os.path.join(MotionDIr, "musev"),
+        "desp": "only train unet motion module, fix t2i",
+    },
+    "musev_referencenet": {
+        "unet": os.path.join(MotionDIr, "musev_referencenet"),
+        "desp": "train referencenet, IPAdapter and unet motion module, fix t2i",
+    },
+    "musev_referencenet_pose": {
+        "unet": os.path.join(MotionDIr, "musev_referencenet_pose"),
+        "desp": "train  unet motion module and IPAdapter, fix t2i and referencenet",
+    },
+}

configs/model/negative_prompt.py ADDED Viewed

	@@ -0,0 +1,32 @@

+Negative_Prompt_CFG = {
+    "Empty": {
+        "base_model": "",
+        "prompt": "",
+        "refer": "",
+    },
+    "V1": {
+        "base_model": "",
+        "prompt": "nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry, tail, watermarks",
+        "refer": "",
+    },
+    "V2": {
+        "base_model": "",
+        "prompt": "badhandv4, ng_deepnegative_v1_75t, (((multiple heads))), (((bad body))), (((two people))), ((extra arms)), ((deformed body)), (((sexy))), paintings,(((two heads))), ((big head)),sketches, (worst quality:2), (low quality:2), (normal quality:2), lowres, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, glans, (((nsfw))), nipples, extra fingers, (extra legs), (long neck), mutated hands, (fused fingers), (too many fingers)",
+        "refer": "Weiban",
+    },
+    "V3": {
+        "base_model": "",
+        "prompt": "badhandv4, ng_deepnegative_v1_75t, bad quality",
+        "refer": "",
+    },
+    "V4": {
+        "base_model": "",
+        "prompt": "badhandv4,ng_deepnegative_v1_75t,EasyNegativeV2,bad_prompt_version2-neg,bad quality",
+        "refer": "",
+    },
+    "V5": {
+        "base_model": "",
+        "prompt": "(((multiple heads))), (((bad body))), (((two people))), ((extra arms)), ((deformed body)), (((sexy))), paintings,(((two heads))), ((big head)),sketches, (worst quality:2), (low quality:2), (normal quality:2), lowres, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, glans, (((nsfw))), nipples, extra fingers, (extra legs), (long neck), mutated hands, (fused fingers), (too many fingers)",
+        "refer": "Weiban",
+    },
+}

configs/model/referencenet.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+MotionDIr = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "../../checkpoints", "motion"
+)
+MODEL_CFG = {
+    "musev_referencenet": {
+        "net": os.path.join(MotionDIr, "musev_referencenet"),
+        "desp": "",
+    },
+}

configs/tasks/example.yaml ADDED Viewed

	@@ -0,0 +1,210 @@

+# - name: task_name
+#   condition_images: vision condition images path
+#   video_path: str, default null, used for video2video
+#   prompt: text to guide image generation
+#   ipadapter_image: image_path for IP-Apdater
+#   refer_image: image_path for referencenet, generally speaking, same as ipadapter_image
+#   height: int # The shorter the image size, the larger the motion amplitude, and the lower video quality.
+#   width: int #  The longer the W&H, the smaller the motion amplitude, and the higher video quality.
+#   img_length_ratio: float, generation video size is (height, width) * img_length_ratio
+# text/image2video
+- condition_images: ./data/images/yongen.jpeg
+  eye_blinks_factor: 1.8
+  height: 1308
+  img_length_ratio: 0.957
+  ipadapter_image: ${.condition_images}
+  name: yongen
+  prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 736
+- condition_images: ./data/images/jinkesi2.jpeg
+  eye_blinks_factor: 1.8
+  height: 714
+  img_length_ratio: 1.25
+  ipadapter_image: ${.condition_images}
+  name: jinkesi2
+  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
+    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 563
+- condition_images: ./data/images/seaside4.jpeg
+  eye_blinks_factor: 1.8
+  height: 317
+  img_length_ratio: 2.221
+  ipadapter_image: ${.condition_images}
+  name: seaside4
+  prompt: (masterpiece, best quality, highres:1), peaceful beautiful sea scene
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 564
+- condition_images: ./data/images/seaside_girl.jpeg
+  eye_blinks_factor: 1.8
+  height: 736
+  img_length_ratio: 0.957
+  ipadapter_image: ${.condition_images}
+  name: seaside_girl
+  prompt: (masterpiece, best quality, highres:1), peaceful beautiful sea scene
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 736
+- condition_images: ./data/images/boy_play_guitar.jpeg
+  eye_blinks_factor: 1.8
+  height: 846
+  img_length_ratio: 1.248
+  ipadapter_image: ${.condition_images}
+  name: boy_play_guitar
+  prompt: (masterpiece, best quality, highres:1), playing guitar
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 564
+- condition_images: ./data/images/girl_play_guitar2.jpeg
+  eye_blinks_factor: 1.8
+  height: 1002
+  img_length_ratio: 1.248
+  ipadapter_image: ${.condition_images}
+  name: girl_play_guitar2
+  prompt: (masterpiece, best quality, highres:1), playing guitar
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 564
+- condition_images: ./data/images/boy_play_guitar2.jpeg
+  eye_blinks_factor: 1.8
+  height: 630
+  img_length_ratio: 1.676
+  ipadapter_image: ${.condition_images}
+  name: boy_play_guitar2
+  prompt: (masterpiece, best quality, highres:1), playing guitar
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 420
+- condition_images: ./data/images/girl_play_guitar4.jpeg
+  eye_blinks_factor: 1.8
+  height: 846
+  img_length_ratio: 1.248
+  ipadapter_image: ${.condition_images}
+  name: girl_play_guitar4
+  prompt: (masterpiece, best quality, highres:1), playing guitar
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 564
+- condition_images: ./data/images/dufu.jpeg
+  eye_blinks_factor: 1.8
+  height: 500
+  img_length_ratio: 1.495
+  ipadapter_image: ${.condition_images}
+  name: dufu
+  prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3),Chinese ink painting style
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 471
+- condition_images: ./data/images/Mona_Lisa..jpg
+  eye_blinks_factor: 1.8
+  height: 894
+  img_length_ratio: 1.173
+  ipadapter_image: ${.condition_images}
+  name: Mona_Lisa.
+  prompt: (masterpiece, best quality, highres:1),(1girl, solo:1),(beautiful face,
+    soft skin, costume:1),(eye blinks:{eye_blinks_factor}),(head wave:1.3)
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 600
+- condition_images: ./data/images/Portrait-of-Dr.-Gachet.jpg
+  eye_blinks_factor: 1.8
+  height: 985
+  img_length_ratio: 0.88
+  ipadapter_image: ${.condition_images}
+  name: Portrait-of-Dr.-Gachet
+  prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 800
+- condition_images: ./data/images/Self-Portrait-with-Cropped-Hair.jpg
+  eye_blinks_factor: 1.8
+  height: 565
+  img_length_ratio: 1.246
+  ipadapter_image: ${.condition_images}
+  name: Self-Portrait-with-Cropped-Hair
+  prompt: (masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3), animate
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 848
+- condition_images: ./data/images/The-Laughing-Cavalier.jpg
+  eye_blinks_factor: 1.8
+  height: 1462
+  img_length_ratio: 0.587
+  ipadapter_image: ${.condition_images}
+  name: The-Laughing-Cavalier
+  prompt: (masterpiece, best quality, highres:1),(1man, solo:1),(eye blinks:1.8),(head wave:1.3)
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 1200
+# scene
+- condition_images: ./data/images/waterfall4.jpeg
+  eye_blinks_factor: 1.8
+  height: 846
+  img_length_ratio: 1.248
+  ipadapter_image: ${.condition_images}
+  name: waterfall4
+  prompt: (masterpiece, best quality, highres:1), peaceful beautiful waterfall, an
+    endless waterfall
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 564
+- condition_images: ./data/images/river.jpeg
+  eye_blinks_factor: 1.8
+  height: 736
+  img_length_ratio: 0.957
+  ipadapter_image: ${.condition_images}
+  name: river
+  prompt: (masterpiece, best quality, highres:1), peaceful beautiful river
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 736
+- condition_images: ./data/images/seaside2.jpeg
+  eye_blinks_factor: 1.8
+  height: 1313
+  img_length_ratio: 0.957
+  ipadapter_image: ${.condition_images}
+  name: seaside2
+  prompt: (masterpiece, best quality, highres:1), peaceful beautiful sea scene
+  refer_image: ${.condition_images}
+  video_path: null
+  width: 736
+# video2video
+- name: "dance1"
+  prompt: "(masterpiece, best quality, highres:1) , a girl is dancing, wearing a dress made of stars, animation"
+  video_path:  ./data/source_video/video1_girl_poseseq.mp4
+  condition_images: ./data/images/spark_girl.png
+  refer_image: ${.condition_images}
+  ipadapter_image: ${.condition_images}
+  height: 960
+  width: 512
+  img_length_ratio: 1.0
+  video_is_middle: True # if true, means video_path is controlnet condition, not natural rgb video
+- name: "dance2"
+  prompt: "(best quality), ((masterpiece)), (highres), illustration, original, extremely detailed wallpaper"
+  video_path:  ./data/source_video/video1_girl_poseseq.mp4
+  condition_images: ./data/images/cyber_girl.png
+  refer_image: ${.condition_images}
+  ipadapter_image: ${.condition_images}
+  height: 960
+  width: 512
+  img_length_ratio: 1.0
+  video_is_middle: True # if true, means video_path is controlnet condition, not natural rgb video
+- name: "duffy"
+  prompt: "(best quality), ((masterpiece)), (highres), illustration, original, extremely detailed wallpaper"
+  video_path: ./data/source_video/pose-for-Duffy-4.mp4
+  condition_images: ./data/images/duffy.png
+  refer_image: ${.condition_images}
+  ipadapter_image: ${.condition_images}
+  height: 1280
+  width: 704
+  img_length_ratio: 1.0
+  video_is_middle: True  # if true, means video_path is controlnet condition, not natural rgb video

controlnet_aux ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 54c6c49baf68bff290679f5bb896715f25932133

data/models/musev_structure.png ADDED Viewed

data/models/parallel_denoise.png ADDED Viewed

diffusers ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit abf2f8bf698a895cecc30a73c6ff4abb92fdce1c

environment.yml ADDED Viewed

	@@ -0,0 +1,312 @@

+name: musev
+channels:
+  - https://repo.anaconda.com/pkgs/main
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2023.12.12=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=1.1.1w=h7f8727e_0
+  - python=3.10.6=haa1d7c7_1
+  - readline=8.2=h5eee18b_0
+  - sqlite=3.41.2=h5eee18b_0
+  - tk=8.6.12=h1ccaba5_0
+  - xz=5.4.5=h5eee18b_0
+  - zlib=1.2.13=h5eee18b_0
+  - pip:
+    - absl-py==2.1.0
+    - accelerate==0.22.0
+    - addict==2.4.0
+    - aiofiles==23.2.1
+    - aiohttp==3.9.1
+    - aiosignal==1.3.1
+    - albumentations==1.3.1
+    - aliyun-python-sdk-core==2.14.0
+    - aliyun-python-sdk-kms==2.16.2
+    - altair==5.2.0
+    - antlr4-python3-runtime==4.9.3
+    - anyio==4.2.0
+    - appdirs==1.4.4
+    - argparse==1.4.0
+    - asttokens==2.4.1
+    - astunparse==1.6.3
+    - async-timeout==4.0.3
+    - attrs==23.2.0
+    - audioread==3.0.1
+    - basicsr==1.4.2
+    - beautifulsoup4==4.12.2
+    - bitsandbytes==0.41.1
+    - black==23.12.1
+    - blinker==1.7.0
+    - braceexpand==0.1.7
+    - cachetools==5.3.2
+    - certifi==2023.11.17
+    - cffi==1.16.0
+    - charset-normalizer==3.3.2
+    - chumpy==0.70
+    - click==8.1.7
+    - cmake==3.28.1
+    - colorama==0.4.6
+    - coloredlogs==15.0.1
+    - comm==0.2.1
+    - contourpy==1.2.0
+    - cos-python-sdk-v5==1.9.22
+    - coscmd==1.8.6.30
+    - crcmod==1.7
+    - cryptography==41.0.7
+    - cycler==0.12.1
+    - cython==3.0.2
+    - datetime==5.4
+    - debugpy==1.8.0
+    - decorator==4.4.2
+    - decord==0.6.0
+    - dill==0.3.7
+    - docker-pycreds==0.4.0
+    - dulwich==0.21.7
+    - easydict==1.11
+    - einops==0.7.0
+    - exceptiongroup==1.2.0
+    - executing==2.0.1
+    - fastapi==0.109.0
+    - ffmpeg==1.4
+    - ffmpeg-python==0.2.0
+    - ffmpy==0.3.1
+    - filelock==3.13.1
+    - flatbuffers==23.5.26
+    - fonttools==4.47.2
+    - frozenlist==1.4.1
+    - fsspec==2023.12.2
+    - ftfy==6.1.1
+    - future==0.18.3
+    - fuzzywuzzy==0.18.0
+    - fvcore==0.1.5.post20221221
+    - gast==0.4.0
+    - gdown==4.5.3
+    - gitdb==4.0.11
+    - gitpython==3.1.41
+    - google-auth==2.26.2
+    - google-auth-oauthlib==0.4.6
+    - google-pasta==0.2.0
+    - gradio==3.43.2
+    - gradio-client==0.5.0
+    - grpcio==1.60.0
+    - h11==0.14.0
+    - h5py==3.10.0
+    - httpcore==1.0.2
+    - httpx==0.26.0
+    - huggingface-hub==0.20.2
+    - humanfriendly==10.0
+    - idna==3.6
+    - imageio==2.31.1
+    - imageio-ffmpeg==0.4.8
+    - importlib-metadata==7.0.1
+    - importlib-resources==6.1.1
+    - iniconfig==2.0.0
+    - insightface==0.7.3
+    - invisible-watermark==0.1.5
+    - iopath==0.1.10
+    - ip-adapter==0.1.0
+    - iprogress==0.4
+    - ipykernel==6.29.0
+    - ipython==8.20.0
+    - ipywidgets==8.0.3
+    - jax==0.4.23
+    - jedi==0.19.1
+    - jinja2==3.1.3
+    - jmespath==0.10.0
+    - joblib==1.3.2
+    - json-tricks==3.17.3
+    - jsonschema==4.21.0
+    - jsonschema-specifications==2023.12.1
+    - jupyter-client==8.6.0
+    - jupyter-core==5.7.1
+    - jupyterlab-widgets==3.0.9
+    - keras==2.12.0
+    - kiwisolver==1.4.5
+    - kornia==0.7.0
+    - lazy-loader==0.3
+    - libclang==16.0.6
+    - librosa==0.10.1
+    - lightning-utilities==0.10.0
+    - lit==17.0.6
+    - llvmlite==0.41.1
+    - lmdb==1.4.1
+    - loguru==0.6.0
+    - markdown==3.5.2
+    - markdown-it-py==3.0.0
+    - markupsafe==2.0.1
+    - matplotlib==3.6.2
+    - matplotlib-inline==0.1.6
+    - mdurl==0.1.2
+    - mediapipe==0.10.3
+    - ml-dtypes==0.3.2
+    - model-index==0.1.11
+    - modelcards==0.1.6
+    - moviepy==1.0.3
+    - mpmath==1.3.0
+    - msgpack==1.0.7
+    - multidict==6.0.4
+    - munkres==1.1.4
+    - mypy-extensions==1.0.0
+    - nest-asyncio==1.5.9
+    - networkx==3.2.1
+    - ninja==1.11.1
+    - numba==0.58.1
+    - numpy==1.23.5
+    - oauthlib==3.2.2
+    - omegaconf==2.3.0
+    - onnx==1.14.1
+    - onnxruntime==1.15.1
+    - onnxsim==0.4.33
+    - open-clip-torch==2.20.0
+    - opencv-contrib-python==4.8.0.76
+    - opencv-python==4.9.0.80
+    - opencv-python-headless==4.9.0.80
+    - opendatalab==0.0.10
+    - openmim==0.3.9
+    - openxlab==0.0.34
+    - opt-einsum==3.3.0
+    - ordered-set==4.1.0
+    - orjson==3.9.10
+    - oss2==2.17.0
+    - packaging==23.2
+    - pandas==2.1.4
+    - parso==0.8.3
+    - pathspec==0.12.1
+    - pathtools==0.1.2
+    - pexpect==4.9.0
+    - pillow==10.2.0
+    - pip==23.3.1
+    - platformdirs==4.1.0
+    - pluggy==1.3.0
+    - pooch==1.8.0
+    - portalocker==2.8.2
+    - prettytable==3.9.0
+    - proglog==0.1.10
+    - prompt-toolkit==3.0.43
+    - protobuf==3.20.3
+    - psutil==5.9.7
+    - ptyprocess==0.7.0
+    - pure-eval==0.2.2
+    - pyarrow==14.0.2
+    - pyasn1==0.5.1
+    - pyasn1-modules==0.3.0
+    - pycocotools==2.0.7
+    - pycparser==2.21
+    - pycryptodome==3.20.0
+    - pydantic==1.10.2
+    - pydeck==0.8.1b0
+    - pydub==0.25.1
+    - pygments==2.17.2
+    - pynvml==11.5.0
+    - pyparsing==3.1.1
+    - pysocks==1.7.1
+    - pytest==7.4.4
+    - python-dateutil==2.8.2
+    - python-dotenv==1.0.0
+    - python-multipart==0.0.6
+    - pytorch-lightning==2.0.8
+    - pytube==15.0.0
+    - pytz==2023.3.post1
+    - pywavelets==1.5.0
+    - pyyaml==6.0.1
+    - pyzmq==25.1.2
+    - qudida==0.0.4
+    - redis==4.5.1
+    - referencing==0.32.1
+    - regex==2023.12.25
+    - requests==2.28.2
+    - requests-oauthlib==1.3.1
+    - rich==13.4.2
+    - rpds-py==0.17.1
+    - rsa==4.9
+    - safetensors==0.3.3
+    - scikit-image==0.22.0
+    - scikit-learn==1.3.2
+    - scipy==1.11.4
+    - semantic-version==2.10.0
+    - sentencepiece==0.1.99
+    - sentry-sdk==1.39.2
+    - setproctitle==1.3.3
+    - setuptools==60.2.0
+    - shapely==2.0.2
+    - six==1.16.0
+    - smmap==5.0.1
+    - sniffio==1.3.0
+    - sounddevice==0.4.6
+    - soundfile==0.12.1
+    - soupsieve==2.5
+    - soxr==0.3.7
+    - stack-data==0.6.3
+    - starlette==0.35.1
+    - streamlit==1.30.0
+    - streamlit-drawable-canvas==0.9.3
+    - sympy==1.12
+    - tabulate==0.9.0
+    - tb-nightly==2.11.0a20220906
+    - tenacity==8.2.3
+    - tensorboard==2.12.0
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.1
+    - tensorflow==2.12.0
+    - tensorflow-estimator==2.12.0
+    - tensorflow-io-gcs-filesystem==0.35.0
+    - termcolor==2.4.0
+    - terminaltables==3.1.10
+    - test-tube==0.7.5
+    - threadpoolctl==3.2.0
+    - tifffile==2023.12.9
+    - timm==0.9.12
+    - tokenizers==0.13.3
+    - toml==0.10.2
+    - tomli==2.0.1
+    - toolz==0.12.0
+    - torch==2.0.1+cu118
+    - torch-tb-profiler==0.4.1
+    - torchmetrics==1.1.1
+    - torchvision==0.15.2+cu118
+    - tornado==6.4
+    - tqdm==4.65.2
+    - traitlets==5.14.1
+    - transformers==4.33.1
+    - triton==2.0.0
+    - typing-extensions==4.9.0
+    - tzdata==2023.4
+    - tzlocal==5.2
+    - urllib3==1.26.18
+    - urwid==2.4.2
+    - uvicorn==0.26.0
+    - validators==0.22.0
+    - wandb==0.15.10
+    - watchdog==3.0.0
+    - wcwidth==0.2.13
+    - webdataset==0.2.86
+    - webp==0.3.0
+    - websockets==11.0.3
+    - werkzeug==3.0.1
+    - wget==3.2
+    - wheel==0.41.2
+    - widgetsnbextension==4.0.9
+    - wrapt==1.14.1
+    - xformers==0.0.21
+    - xmltodict==0.13.0
+    - xtcocotools==1.14.3
+    - yacs==0.1.8
+    - yapf==0.40.2
+    - yarl==1.9.4
+    - zipp==3.17.0
+    - zope-interface==6.1
+    - fire==0.6.0
+    - cuid
+    - git+https://github.com/tencent-ailab/IP-Adapter.git@main
+    - git+https://github.com/openai/CLIP.git@main
+prefix: /data/miniconda3/envs/musev

musev/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+import logging
+import logging.config
+# 读取日志配置文件内容
+logging.config.fileConfig(os.path.join(os.path.dirname(__file__), "logging.conf"))
+# 创建一个日志器logger
+logger = logging.getLogger("musev")

musev/auto_prompt/__init__.py ADDED Viewed

File without changes

musev/auto_prompt/attributes/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from ...utils.register import Register
+AttrRegister = Register(registry_name="attributes")
+# must import like bellow to ensure that each class is registered with AttrRegister:
+from .human import *
+from .render import *
+from .style import *

musev/auto_prompt/attributes/attr2template.py ADDED Viewed

	@@ -0,0 +1,127 @@

+r"""
+中文
+该模块将关键词字典转化为描述文本，生成完整的提词，从而降低对比实验成本、提升控制能力和效率。
+提词（prompy）对比实验会需要控制关键属性发生变化、其他属性不变的文本对。当需要控制的属性变量发生较大变化时，靠人为复制粘贴进行完成文本撰写工作量会非常大。
+该模块主要有三种类，分别是：
+1. `BaseAttribute2Text`: 单属性文本转换类
+2. `MultiAttr2Text` 多属性文本转化类，输出`List[Tuple[str, str]`。具体如何转换为文本在 `MultiAttr2PromptTemplate`中实现。
+3. `MultiAttr2PromptTemplate`：先将2生成的多属性文本字典列表转化为完整的文本，然后再使用内置的模板`template`拼接。拼接后的文本作为实际模型输入的提词。
+    1. `template`字段若没有{}，且有字符，则认为输入就是完整输入网络的`prompt`;
+    2. `template`字段若含有{key}，则认为是带关键词的字符串目标，多个属性由`template`字符串中顺序完全决定。关键词内容由表格中相关列通过`attr2text`转化而来;
+    3. `template`字段有且只含有一个{}，如`a portrait of {}`，则相关内容由 `PresetMultiAttr2PromptTemplate`中预定义好的`attrs`列表指定先后顺序；
+English
+This module converts a keyword dictionary into descriptive text, generating complete prompts to reduce the cost of comparison experiments, and improve control and efficiency.
+Prompt-based comparison experiments require text pairs where the key attributes are controlled while other attributes remain constant. When the variable attributes to be controlled undergo significant changes, manually copying and pasting to write text can be very time-consuming.
+This module mainly consists of three classes:
+BaseAttribute2Text: A class for converting single attribute text.
+MultiAttr2Text: A class for converting multi-attribute text, outputting List[Tuple[str, str]]. The specific implementation of how to convert to text is implemented in MultiAttr2PromptTemplate.
+MultiAttr2PromptTemplate: First, the list of multi-attribute text dictionaries generated by 2 is converted into complete text, and then the built-in template template is used for concatenation. The concatenated text serves as the prompt for the actual model input.
+If the template field does not contain {}, and there are characters, the input is considered the complete prompt for the network.
+If the template field contains {key}, it is considered a string target with keywords, and the order of multiple attributes is completely determined by the template string. The keyword content is generated by attr2text from the relevant columns in the table.
+If the template field contains only one {}, such as a portrait of {}, the relevant content is specified in the order defined by the attrs list predefined in PresetMultiAttr2PromptTemplate.
+"""
+from typing import List, Tuple, Union
+from mmcm.utils.str_util import (
+    has_key_brace,
+    merge_near_same_char,
+    get_word_from_key_brace_string,
+)
+from .attributes import MultiAttr2Text, merge_multi_attrtext, AttriributeIsText
+from . import AttrRegister
+class MultiAttr2PromptTemplate(object):
+    """
+    将多属性转化为模型输入文本的实际类
+    The actual class that converts multiple attributes into model input text is
+    """
+    def __init__(
+        self,
+        template: str,
+        attr2text: MultiAttr2Text,
+        name: str,
+    ) -> None:
+        """
+        Args:
+            template (str): 提词模板, prompt template.
+                如果`template`含有{key}，则根据key来取值。 if the template field contains {key}, it means that the actual value for that part of the prompt will be determined by the corresponding key
+                如果`template`有且只有1个{}，则根据先后顺序对texts中的值进行拼接。if the template field in MultiAttr2PromptTemplate contains only one {} placeholder, such as "a portrait of {}", the order of the attributes is determined by the attrs list predefined in PresetMultiAttr2PromptTemplate. The values of the attributes in the texts list are concatenated in the order specified by the attrs list.
+            attr2text (MultiAttr2Text): 多属性转换类。Class for converting multiple attributes into text prompt.
+            name (str): 该多属性文本模板类的名字，便于记忆. Class Instance name
+        """
+        self.attr2text = attr2text
+        self.name = name
+        if template == "":
+            template = "{}"
+        self.template = template
+        self.template_has_key_brace = has_key_brace(template)
+    def __call__(self, attributes: dict) -> Union[str, List[str]]:
+        texts = self.attr2text(attributes)
+        if not isinstance(texts, list):
+            texts = [texts]
+        prompts = [merge_multi_attrtext(text, self.template) for text in texts]
+        prompts = [merge_near_same_char(prompt) for prompt in prompts]
+        if len(prompts) == 1:
+            prompts = prompts[0]
+        return prompts
+class KeywordMultiAttr2PromptTemplate(MultiAttr2PromptTemplate):
+    def __init__(self, template: str, name: str = "keywords") -> None:
+        """关键词模板属性2文本转化类
+        1. 获取关键词模板字符串中的关键词属性；
+        2. 从import * 存储在locals()中变量中获取对应的类；
+        3. 将集成了多属性转换类的`MultiAttr2Text`
+        Args:
+            template (str): 含有{key}的模板字符串
+            name (str, optional): 该模板字符串名字，暂无实际用处. Defaults to "keywords".
+        class for converting keyword template attributes to text
+        1. Get the keyword attributes in the keyword template string;
+        2. Get the corresponding class from the variables stored in locals() by import *;
+        3. The `MultiAttr2Text` integrated with multiple attribute conversion classes
+        Args:
+            template (str): template string containing {key}
+            name (str, optional): the name of the template string, no actual use. Defaults to "keywords".
+        """
+        assert has_key_brace(
+            template
+        ), "template should have key brace, but given {}".format(template)
+        keywords = get_word_from_key_brace_string(template)
+        funcs = []
+        for word in keywords:
+            if word in AttrRegister:
+                func = AttrRegister[word](name=word)
+            else:
+                func = AttriributeIsText(name=word)
+            funcs.append(func)
+        attr2text = MultiAttr2Text(funcs, name=name)
+        super().__init__(template, attr2text, name)
+class OnlySpacePromptTemplate(MultiAttr2PromptTemplate):
+    def __init__(self, template: str, name: str = "space_prompt") -> None:
+        """纯空模板，无论输入啥，都只返回空格字符串作为prompt。
+        Args:
+            template (str): 符合只输出空格字符串的模板，
+            name (str, optional): 该模板字符串名字，暂无实际用处. Defaults to "space_prompt".
+        Pure empty template, no matter what the input is, it will only return a space string as the prompt.
+        Args:
+            template (str): template that only outputs a space string,
+            name (str, optional): the name of the template string, no actual use. Defaults to "space_prompt".
+        """
+        attr2text = None
+        super().__init__(template, attr2text, name)
+    def __call__(self, attributes: dict) -> Union[str, List[str]]:
+        return ""

musev/auto_prompt/attributes/attributes.py ADDED Viewed

	@@ -0,0 +1,227 @@

+from copy import deepcopy
+from typing import List, Tuple, Dict
+from mmcm.utils.str_util import has_key_brace
+class BaseAttribute2Text(object):
+    """
+    属性转化为文本的基类，该类作用就是输入属性，转化为描述文本。
+    Base class for converting attributes to text which converts attributes to prompt text.
+    """
+    name = "base_attribute"
+    def __init__(self, name: str = None) -> None:
+        """这里类实例初始化设置`name`参数，主要是为了便于一些没有提前实现、通过字符串参数实现的新属性。
+            Theses class instances are initialized with the `name` parameter to facilitate the implementation of new attributes that are not implemented in advance and are implemented through string parameters.
+        Args:
+            name (str, optional): _description_. Defaults to None.
+        """
+        if name is not None:
+            self.name = name
+    def __call__(self, attributes) -> str:
+        raise NotImplementedError
+class AttributeIsTextAndName(BaseAttribute2Text):
+    """
+    属性文本转换功能类，将key和value拼接在一起作为文本.
+    class for converting attributes to text which concatenates the key and value together as text.
+    """
+    name = "attribute_is_text_name"
+    def __call__(self, attributes) -> str:
+        if attributes == "" or attributes is None:
+            return ""
+        attributes = attributes.split(",")
+        text = ", ".join(
+            [
+                "{} {}".format(attr, self.name) if attr != "" else ""
+                for attr in attributes
+            ]
+        )
+        return text
+class AttriributeIsText(BaseAttribute2Text):
+    """
+    属性文本转换功能类，将value作为文本.
+    class for converting attributes to text which only uses the value as text.
+    """
+    name = "attribute_is_text"
+    def __call__(self, attributes: str) -> str:
+        if attributes == "" or attributes is None:
+            return ""
+        attributes = str(attributes)
+        attributes = attributes.split(",")
+        text = ", ".join(["{}".format(attr) for attr in attributes])
+        return text
+class MultiAttr2Text(object):
+    """将多属性组成的字典转换成完整的文本描述，目前采用简单的前后拼接方式，以`, `作为拼接符号
+    class for converting a dictionary of multiple attributes into a complete text description. Currently, a simple front and back splicing method is used, with `, ` as the splicing symbol.
+    Args:
+        object (_type_): _description_
+    """
+    def __init__(self, funcs: list, name) -> None:
+        """
+        Args:
+            funcs (list): 继承`BaseAttribute2Text`并实现了`__call__`函数的类. Inherited `BaseAttribute2Text` and implemented the `__call__` function of the class.
+            name (_type_): 该多属性的一个名字，可通过该类方便了解对应相关属性都是关于啥的。 name of the multi-attribute, which can be used to easily understand what the corresponding related attributes are about.
+        """
+        if not isinstance(funcs, list):
+            funcs = [funcs]
+        self.funcs = funcs
+        self.name = name
+    def __call__(
+        self, dct: dict, ignored_blank_str: bool = False
+    ) -> List[Tuple[str, str]]:
+        """
+        有时候一个属性可能会返回多个文本，如 style cartoon会返回宫崎骏和皮克斯两种风格，采用外积增殖成多个字典。
+        sometimes an attribute may return multiple texts, such as style cartoon will return two styles, Miyazaki and Pixar, which are multiplied into multiple dictionaries by the outer product.
+        Args:
+            dct (dict): 多属性组成的字典，可能有self.funcs关注的属性也可能没有，self.funcs按照各自的名字按需提取关注的属性和值，并转化成文本.
+                Dict of multiple attributes, may or may not have the attributes that self.funcs is concerned with. self.funcs extracts the attributes and values of interest according to their respective names and converts them into text.
+            ignored_blank_str (bool): 如果某个attr2text返回的是空字符串，是否要过滤掉该属性。默认`False`.
+                If the text returned by an attr2text is an empty string, whether to filter out the attribute. Defaults to `False`.
+        Returns:
+            Union[List[List[Tuple[str, str]]], List[Tuple[str, str]]: 多组多属性文本字典列表. Multiple sets of multi-attribute text dictionaries.
+        """
+        attrs_lst = [[]]
+        for func in self.funcs:
+            if func.name in dct:
+                attrs = func(dct[func.name])
+                if isinstance(attrs, str):
+                    for i in range(len(attrs_lst)):
+                        attrs_lst[i].append((func.name, attrs))
+                else:
+                    # 一个属性可能会返回多个文本
+                    n_attrs = len(attrs)
+                    new_attrs_lst = []
+                    for n in range(n_attrs):
+                        attrs_lst_cp = deepcopy(attrs_lst)
+                        for i in range(len(attrs_lst_cp)):
+                            attrs_lst_cp[i].append((func.name, attrs[n]))
+                        new_attrs_lst.extend(attrs_lst_cp)
+                    attrs_lst = new_attrs_lst
+        texts = [
+            [
+                (attr, text)
+                for (attr, text) in attrs
+                if not (text == "" and ignored_blank_str)
+            ]
+            for attrs in attrs_lst
+        ]
+        return texts
+def format_tuple_texts(template: str, texts: Tuple[str, str]) -> str:
+    """使用含有"{}" 的模板对多属性文本元组进行拼接，形成新文本
+        concatenate multiple attribute text tuples using a template containing "{}" to form a new text
+    Args:
+        template (str):
+        texts (Tuple[str, str]): 多属性文本元组. multiple attribute text tuples
+    Returns:
+        str: 拼接后的新文本, merged new text
+    """
+    merged_text = ", ".join([text[1] for text in texts if text[1] != ""])
+    merged_text = template.format(merged_text)
+    return merged_text
+def format_dct_texts(template: str, texts: Dict[str, str]) -> str:
+    """使用含有"{key}" 的模板对多属性文本字典进行拼接，形成新文本
+        concatenate multiple attribute text dictionaries using a template containing "{key}" to form a new text
+    Args:
+        template (str):
+        texts (Tuple[str, str]): 多属性文本字典. multiple attribute text dictionaries
+    Returns:
+        str: 拼接后的新文本, merged new text
+    """
+    merged_text = template.format(**texts)
+    return merged_text
+def merge_multi_attrtext(texts: List[Tuple[str, str]], template: str = None) -> str:
+    """对多属性文本元组进行拼接，形成新文本。
+        如果`template`含有{key}，则根据key来取值；
+        如果`template`有且只有1个{}，则根据先后顺序对texts中的值进行拼接。
+        concatenate multiple attribute text tuples to form a new text.
+        if `template` contains {key}, the value is taken according to the key;
+        if `template` contains only one {}, the values in texts are concatenated in order.
+    Args:
+        texts (List[Tuple[str, str]]): Tuple[str, str]第一个str是属性名，第二个str是属性转化的文本.
+            Tuple[str, str] The first str is the attribute name, and the second str is the text of the attribute conversion.
+        template (str, optional): template . Defaults to None.
+    Returns:
+        str: 拼接后的新文本, merged new text
+    """
+    if not isinstance(texts, List):
+        texts = [texts]
+    if template is None or template == "":
+        template = "{}"
+    if has_key_brace(template):
+        texts = {k: v for k, v in texts}
+        merged_text = format_dct_texts(template, texts)
+    else:
+        merged_text = format_tuple_texts(template, texts)
+    return merged_text
+class PresetMultiAttr2Text(MultiAttr2Text):
+    """预置了多种关注属性转换的类，方便维护
+    class for multiple attribute conversion with multiple attention attributes preset for easy maintenance
+    """
+    preset_attributes = []
+    def __init__(
+        self, funcs: List = None, use_preset: bool = True, name: str = "preset"
+    ) -> None:
+        """虽然预置了关注的属性列表和转换类，但也允许定义示例时，进行更新。
+        注意`self.preset_attributes`的元素只是类名字，以便减少实例化的资源消耗。而funcs是实例化后的属性转换列表。
+        Although the list of attention attributes and conversion classes is preset, it is also allowed to be updated when defining an instance.
+        Note that the elements of `self.preset_attributes` are only class names, in order to reduce the resource consumption of instantiation. And funcs is a list of instantiated attribute conversions.
+        Args:
+            funcs (List, optional): list of funcs . Defaults to None.
+            use_preset (bool, optional): _description_. Defaults to True.
+            name (str, optional): _description_. Defaults to "preset".
+        """
+        if use_preset:
+            preset_funcs = self.preset()
+        else:
+            preset_funcs = []
+        if funcs is None:
+            funcs = []
+        if not isinstance(funcs, list):
+            funcs = [funcs]
+        funcs_names = [func.name for func in funcs]
+        preset_funcs = [
+            preset_func
+            for preset_func in preset_funcs
+            if preset_func.name not in funcs_names
+        ]
+        funcs = funcs + preset_funcs
+        super().__init__(funcs, name)
+    def preset(self):
+        funcs = [cls() for cls in self.preset_attributes]
+        return funcs

musev/auto_prompt/attributes/human.py ADDED Viewed

	@@ -0,0 +1,424 @@

+from copy import deepcopy
+import numpy as np
+import random
+import json
+from .attributes import (
+    MultiAttr2Text,
+    AttriributeIsText,
+    AttributeIsTextAndName,
+    PresetMultiAttr2Text,
+)
+from .style import Style
+from .render import Render
+from . import AttrRegister
+__all__ = [
+    "Age",
+    "Sex",
+    "Singing",
+    "Country",
+    "Lighting",
+    "Headwear",
+    "Eyes",
+    "Irises",
+    "Hair",
+    "Skin",
+    "Face",
+    "Smile",
+    "Expression",
+    "Clothes",
+    "Nose",
+    "Mouth",
+    "Beard",
+    "Necklace",
+    "KeyWords",
+    "InsightFace",
+    "Caption",
+    "Env",
+    "Decoration",
+    "Festival",
+    "SpringHeadwear",
+    "SpringClothes",
+    "Animal",
+]
+@AttrRegister.register
+class Sex(AttriributeIsText):
+    name = "sex"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Headwear(AttriributeIsText):
+    name = "headwear"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Expression(AttriributeIsText):
+    name = "expression"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class KeyWords(AttriributeIsText):
+    name = "keywords"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Singing(AttriributeIsText):
+    def __init__(self, name: str = "singing") -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Country(AttriributeIsText):
+    name = "country"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Clothes(AttriributeIsText):
+    name = "clothes"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Age(AttributeIsTextAndName):
+    name = "age"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+    def __call__(self, attributes: str) -> str:
+        if not isinstance(attributes, str):
+            attributes = str(attributes)
+        attributes = attributes.split(",")
+        text = ", ".join(
+            ["{}-year-old".format(attr) if attr != "" else "" for attr in attributes]
+        )
+        return text
+@AttrRegister.register
+class Eyes(AttributeIsTextAndName):
+    name = "eyes"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Hair(AttributeIsTextAndName):
+    name = "hair"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Background(AttributeIsTextAndName):
+    name = "background"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Skin(AttributeIsTextAndName):
+    name = "skin"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Face(AttributeIsTextAndName):
+    name = "face"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Smile(AttributeIsTextAndName):
+    name = "smile"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Nose(AttributeIsTextAndName):
+    name = "nose"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Mouth(AttributeIsTextAndName):
+    name = "mouth"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Beard(AttriributeIsText):
+    name = "beard"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Necklace(AttributeIsTextAndName):
+    name = "necklace"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Irises(AttributeIsTextAndName):
+    name = "irises"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+@AttrRegister.register
+class Lighting(AttributeIsTextAndName):
+    name = "lighting"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+PresetPortraitAttributes = [
+    Age,
+    Sex,
+    Singing,
+    Country,
+    Lighting,
+    Headwear,
+    Eyes,
+    Irises,
+    Hair,
+    Skin,
+    Face,
+    Smile,
+    Expression,
+    Clothes,
+    Nose,
+    Mouth,
+    Beard,
+    Necklace,
+    Style,
+    KeyWords,
+    Render,
+]
+class PortraitMultiAttr2Text(PresetMultiAttr2Text):
+    preset_attributes = PresetPortraitAttributes
+    def __init__(self, funcs: list = None, use_preset=True, name="portrait") -> None:
+        super().__init__(funcs, use_preset, name)
+@AttrRegister.register
+class InsightFace(AttriributeIsText):
+    name = "insight_face"
+    face_render_dict = {
+        "boy": "handsome,elegant",
+        "girl": "gorgeous,kawaii,colorful",
+    }
+    key_words = "delicate face,beautiful eyes"
+    def __call__(self, attributes: str) -> str:
+        """将insight faces 检测的结果转化成prompt
+            convert the results of insight faces detection to prompt
+        Args:
+            face_list (_type_): _description_
+        Returns:
+            _type_: _description_
+        """
+        attributes = json.loads(attributes)
+        face_list = attributes["info"]
+        if len(face_list) == 0:
+            return ""
+        if attributes["image_type"] == "body":
+            for face in face_list:
+                if "black" in face and face["black"]:
+                    return "african,dark skin"
+            return ""
+        gender_dict = {"girl": 0, "boy": 0}
+        face_render_list = []
+        black = False
+        for face in face_list:
+            if face["ratio"] < 0.02:
+                continue
+            if face["gender"] == 0:
+                gender_dict["girl"] += 1
+                face_render_list.append(self.face_render_dict["girl"])
+            else:
+                gender_dict["boy"] += 1
+                face_render_list.append(self.face_render_dict["boy"])
+            if "black" in face and face["black"]:
+                black = True
+        if len(face_render_list) == 0:
+            return ""
+        elif len(face_render_list) == 1:
+            solo = True
+        else:
+            solo = False
+        gender = ""
+        for g, num in gender_dict.items():
+            if num > 0:
+                if gender:
+                    gender += ", "
+                gender += "{}{}".format(num, g)
+                if num > 1:
+                    gender += "s"
+        face_render_list = ",".join(face_render_list)
+        face_render_list = face_render_list.split(",")
+        face_render = list(set(face_render_list))
+        face_render.sort(key=face_render_list.index)
+        face_render = ",".join(face_render)
+        if gender_dict["girl"] == 0:
+            face_render = "male focus," + face_render
+        insightface_prompt = "{},{},{}".format(gender, face_render, self.key_words)
+        if solo:
+            insightface_prompt += ",solo"
+        if black:
+            insightface_prompt = "african,dark skin," + insightface_prompt
+        return insightface_prompt
+@AttrRegister.register
+class Caption(AttriributeIsText):
+    name = "caption"
+@AttrRegister.register
+class Env(AttriributeIsText):
+    name = "env"
+    envs_list = [
+        "east asian architecture",
+        "fireworks",
+        "snow, snowflakes",
+        "snowing, snowflakes",
+    ]
+    def __call__(self, attributes: str = None) -> str:
+        if attributes != "" and attributes != " " and attributes is not None:
+            return attributes
+        else:
+            return random.choice(self.envs_list)
+@AttrRegister.register
+class Decoration(AttriributeIsText):
+    name = "decoration"
+    def __init__(self, name: str = None) -> None:
+        self.decoration_list = [
+            "chinese knot",
+            "flowers",
+            "food",
+            "lanterns",
+            "red envelop",
+        ]
+        super().__init__(name)
+    def __call__(self, attributes: str = None) -> str:
+        if attributes != "" and attributes != " " and attributes is not None:
+            return attributes
+        else:
+            return random.choice(self.decoration_list)
+@AttrRegister.register
+class Festival(AttriributeIsText):
+    name = "festival"
+    festival_list = ["new year"]
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+    def __call__(self, attributes: str = None) -> str:
+        if attributes != "" and attributes != " " and attributes is not None:
+            return attributes
+        else:
+            return random.choice(self.festival_list)
+@AttrRegister.register
+class SpringHeadwear(AttriributeIsText):
+    name = "spring_headwear"
+    headwear_list = ["rabbit ears", "rabbit ears, fur hat"]
+    def __call__(self, attributes: str = None) -> str:
+        if attributes != "" and attributes != " " and attributes is not None:
+            return attributes
+        else:
+            return random.choice(self.headwear_list)
+@AttrRegister.register
+class SpringClothes(AttriributeIsText):
+    name = "spring_clothes"
+    clothes_list = [
+        "mittens,chinese clothes",
+        "mittens,fur trim",
+        "mittens,red scarf",
+        "mittens,winter clothes",
+    ]
+    def __call__(self, attributes: str = None) -> str:
+        if attributes != "" and attributes != " " and attributes is not None:
+            return attributes
+        else:
+            return random.choice(self.clothes_list)
+@AttrRegister.register
+class Animal(AttriributeIsText):
+    name = "animal"
+    animal_list = ["rabbit", "holding rabbits"]
+    def __call__(self, attributes: str = None) -> str:
+        if attributes != "" and attributes != " " and attributes is not None:
+            return attributes
+        else:
+            return random.choice(self.animal_list)

musev/auto_prompt/attributes/render.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from mmcm.utils.util import flatten
+from .attributes import BaseAttribute2Text
+from . import AttrRegister
+__all__ = ["Render"]
+RenderMap = {
+    "Epic": "artstation, epic environment, highly detailed, 8k, HD",
+    "HD": "8k, highly detailed",
+    "EpicHD": "hyper detailed, beautiful lighting, epic environment, octane render, cinematic, 8k",
+    "Digital": "detailed illustration, crisp lines, digital art, 8k, trending on artstation",
+    "Unreal1": "artstation, concept art, smooth, sharp focus, illustration, unreal engine 5, 8k",
+    "Unreal2": "concept art, octane render, artstation, epic environment, highly detailed, 8k",
+}
+@AttrRegister.register
+class Render(BaseAttribute2Text):
+    name = "render"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)
+    def __call__(self, attributes: str) -> str:
+        if attributes == "" or attributes is None:
+            return ""
+        attributes = attributes.split(",")
+        render = [RenderMap[attr] for attr in attributes if attr in RenderMap]
+        render = flatten(render, ignored_iterable_types=[str])
+        if len(render) == 1:
+            render = render[0]
+        return render

musev/auto_prompt/attributes/style.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from .attributes import AttriributeIsText
+from . import AttrRegister
+__all__ = ["Style"]
+@AttrRegister.register
+class Style(AttriributeIsText):
+    name = "style"
+    def __init__(self, name: str = None) -> None:
+        super().__init__(name)

musev/auto_prompt/human.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""负责按照人相关的属性转化成提词
+"""
+from typing import List
+from .attributes.human import PortraitMultiAttr2Text
+from .attributes.attributes import BaseAttribute2Text
+from .attributes.attr2template import MultiAttr2PromptTemplate
+class PortraitAttr2PromptTemplate(MultiAttr2PromptTemplate):
+    """可以将任务字典转化为形象提词模板类
+        template class for converting task dictionaries into image prompt templates
+    Args:
+        MultiAttr2PromptTemplate (_type_): _description_
+    """
+    templates = "a portrait of {}"
+    def __init__(
+        self, templates: str = None, attr2text: List = None, name: str = "portrait"
+    ) -> None:
+        """
+        Args:
+            templates (str, optional): 形象提词模板，若为None，则使用默认的类属性. Defaults to None.
+                portrait prompt template, if None, the default class attribute is used.
+            attr2text (List, optional): 形象类需要新增、更新的属性列表，默认使用PortraitMultiAttr2Text中定义的形象属性. Defaults to None.
+                the list of attributes that need to be added or updated in the image class, by default, the image attributes defined in PortraitMultiAttr2Text are used.
+            name (str, optional): 该形象类的名字. Defaults to "portrait".
+                class name of this class instance
+        """
+        if (
+            attr2text is None
+            or isinstance(attr2text, list)
+            or isinstance(attr2text, BaseAttribute2Text)
+        ):
+            attr2text = PortraitMultiAttr2Text(funcs=attr2text)
+        if templates is None:
+            templates = self.templates
+        super().__init__(templates, attr2text, name=name)

musev/auto_prompt/load_template.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from mmcm.utils.str_util import has_key_brace
+from .human import PortraitAttr2PromptTemplate
+from .attributes.attr2template import (
+    KeywordMultiAttr2PromptTemplate,
+    OnlySpacePromptTemplate,
+)
+def get_template_by_name(template: str, name: str = None):
+    """根据 template_name 确定 prompt 生成器类
+        choose prompt generator class according to template_name
+    Args:
+        name (str): template 的名字简称，便于指定. template name abbreviation, for easy reference
+    Raises:
+        ValueError: ValueError: 如果name不在支持的列表中，则报错. if name is not in the supported list, an error is reported.
+    Returns:
+        MultiAttr2PromptTemplate: 能够将任务字典转化为提词的 实现了__call__功能的类. class that can convert task dictionaries into prompts and implements the __call__ function
+    """
+    if template == "" or template is None:
+        template = OnlySpacePromptTemplate(template=template)
+    elif has_key_brace(template):
+        # if has_key_brace(template):
+        template = KeywordMultiAttr2PromptTemplate(template=template)
+    else:
+        if name == "portrait":
+            template = PortraitAttr2PromptTemplate(templates=template)
+        else:
+            raise ValueError(
+                "PresetAttr2PromptTemplate only support one of [portrait], but given {}".format(
+                    name
+                )
+            )
+    return template

musev/auto_prompt/util.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from copy import deepcopy
+from typing import Dict, List
+from .load_template import get_template_by_name
+def generate_prompts(tasks: List[Dict]) -> List[Dict]:
+    new_tasks = []
+    for task in tasks:
+        task["origin_prompt"] = deepcopy(task["prompt"])
+        # 如果prompt单元值含有模板 {}，或者 没有填写任何值（默认为空模板），则使用原prompt值
+        if "{" not in task["prompt"] and len(task["prompt"]) != 0:
+            new_tasks.append(task)
+        else:
+            template = get_template_by_name(
+                template=task["prompt"], name=task.get("template_name", None)
+            )
+            prompts = template(task)
+            if not isinstance(prompts, list) and isinstance(prompts, str):
+                prompts = [prompts]
+            for prompt in prompts:
+                task_cp = deepcopy(task)
+                task_cp["prompt"] = prompt
+                new_tasks.append(task_cp)
+    return new_tasks

musev/data/__init__.py ADDED Viewed

File without changes

musev/data/data_util.py ADDED Viewed

	@@ -0,0 +1,681 @@

+from typing import List, Dict, Literal, Union, Tuple
+import os
+import string
+import logging
+import torch
+import numpy as np
+from einops import rearrange, repeat
+logger = logging.getLogger(__name__)
+def generate_tasks_of_dir(
+    path: str,
+    output_dir: str,
+    exts: Tuple[str],
+    same_dir_name: bool = False,
+    **kwargs,
+) -> List[Dict]:
+    """covert video directory into tasks
+    Args:
+        path (str): _description_
+        output_dir (str): _description_
+        exts (Tuple[str]): _description_
+        same_dir_name (bool, optional): 存储路径是否保留和源视频相同的父文件名. Defaults to False.
+            whether keep the same parent dir name as the source video
+    Returns:
+        List[Dict]: _description_
+    """
+    tasks = []
+    for rootdir, dirs, files in os.walk(path):
+        for basename in files:
+            if basename.lower().endswith(exts):
+                video_path = os.path.join(rootdir, basename)
+                filename, ext = basename.split(".")
+                rootdir_name = os.path.basename(rootdir)
+                if same_dir_name:
+                    save_path = os.path.join(
+                        output_dir, rootdir_name, f"{filename}.h5py"
+                    )
+                    save_dir = os.path.join(output_dir, rootdir_name)
+                else:
+                    save_path = os.path.join(output_dir, f"{filename}.h5py")
+                    save_dir = output_dir
+                task = {
+                    "video_path": video_path,
+                    "output_path": save_path,
+                    "output_dir": save_dir,
+                    "filename": filename,
+                    "ext": ext,
+                }
+                task.update(kwargs)
+                tasks.append(task)
+    return tasks
+def sample_by_idx(
+    T: int,
+    n_sample: int,
+    sample_rate: int,
+    sample_start_idx: int = None,
+    change_sample_rate: bool = False,
+    seed: int = None,
+    whether_random: bool = True,
+    n_independent: int = 0,
+) -> List[int]:
+    """given a int to represent candidate list, sample n_sample with sample_rate from the candidate list
+    Args:
+        T (int): _description_
+        n_sample (int): 目标采样数目. sample number
+        sample_rate (int): 采样率, 每隔sample_rate个采样一个. sample interval, pick one per sample_rate number
+        sample_start_idx (int, optional): 采样开始位置的选择. start position to sample . Defaults to 0.
+        change_sample_rate (bool, optional): 是否可以通过降低sample_rate的方式来完成采样. whether allow changing sample_rate to finish sample process. Defaults to False.
+        whether_random (bool, optional): 是否最后随机选择开始点. whether randomly choose sample start position. Defaults to False.
+    Raises:
+        ValueError: T / sample_rate should be larger than n_sample
+    Returns:
+        List[int]: 采样的索引位置. sampled index position
+    """
+    if T < n_sample:
+        raise ValueError(f"T({T}) < n_sample({n_sample})")
+    else:
+        if T / sample_rate < n_sample:
+            if not change_sample_rate:
+                raise ValueError(
+                    f"T({T}) / sample_rate({sample_rate}) < n_sample({n_sample})"
+                )
+            else:
+                while T / sample_rate < n_sample:
+                    sample_rate -= 1
+                    logger.error(
+                        f"sample_rate{sample_rate+1} is too large, decrease to {sample_rate}"
+                    )
+                    if sample_rate == 0:
+                        raise ValueError("T / sample_rate < n_sample")
+    if sample_start_idx is None:
+        if whether_random:
+            sample_start_idx_candidates = np.arange(T - n_sample * sample_rate)
+            if seed is not None:
+                np.random.seed(seed)
+            sample_start_idx = np.random.choice(sample_start_idx_candidates, 1)[0]
+        else:
+            sample_start_idx = 0
+    sample_end_idx = sample_start_idx + sample_rate * n_sample
+    sample = list(range(sample_start_idx, sample_end_idx, sample_rate))
+    if n_independent == 0:
+        n_independent_sample = None
+    else:
+        left_candidate = np.array(
+            list(range(0, sample_start_idx)) + list(range(sample_end_idx, T))
+        )
+        if len(left_candidate) >= n_independent:
+            # 使用两端的剩余空间采样, use the left space to sample
+            n_independent_sample = np.random.choice(left_candidate, n_independent)
+        else:
+            # 当两端没有剩余采样空间时，使用任意不是sample中的帧
+            # if no enough space to sample, use any frame not in sample
+            left_candidate = np.array(list(set(range(T) - set(sample))))
+            n_independent_sample = np.random.choice(left_candidate, n_independent)
+    return sample, sample_rate, n_independent_sample
+def sample_tensor_by_idx(
+    tensor: Union[torch.Tensor, np.ndarray],
+    n_sample: int,
+    sample_rate: int,
+    sample_start_idx: int = 0,
+    change_sample_rate: bool = False,
+    seed: int = None,
+    dim: int = 0,
+    return_type: Literal["numpy", "torch"] = "torch",
+    whether_random: bool = True,
+    n_independent: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor]:
+    """sample sub_tensor
+    Args:
+        tensor (Union[torch.Tensor, np.ndarray]): _description_
+        n_sample (int): _description_
+        sample_rate (int): _description_
+        sample_start_idx (int, optional): _description_. Defaults to 0.
+        change_sample_rate (bool, optional): _description_. Defaults to False.
+        seed (int, optional): _description_. Defaults to None.
+        dim (int, optional): _description_. Defaults to 0.
+        return_type (Literal[&quot;numpy&quot;, &quot;torch&quot;], optional): _description_. Defaults to "torch".
+        whether_random (bool, optional): _description_. Defaults to True.
+        n_independent (int, optional): 独立于n_sample的采样数量. Defaults to 0.
+            n_independent sample number that is independent of n_sample
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor, torch.Tensor]: sampled tensor
+    """
+    if isinstance(tensor, np.ndarray):
+        tensor = torch.from_numpy(tensor)
+    T = tensor.shape[dim]
+    sample_idx, sample_rate, independent_sample_idx = sample_by_idx(
+        T,
+        n_sample,
+        sample_rate,
+        sample_start_idx,
+        change_sample_rate,
+        seed,
+        whether_random=whether_random,
+        n_independent=n_independent,
+    )
+    sample_idx = torch.LongTensor(sample_idx)
+    sample = torch.index_select(tensor, dim, sample_idx)
+    if independent_sample_idx is not None:
+        independent_sample_idx = torch.LongTensor(independent_sample_idx)
+        independent_sample = torch.index_select(tensor, dim, independent_sample_idx)
+    else:
+        independent_sample = None
+        independent_sample_idx = None
+    if return_type == "numpy":
+        sample = sample.cpu().numpy()
+    return sample, sample_idx, sample_rate, independent_sample, independent_sample_idx
+def concat_two_tensor(
+    data1: torch.Tensor,
+    data2: torch.Tensor,
+    dim: int,
+    method: Literal[
+        "first_in_first_out", "first_in_last_out", "intertwine", "index"
+    ] = "first_in_first_out",
+    data1_index: torch.long = None,
+    data2_index: torch.long = None,
+    return_index: bool = False,
+):
+    """concat two tensor along dim with given method
+    Args:
+        data1 (torch.Tensor): first in data
+        data2 (torch.Tensor): last in  data
+        dim (int): _description_
+        method (Literal[ &quot;first_in_first_out&quot;, &quot;first_in_last_out&quot;, &quot;intertwine&quot; ], optional): _description_. Defaults to "first_in_first_out".
+    Raises:
+        NotImplementedError: unsupported method
+        ValueError: unsupported method
+    Returns:
+        _type_: _description_
+    """
+    len_data1 = data1.shape[dim]
+    len_data2 = data2.shape[dim]
+    if method == "first_in_first_out":
+        res = torch.concat([data1, data2], dim=dim)
+        data1_index = range(len_data1)
+        data2_index = [len_data1 + x for x in range(len_data2)]
+    elif method == "first_in_last_out":
+        res = torch.concat([data2, data1], dim=dim)
+        data2_index = range(len_data2)
+        data1_index = [len_data2 + x for x in range(len_data1)]
+    elif method == "intertwine":
+        raise NotImplementedError("intertwine")
+    elif method == "index":
+        res = concat_two_tensor_with_index(
+            data1=data1,
+            data1_index=data1_index,
+            data2=data2,
+            data2_index=data2_index,
+            dim=dim,
+        )
+    else:
+        raise ValueError(
+            "only support first_in_first_out, first_in_last_out, intertwine, index"
+        )
+    if return_index:
+        return res, data1_index, data2_index
+    else:
+        return res
+def concat_two_tensor_with_index(
+    data1: torch.Tensor,
+    data1_index: torch.LongTensor,
+    data2: torch.Tensor,
+    data2_index: torch.LongTensor,
+    dim: int,
+) -> torch.Tensor:
+    """_summary_
+    Args:
+        data1 (torch.Tensor): b1*c1*h1*w1*...
+        data1_index (torch.LongTensor): N, if dim=1, N=c1
+        data2 (torch.Tensor): b2*c2*h2*w2*...
+        data2_index (torch.LongTensor): M, if dim=1, M=c2
+        dim (int): int
+    Returns:
+        torch.Tensor: b*c*h*w*..., if dim=1, b=b1=b2, c=c1+c2, h=h1=h2, w=w1=w2,...
+    """
+    shape1 = list(data1.shape)
+    shape2 = list(data2.shape)
+    target_shape = list(shape1)
+    target_shape[dim] = shape1[dim] + shape2[dim]
+    target = torch.zeros(target_shape, device=data1.device, dtype=data1.dtype)
+    target = batch_index_copy(target, dim=dim, index=data1_index, source=data1)
+    target = batch_index_copy(target, dim=dim, index=data2_index, source=data2)
+    return target
+def repeat_index_to_target_size(
+    index: torch.LongTensor, target_size: int
+) -> torch.LongTensor:
+    if len(index.shape) == 1:
+        index = repeat(index, "n -> b n", b=target_size)
+    if len(index.shape) == 2:
+        remainder = target_size % index.shape[0]
+        assert (
+            remainder == 0
+        ), f"target_size % index.shape[0] must be zero, but give {target_size % index.shape[0]}"
+        index = repeat(index, "b n -> (b c) n", c=int(target_size / index.shape[0]))
+    return index
+def batch_concat_two_tensor_with_index(
+    data1: torch.Tensor,
+    data1_index: torch.LongTensor,
+    data2: torch.Tensor,
+    data2_index: torch.LongTensor,
+    dim: int,
+) -> torch.Tensor:
+    return concat_two_tensor_with_index(data1, data1_index, data2, data2_index, dim)
+def interwine_two_tensor(
+    data1: torch.Tensor,
+    data2: torch.Tensor,
+    dim: int,
+    return_index: bool = False,
+) -> torch.Tensor:
+    shape1 = list(data1.shape)
+    shape2 = list(data2.shape)
+    target_shape = list(shape1)
+    target_shape[dim] = shape1[dim] + shape2[dim]
+    target = torch.zeros(target_shape, device=data1.device, dtype=data1.dtype)
+    data1_reshape = torch.swapaxes(data1, 0, dim)
+    data2_reshape = torch.swapaxes(data2, 0, dim)
+    target = torch.swapaxes(target, 0, dim)
+    total_index = set(range(target_shape[dim]))
+    data1_index = range(0, 2 * shape1[dim], 2)
+    data2_index = sorted(list(set(total_index) - set(data1_index)))
+    data1_index = torch.LongTensor(data1_index)
+    data2_index = torch.LongTensor(data2_index)
+    target[data1_index, ...] = data1_reshape
+    target[data2_index, ...] = data2_reshape
+    target = torch.swapaxes(target, 0, dim)
+    if return_index:
+        return target, data1_index, data2_index
+    else:
+        return target
+def split_index(
+    indexs: torch.Tensor,
+    n_first: int = None,
+    n_last: int = None,
+    method: Literal[
+        "first_in_first_out", "first_in_last_out", "intertwine", "index", "random"
+    ] = "first_in_first_out",
+):
+    """_summary_
+    Args:
+        indexs (List): _description_
+        n_first (int): _description_
+        n_last (int): _description_
+        method (Literal[ &quot;first_in_first_out&quot;, &quot;first_in_last_out&quot;, &quot;intertwine&quot;, &quot;index&quot; ], optional): _description_. Defaults to "first_in_first_out".
+    Raises:
+        NotImplementedError: _description_
+    Returns:
+        first_index: _description_
+        last_index:
+    """
+    # assert (
+    #     n_first is None and n_last is None
+    # ), "must assign one value for n_first or n_last"
+    n_total = len(indexs)
+    if n_first is None:
+        n_first = n_total - n_last
+    if n_last is None:
+        n_last = n_total - n_first
+    assert len(indexs) == n_first + n_last
+    if method == "first_in_first_out":
+        first_index = indexs[:n_first]
+        last_index = indexs[n_first:]
+    elif method == "first_in_last_out":
+        first_index = indexs[n_last:]
+        last_index = indexs[:n_last]
+    elif method == "intertwine":
+        raise NotImplementedError
+    elif method == "random":
+        idx_ = torch.randperm(len(indexs))
+        first_index = indexs[idx_[:n_first]]
+        last_index = indexs[idx_[n_first:]]
+    return first_index, last_index
+def split_tensor(
+    tensor: torch.Tensor,
+    dim: int,
+    n_first=None,
+    n_last=None,
+    method: Literal[
+        "first_in_first_out", "first_in_last_out", "intertwine", "index", "random"
+    ] = "first_in_first_out",
+    need_return_index: bool = False,
+):
+    device = tensor.device
+    total = tensor.shape[dim]
+    if n_first is None:
+        n_first = total - n_last
+    if n_last is None:
+        n_last = total - n_first
+    indexs = torch.arange(
+        total,
+        dtype=torch.long,
+        device=device,
+    )
+    (
+        first_index,
+        last_index,
+    ) = split_index(
+        indexs=indexs,
+        n_first=n_first,
+        method=method,
+    )
+    first_tensor = torch.index_select(tensor, dim=dim, index=first_index)
+    last_tensor = torch.index_select(tensor, dim=dim, index=last_index)
+    if need_return_index:
+        return (
+            first_tensor,
+            last_tensor,
+            first_index,
+            last_index,
+        )
+    else:
+        return (first_tensor, last_tensor)
+# TODO: 待确定batch_index_select的优化
+def batch_index_select(
+    tensor: torch.Tensor, index: torch.LongTensor, dim: int
+) -> torch.Tensor:
+    """_summary_
+    Args:
+        tensor (torch.Tensor): D1*D2*D3*D4...
+        index (torch.LongTensor): D1*N or N, N<= tensor.shape[dim]
+        dim (int): dim to select
+    Returns:
+        torch.Tensor: D1*...*N*...
+    """
+    # TODO: now only support N same for every d1
+    if len(index.shape) == 1:
+        return torch.index_select(tensor, dim=dim, index=index)
+    else:
+        index = repeat_index_to_target_size(index, tensor.shape[0])
+        out = []
+        for i in torch.arange(tensor.shape[0]):
+            sub_tensor = tensor[i]
+            sub_index = index[i]
+            d = torch.index_select(sub_tensor, dim=dim - 1, index=sub_index)
+            out.append(d)
+        return torch.stack(out).to(dtype=tensor.dtype)
+def batch_index_copy(
+    tensor: torch.Tensor, dim: int, index: torch.LongTensor, source: torch.Tensor
+) -> torch.Tensor:
+    """_summary_
+    Args:
+        tensor (torch.Tensor): b*c*h
+        dim (int):
+        index (torch.LongTensor): b*d,
+        source (torch.Tensor):
+            b*d*h*..., if dim=1
+            b*c*d*..., if dim=2
+    Returns:
+        torch.Tensor: b*c*d*...
+    """
+    if len(index.shape) == 1:
+        tensor.index_copy_(dim=dim, index=index, source=source)
+    else:
+        index = repeat_index_to_target_size(index, tensor.shape[0])
+        batch_size = tensor.shape[0]
+        for b in torch.arange(batch_size):
+            sub_index = index[b]
+            sub_source = source[b]
+            sub_tensor = tensor[b]
+            sub_tensor.index_copy_(dim=dim - 1, index=sub_index, source=sub_source)
+            tensor[b] = sub_tensor
+    return tensor
+def batch_index_fill(
+    tensor: torch.Tensor,
+    dim: int,
+    index: torch.LongTensor,
+    value: Literal[torch.Tensor, torch.float],
+) -> torch.Tensor:
+    """_summary_
+    Args:
+        tensor (torch.Tensor): b*c*h
+        dim (int):
+        index (torch.LongTensor): b*d,
+        value (torch.Tensor): b
+    Returns:
+        torch.Tensor: b*c*d*...
+    """
+    index = repeat_index_to_target_size(index, tensor.shape[0])
+    batch_size = tensor.shape[0]
+    for b in torch.arange(batch_size):
+        sub_index = index[b]
+        sub_value = value[b] if isinstance(value, torch.Tensor) else value
+        sub_tensor = tensor[b]
+        sub_tensor.index_fill_(dim - 1, sub_index, sub_value)
+        tensor[b] = sub_tensor
+    return tensor
+def adaptive_instance_normalization(
+    src: torch.Tensor,
+    dst: torch.Tensor,
+    eps: float = 1e-6,
+):
+    """
+    Args:
+        src (torch.Tensor): b c t h w
+        dst (torch.Tensor): b c t h w
+    """
+    ndim = src.ndim
+    if ndim == 5:
+        dim = (2, 3, 4)
+    elif ndim == 4:
+        dim = (2, 3)
+    elif ndim == 3:
+        dim = 2
+    else:
+        raise ValueError("only support ndim in [3,4,5], but given {ndim}")
+    var, mean = torch.var_mean(src, dim=dim, keepdim=True, correction=0)
+    std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5
+    dst = align_repeat_tensor_single_dim(dst, src.shape[0], dim=0)
+    mean_acc, var_acc = torch.var_mean(dst, dim=dim, keepdim=True, correction=0)
+    # mean_acc = sum(mean_acc) / float(len(mean_acc))
+    # var_acc = sum(var_acc) / float(len(var_acc))
+    std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5
+    src = (((src - mean) / std) * std_acc) + mean_acc
+    return src
+def adaptive_instance_normalization_with_ref(
+    src: torch.LongTensor,
+    dst: torch.LongTensor,
+    style_fidelity: float = 0.5,
+    do_classifier_free_guidance: bool = True,
+):
+    # logger.debug(
+    #     f"src={src.shape}, min={src.min()}, max={src.max()}, mean={src.mean()}, \n"
+    #     f"dst={src.shape}, min={dst.min()}, max={dst.max()}, mean={dst.mean()}"
+    # )
+    batch_size = src.shape[0] // 2
+    uc_mask = torch.Tensor([1] * batch_size + [0] * batch_size).type_as(src).bool()
+    src_uc = adaptive_instance_normalization(src, dst)
+    src_c = src_uc.clone()
+    # TODO: 该部分默认 do_classifier_free_guidance and style_fidelity > 0 = True
+    if do_classifier_free_guidance and style_fidelity > 0:
+        src_c[uc_mask] = src[uc_mask]
+    src = style_fidelity * src_c + (1.0 - style_fidelity) * src_uc
+    return src
+def batch_adain_conditioned_tensor(
+    tensor: torch.Tensor,
+    src_index: torch.LongTensor,
+    dst_index: torch.LongTensor,
+    keep_dim: bool = True,
+    num_frames: int = None,
+    dim: int = 2,
+    style_fidelity: float = 0.5,
+    do_classifier_free_guidance: bool = True,
+    need_style_fidelity: bool = False,
+):
+    """_summary_
+    Args:
+        tensor (torch.Tensor): b c t h w
+        src_index (torch.LongTensor): _description_
+        dst_index (torch.LongTensor): _description_
+        keep_dim (bool, optional): _description_. Defaults to True.
+    Returns:
+        _type_: _description_
+    """
+    ndim = tensor.ndim
+    dtype = tensor.dtype
+    if ndim == 4 and num_frames is not None:
+        tensor = rearrange(tensor, "(b t) c h w->  b c t h w ", t=num_frames)
+    src = batch_index_select(tensor, dim=dim, index=src_index).contiguous()
+    dst = batch_index_select(tensor, dim=dim, index=dst_index).contiguous()
+    if need_style_fidelity:
+        src = adaptive_instance_normalization_with_ref(
+            src=src,
+            dst=dst,
+            style_fidelity=style_fidelity,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            need_style_fidelity=need_style_fidelity,
+        )
+    else:
+        src = adaptive_instance_normalization(
+            src=src,
+            dst=dst,
+        )
+    if keep_dim:
+        src = batch_concat_two_tensor_with_index(
+            src.to(dtype=dtype),
+            src_index,
+            dst.to(dtype=dtype),
+            dst_index,
+            dim=dim,
+        )
+    if ndim == 4 and num_frames is not None:
+        src = rearrange(tensor, "b c t h w ->(b t) c h w")
+    return src
+def align_repeat_tensor_single_dim(
+    src: torch.Tensor,
+    target_length: int,
+    dim: int = 0,
+    n_src_base_length: int = 1,
+    src_base_index: List[int] = None,
+) -> torch.Tensor:
+    """沿着 dim 纬度， 补齐 src 的长度到目标 target_length。
+    当 src 长度不如 target_length 时， 取其中 前 n_src_base_length 然后 repeat 到 target_length
+    align length of src to target_length along dim
+    when src length is less than target_length, take the first n_src_base_length and repeat to target_length
+    Args:
+        src (torch.Tensor): 输入 tensor, input tensor
+        target_length (int): 目标长度, target_length
+        dim (int, optional): 处理纬度, target dim . Defaults to 0.
+        n_src_base_length (int, optional): src 的基本单元长度, basic length of src. Defaults to 1.
+    Returns:
+        torch.Tensor: _description_
+    """
+    src_dim_length = src.shape[dim]
+    if target_length > src_dim_length:
+        if target_length % src_dim_length == 0:
+            new = src.repeat_interleave(
+                repeats=target_length // src_dim_length, dim=dim
+            )
+        else:
+            if src_base_index is None and n_src_base_length is not None:
+                src_base_index = torch.arange(n_src_base_length)
+            new = src.index_select(
+                dim=dim,
+                index=torch.LongTensor(src_base_index).to(device=src.device),
+            )
+            new = new.repeat_interleave(
+                repeats=target_length // len(src_base_index),
+                dim=dim,
+            )
+    elif target_length < src_dim_length:
+        new = src.index_select(
+            dim=dim,
+            index=torch.LongTensor(torch.arange(target_length)).to(device=src.device),
+        )
+    else:
+        new = src
+    return new
+def fuse_part_tensor(
+    src: torch.Tensor,
+    dst: torch.Tensor,
+    overlap: int,
+    weight: float = 0.5,
+    skip_step: int = 0,
+) -> torch.Tensor:
+    """fuse overstep tensor with weight of src into dst
+    out = src_fused_part * weight + dst * (1-weight) for overlap
+    Args:
+        src (torch.Tensor): b c t h w
+        dst (torch.Tensor): b c t h w
+        overlap (int): 1
+        weight (float, optional): weight of src tensor part. Defaults to 0.5.
+    Returns:
+        torch.Tensor: fused tensor
+    """
+    if overlap == 0:
+        return dst
+    else:
+        dst[:, :, skip_step : skip_step + overlap] = (
+            weight * src[:, :, -overlap:]
+            + (1 - weight) * dst[:, :, skip_step : skip_step + overlap]
+        )
+        return dst

musev/logging.conf ADDED Viewed

	@@ -0,0 +1,32 @@

+[loggers]
+keys=root,musev
+[handlers]
+keys=consoleHandler
+[formatters]
+keys=musevFormatter
+[logger_root]
+level=INFO
+handlers=consoleHandler
+# logger level 尽量设置低一点
+[logger_musev]
+level=DEBUG
+handlers=consoleHandler
+qualname=musev
+propagate=0
+# handler level 设置比 logger level高
+[handler_consoleHandler]
+class=StreamHandler
+level=DEBUG
+# level=INFO
+formatter=musevFormatter
+args=(sys.stdout,)
+[formatter_musevFormatter]
+format=%(asctime)s- %(name)s:%(lineno)d- %(levelname)s- %(message)s
+datefmt=

musev/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from ..utils.register import Register
2	+
3	+ Model_Register = Register(registry_name="torch_model")

musev/models/attention.py ADDED Viewed

	@@ -0,0 +1,431 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/huggingface/diffusers/blob/64bf5d33b7ef1b1deac256bed7bd99b55020c4e0/src/diffusers/models/attention.py
+from __future__ import annotations
+from copy import deepcopy
+from typing import Any, Dict, List, Literal, Optional, Callable, Tuple
+import logging
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.models.embeddings import CombinedTimestepLabelEmbeddings
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention_processor import Attention as DiffusersAttention
+from diffusers.models.attention import (
+    BasicTransformerBlock as DiffusersBasicTransformerBlock,
+    AdaLayerNormZero,
+    AdaLayerNorm,
+    FeedForward,
+)
+from diffusers.models.attention_processor import AttnProcessor
+from .attention_processor import IPAttention, BaseIPAttnProcessor
+logger = logging.getLogger(__name__)
+def not_use_xformers_anyway(
+    use_memory_efficient_attention_xformers: bool,
+    attention_op: Optional[Callable] = None,
+):
+    return None
+@maybe_allow_in_graph
+class BasicTransformerBlock(DiffusersBasicTransformerBlock):
+    print_idx = 0
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0,
+        cross_attention_dim: int | None = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: int | None = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        allow_xformers: bool = True,
+        cross_attn_temporal_cond: bool = False,
+        image_scale: float = 1.0,
+        processor: AttnProcessor | None = None,
+        ip_adapter_cross_attn: bool = False,
+        need_t2i_facein: bool = False,
+        need_t2i_ip_adapter_face: bool = False,
+    ):
+        if not only_cross_attention and double_self_attention:
+            cross_attention_dim = None
+        super().__init__(
+            dim,
+            num_attention_heads,
+            attention_head_dim,
+            dropout,
+            cross_attention_dim,
+            activation_fn,
+            num_embeds_ada_norm,
+            attention_bias,
+            only_cross_attention,
+            double_self_attention,
+            upcast_attention,
+            norm_elementwise_affine,
+            norm_type,
+            final_dropout,
+            attention_type,
+        )
+        self.attn1 = IPAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            cross_attn_temporal_cond=cross_attn_temporal_cond,
+            image_scale=image_scale,
+            ip_adapter_dim=cross_attention_dim
+            if only_cross_attention
+            else attention_head_dim,
+            facein_dim=cross_attention_dim
+            if only_cross_attention
+            else attention_head_dim,
+            processor=processor,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = IPAttention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim
+                if not double_self_attention
+                else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                cross_attn_temporal_cond=ip_adapter_cross_attn,
+                need_t2i_facein=need_t2i_facein,
+                need_t2i_ip_adapter_face=need_t2i_ip_adapter_face,
+                image_scale=image_scale,
+                ip_adapter_dim=cross_attention_dim
+                if not double_self_attention
+                else attention_head_dim,
+                facein_dim=cross_attention_dim
+                if not double_self_attention
+                else attention_head_dim,
+                ip_adapter_face_dim=cross_attention_dim
+                if not double_self_attention
+                else attention_head_dim,
+                processor=processor,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        if self.attn1 is not None:
+            if not allow_xformers:
+                self.attn1.set_use_memory_efficient_attention_xformers = (
+                    not_use_xformers_anyway
+                )
+        if self.attn2 is not None:
+            if not allow_xformers:
+                self.attn2.set_use_memory_efficient_attention_xformers = (
+                    not_use_xformers_anyway
+                )
+        self.double_self_attention = double_self_attention
+        self.only_cross_attention = only_cross_attention
+        self.cross_attn_temporal_cond = cross_attn_temporal_cond
+        self.image_scale = image_scale
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        self_attn_block_embs: Optional[Tuple[List[torch.Tensor], List[None]]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        # 1. Retrieve lora scale.
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+        # 特殊AttnProcessor需要的入参 在 cross_attention_kwargs 准备
+        # special AttnProcessor needs input parameters in cross_attention_kwargs
+        original_cross_attention_kwargs = {
+            k: v
+            for k, v in cross_attention_kwargs.items()
+            if k
+            not in [
+                "num_frames",
+                "sample_index",
+                "vision_conditon_frames_sample_index",
+                "vision_cond",
+                "vision_clip_emb",
+                "ip_adapter_scale",
+                "face_emb",
+                "facein_scale",
+                "ip_adapter_face_emb",
+                "ip_adapter_face_scale",
+                "do_classifier_free_guidance",
+            ]
+        }
+        if "do_classifier_free_guidance" in cross_attention_kwargs:
+            do_classifier_free_guidance = cross_attention_kwargs[
+                "do_classifier_free_guidance"
+            ]
+        else:
+            do_classifier_free_guidance = False
+        # 2. Prepare GLIGEN inputs
+        original_cross_attention_kwargs = (
+            original_cross_attention_kwargs.copy()
+            if original_cross_attention_kwargs is not None
+            else {}
+        )
+        gligen_kwargs = original_cross_attention_kwargs.pop("gligen", None)
+        # 返回self_attn的结果，适用于referencenet的输出给其他Unet来使用
+        # return the result of self_attn, which is suitable for the output of referencenet to be used by other Unet
+        if (
+            self_attn_block_embs is not None
+            and self_attn_block_embs_mode.lower() == "write"
+        ):
+            # self_attn_block_emb = self.attn1.head_to_batch_dim(attn_output, out_dim=4)
+            self_attn_block_emb = norm_hidden_states
+            if not hasattr(self, "spatial_self_attn_idx"):
+                raise ValueError(
+                    "must call unet.insert_spatial_self_attn_idx to generate spatial attn index"
+                )
+            basick_transformer_idx = self.spatial_self_attn_idx
+            if self.print_idx == 0:
+                logger.debug(
+                    f"self_attn_block_embs, self_attn_block_embs_mode={self_attn_block_embs_mode}, "
+                    f"basick_transformer_idx={basick_transformer_idx}, length={len(self_attn_block_embs)}, shape={self_attn_block_emb.shape}, "
+                    # f"attn1 processor, {type(self.attn1.processor)}"
+                )
+            self_attn_block_embs[basick_transformer_idx] = self_attn_block_emb
+        # read and put referencenet emb into cross_attention_kwargs, which would be fused into attn_processor
+        if (
+            self_attn_block_embs is not None
+            and self_attn_block_embs_mode.lower() == "read"
+        ):
+            basick_transformer_idx = self.spatial_self_attn_idx
+            if not hasattr(self, "spatial_self_attn_idx"):
+                raise ValueError(
+                    "must call unet.insert_spatial_self_attn_idx to generate spatial attn index"
+                )
+            if self.print_idx == 0:
+                logger.debug(
+                    f"refer_self_attn_emb: , self_attn_block_embs_mode={self_attn_block_embs_mode}, "
+                    f"length={len(self_attn_block_embs)}, idx={basick_transformer_idx}, "
+                    # f"attn1 processor, {type(self.attn1.processor)}, "
+                )
+            ref_emb = self_attn_block_embs[basick_transformer_idx]
+            cross_attention_kwargs["refer_emb"] = ref_emb
+            if self.print_idx == 0:
+                logger.debug(
+                    f"unet attention read, {self.spatial_self_attn_idx}",
+                )
+                # ------------------------------warning-----------------------
+                # 这两行由于使用了ref_emb会导致和checkpoint_train相关的训练错误，具体未知，留在这里作为警示
+                # bellow annoated code will cause training error, keep it here as a warning
+                # logger.debug(f"ref_emb shape,{ref_emb.shape}, {ref_emb.mean()}")
+                # logger.debug(
+                # f"norm_hidden_states shape, {norm_hidden_states.shape}, {norm_hidden_states.mean()}",
+                # )
+        if self.attn1 is None:
+            self.print_idx += 1
+            return norm_hidden_states
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states
+            if self.only_cross_attention
+            else None,
+            attention_mask=attention_mask,
+            **(
+                cross_attention_kwargs
+                if isinstance(self.attn1.processor, BaseIPAttnProcessor)
+                else original_cross_attention_kwargs
+            ),
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 推断的时候，对于uncondition_部分独立生成，排除掉 refer_emb，
+        # 首帧等的影响，避免生成参考了refer_emb、首帧等，又在uncond上去除了
+        # in inference stage, eliminate influence of refer_emb, vis_cond on unconditionpart
+        # to avoid use that, and then eliminate in pipeline
+        # refer to moore-animate anyone
+        # do_classifier_free_guidance = False
+        if self.print_idx == 0:
+            logger.debug(f"do_classifier_free_guidance={do_classifier_free_guidance},")
+        if do_classifier_free_guidance:
+            hidden_states_c = attn_output.clone()
+            _uc_mask = (
+                torch.Tensor(
+                    [1] * (norm_hidden_states.shape[0] // 2)
+                    + [0] * (norm_hidden_states.shape[0] // 2)
+                )
+                .to(norm_hidden_states.device)
+                .bool()
+            )
+            hidden_states_c[_uc_mask] = self.attn1(
+                norm_hidden_states[_uc_mask],
+                encoder_hidden_states=norm_hidden_states[_uc_mask],
+                attention_mask=attention_mask,
+            )
+            attn_output = hidden_states_c.clone()
+        if "refer_emb" in cross_attention_kwargs:
+            del cross_attention_kwargs["refer_emb"]
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 2.5 ends
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep)
+                if self.use_ada_layer_norm
+                else self.norm2(hidden_states)
+            )
+            # 特殊AttnProcessor需要的入参 在 cross_attention_kwargs 准备
+            # special AttnProcessor needs input parameters in cross_attention_kwargs
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states
+                if not self.double_self_attention
+                else None,
+                attention_mask=encoder_attention_mask,
+                **(
+                    original_cross_attention_kwargs
+                    if not isinstance(self.attn2.processor, BaseIPAttnProcessor)
+                    else cross_attention_kwargs
+                ),
+            )
+            if self.print_idx == 0:
+                logger.debug(
+                    f"encoder_hidden_states, type={type(encoder_hidden_states)}"
+                )
+                if encoder_hidden_states is not None:
+                    logger.debug(
+                        f"encoder_hidden_states, ={encoder_hidden_states.shape}"
+                    )
+            # encoder_hidden_states_tmp = (
+            #     encoder_hidden_states
+            #     if not self.double_self_attention
+            #     else norm_hidden_states
+            # )
+            # if do_classifier_free_guidance:
+            #     hidden_states_c = attn_output.clone()
+            #     _uc_mask = (
+            #         torch.Tensor(
+            #             [1] * (norm_hidden_states.shape[0] // 2)
+            #             + [0] * (norm_hidden_states.shape[0] // 2)
+            #         )
+            #         .to(norm_hidden_states.device)
+            #         .bool()
+            #     )
+            #     hidden_states_c[_uc_mask] = self.attn2(
+            #         norm_hidden_states[_uc_mask],
+            #         encoder_hidden_states=encoder_hidden_states_tmp[_uc_mask],
+            #         attention_mask=attention_mask,
+            #     )
+            #     attn_output = hidden_states_c.clone()
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        if self.norm3 is not None and self.ff is not None:
+            norm_hidden_states = self.norm3(hidden_states)
+            if self.use_ada_layer_norm_zero:
+                norm_hidden_states = (
+                    norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+                )
+            if self._chunk_size is not None:
+                # "feed_forward_chunk_size" can be used to save memory
+                if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                    raise ValueError(
+                        f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                    )
+                num_chunks = (
+                    norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+                )
+                ff_output = torch.cat(
+                    [
+                        self.ff(hid_slice, scale=lora_scale)
+                        for hid_slice in norm_hidden_states.chunk(
+                            num_chunks, dim=self._chunk_dim
+                        )
+                    ],
+                    dim=self._chunk_dim,
+                )
+            else:
+                ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+            if self.use_ada_layer_norm_zero:
+                ff_output = gate_mlp.unsqueeze(1) * ff_output
+            hidden_states = ff_output + hidden_states
+        self.print_idx += 1
+        return hidden_states

musev/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,750 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""该模型是自定义的attn_processor，实现特殊功能的 Attn功能。
+    相对而言，开源代码经常会重新定义Attention 类，
+    This module implements special AttnProcessor function with custom attn_processor class.
+    While other open source code always modify Attention class.
+"""
+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+from __future__ import annotations
+import time
+from typing import Any, Callable, Optional
+import logging
+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import xformers
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention_processor import (
+    Attention as DiffusersAttention,
+    AttnProcessor,
+    AttnProcessor2_0,
+)
+from ..data.data_util import (
+    batch_concat_two_tensor_with_index,
+    batch_index_select,
+    align_repeat_tensor_single_dim,
+    batch_adain_conditioned_tensor,
+)
+from . import Model_Register
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class IPAttention(DiffusersAttention):
+    r"""
+    Modified Attention class which has special layer, like ip_apadapter_to_k, ip_apadapter_to_v,
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: int | None = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: str | None = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: int | None = None,
+        norm_num_groups: int | None = None,
+        spatial_norm_dim: int | None = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 0.00001,
+        rescale_output_factor: float = 1,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block=False,
+        processor: AttnProcessor | None = None,
+        cross_attn_temporal_cond: bool = False,
+        image_scale: float = 1.0,
+        ip_adapter_dim: int = None,
+        need_t2i_facein: bool = False,
+        facein_dim: int = None,
+        need_t2i_ip_adapter_face: bool = False,
+        ip_adapter_face_dim: int = None,
+    ):
+        super().__init__(
+            query_dim,
+            cross_attention_dim,
+            heads,
+            dim_head,
+            dropout,
+            bias,
+            upcast_attention,
+            upcast_softmax,
+            cross_attention_norm,
+            cross_attention_norm_num_groups,
+            added_kv_proj_dim,
+            norm_num_groups,
+            spatial_norm_dim,
+            out_bias,
+            scale_qk,
+            only_cross_attention,
+            eps,
+            rescale_output_factor,
+            residual_connection,
+            _from_deprecated_attn_block,
+            processor,
+        )
+        self.cross_attn_temporal_cond = cross_attn_temporal_cond
+        self.image_scale = image_scale
+        # 面向首帧的 ip_adapter
+        # ip_apdater
+        if cross_attn_temporal_cond:
+            self.to_k_ip = LoRACompatibleLinear(ip_adapter_dim, query_dim, bias=False)
+            self.to_v_ip = LoRACompatibleLinear(ip_adapter_dim, query_dim, bias=False)
+        # facein
+        self.need_t2i_facein = need_t2i_facein
+        self.facein_dim = facein_dim
+        if need_t2i_facein:
+            raise NotImplementedError("facein")
+        # ip_adapter_face
+        self.need_t2i_ip_adapter_face = need_t2i_ip_adapter_face
+        self.ip_adapter_face_dim = ip_adapter_face_dim
+        if need_t2i_ip_adapter_face:
+            self.ip_adapter_face_to_k_ip = LoRACompatibleLinear(
+                ip_adapter_face_dim, query_dim, bias=False
+            )
+            self.ip_adapter_face_to_v_ip = LoRACompatibleLinear(
+                ip_adapter_face_dim, query_dim, bias=False
+            )
+    def set_use_memory_efficient_attention_xformers(
+        self,
+        use_memory_efficient_attention_xformers: bool,
+        attention_op: Callable[..., Any] | None = None,
+    ):
+        if (
+            "XFormers" in self.processor.__class__.__name__
+            or "IP" in self.processor.__class__.__name__
+        ):
+            pass
+        else:
+            return super().set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers, attention_op
+            )
+@Model_Register.register
+class BaseIPAttnProcessor(nn.Module):
+    print_idx = 0
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+@Model_Register.register
+class T2IReferencenetIPAdapterXFormersAttnProcessor(BaseIPAttnProcessor):
+    r"""
+    面向 ref_image的 self_attn的 IPAdapter
+    """
+    print_idx = 0
+    def __init__(
+        self,
+        attention_op: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.attention_op = attention_op
+    def __call__(
+        self,
+        attn: IPAttention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        num_frames: int = None,
+        sample_index: torch.LongTensor = None,
+        vision_conditon_frames_sample_index: torch.LongTensor = None,
+        refer_emb: torch.Tensor = None,
+        vision_clip_emb: torch.Tensor = None,
+        ip_adapter_scale: float = 1.0,
+        face_emb: torch.Tensor = None,
+        facein_scale: float = 1.0,
+        ip_adapter_face_emb: torch.Tensor = None,
+        ip_adapter_face_scale: float = 1.0,
+        do_classifier_free_guidance: bool = False,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, key_tokens, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, key_tokens, batch_size
+        )
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states, scale=scale)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        encoder_hidden_states = align_repeat_tensor_single_dim(
+            encoder_hidden_states, target_length=hidden_states.shape[0], dim=0
+        )
+        key = attn.to_k(encoder_hidden_states, scale=scale)
+        value = attn.to_v(encoder_hidden_states, scale=scale)
+        # for facein
+        if self.print_idx == 0:
+            logger.debug(
+                f"T2IReferencenetIPAdapterXFormersAttnProcessor,type(face_emb)={type(face_emb)}, facein_scale={facein_scale}"
+            )
+        if facein_scale > 0 and face_emb is not None:
+            raise NotImplementedError("facein")
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query,
+            key,
+            value,
+            attn_bias=attention_mask,
+            op=self.attention_op,
+            scale=attn.scale,
+        )
+        # ip-adapter start
+        if self.print_idx == 0:
+            logger.debug(
+                f"T2IReferencenetIPAdapterXFormersAttnProcessor,type(vision_clip_emb)={type(vision_clip_emb)}"
+            )
+        if ip_adapter_scale > 0 and vision_clip_emb is not None:
+            if self.print_idx == 0:
+                logger.debug(
+                    f"T2I cross_attn, ipadapter, vision_clip_emb={vision_clip_emb.shape}, hidden_states={hidden_states.shape}, batch_size={batch_size}"
+                )
+            ip_key = attn.to_k_ip(vision_clip_emb)
+            ip_value = attn.to_v_ip(vision_clip_emb)
+            ip_key = align_repeat_tensor_single_dim(
+                ip_key, target_length=batch_size, dim=0
+            )
+            ip_value = align_repeat_tensor_single_dim(
+                ip_value, target_length=batch_size, dim=0
+            )
+            ip_key = attn.head_to_batch_dim(ip_key).contiguous()
+            ip_value = attn.head_to_batch_dim(ip_value).contiguous()
+            if self.print_idx == 0:
+                logger.debug(
+                    f"query={query.shape}, ip_key={ip_key.shape}, ip_value={ip_value.shape}"
+                )
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            hidden_states_from_ip = xformers.ops.memory_efficient_attention(
+                query,
+                ip_key,
+                ip_value,
+                attn_bias=attention_mask,
+                op=self.attention_op,
+                scale=attn.scale,
+            )
+            hidden_states = hidden_states + ip_adapter_scale * hidden_states_from_ip
+        # ip-adapter end
+        # ip-adapter face start
+        if self.print_idx == 0:
+            logger.debug(
+                f"T2IReferencenetIPAdapterXFormersAttnProcessor,type(ip_adapter_face_emb)={type(ip_adapter_face_emb)}"
+            )
+        if ip_adapter_face_scale > 0 and ip_adapter_face_emb is not None:
+            if self.print_idx == 0:
+                logger.debug(
+                    f"T2I cross_attn, ipadapter face, ip_adapter_face_emb={vision_clip_emb.shape}, hidden_states={hidden_states.shape}, batch_size={batch_size}"
+                )
+            ip_key = attn.ip_adapter_face_to_k_ip(ip_adapter_face_emb)
+            ip_value = attn.ip_adapter_face_to_v_ip(ip_adapter_face_emb)
+            ip_key = align_repeat_tensor_single_dim(
+                ip_key, target_length=batch_size, dim=0
+            )
+            ip_value = align_repeat_tensor_single_dim(
+                ip_value, target_length=batch_size, dim=0
+            )
+            ip_key = attn.head_to_batch_dim(ip_key).contiguous()
+            ip_value = attn.head_to_batch_dim(ip_value).contiguous()
+            if self.print_idx == 0:
+                logger.debug(
+                    f"query={query.shape}, ip_key={ip_key.shape}, ip_value={ip_value.shape}"
+                )
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            hidden_states_from_ip = xformers.ops.memory_efficient_attention(
+                query,
+                ip_key,
+                ip_value,
+                attn_bias=attention_mask,
+                op=self.attention_op,
+                scale=attn.scale,
+            )
+            hidden_states = (
+                hidden_states + ip_adapter_face_scale * hidden_states_from_ip
+            )
+        # ip-adapter face end
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, scale=scale)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        self.print_idx += 1
+        return hidden_states
+@Model_Register.register
+class NonParamT2ISelfReferenceXFormersAttnProcessor(BaseIPAttnProcessor):
+    r"""
+    面向首帧的 referenceonly attn,适用于 T2I的 self_attn
+    referenceonly with vis_cond as key, value, in t2i self_attn.
+    """
+    print_idx = 0
+    def __init__(
+        self,
+        attention_op: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.attention_op = attention_op
+    def __call__(
+        self,
+        attn: IPAttention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        num_frames: int = None,
+        sample_index: torch.LongTensor = None,
+        vision_conditon_frames_sample_index: torch.LongTensor = None,
+        refer_emb: torch.Tensor = None,
+        face_emb: torch.Tensor = None,
+        vision_clip_emb: torch.Tensor = None,
+        ip_adapter_scale: float = 1.0,
+        facein_scale: float = 1.0,
+        ip_adapter_face_emb: torch.Tensor = None,
+        ip_adapter_face_scale: float = 1.0,
+        do_classifier_free_guidance: bool = False,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, key_tokens, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, key_tokens, batch_size
+        )
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        # vision_cond in same unet attn start
+        if (
+            vision_conditon_frames_sample_index is not None and num_frames > 1
+        ) or refer_emb is not None:
+            batchsize_timesize = hidden_states.shape[0]
+            if self.print_idx == 0:
+                logger.debug(
+                    f"NonParamT2ISelfReferenceXFormersAttnProcessor 0, hidden_states={hidden_states.shape}, vision_conditon_frames_sample_index={vision_conditon_frames_sample_index}"
+                )
+            encoder_hidden_states = rearrange(
+                hidden_states, "(b t) hw c -> b t hw c", t=num_frames
+            )
+            # if False:
+            if vision_conditon_frames_sample_index is not None and num_frames > 1:
+                ip_hidden_states = batch_index_select(
+                    encoder_hidden_states,
+                    dim=1,
+                    index=vision_conditon_frames_sample_index,
+                ).contiguous()
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"NonParamT2ISelfReferenceXFormersAttnProcessor 1, vis_cond referenceonly, encoder_hidden_states={encoder_hidden_states.shape}, ip_hidden_states={ip_hidden_states.shape}"
+                    )
+                #
+                ip_hidden_states = rearrange(
+                    ip_hidden_states, "b t hw c -> b 1 (t hw) c"
+                )
+                ip_hidden_states = align_repeat_tensor_single_dim(
+                    ip_hidden_states,
+                    dim=1,
+                    target_length=num_frames,
+                )
+                # b t hw c -> b t hw + hw c
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"NonParamT2ISelfReferenceXFormersAttnProcessor 2, vis_cond referenceonly, encoder_hidden_states={encoder_hidden_states.shape}, ip_hidden_states={ip_hidden_states.shape}"
+                    )
+                encoder_hidden_states = torch.concat(
+                    [encoder_hidden_states, ip_hidden_states], dim=2
+                )
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"NonParamT2ISelfReferenceXFormersAttnProcessor 3, hidden_states={hidden_states.shape}, ip_hidden_states={ip_hidden_states.shape}"
+                    )
+            # if False:
+            if refer_emb is not None:  # and num_frames > 1:
+                refer_emb = rearrange(refer_emb, "b c t h w->b 1 (t h w) c")
+                refer_emb = align_repeat_tensor_single_dim(
+                    refer_emb, target_length=num_frames, dim=1
+                )
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"NonParamT2ISelfReferenceXFormersAttnProcessor4, referencenet, encoder_hidden_states={encoder_hidden_states.shape}, refer_emb={refer_emb.shape}"
+                    )
+                encoder_hidden_states = torch.concat(
+                    [encoder_hidden_states, refer_emb], dim=2
+                )
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"NonParamT2ISelfReferenceXFormersAttnProcessor5, referencenet, encoder_hidden_states={encoder_hidden_states.shape}, refer_emb={refer_emb.shape}"
+                    )
+            encoder_hidden_states = rearrange(
+                encoder_hidden_states, "b t hw c -> (b t) hw c"
+            )
+        #  vision_cond in same unet attn end
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states, scale=scale)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        encoder_hidden_states = align_repeat_tensor_single_dim(
+            encoder_hidden_states, target_length=hidden_states.shape[0], dim=0
+        )
+        key = attn.to_k(encoder_hidden_states, scale=scale)
+        value = attn.to_v(encoder_hidden_states, scale=scale)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query,
+            key,
+            value,
+            attn_bias=attention_mask,
+            op=self.attention_op,
+            scale=attn.scale,
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, scale=scale)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        self.print_idx += 1
+        return hidden_states
+@Model_Register.register
+class NonParamReferenceIPXFormersAttnProcessor(
+    NonParamT2ISelfReferenceXFormersAttnProcessor
+):
+    def __init__(self, attention_op: Callable[..., Any] | None = None):
+        super().__init__(attention_op)
+@maybe_allow_in_graph
+class ReferEmbFuseAttention(IPAttention):
+    """使用 attention 融合 refernet 中的 emb 到 unet 对应的 latens 中
+    # TODO: 目前只支持 bt hw c 的融合，后续考虑增加对 视频 bhw t c、b thw c的融合
+    residual_connection: bool = True, 默认， 从不产生影响开始学习
+    use attention to fuse referencenet emb into unet latents
+    # TODO: by now, only support bt hw c, later consider to support bhw t c, b thw c
+    residual_connection: bool = True, default, start from no effect
+    Args:
+        IPAttention (_type_): _description_
+    """
+    print_idx = 0
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: int | None = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: str | None = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: int | None = None,
+        norm_num_groups: int | None = None,
+        spatial_norm_dim: int | None = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 0.00001,
+        rescale_output_factor: float = 1,
+        residual_connection: bool = True,
+        _from_deprecated_attn_block=False,
+        processor: AttnProcessor | None = None,
+        cross_attn_temporal_cond: bool = False,
+        image_scale: float = 1,
+    ):
+        super().__init__(
+            query_dim,
+            cross_attention_dim,
+            heads,
+            dim_head,
+            dropout,
+            bias,
+            upcast_attention,
+            upcast_softmax,
+            cross_attention_norm,
+            cross_attention_norm_num_groups,
+            added_kv_proj_dim,
+            norm_num_groups,
+            spatial_norm_dim,
+            out_bias,
+            scale_qk,
+            only_cross_attention,
+            eps,
+            rescale_output_factor,
+            residual_connection,
+            _from_deprecated_attn_block,
+            processor,
+            cross_attn_temporal_cond,
+            image_scale,
+        )
+        self.processor = None
+        # 配合residual,使一开始不影响之前结果
+        nn.init.zeros_(self.to_out[0].weight)
+        nn.init.zeros_(self.to_out[0].bias)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        num_frames: int = None,
+    ) -> torch.Tensor:
+        """fuse referencenet emb b c t2 h2 w2  into unet latents b c t1 h1 w1 with attn
+        refer to musev/models/attention_processor.py::NonParamT2ISelfReferenceXFormersAttnProcessor
+        Args:
+            hidden_states (torch.FloatTensor): unet latents, (b t1) c h1 w1
+            encoder_hidden_states (Optional[torch.FloatTensor], optional): referencenet emb b c2 t2 h2 w2. Defaults to None.
+            attention_mask (Optional[torch.FloatTensor], optional): _description_. Defaults to None.
+            temb (Optional[torch.FloatTensor], optional): _description_. Defaults to None.
+            scale (float, optional): _description_. Defaults to 1.0.
+            num_frames (int, optional): _description_. Defaults to None.
+        Returns:
+            torch.Tensor: _description_
+        """
+        residual = hidden_states
+        # start
+        hidden_states = rearrange(
+            hidden_states, "(b t) c h w -> b c t h w", t=num_frames
+        )
+        batch_size, channel, t1, height, width = hidden_states.shape
+        if self.print_idx == 0:
+            logger.debug(
+                f"hidden_states={hidden_states.shape},encoder_hidden_states={encoder_hidden_states.shape}"
+            )
+        # concat  with hidden_states b c t1 h1 w1 in  hw channel into bt  (t2 + 1)hw c
+        encoder_hidden_states = rearrange(
+            encoder_hidden_states, " b c t2 h w-> b (t2 h w) c"
+        )
+        encoder_hidden_states = repeat(
+            encoder_hidden_states, " b t2hw c -> (b t) t2hw c", t=t1
+        )
+        hidden_states = rearrange(hidden_states, " b c t h w-> (b t) (h w) c")
+        # bt (t2+1)hw d
+        encoder_hidden_states = torch.concat(
+            [encoder_hidden_states, hidden_states], dim=1
+        )
+        # encoder_hidden_states = align_repeat_tensor_single_dim(
+        #     encoder_hidden_states, target_length=hidden_states.shape[0], dim=0
+        # )
+        # end
+        if self.spatial_norm is not None:
+            hidden_states = self.spatial_norm(hidden_states, temb)
+        _, key_tokens, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = self.prepare_attention_mask(
+            attention_mask, key_tokens, batch_size
+        )
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = self.to_q(hidden_states, scale=scale)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif self.norm_cross:
+            encoder_hidden_states = self.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = self.to_k(encoder_hidden_states, scale=scale)
+        value = self.to_v(encoder_hidden_states, scale=scale)
+        query = self.head_to_batch_dim(query).contiguous()
+        key = self.head_to_batch_dim(key).contiguous()
+        value = self.head_to_batch_dim(value).contiguous()
+        # query: b t hw d
+        # key/value: bt (t1+1)hw d
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query,
+            key,
+            value,
+            attn_bias=attention_mask,
+            scale=self.scale,
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = self.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states, scale=scale)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        hidden_states = rearrange(
+            hidden_states,
+            "bt (h w) c-> bt c h w",
+            h=height,
+            w=width,
+        )
+        if self.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / self.rescale_output_factor
+        self.print_idx += 1
+        return hidden_states

musev/models/controlnet.py ADDED Viewed

	@@ -0,0 +1,399 @@

+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
+import warnings
+import os
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.modeling_utils import ModelMixin
+import PIL
+from einops import rearrange, repeat
+import numpy as np
+import torch
+import torch.nn.init as init
+from diffusers.models.controlnet import ControlNetModel
+from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import is_compiled_module
+class ControlnetPredictor(object):
+    def __init__(self, controlnet_model_path: str, *args, **kwargs):
+        """Controlnet 推断函数，用于提取 controlnet backbone的emb，避免训练时重复抽取
+            Controlnet inference predictor, used to extract the emb of the controlnet backbone to avoid repeated extraction during training
+        Args:
+            controlnet_model_path (str): controlnet 模型路径. controlnet model path.
+        """
+        super(ControlnetPredictor, self).__init__(*args, **kwargs)
+        self.controlnet = ControlNetModel.from_pretrained(
+            controlnet_model_path,
+        )
+    def prepare_image(
+        self,
+        image,  # b c t h w
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if height is None:
+            height = image.shape[-2]
+        if width is None:
+            width = image.shape[-1]
+        width, height = (
+            x - x % self.control_image_processor.vae_scale_factor
+            for x in (width, height)
+        )
+        image = rearrange(image, "b c t h w-> (b t) c h w")
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 255.0
+        image = (
+            torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+                mode="bilinear",
+            ),
+        )
+        do_normalize = self.control_image_processor.config.do_normalize
+        if image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+        if do_normalize:
+            image = self.control_image_processor.normalize(image)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+        return image
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int,
+        device: str,
+        dtype: torch.dtype,
+        timesteps: List[float],
+        i: int,
+        scheduler: KarrasDiffusionSchedulers,
+        prompt_embeds: torch.Tensor,
+        do_classifier_free_guidance: bool = False,
+        # 2b co t ho wo
+        latent_model_input: torch.Tensor = None,
+        # b co t ho wo
+        latents: torch.Tensor = None,
+        # b c t h w
+        image: Union[
+            torch.FloatTensor,
+            PIL.Image.Image,
+            np.ndarray,
+            List[torch.FloatTensor],
+            List[PIL.Image.Image],
+            List[np.ndarray],
+        ] = None,
+        # b c t(1) hi wi
+        controlnet_condition_frames: Optional[torch.FloatTensor] = None,
+        # b c t ho wo
+        controlnet_latents: Union[torch.FloatTensor, np.ndarray] = None,
+        # b c t(1) ho wo
+        controlnet_condition_latents: Optional[torch.FloatTensor] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        return_dict: bool = True,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        latent_index: torch.LongTensor = None,
+        vision_condition_latent_index: torch.LongTensor = None,
+        **kwargs,
+    ):
+        assert (
+            image is None and controlnet_latents is None
+        ), "should set one of image and controlnet_latents"
+        controlnet = (
+            self.controlnet._orig_mod
+            if is_compiled_module(self.controlnet)
+            else self.controlnet
+        )
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(
+            control_guidance_end, list
+        ):
+            control_guidance_start = len(control_guidance_end) * [
+                control_guidance_start
+            ]
+        elif not isinstance(control_guidance_end, list) and isinstance(
+            control_guidance_start, list
+        ):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(
+            control_guidance_end, list
+        ):
+            mult = (
+                len(controlnet.nets)
+                if isinstance(controlnet, MultiControlNetModel)
+                else 1
+            )
+            control_guidance_start, control_guidance_end = mult * [
+                control_guidance_start
+            ], mult * [control_guidance_end]
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(
+            controlnet_conditioning_scale, float
+        ):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(
+                controlnet.nets
+            )
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            if (
+                controlnet_latents is not None
+                and controlnet_condition_latents is not None
+            ):
+                if isinstance(controlnet_latents, np.ndarray):
+                    controlnet_latents = torch.from_numpy(controlnet_latents)
+                if isinstance(controlnet_condition_latents, np.ndarray):
+                    controlnet_condition_latents = torch.from_numpy(
+                        controlnet_condition_latents
+                    )
+                # TODO：使用index进行concat
+                controlnet_latents = torch.concat(
+                    [controlnet_condition_latents, controlnet_latents], dim=2
+                )
+                if not guess_mode and do_classifier_free_guidance:
+                    controlnet_latents = torch.concat([controlnet_latents] * 2, dim=0)
+                controlnet_latents = rearrange(
+                    controlnet_latents, "b c t h w->(b t) c h w"
+                )
+                controlnet_latents = controlnet_latents.to(device=device, dtype=dtype)
+            else:
+                # TODO：使用index进行concat
+                # TODO: concat with index
+                if controlnet_condition_frames is not None:
+                    if isinstance(controlnet_condition_frames, np.ndarray):
+                        image = np.concatenate(
+                            [controlnet_condition_frames, image], axis=2
+                        )
+                image = self.prepare_image(
+                    image=image,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_videos_per_prompt,
+                    num_images_per_prompt=num_videos_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+                height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+            # TODO: 支持直接使用controlnet_latent而不是frames
+            # TODO: support using controlnet_latent directly instead of frames
+            if controlnet_latents is not None:
+                raise NotImplementedError
+            else:
+                for i, image_ in enumerate(image):
+                    if controlnet_condition_frames is not None and isinstance(
+                        controlnet_condition_frames, list
+                    ):
+                        if isinstance(controlnet_condition_frames[i], np.ndarray):
+                            image_ = np.concatenate(
+                                [controlnet_condition_frames[i], image_], axis=2
+                            )
+                    image_ = self.prepare_image(
+                        image=image_,
+                        width=width,
+                        height=height,
+                        batch_size=batch_size * num_videos_per_prompt,
+                        num_images_per_prompt=num_videos_per_prompt,
+                        device=device,
+                        dtype=controlnet.dtype,
+                        do_classifier_free_guidance=do_classifier_free_guidance,
+                        guess_mode=guess_mode,
+                    )
+                    images.append(image_)
+                image = images
+                height, width = image[0].shape[-2:]
+        else:
+            assert False
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(
+                keeps[0] if isinstance(controlnet, ControlNetModel) else keeps
+            )
+        t = timesteps[i]
+        # controlnet(s) inference
+        if guess_mode and do_classifier_free_guidance:
+            # Infer ControlNet only for the conditional batch.
+            control_model_input = latents
+            control_model_input = scheduler.scale_model_input(control_model_input, t)
+            controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+        else:
+            control_model_input = latent_model_input
+            controlnet_prompt_embeds = prompt_embeds
+        if isinstance(controlnet_keep[i], list):
+            cond_scale = [
+                c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])
+            ]
+        else:
+            cond_scale = controlnet_conditioning_scale * controlnet_keep[i]
+        control_model_input_reshape = rearrange(
+            control_model_input, "b c t h w -> (b t) c h w"
+        )
+        encoder_hidden_states_repeat = repeat(
+            controlnet_prompt_embeds,
+            "b n q->(b t) n q",
+            t=control_model_input.shape[2],
+        )
+        down_block_res_samples, mid_block_res_sample = self.controlnet(
+            control_model_input_reshape,
+            t,
+            encoder_hidden_states_repeat,
+            controlnet_cond=image,
+            controlnet_cond_latents=controlnet_latents,
+            conditioning_scale=cond_scale,
+            guess_mode=guess_mode,
+            return_dict=False,
+        )
+        return down_block_res_samples, mid_block_res_sample
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+def zero_module(module):
+    # Zero out the parameters of a module and return it.
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class PoseGuider(ModelMixin):
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (16, 32, 64, 128),
+    ):
+        super().__init__()
+        self.conv_in = InflatedConv3d(
+            conditioning_channels, block_out_channels[0], kernel_size=3, padding=1
+        )
+        self.blocks = nn.ModuleList([])
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(
+                InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1)
+            )
+            self.blocks.append(
+                InflatedConv3d(
+                    channel_in, channel_out, kernel_size=3, padding=1, stride=2
+                )
+            )
+        self.conv_out = zero_module(
+            InflatedConv3d(
+                block_out_channels[-1],
+                conditioning_embedding_channels,
+                kernel_size=3,
+                padding=1,
+            )
+        )
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+        embedding = self.conv_out(embedding)
+        return embedding
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_path,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int] = (16, 32, 64, 128),
+    ):
+        if not os.path.exists(pretrained_model_path):
+            print(f"There is no model file in {pretrained_model_path}")
+        print(
+            f"loaded PoseGuider's pretrained weights from {pretrained_model_path} ..."
+        )
+        state_dict = torch.load(pretrained_model_path, map_location="cpu")
+        model = PoseGuider(
+            conditioning_embedding_channels=conditioning_embedding_channels,
+            conditioning_channels=conditioning_channels,
+            block_out_channels=block_out_channels,
+        )
+        m, u = model.load_state_dict(state_dict, strict=False)
+        # print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        params = [p.numel() for n, p in model.named_parameters()]
+        print(f"### PoseGuider's Parameters: {sum(params) / 1e6} M")
+        return model

musev/models/embeddings.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from einops import rearrange
+import torch
+from torch.nn import functional as F
+import numpy as np
+from diffusers.models.embeddings import get_2d_sincos_pos_embed_from_grid
+# ref diffusers.models.embeddings.get_2d_sincos_pos_embed
+def get_2d_sincos_pos_embed(
+    embed_dim,
+    grid_size_w,
+    grid_size_h,
+    cls_token=False,
+    extra_tokens=0,
+    norm_length: bool = False,
+    max_length: float = 2048,
+):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if norm_length and grid_size_h <= max_length and grid_size_w <= max_length:
+        grid_h = np.linspace(0, max_length, grid_size_h)
+        grid_w = np.linspace(0, max_length, grid_size_w)
+    else:
+        grid_h = np.arange(grid_size_h, dtype=np.float32)
+        grid_w = np.arange(grid_size_w, dtype=np.float32)
+    grid = np.meshgrid(grid_h, grid_w)  # here h goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate(
+            [np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0
+        )
+    return pos_embed
+def resize_spatial_position_emb(
+    emb: torch.Tensor,
+    height: int,
+    width: int,
+    scale: float = None,
+    target_height: int = None,
+    target_width: int = None,
+) -> torch.Tensor:
+    """_summary_
+    Args:
+        emb (torch.Tensor): b ( h w) d
+        height (int): _description_
+        width (int): _description_
+        scale (float, optional): _description_. Defaults to None.
+        target_height (int, optional): _description_. Defaults to None.
+        target_width (int, optional): _description_. Defaults to None.
+    Returns:
+        torch.Tensor: b (target_height target_width) d
+    """
+    if scale is not None:
+        target_height = int(height * scale)
+        target_width = int(width * scale)
+    emb = rearrange(emb, "(h w) (b d) ->b d h w", h=height, b=1)
+    emb = F.interpolate(
+        emb,
+        size=(target_height, target_width),
+        mode="bicubic",
+        align_corners=False,
+    )
+    emb = rearrange(emb, "b d h w-> (h w) (b d)")
+    return emb

musev/models/facein_loader.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import copy
+from typing import Any, Callable, Dict, Iterable, Union
+import PIL
+import cv2
+import torch
+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import shutil
+from typing import Dict, List, Optional, Tuple
+from pprint import pprint
+from collections import OrderedDict
+from dataclasses import dataclass
+import gc
+import time
+import numpy as np
+from omegaconf import OmegaConf
+from omegaconf import SCMode
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange, repeat
+import pandas as pd
+import h5py
+from diffusers.models.modeling_utils import load_state_dict
+from diffusers.utils import (
+    logging,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from mmcm.vision.feature_extractor.clip_vision_extractor import (
+    ImageClipVisionFeatureExtractor,
+    ImageClipVisionFeatureExtractorV2,
+)
+from mmcm.vision.feature_extractor.insight_face_extractor import InsightFaceExtractor
+from ip_adapter.resampler import Resampler
+from ip_adapter.ip_adapter import ImageProjModel
+from .unet_loader import update_unet_with_sd
+from .unet_3d_condition import UNet3DConditionModel
+from .ip_adapter_loader import ip_adapter_keys_list
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# refer https://github.com/tencent-ailab/IP-Adapter/issues/168#issuecomment-1846771651
+unet_keys_list = [
+    "down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+    "mid_block.attentions.0.transformer_blocks.0.attn2.processor.facein_to_k_ip.weight",
+    "mid_block.attentions.0.transformer_blocks.0.attn2.processor.facein_to_v_ip.weight",
+]
+UNET2IPAadapter_Keys_MAPIING = {
+    k: v for k, v in zip(unet_keys_list, ip_adapter_keys_list)
+}
+def load_facein_extractor_and_proj_by_name(
+    model_name: str,
+    ip_ckpt: Tuple[str, nn.Module],
+    ip_image_encoder: Tuple[str, nn.Module] = None,
+    cross_attention_dim: int = 768,
+    clip_embeddings_dim: int = 512,
+    clip_extra_context_tokens: int = 1,
+    ip_scale: float = 0.0,
+    dtype: torch.dtype = torch.float16,
+    device: str = "cuda",
+    unet: nn.Module = None,
+) -> nn.Module:
+    pass
+def update_unet_facein_cross_attn_param(
+    unet: UNet3DConditionModel, ip_adapter_state_dict: Dict
+) -> None:
+    """use independent ip_adapter attn 中的 to_k, to_v in unet
+    ip_adapter： like ['1.to_k_ip.weight', '1.to_v_ip.weight', '3.to_k_ip.weight']的字典
+    Args:
+        unet (UNet3DConditionModel): _description_
+        ip_adapter_state_dict (Dict): _description_
+    """
+    pass

musev/models/ip_adapter_face_loader.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import copy
+from typing import Any, Callable, Dict, Iterable, Union
+import PIL
+import cv2
+import torch
+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import shutil
+from typing import Dict, List, Optional, Tuple
+from pprint import pprint
+from collections import OrderedDict
+from dataclasses import dataclass
+import gc
+import time
+import numpy as np
+from omegaconf import OmegaConf
+from omegaconf import SCMode
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange, repeat
+import pandas as pd
+import h5py
+from diffusers.models.modeling_utils import load_state_dict
+from diffusers.utils import (
+    logging,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from ip_adapter.resampler import Resampler
+from ip_adapter.ip_adapter import ImageProjModel
+from ip_adapter.ip_adapter_faceid import ProjPlusModel, MLPProjModel
+from mmcm.vision.feature_extractor.clip_vision_extractor import (
+    ImageClipVisionFeatureExtractor,
+    ImageClipVisionFeatureExtractorV2,
+)
+from mmcm.vision.feature_extractor.insight_face_extractor import (
+    InsightFaceExtractorNormEmb,
+)
+from .unet_loader import update_unet_with_sd
+from .unet_3d_condition import UNet3DConditionModel
+from .ip_adapter_loader import ip_adapter_keys_list
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# refer https://github.com/tencent-ailab/IP-Adapter/issues/168#issuecomment-1846771651
+unet_keys_list = [
+    "down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+    "mid_block.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_k_ip.weight",
+    "mid_block.attentions.0.transformer_blocks.0.attn2.processor.ip_adapter_face_to_v_ip.weight",
+]
+UNET2IPAadapter_Keys_MAPIING = {
+    k: v for k, v in zip(unet_keys_list, ip_adapter_keys_list)
+}
+def load_ip_adapter_face_extractor_and_proj_by_name(
+    model_name: str,
+    ip_ckpt: Tuple[str, nn.Module],
+    ip_image_encoder: Tuple[str, nn.Module] = None,
+    cross_attention_dim: int = 768,
+    clip_embeddings_dim: int = 1024,
+    clip_extra_context_tokens: int = 4,
+    ip_scale: float = 0.0,
+    dtype: torch.dtype = torch.float16,
+    device: str = "cuda",
+    unet: nn.Module = None,
+) -> nn.Module:
+    if model_name == "IPAdapterFaceID":
+        if ip_image_encoder is not None:
+            ip_adapter_face_emb_extractor = InsightFaceExtractorNormEmb(
+                pretrained_model_name_or_path=ip_image_encoder,
+                dtype=dtype,
+                device=device,
+            )
+        else:
+            ip_adapter_face_emb_extractor = None
+        ip_adapter_image_proj = MLPProjModel(
+            cross_attention_dim=cross_attention_dim,
+            id_embeddings_dim=clip_embeddings_dim,
+            num_tokens=clip_extra_context_tokens,
+        ).to(device, dtype=dtype)
+    else:
+        raise ValueError(
+            f"unsupport model_name={model_name}, only support IPAdapter, IPAdapterPlus, IPAdapterFaceID"
+        )
+    ip_adapter_state_dict = torch.load(
+        ip_ckpt,
+        map_location="cpu",
+    )
+    ip_adapter_image_proj.load_state_dict(ip_adapter_state_dict["image_proj"])
+    if unet is not None and "ip_adapter" in ip_adapter_state_dict:
+        update_unet_ip_adapter_cross_attn_param(
+            unet,
+            ip_adapter_state_dict["ip_adapter"],
+        )
+        logger.info(
+            f"update unet.spatial_cross_attn_ip_adapter parameter with {ip_ckpt}"
+        )
+    return (
+        ip_adapter_face_emb_extractor,
+        ip_adapter_image_proj,
+    )
+def update_unet_ip_adapter_cross_attn_param(
+    unet: UNet3DConditionModel, ip_adapter_state_dict: Dict
+) -> None:
+    """use independent ip_adapter attn 中的 to_k, to_v in unet
+    ip_adapter： like ['1.to_k_ip.weight', '1.to_v_ip.weight', '3.to_k_ip.weight']
+    Args:
+        unet (UNet3DConditionModel): _description_
+        ip_adapter_state_dict (Dict): _description_
+    """
+    unet_spatial_cross_atnns = unet.spatial_cross_attns[0]
+    unet_spatial_cross_atnns_dct = {k: v for k, v in unet_spatial_cross_atnns}
+    for i, (unet_key_more, ip_adapter_key) in enumerate(
+        UNET2IPAadapter_Keys_MAPIING.items()
+    ):
+        ip_adapter_value = ip_adapter_state_dict[ip_adapter_key]
+        unet_key_more_spit = unet_key_more.split(".")
+        unet_key = ".".join(unet_key_more_spit[:-3])
+        suffix = ".".join(unet_key_more_spit[-3:])
+        logger.debug(
+            f"{i}: unet_key_more = {unet_key_more}, {unet_key}=unet_key, suffix={suffix}",
+        )
+        if ".ip_adapter_face_to_k" in suffix:
+            with torch.no_grad():
+                unet_spatial_cross_atnns_dct[
+                    unet_key
+                ].ip_adapter_face_to_k_ip.weight.copy_(ip_adapter_value.data)
+        else:
+            with torch.no_grad():
+                unet_spatial_cross_atnns_dct[
+                    unet_key
+                ].ip_adapter_face_to_v_ip.weight.copy_(ip_adapter_value.data)

musev/models/ip_adapter_loader.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import copy
+from typing import Any, Callable, Dict, Iterable, Union
+import PIL
+import cv2
+import torch
+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import shutil
+from typing import Dict, List, Optional, Tuple
+from pprint import pprint
+from collections import OrderedDict
+from dataclasses import dataclass
+import gc
+import time
+import numpy as np
+from omegaconf import OmegaConf
+from omegaconf import SCMode
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange, repeat
+import pandas as pd
+import h5py
+from diffusers.models.modeling_utils import load_state_dict
+from diffusers.utils import (
+    logging,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from mmcm.vision.feature_extractor import clip_vision_extractor
+from mmcm.vision.feature_extractor.clip_vision_extractor import (
+    ImageClipVisionFeatureExtractor,
+    ImageClipVisionFeatureExtractorV2,
+    VerstailSDLastHiddenState2ImageEmb,
+)
+from ip_adapter.resampler import Resampler
+from ip_adapter.ip_adapter import ImageProjModel
+from .unet_loader import update_unet_with_sd
+from .unet_3d_condition import UNet3DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def load_vision_clip_encoder_by_name(
+    ip_image_encoder: Tuple[str, nn.Module] = None,
+    dtype: torch.dtype = torch.float16,
+    device: str = "cuda",
+    vision_clip_extractor_class_name: str = None,
+) -> nn.Module:
+    if vision_clip_extractor_class_name is not None:
+        vision_clip_extractor = getattr(
+            clip_vision_extractor, vision_clip_extractor_class_name
+        )(
+            pretrained_model_name_or_path=ip_image_encoder,
+            dtype=dtype,
+            device=device,
+        )
+    else:
+        vision_clip_extractor = None
+    return vision_clip_extractor
+def load_ip_adapter_image_proj_by_name(
+    model_name: str,
+    ip_ckpt: Tuple[str, nn.Module] = None,
+    cross_attention_dim: int = 768,
+    clip_embeddings_dim: int = 1024,
+    clip_extra_context_tokens: int = 4,
+    ip_scale: float = 0.0,
+    dtype: torch.dtype = torch.float16,
+    device: str = "cuda",
+    unet: nn.Module = None,
+    vision_clip_extractor_class_name: str = None,
+    ip_image_encoder: Tuple[str, nn.Module] = None,
+) -> nn.Module:
+    if model_name in [
+        "IPAdapter",
+        "musev_referencenet",
+        "musev_referencenet_pose",
+    ]:
+        ip_adapter_image_proj = ImageProjModel(
+            cross_attention_dim=cross_attention_dim,
+            clip_embeddings_dim=clip_embeddings_dim,
+            clip_extra_context_tokens=clip_extra_context_tokens,
+        )
+    elif model_name == "IPAdapterPlus":
+        vision_clip_extractor = ImageClipVisionFeatureExtractorV2(
+            pretrained_model_name_or_path=ip_image_encoder,
+            dtype=dtype,
+            device=device,
+        )
+        ip_adapter_image_proj = Resampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=12,
+            num_queries=clip_extra_context_tokens,
+            embedding_dim=vision_clip_extractor.image_encoder.config.hidden_size,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        )
+    elif model_name in [
+        "VerstailSDLastHiddenState2ImageEmb",
+        "OriginLastHiddenState2ImageEmbd",
+        "OriginLastHiddenState2Poolout",
+    ]:
+        ip_adapter_image_proj = getattr(
+            clip_vision_extractor, model_name
+        ).from_pretrained(ip_image_encoder)
+    else:
+        raise ValueError(
+            f"unsupport model_name={model_name}, only support IPAdapter, IPAdapterPlus, VerstailSDLastHiddenState2ImageEmb"
+        )
+    if ip_ckpt is not None:
+        ip_adapter_state_dict = torch.load(
+            ip_ckpt,
+            map_location="cpu",
+        )
+        ip_adapter_image_proj.load_state_dict(ip_adapter_state_dict["image_proj"])
+        if (
+            unet is not None
+            and unet.ip_adapter_cross_attn
+            and "ip_adapter" in ip_adapter_state_dict
+        ):
+            update_unet_ip_adapter_cross_attn_param(
+                unet, ip_adapter_state_dict["ip_adapter"]
+            )
+            logger.info(
+                f"update unet.spatial_cross_attn_ip_adapter parameter with {ip_ckpt}"
+            )
+    return ip_adapter_image_proj
+def load_ip_adapter_vision_clip_encoder_by_name(
+    model_name: str,
+    ip_ckpt: Tuple[str, nn.Module],
+    ip_image_encoder: Tuple[str, nn.Module] = None,
+    cross_attention_dim: int = 768,
+    clip_embeddings_dim: int = 1024,
+    clip_extra_context_tokens: int = 4,
+    ip_scale: float = 0.0,
+    dtype: torch.dtype = torch.float16,
+    device: str = "cuda",
+    unet: nn.Module = None,
+    vision_clip_extractor_class_name: str = None,
+) -> nn.Module:
+    if vision_clip_extractor_class_name is not None:
+        vision_clip_extractor = getattr(
+            clip_vision_extractor, vision_clip_extractor_class_name
+        )(
+            pretrained_model_name_or_path=ip_image_encoder,
+            dtype=dtype,
+            device=device,
+        )
+    else:
+        vision_clip_extractor = None
+    if model_name in [
+        "IPAdapter",
+        "musev_referencenet",
+    ]:
+        if ip_image_encoder is not None:
+            if vision_clip_extractor_class_name is None:
+                vision_clip_extractor = ImageClipVisionFeatureExtractor(
+                    pretrained_model_name_or_path=ip_image_encoder,
+                    dtype=dtype,
+                    device=device,
+                )
+        else:
+            vision_clip_extractor = None
+        ip_adapter_image_proj = ImageProjModel(
+            cross_attention_dim=cross_attention_dim,
+            clip_embeddings_dim=clip_embeddings_dim,
+            clip_extra_context_tokens=clip_extra_context_tokens,
+        )
+    elif model_name == "IPAdapterPlus":
+        if ip_image_encoder is not None:
+            if vision_clip_extractor_class_name is None:
+                vision_clip_extractor = ImageClipVisionFeatureExtractorV2(
+                    pretrained_model_name_or_path=ip_image_encoder,
+                    dtype=dtype,
+                    device=device,
+                )
+        else:
+            vision_clip_extractor = None
+        ip_adapter_image_proj = Resampler(
+            dim=cross_attention_dim,
+            depth=4,
+            dim_head=64,
+            heads=12,
+            num_queries=clip_extra_context_tokens,
+            embedding_dim=vision_clip_extractor.image_encoder.config.hidden_size,
+            output_dim=cross_attention_dim,
+            ff_mult=4,
+        ).to(dtype=torch.float16)
+    else:
+        raise ValueError(
+            f"unsupport model_name={model_name}, only support IPAdapter, IPAdapterPlus"
+        )
+    ip_adapter_state_dict = torch.load(
+        ip_ckpt,
+        map_location="cpu",
+    )
+    ip_adapter_image_proj.load_state_dict(ip_adapter_state_dict["image_proj"])
+    if (
+        unet is not None
+        and unet.ip_adapter_cross_attn
+        and "ip_adapter" in ip_adapter_state_dict
+    ):
+        update_unet_ip_adapter_cross_attn_param(
+            unet, ip_adapter_state_dict["ip_adapter"]
+        )
+        logger.info(
+            f"update unet.spatial_cross_attn_ip_adapter parameter with {ip_ckpt}"
+        )
+    return (
+        vision_clip_extractor,
+        ip_adapter_image_proj,
+    )
+# refer https://github.com/tencent-ailab/IP-Adapter/issues/168#issuecomment-1846771651
+unet_keys_list = [
+    "down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "down_blocks.0.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "down_blocks.0.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "down_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "down_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "down_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "down_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.1.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.1.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.1.attentions.2.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.2.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.2.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.2.attentions.2.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.3.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.3.attentions.1.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "up_blocks.3.attentions.2.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+    "mid_block.attentions.0.transformer_blocks.0.attn2.processor.to_k_ip.weight",
+    "mid_block.attentions.0.transformer_blocks.0.attn2.processor.to_v_ip.weight",
+]
+ip_adapter_keys_list = [
+    "1.to_k_ip.weight",
+    "1.to_v_ip.weight",
+    "3.to_k_ip.weight",
+    "3.to_v_ip.weight",
+    "5.to_k_ip.weight",
+    "5.to_v_ip.weight",
+    "7.to_k_ip.weight",
+    "7.to_v_ip.weight",
+    "9.to_k_ip.weight",
+    "9.to_v_ip.weight",
+    "11.to_k_ip.weight",
+    "11.to_v_ip.weight",
+    "13.to_k_ip.weight",
+    "13.to_v_ip.weight",
+    "15.to_k_ip.weight",
+    "15.to_v_ip.weight",
+    "17.to_k_ip.weight",
+    "17.to_v_ip.weight",
+    "19.to_k_ip.weight",
+    "19.to_v_ip.weight",
+    "21.to_k_ip.weight",
+    "21.to_v_ip.weight",
+    "23.to_k_ip.weight",
+    "23.to_v_ip.weight",
+    "25.to_k_ip.weight",
+    "25.to_v_ip.weight",
+    "27.to_k_ip.weight",
+    "27.to_v_ip.weight",
+    "29.to_k_ip.weight",
+    "29.to_v_ip.weight",
+    "31.to_k_ip.weight",
+    "31.to_v_ip.weight",
+]
+UNET2IPAadapter_Keys_MAPIING = {
+    k: v for k, v in zip(unet_keys_list, ip_adapter_keys_list)
+}
+def update_unet_ip_adapter_cross_attn_param(
+    unet: UNet3DConditionModel, ip_adapter_state_dict: Dict
+) -> None:
+    """use independent ip_adapter attn 中的 to_k, to_v in unet
+    ip_adapter：  dict whose keys are ['1.to_k_ip.weight', '1.to_v_ip.weight', '3.to_k_ip.weight']
+    Args:
+        unet (UNet3DConditionModel): _description_
+        ip_adapter_state_dict (Dict): _description_
+    """
+    unet_spatial_cross_atnns = unet.spatial_cross_attns[0]
+    unet_spatial_cross_atnns_dct = {k: v for k, v in unet_spatial_cross_atnns}
+    for i, (unet_key_more, ip_adapter_key) in enumerate(
+        UNET2IPAadapter_Keys_MAPIING.items()
+    ):
+        ip_adapter_value = ip_adapter_state_dict[ip_adapter_key]
+        unet_key_more_spit = unet_key_more.split(".")
+        unet_key = ".".join(unet_key_more_spit[:-3])
+        suffix = ".".join(unet_key_more_spit[-3:])
+        logger.debug(
+            f"{i}: unet_key_more = {unet_key_more}, {unet_key}=unet_key, suffix={suffix}",
+        )
+        if "to_k" in suffix:
+            with torch.no_grad():
+                unet_spatial_cross_atnns_dct[unet_key].to_k_ip.weight.copy_(
+                    ip_adapter_value.data
+                )
+        else:
+            with torch.no_grad():
+                unet_spatial_cross_atnns_dct[unet_key].to_v_ip.weight.copy_(
+                    ip_adapter_value.data
+                )

musev/models/referencenet.py ADDED Viewed

	@@ -0,0 +1,1216 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple, Union
+import logging
+import torch
+from diffusers.models.attention_processor import Attention, AttnProcessor
+from einops import rearrange, repeat
+import torch.nn as nn
+import torch.nn.functional as F
+import xformers
+from diffusers.models.lora import LoRACompatibleLinear
+from diffusers.models.unet_2d_condition import (
+    UNet2DConditionModel,
+    UNet2DConditionOutput,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils.constants import USE_PEFT_BACKEND
+from diffusers.utils.deprecation_utils import deprecate
+from diffusers.utils.peft_utils import scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.modeling_utils import ModelMixin, load_state_dict
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from ..data.data_util import align_repeat_tensor_single_dim
+from .unet_3d_condition import UNet3DConditionModel
+from .attention import BasicTransformerBlock, IPAttention
+from .unet_2d_blocks import (
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+from . import Model_Register
+logger = logging.getLogger(__name__)
+@Model_Register.register
+class ReferenceNet2D(UNet2DConditionModel, nn.Module):
+    """继承 UNet2DConditionModel. 新增功能，类似controlnet 返回模型中间特征，用于后续作用
+        Inherit Unet2DConditionModel. Add new functions, similar to controlnet, return the intermediate features of the model for subsequent effects
+    Args:
+        UNet2DConditionModel (_type_): _description_
+    """
+    _supports_gradient_checkpointing = True
+    print_idx = 0
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int | None = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: str | None = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+        ),
+        only_cross_attention: bool | Tuple[bool] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int | Tuple[int] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0,
+        act_fn: str = "silu",
+        norm_num_groups: int | None = 32,
+        norm_eps: float = 0.00001,
+        cross_attention_dim: int | Tuple[int] = 1280,
+        transformer_layers_per_block: int | Tuple[int] | Tuple[Tuple] = 1,
+        reverse_transformer_layers_per_block: Tuple[Tuple[int]] | None = None,
+        encoder_hid_dim: int | None = None,
+        encoder_hid_dim_type: str | None = None,
+        attention_head_dim: int | Tuple[int] = 8,
+        num_attention_heads: int | Tuple[int] | None = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: str | None = None,
+        addition_embed_type: str | None = None,
+        addition_time_embed_dim: int | None = None,
+        num_class_embeds: int | None = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: int | None = None,
+        time_embedding_act_fn: str | None = None,
+        timestep_post_act: str | None = None,
+        time_cond_proj_dim: int | None = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: int | None = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: bool | None = None,
+        cross_attention_norm: str | None = None,
+        addition_embed_type_num_heads=64,
+        need_self_attn_block_embs: bool = False,
+        need_block_embs: bool = False,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(
+            only_cross_attention
+        ) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if (
+            isinstance(transformer_layers_per_block, list)
+            and reverse_transformer_layers_per_block is None
+        ):
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError(
+                        "Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet."
+                    )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=conv_in_kernel,
+            padding=conv_in_padding,
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(
+                    f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
+                )
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2,
+                set_W_to_weight=False,
+                log=False,
+                flip_sin_to_cos=flip_sin_to_cos,
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(
+                block_out_channels[0], flip_sin_to_cos, freq_shift
+            )
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info(
+                "encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined."
+            )
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(
+                timestep_input_dim, time_embed_dim, act_fn=act_fn
+            )
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim
+            )
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(
+                projection_class_embeddings_input_dim, time_embed_dim
+            )
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim,
+                time_embed_dim,
+                num_heads=addition_embed_type_num_heads,
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim,
+                image_embed_dim=cross_attention_dim,
+                time_embed_dim=time_embed_dim,
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(
+                addition_time_embed_dim, flip_sin_to_cos, freq_shift
+            )
+            self.add_embedding = TimestepEmbedding(
+                projection_class_embeddings_input_dim, time_embed_dim
+            )
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(
+                image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type is not None:
+            raise ValueError(
+                f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'."
+            )
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(
+                down_block_types
+            )
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i]
+                if attention_head_dim[i] is not None
+                else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i]
+                if attention_head_dim[i] is not None
+                else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0],
+                num_groups=norm_num_groups,
+                eps=norm_eps,
+            )
+            self.conv_act = get_activation(act_fn)
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=conv_out_kernel,
+            padding=conv_out_padding,
+        )
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(
+                cross_attention_dim, list
+            ):
+                positive_len = cross_attention_dim[0]
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len,
+                out_dim=cross_attention_dim,
+                feature_type=feature_type,
+            )
+        self.need_block_embs = need_block_embs
+        self.need_self_attn_block_embs = need_self_attn_block_embs
+        # only use referencenet soma layers, other layers set None
+        self.conv_norm_out = None
+        self.conv_act = None
+        self.conv_out = None
+        self.up_blocks[-1].attentions[-1].proj_out = None
+        self.up_blocks[-1].attentions[-1].transformer_blocks[-1].attn1 = None
+        self.up_blocks[-1].attentions[-1].transformer_blocks[-1].attn2 = None
+        self.up_blocks[-1].attentions[-1].transformer_blocks[-1].norm2 = None
+        self.up_blocks[-1].attentions[-1].transformer_blocks[-1].ff = None
+        self.up_blocks[-1].attentions[-1].transformer_blocks[-1].norm3 = None
+        if not self.need_self_attn_block_embs:
+            self.up_blocks = None
+        self.insert_spatial_self_attn_idx()
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        # update new paramestes start
+        num_frames: int = None,
+        return_ndim: int = 5,
+        # update new paramestes end
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(sample.dtype)
+            ) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError(
+                    "class_labels should be provided when num_class_embeds > 0"
+                )
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if (
+                "image_embeds" not in added_cond_kwargs
+                or "hint" not in added_cond_kwargs
+            ):
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if (
+            self.encoder_hid_proj is not None
+            and self.config.encoder_hid_dim_type == "text_proj"
+        ):
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif (
+            self.encoder_hid_proj is not None
+            and self.config.encoder_hid_dim_type == "text_image_proj"
+        ):
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(
+                encoder_hidden_states, image_embeds
+            )
+        elif (
+            self.encoder_hid_proj is not None
+            and self.config.encoder_hid_dim_type == "image_proj"
+        ):
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif (
+            self.encoder_hid_proj is not None
+            and self.config.encoder_hid_dim_type == "ip_image_proj"
+        ):
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds).to(
+                encoder_hidden_states.dtype
+            )
+            encoder_hidden_states = torch.cat(
+                [encoder_hidden_states, image_embeds], dim=1
+            )
+        # need_self_attn_block_embs
+        # 初始化
+        # 或在unet中运算中会不断 append self_attn_blocks_embs，用完需要清理，
+        if self.need_self_attn_block_embs:
+            self_attn_block_embs = [None] * self.self_attn_num
+        else:
+            self_attn_block_embs = None
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        if self.print_idx == 0:
+            logger.debug(f"after conv in sample={sample.mean()}")
+        # 2.5 GLIGEN position net
+        if (
+            cross_attention_kwargs is not None
+            and cross_attention_kwargs.get("gligen", None) is not None
+        ):
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {
+                "objs": self.position_net(**gligen_args)
+            }
+        # 3. down
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        is_controlnet = (
+            mid_block_additional_residual is not None
+            and down_block_additional_residuals is not None
+        )
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if (
+            not is_adapter
+            and mid_block_additional_residual is None
+            and down_block_additional_residuals is not None
+        ):
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+        down_block_res_samples = (sample,)
+        for i_downsample_block, downsample_block in enumerate(self.down_blocks):
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals[
+                        "additional_residuals"
+                    ] = down_intrablock_additional_residuals.pop(0)
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"downsample_block {i_downsample_block} sample={sample.mean()}"
+                    )
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                    self_attn_block_embs=self_attn_block_embs,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    scale=lora_scale,
+                    self_attn_block_embs=self_attn_block_embs,
+                )
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = (
+                    down_block_res_sample + down_block_additional_residual
+                )
+                new_down_block_res_samples = new_down_block_res_samples + (
+                    down_block_res_sample,
+                )
+            down_block_res_samples = new_down_block_res_samples
+        # update code start
+        def reshape_return_emb(tmp_emb):
+            if return_ndim == 4:
+                return tmp_emb
+            elif return_ndim == 5:
+                return rearrange(tmp_emb, "(b t) c h w-> b c t h w", t=num_frames)
+            else:
+                raise ValueError(
+                    f"reshape_emb only support 4, 5 but given {return_ndim}"
+                )
+        if self.need_block_embs:
+            return_down_block_res_samples = [
+                reshape_return_emb(tmp_emb) for tmp_emb in down_block_res_samples
+            ]
+        else:
+            return_down_block_res_samples = None
+        # update code end
+        # 4. mid
+        if self.mid_block is not None:
+            if (
+                hasattr(self.mid_block, "has_cross_attention")
+                and self.mid_block.has_cross_attention
+            ):
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    self_attn_block_embs=self_attn_block_embs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        if self.need_block_embs:
+            return_mid_block_res_samples = reshape_return_emb(sample)
+            logger.debug(
+                f"return_mid_block_res_samples, is_leaf={return_mid_block_res_samples.is_leaf}, requires_grad={return_mid_block_res_samples.requires_grad}"
+            )
+        else:
+            return_mid_block_res_samples = None
+        if self.up_blocks is not None:
+            # update code end
+            # 5. up
+            for i, upsample_block in enumerate(self.up_blocks):
+                is_final_block = i == len(self.up_blocks) - 1
+                res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+                down_block_res_samples = down_block_res_samples[
+                    : -len(upsample_block.resnets)
+                ]
+                # if we have not reached the final block and need to forward the
+                # upsample size, we do it here
+                if not is_final_block and forward_upsample_size:
+                    upsample_size = down_block_res_samples[-1].shape[2:]
+                if (
+                    hasattr(upsample_block, "has_cross_attention")
+                    and upsample_block.has_cross_attention
+                ):
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        upsample_size=upsample_size,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        self_attn_block_embs=self_attn_block_embs,
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        upsample_size=upsample_size,
+                        scale=lora_scale,
+                        self_attn_block_embs=self_attn_block_embs,
+                    )
+        # update code start
+        if self.need_block_embs or self.need_self_attn_block_embs:
+            if self_attn_block_embs is not None:
+                self_attn_block_embs = [
+                    reshape_return_emb(tmp_emb=tmp_emb)
+                    for tmp_emb in self_attn_block_embs
+                ]
+            self.print_idx += 1
+            return (
+                return_down_block_res_samples,
+                return_mid_block_res_samples,
+                self_attn_block_embs,
+            )
+        if not self.need_block_embs and not self.need_self_attn_block_embs:
+            # 6. post-process
+            if self.conv_norm_out:
+                sample = self.conv_norm_out(sample)
+                sample = self.conv_act(sample)
+            sample = self.conv_out(sample)
+            if USE_PEFT_BACKEND:
+                # remove `lora_scale` from each PEFT layer
+                unscale_lora_layers(self, lora_scale)
+            self.print_idx += 1
+            if not return_dict:
+                return (sample,)
+            return UNet2DConditionOutput(sample=sample)
+    def insert_spatial_self_attn_idx(self):
+        attns, basic_transformers = self.spatial_self_attns
+        self.self_attn_num = len(attns)
+        for i, (name, layer) in enumerate(attns):
+            logger.debug(f"{self.__class__.__name__}, {i}, {name}, {type(layer)}")
+            if layer is not None:
+                layer.spatial_self_attn_idx = i
+        for i, (name, layer) in enumerate(basic_transformers):
+            logger.debug(f"{self.__class__.__name__}, {i}, {name}, {type(layer)}")
+            if layer is not None:
+                layer.spatial_self_attn_idx = i
+    @property
+    def spatial_self_attns(
+        self,
+    ) -> List[Tuple[str, Attention]]:
+        attns, spatial_transformers = self.get_self_attns(
+            include="attentions", exclude="temp_attentions"
+        )
+        attns = sorted(attns)
+        spatial_transformers = sorted(spatial_transformers)
+        return attns, spatial_transformers
+    def get_self_attns(
+        self, include: str = None, exclude: str = None
+    ) -> List[Tuple[str, Attention]]:
+        r"""
+        Returns:
+            `dict` of attention attns: A dictionary containing all attention attns used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        attns = []
+        spatial_transformers = []
+        def fn_recursive_add_attns(
+            name: str,
+            module: torch.nn.Module,
+            attns: List[Tuple[str, Attention]],
+            spatial_transformers: List[Tuple[str, BasicTransformerBlock]],
+        ):
+            is_target = False
+            if isinstance(module, BasicTransformerBlock) and hasattr(module, "attn1"):
+                is_target = True
+                if include is not None:
+                    is_target = include in name
+                if exclude is not None:
+                    is_target = exclude not in name
+            if is_target:
+                attns.append([f"{name}.attn1", module.attn1])
+                spatial_transformers.append([f"{name}", module])
+            for sub_name, child in module.named_children():
+                fn_recursive_add_attns(
+                    f"{name}.{sub_name}", child, attns, spatial_transformers
+                )
+            return attns
+        for name, module in self.named_children():
+            fn_recursive_add_attns(name, module, attns, spatial_transformers)
+        return attns, spatial_transformers
+class ReferenceNet3D(UNet3DConditionModel):
+    """继承 UNet3DConditionModel， 用于提取中间emb用于后续作用。
+        Inherit Unet3DConditionModel, used to extract the middle emb for subsequent actions.
+    Args:
+        UNet3DConditionModel (_type_): _description_
+    """
+    pass

musev/models/referencenet_loader.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import copy
+from typing import Any, Callable, Dict, Iterable, Union
+import PIL
+import cv2
+import torch
+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import shutil
+from typing import Dict, List, Optional, Tuple
+from pprint import pprint
+from collections import OrderedDict
+from dataclasses import dataclass
+import gc
+import time
+import numpy as np
+from omegaconf import OmegaConf
+from omegaconf import SCMode
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange, repeat
+import pandas as pd
+import h5py
+from diffusers.models.modeling_utils import load_state_dict
+from diffusers.utils import (
+    logging,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from .referencenet import ReferenceNet2D
+from .unet_loader import update_unet_with_sd
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def load_referencenet(
+    sd_referencenet_model: Tuple[str, nn.Module],
+    sd_model: nn.Module = None,
+    need_self_attn_block_embs: bool = False,
+    need_block_embs: bool = False,
+    dtype: torch.dtype = torch.float16,
+    cross_attention_dim: int = 768,
+    subfolder: str = "unet",
+):
+    """
+    Loads the ReferenceNet model.
+    Args:
+        sd_referencenet_model (Tuple[str, nn.Module] or str): The pretrained ReferenceNet model or the path to the model.
+        sd_model (nn.Module, optional): The sd_model to update the ReferenceNet with. Defaults to None.
+        need_self_attn_block_embs (bool, optional): Whether to compute self-attention block embeddings. Defaults to False.
+        need_block_embs (bool, optional): Whether to compute block embeddings. Defaults to False.
+        dtype (torch.dtype, optional): The data type of the tensors. Defaults to torch.float16.
+        cross_attention_dim (int, optional): The dimension of the cross-attention. Defaults to 768.
+        subfolder (str, optional): The subfolder of the model. Defaults to "unet".
+    Returns:
+        nn.Module: The loaded ReferenceNet model.
+    """
+    if isinstance(sd_referencenet_model, str):
+        referencenet = ReferenceNet2D.from_pretrained(
+            sd_referencenet_model,
+            subfolder=subfolder,
+            need_self_attn_block_embs=need_self_attn_block_embs,
+            need_block_embs=need_block_embs,
+            torch_dtype=dtype,
+            cross_attention_dim=cross_attention_dim,
+        )
+    elif isinstance(sd_referencenet_model, nn.Module):
+        referencenet = sd_referencenet_model
+    if sd_model is not None:
+        referencenet = update_unet_with_sd(referencenet, sd_model)
+    return referencenet
+def load_referencenet_by_name(
+    model_name: str,
+    sd_referencenet_model: Tuple[str, nn.Module],
+    sd_model: nn.Module = None,
+    cross_attention_dim: int = 768,
+    dtype: torch.dtype = torch.float16,
+) -> nn.Module:
+    """通过模型名字 初始化 referencenet，载入预训练参数，
+        如希望后续通过简单名字就可以使用预训练模型，需要在这里完成定义
+        init referencenet with model_name.
+        if you want to use pretrained model with simple name, you need to define it here.
+    Args:
+        model_name (str): _description_
+        sd_unet_model (Tuple[str, nn.Module]): _description_
+        sd_model (Tuple[str, nn.Module]): _description_
+        cross_attention_dim (int, optional): _description_. Defaults to 768.
+        dtype (torch.dtype, optional): _description_. Defaults to torch.float16.
+    Raises:
+        ValueError: _description_
+    Returns:
+        nn.Module: _description_
+    """
+    if model_name in [
+        "musev_referencenet",
+    ]:
+        unet = load_referencenet(
+            sd_referencenet_model=sd_referencenet_model,
+            sd_model=sd_model,
+            cross_attention_dim=cross_attention_dim,
+            dtype=dtype,
+            need_self_attn_block_embs=False,
+            need_block_embs=True,
+            subfolder="referencenet",
+        )
+    else:
+        raise ValueError(
+            f"unsupport model_name={model_name}, only support ReferenceNet_V0_block13, ReferenceNet_V1_block13, ReferenceNet_V2_block13, ReferenceNet_V0_sefattn16"
+        )
+    return unet

musev/models/resnet.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+# `TemporalConvLayer` Copyright 2023 Alibaba DAMO-VILAB, The ModelScope Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/huggingface/diffusers/blob/v0.16.1/src/diffusers/models/resnet.py
+from __future__ import annotations
+from functools import partial
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from diffusers.models.resnet import TemporalConvLayer as DiffusersTemporalConvLayer
+from ..data.data_util import batch_index_fill, batch_index_select
+from . import Model_Register
+@Model_Register.register
+class TemporalConvLayer(nn.Module):
+    """
+    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
+    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
+    """
+    def __init__(
+        self,
+        in_dim,
+        out_dim=None,
+        dropout=0.0,
+        keep_content_condition: bool = False,
+        femb_channels: Optional[int] = None,
+        need_temporal_weight: bool = True,
+    ):
+        super().__init__()
+        out_dim = out_dim or in_dim
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.keep_content_condition = keep_content_condition
+        self.femb_channels = femb_channels
+        self.need_temporal_weight = need_temporal_weight
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(32, in_dim),
+            nn.SiLU(),
+            nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(32, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(32, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(32, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        # zero out the last layer params,so the conv block is identity
+        #         nn.init.zeros_(self.conv4[-1].weight)
+        #         nn.init.zeros_(self.conv4[-1].bias)
+        self.temporal_weight = nn.Parameter(
+            torch.tensor(
+                [
+                    1e-5,
+                ]
+            )
+        )  # initialize parameter with 0
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+        self.skip_temporal_layers = False  # Whether to skip temporal layer
+    def forward(
+        self,
+        hidden_states,
+        num_frames=1,
+        sample_index: torch.LongTensor = None,
+        vision_conditon_frames_sample_index: torch.LongTensor = None,
+        femb: torch.Tensor = None,
+    ):
+        if self.skip_temporal_layers is True:
+            return hidden_states
+        hidden_states_dtype = hidden_states.dtype
+        hidden_states = rearrange(
+            hidden_states, "(b t) c h w -> b c t h w", t=num_frames
+        )
+        identity = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = self.conv4(hidden_states)
+        # 保留condition对应的frames，便于保持前序内容帧，提升一致性
+        if self.keep_content_condition:
+            mask = torch.ones_like(hidden_states, device=hidden_states.device)
+            mask = batch_index_fill(
+                mask, dim=2, index=vision_conditon_frames_sample_index, value=0
+            )
+            if self.need_temporal_weight:
+                hidden_states = (
+                    identity + torch.abs(self.temporal_weight) * mask * hidden_states
+                )
+            else:
+                hidden_states = identity + mask * hidden_states
+        else:
+            if self.need_temporal_weight:
+                hidden_states = (
+                    identity + torch.abs(self.temporal_weight) * hidden_states
+                )
+            else:
+                hidden_states = identity + hidden_states
+        hidden_states = rearrange(hidden_states, " b c t h w -> (b t) c h w")
+        hidden_states = hidden_states.to(dtype=hidden_states_dtype)
+        return hidden_states

musev/models/super_model.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from __future__ import annotations
+import logging
+from typing import Any, Dict, Tuple, Union, Optional
+from einops import rearrange, repeat
+from torch import nn
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin, load_state_dict
+from ..data.data_util import align_repeat_tensor_single_dim
+from .unet_3d_condition import UNet3DConditionModel
+from .referencenet import ReferenceNet2D
+from ip_adapter.ip_adapter import ImageProjModel
+logger = logging.getLogger(__name__)
+class SuperUNet3DConditionModel(nn.Module):
+    """封装了各种子模型的超模型，与 diffusers 的 pipeline 很像，只不过这里是模型定义。
+    主要作用
+    1. 将支持controlnet、referencenet等功能的计算封装起来，简洁些；
+    2. 便于 accelerator 的分布式训练；
+    wrap the sub-models, such as unet, referencenet, controlnet, vae, text_encoder, tokenizer, text_emb_extractor, clip_vision_extractor, ip_adapter_image_proj
+    1. support controlnet, referencenet, etc.
+    2. support accelerator distributed training
+    """
+    _supports_gradient_checkpointing = True
+    print_idx = 0
+    # @register_to_config
+    def __init__(
+        self,
+        unet: nn.Module,
+        referencenet: nn.Module = None,
+        controlnet: nn.Module = None,
+        vae: nn.Module = None,
+        text_encoder: nn.Module = None,
+        tokenizer: nn.Module = None,
+        text_emb_extractor: nn.Module = None,
+        clip_vision_extractor: nn.Module = None,
+        ip_adapter_image_proj: nn.Module = None,
+    ) -> None:
+        """_summary_
+        Args:
+            unet (nn.Module): _description_
+            referencenet (nn.Module, optional): _description_. Defaults to None.
+            controlnet (nn.Module, optional): _description_. Defaults to None.
+            vae (nn.Module, optional): _description_. Defaults to None.
+            text_encoder (nn.Module, optional): _description_. Defaults to None.
+            tokenizer (nn.Module, optional): _description_. Defaults to None.
+            text_emb_extractor (nn.Module, optional): wrap text_encoder and tokenizer for str2emb. Defaults to None.
+            clip_vision_extractor (nn.Module, optional): _description_. Defaults to None.
+        """
+        super().__init__()
+        self.unet = unet
+        self.referencenet = referencenet
+        self.controlnet = controlnet
+        self.vae = vae
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.text_emb_extractor = text_emb_extractor
+        self.clip_vision_extractor = clip_vision_extractor
+        self.ip_adapter_image_proj = ip_adapter_image_proj
+    def forward(
+        self,
+        unet_params: Dict,
+        encoder_hidden_states: torch.Tensor,
+        referencenet_params: Dict = None,
+        controlnet_params: Dict = None,
+        controlnet_scale: float = 1.0,
+        vision_clip_emb: Union[torch.Tensor, None] = None,
+        prompt_only_use_image_prompt: bool = False,
+    ):
+        """_summary_
+        Args:
+            unet_params (Dict): _description_
+            encoder_hidden_states (torch.Tensor): b t n d
+            referencenet_params (Dict, optional): _description_. Defaults to None.
+            controlnet_params (Dict, optional): _description_. Defaults to None.
+            controlnet_scale (float, optional): _description_. Defaults to 1.0.
+            vision_clip_emb (Union[torch.Tensor, None], optional): b t d. Defaults to None.
+            prompt_only_use_image_prompt (bool, optional): _description_. Defaults to False.
+        Returns:
+            _type_: _description_
+        """
+        batch_size = unet_params["sample"].shape[0]
+        time_size = unet_params["sample"].shape[2]
+        # ip_adapter_cross_attn, prepare image prompt
+        if vision_clip_emb is not None:
+            # b t n d -> b t n d
+            if self.print_idx == 0:
+                logger.debug(
+                    f"vision_clip_emb, before ip_adapter_image_proj, shape={vision_clip_emb.shape} mean={torch.mean(vision_clip_emb)}"
+                )
+            if vision_clip_emb.ndim == 3:
+                vision_clip_emb = rearrange(vision_clip_emb, "b t d-> b t 1 d")
+            if self.ip_adapter_image_proj is not None:
+                vision_clip_emb = rearrange(vision_clip_emb, "b t n d ->(b t) n d")
+                vision_clip_emb = self.ip_adapter_image_proj(vision_clip_emb)
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"vision_clip_emb, after ip_adapter_image_proj shape={vision_clip_emb.shape} mean={torch.mean(vision_clip_emb)}"
+                    )
+                if vision_clip_emb.ndim == 2:
+                    vision_clip_emb = rearrange(vision_clip_emb, "b d-> b 1 d")
+                vision_clip_emb = rearrange(
+                    vision_clip_emb, "(b t) n d -> b t n d", b=batch_size
+                )
+            vision_clip_emb = align_repeat_tensor_single_dim(
+                vision_clip_emb, target_length=time_size, dim=1
+            )
+            if self.print_idx == 0:
+                logger.debug(
+                    f"vision_clip_emb, after reshape shape={vision_clip_emb.shape} mean={torch.mean(vision_clip_emb)}"
+                )
+        if vision_clip_emb is None and encoder_hidden_states is not None:
+            vision_clip_emb = encoder_hidden_states
+        if vision_clip_emb is not None and encoder_hidden_states is None:
+            encoder_hidden_states = vision_clip_emb
+        # 当 prompt_only_use_image_prompt 为True时，
+        # 1. referencenet 都使用 vision_clip_emb
+        # 2. unet 如果没有dual_cross_attn，使用vision_clip_emb，有时不更新
+        # 3. controlnet 当前使用 text_prompt
+        # when prompt_only_use_image_prompt True,
+        # 1. referencenet use vision_clip_emb
+        # 2. unet use vision_clip_emb if no dual_cross_attn, sometimes not update
+        # 3. controlnet use text_prompt
+        # extract referencenet emb
+        if self.referencenet is not None and referencenet_params is not None:
+            referencenet_encoder_hidden_states = align_repeat_tensor_single_dim(
+                vision_clip_emb,
+                target_length=referencenet_params["num_frames"],
+                dim=1,
+            )
+            referencenet_params["encoder_hidden_states"] = rearrange(
+                referencenet_encoder_hidden_states, "b t n d->(b t) n d"
+            )
+            referencenet_out = self.referencenet(**referencenet_params)
+            (
+                down_block_refer_embs,
+                mid_block_refer_emb,
+                refer_self_attn_emb,
+            ) = referencenet_out
+            if down_block_refer_embs is not None:
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"len(down_block_refer_embs)={len(down_block_refer_embs)}"
+                    )
+                for i, down_emb in enumerate(down_block_refer_embs):
+                    if self.print_idx == 0:
+                        logger.debug(
+                            f"down_emb, {i}, {down_emb.shape}, mean={down_emb.mean()}"
+                        )
+            else:
+                if self.print_idx == 0:
+                    logger.debug(f"down_block_refer_embs is None")
+            if mid_block_refer_emb is not None:
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"mid_block_refer_emb, {mid_block_refer_emb.shape}, mean={mid_block_refer_emb.mean()}"
+                    )
+            else:
+                if self.print_idx == 0:
+                    logger.debug(f"mid_block_refer_emb is None")
+            if refer_self_attn_emb is not None:
+                if self.print_idx == 0:
+                    logger.debug(f"refer_self_attn_emb, num={len(refer_self_attn_emb)}")
+                for i, self_attn_emb in enumerate(refer_self_attn_emb):
+                    if self.print_idx == 0:
+                        logger.debug(
+                            f"referencenet, self_attn_emb, {i}th, shape={self_attn_emb.shape}, mean={self_attn_emb.mean()}"
+                        )
+            else:
+                if self.print_idx == 0:
+                    logger.debug(f"refer_self_attn_emb is None")
+        else:
+            down_block_refer_embs, mid_block_refer_emb, refer_self_attn_emb = (
+                None,
+                None,
+                None,
+            )
+        # extract controlnet emb
+        if self.controlnet is not None and controlnet_params is not None:
+            controlnet_encoder_hidden_states = align_repeat_tensor_single_dim(
+                encoder_hidden_states,
+                target_length=unet_params["sample"].shape[2],
+                dim=1,
+            )
+            controlnet_params["encoder_hidden_states"] = rearrange(
+                controlnet_encoder_hidden_states, " b t n d -> (b t) n d"
+            )
+            (
+                down_block_additional_residuals,
+                mid_block_additional_residual,
+            ) = self.controlnet(**controlnet_params)
+            if controlnet_scale != 1.0:
+                down_block_additional_residuals = [
+                    x * controlnet_scale for x in down_block_additional_residuals
+                ]
+                mid_block_additional_residual = (
+                    mid_block_additional_residual * controlnet_scale
+                )
+            for i, down_block_additional_residual in enumerate(
+                down_block_additional_residuals
+            ):
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"{i}, down_block_additional_residual mean={torch.mean(down_block_additional_residual)}"
+                    )
+            if self.print_idx == 0:
+                logger.debug(
+                    f"mid_block_additional_residual mean={torch.mean(mid_block_additional_residual)}"
+                )
+        else:
+            down_block_additional_residuals = None
+            mid_block_additional_residual = None
+        if prompt_only_use_image_prompt and vision_clip_emb is not None:
+            encoder_hidden_states = vision_clip_emb
+        # run unet
+        out = self.unet(
+            **unet_params,
+            down_block_refer_embs=down_block_refer_embs,
+            mid_block_refer_emb=mid_block_refer_emb,
+            refer_self_attn_emb=refer_self_attn_emb,
+            down_block_additional_residuals=down_block_additional_residuals,
+            mid_block_additional_residual=mid_block_additional_residual,
+            encoder_hidden_states=encoder_hidden_states,
+            vision_clip_emb=vision_clip_emb,
+        )
+        self.print_idx += 1
+        return out
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (UNet3DConditionModel, ReferenceNet2D)):
+            module.gradient_checkpointing = value

musev/models/temporal_transformer.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/huggingface/diffusers/blob/v0.16.1/src/diffusers/models/transformer_temporal.py
+from __future__ import annotations
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import List, Literal, Optional
+import logging
+import torch
+from torch import nn
+from einops import rearrange, repeat
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.transformer_temporal import (
+    TransformerTemporalModelOutput,
+    TransformerTemporalModel as DiffusersTransformerTemporalModel,
+)
+from diffusers.models.attention_processor import AttnProcessor
+from mmcm.utils.gpu_util import get_gpu_status
+from ..data.data_util import (
+    batch_concat_two_tensor_with_index,
+    batch_index_fill,
+    batch_index_select,
+    concat_two_tensor,
+    align_repeat_tensor_single_dim,
+)
+from ..utils.attention_util import generate_sparse_causcal_attn_mask
+from .attention import BasicTransformerBlock
+from .attention_processor import (
+    BaseIPAttnProcessor,
+)
+from . import Model_Register
+# https://github.com/facebookresearch/xformers/issues/845
+# 输入bs*n_frames*w*h太高，xformers报错。因此将transformer_temporal的allow_xformers均关掉
+# if bs*n_frames*w*h to large, xformers will raise error. So we close the allow_xformers in transformer_temporal
+logger = logging.getLogger(__name__)
+@Model_Register.register
+class TransformerTemporalModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for video-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+        double_self_attention (`bool`, *optional*):
+            Configure if each TransformerBlock should contain two self-attention layers
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        femb_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        double_self_attention: bool = True,
+        allow_xformers: bool = False,
+        only_cross_attention: bool = False,
+        keep_content_condition: bool = False,
+        need_spatial_position_emb: bool = False,
+        need_temporal_weight: bool = True,
+        self_attn_mask: str = None,
+        # TODO: 运行参数，有待改到forward里面去
+        # TODO: running parameters, need to be moved to forward
+        image_scale: float = 1.0,
+        processor: AttnProcessor | None = None,
+        remove_femb_non_linear: bool = False,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(
+            num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        # 2. Define temporal positional embedding
+        self.frame_emb_proj = torch.nn.Linear(femb_channels, inner_dim)
+        self.remove_femb_non_linear = remove_femb_non_linear
+        if not remove_femb_non_linear:
+            self.nonlinearity = nn.SiLU()
+        # spatial_position_emb 使用femb_的参数配置
+        self.need_spatial_position_emb = need_spatial_position_emb
+        if need_spatial_position_emb:
+            self.spatial_position_emb_proj = torch.nn.Linear(femb_channels, inner_dim)
+        # 3. Define transformers blocks
+        # TODO： 该实现方式不好，待优化
+        # TODO: bad implementation, need to be optimized
+        self.need_ipadapter = False
+        self.cross_attn_temporal_cond = False
+        self.allow_xformers = allow_xformers
+        if processor is not None and isinstance(processor, BaseIPAttnProcessor):
+            self.cross_attn_temporal_cond = True
+            self.allow_xformers = False
+            if "NonParam" not in processor.__class__.__name__:
+                self.need_ipadapter = True
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    double_self_attention=double_self_attention,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    allow_xformers=allow_xformers,
+                    only_cross_attention=only_cross_attention,
+                    cross_attn_temporal_cond=self.need_ipadapter,
+                    image_scale=image_scale,
+                    processor=processor,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+        self.need_temporal_weight = need_temporal_weight
+        if need_temporal_weight:
+            self.temporal_weight = nn.Parameter(
+                torch.tensor(
+                    [
+                        1e-5,
+                    ]
+                )
+            )  # initialize parameter with 0
+        self.skip_temporal_layers = False  # Whether to skip temporal layer
+        self.keep_content_condition = keep_content_condition
+        self.self_attn_mask = self_attn_mask
+        self.only_cross_attention = only_cross_attention
+        self.double_self_attention = double_self_attention
+        self.cross_attention_dim = cross_attention_dim
+        self.image_scale = image_scale
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.proj_out.weight)
+        nn.init.zeros_(self.proj_out.bias)
+    def forward(
+        self,
+        hidden_states,
+        femb,
+        encoder_hidden_states=None,
+        timestep=None,
+        class_labels=None,
+        num_frames=1,
+        cross_attention_kwargs=None,
+        sample_index: torch.LongTensor = None,
+        vision_conditon_frames_sample_index: torch.LongTensor = None,
+        spatial_position_emb: torch.Tensor = None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
+                conditioning.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.transformer_2d.TransformerTemporalModelOutput`] or `tuple`:
+            [`~models.transformer_2d.TransformerTemporalModelOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+        """
+        if self.skip_temporal_layers is True:
+            if not return_dict:
+                return (hidden_states,)
+            return TransformerTemporalModelOutput(sample=hidden_states)
+        # 1. Input
+        batch_frames, channel, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        hidden_states = rearrange(
+            hidden_states, "(b t) c h w -> b c t h w", b=batch_size
+        )
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        hidden_states = rearrange(hidden_states, "b c t h w -> (b h w) t c")
+        hidden_states = self.proj_in(hidden_states)
+        # 2 Positional embedding
+        # adapted from https://github.com/huggingface/diffusers/blob/v0.16.1/src/diffusers/models/resnet.py#L574
+        if not self.remove_femb_non_linear:
+            femb = self.nonlinearity(femb)
+        femb = self.frame_emb_proj(femb)
+        femb = align_repeat_tensor_single_dim(femb, hidden_states.shape[0], dim=0)
+        hidden_states = hidden_states + femb
+        # 3. Blocks
+        if (
+            (self.only_cross_attention or not self.double_self_attention)
+            and self.cross_attention_dim is not None
+            and encoder_hidden_states is not None
+        ):
+            encoder_hidden_states = align_repeat_tensor_single_dim(
+                encoder_hidden_states,
+                hidden_states.shape[0],
+                dim=0,
+                n_src_base_length=batch_size,
+            )
+        for i, block in enumerate(self.transformer_blocks):
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+        # 4. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = rearrange(
+            hidden_states, "(b h w) t c -> b c t h w", b=batch_size, h=height, w=width
+        ).contiguous()
+        # 保留condition对应的frames，便于保持前序内容帧，提升一致性
+        # keep the frames corresponding to the condition to maintain the previous content frames and improve consistency
+        if (
+            vision_conditon_frames_sample_index is not None
+            and self.keep_content_condition
+        ):
+            mask = torch.ones_like(hidden_states, device=hidden_states.device)
+            mask = batch_index_fill(
+                mask, dim=2, index=vision_conditon_frames_sample_index, value=0
+            )
+            if self.need_temporal_weight:
+                output = (
+                    residual + torch.abs(self.temporal_weight) * mask * hidden_states
+                )
+            else:
+                output = residual + mask * hidden_states
+        else:
+            if self.need_temporal_weight:
+                output = residual + torch.abs(self.temporal_weight) * hidden_states
+            else:
+                output = residual + mask * hidden_states
+        # output = torch.abs(self.temporal_weight) * hidden_states + residual
+        output = rearrange(output, "b c t h w -> (b t) c h w")
+        if not return_dict:
+            return (output,)
+        return TransformerTemporalModelOutput(sample=output)

musev/models/text_model.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from typing import Any, Dict
+from torch import nn
+class TextEmbExtractor(nn.Module):
+    def __init__(self, tokenizer, text_encoder) -> None:
+        super(TextEmbExtractor, self).__init__()
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+    def forward(
+        self,
+        texts,
+        text_params: Dict = None,
+    ):
+        if text_params is None:
+            text_params = {}
+        special_prompt_input = self.tokenizer(
+            texts,
+            max_length=self.tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        if (
+            hasattr(self.text_encoder.config, "use_attention_mask")
+            and self.text_encoder.config.use_attention_mask
+        ):
+            attention_mask = special_prompt_input.attention_mask.to(
+                self.text_encoder.device
+            )
+        else:
+            attention_mask = None
+        embeddings = self.text_encoder(
+            special_prompt_input.input_ids.to(self.text_encoder.device),
+            attention_mask=attention_mask,
+            **text_params
+        )
+        return embeddings

musev/models/transformer_2d.py ADDED Viewed

	@@ -0,0 +1,445 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Optional
+import logging
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.models.transformer_2d import (
+    Transformer2DModelOutput,
+    Transformer2DModel as DiffusersTransformer2DModel,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput, deprecate
+from diffusers.models.attention import (
+    BasicTransformerBlock as DiffusersBasicTransformerBlock,
+)
+from diffusers.models.embeddings import PatchEmbed
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.constants import USE_PEFT_BACKEND
+from .attention import BasicTransformerBlock
+logger = logging.getLogger(__name__)
+# 本部分 与 diffusers/models/transformer_2d.py 几乎一样
+# 更新部分
+# 1. 替换自定义 BasicTransformerBlock 类
+# 2. 在forward 里增加了 self_attn_block_embs 用于 提取 self_attn 中的emb
+# this module is same as diffusers/models/transformer_2d.py. The update part is
+# 1 redefine BasicTransformerBlock
+# 2. add self_attn_block_embs in forward to extract emb from self_attn
+class Transformer2DModel(DiffusersTransformer2DModel):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: int | None = None,
+        out_channels: int | None = None,
+        num_layers: int = 1,
+        dropout: float = 0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: int | None = None,
+        attention_bias: bool = False,
+        sample_size: int | None = None,
+        num_vector_embeds: int | None = None,
+        patch_size: int | None = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: int | None = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        attention_type: str = "default",
+        cross_attn_temporal_cond: bool = False,
+        ip_adapter_cross_attn: bool = False,
+        need_t2i_facein: bool = False,
+        need_t2i_ip_adapter_face: bool = False,
+        image_scale: float = 1.0,
+    ):
+        super().__init__(
+            num_attention_heads,
+            attention_head_dim,
+            in_channels,
+            out_channels,
+            num_layers,
+            dropout,
+            norm_num_groups,
+            cross_attention_dim,
+            attention_bias,
+            sample_size,
+            num_vector_embeds,
+            patch_size,
+            activation_fn,
+            num_embeds_ada_norm,
+            use_linear_projection,
+            only_cross_attention,
+            double_self_attention,
+            upcast_attention,
+            norm_type,
+            norm_elementwise_affine,
+            attention_type,
+        )
+        inner_dim = num_attention_heads * attention_head_dim
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    attention_type=attention_type,
+                    cross_attn_temporal_cond=cross_attn_temporal_cond,
+                    ip_adapter_cross_attn=ip_adapter_cross_attn,
+                    need_t2i_facein=need_t2i_facein,
+                    need_t2i_ip_adapter_face=need_t2i_ip_adapter_face,
+                    image_scale=image_scale,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.num_layers = num_layers
+        self.cross_attn_temporal_cond = cross_attn_temporal_cond
+        self.ip_adapter_cross_attn = ip_adapter_cross_attn
+        self.need_t2i_facein = need_t2i_facein
+        self.need_t2i_ip_adapter_face = need_t2i_ip_adapter_face
+        self.image_scale = image_scale
+        self.print_idx = 0
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(hidden_states.dtype)
+            ) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # Retrieve lora scale.
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(
+                    batch, height * width, inner_dim
+                )
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(
+                    batch, height * width, inner_dim
+                )
+                hidden_states = (
+                    self.proj_in(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_in(hidden_states)
+                )
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = (
+                hidden_states.shape[-2] // self.patch_size,
+                hidden_states.shape[-1] // self.patch_size,
+            )
+            hidden_states = self.pos_embed(hidden_states)
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep,
+                    added_cond_kwargs,
+                    batch_size=batch_size,
+                    hidden_dtype=hidden_states.dtype,
+                )
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(
+                batch_size, -1, hidden_states.shape[-1]
+            )
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    self_attn_block_embs,
+                    self_attn_block_embs_mode,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                    self_attn_block_embs=self_attn_block_embs,
+                    self_attn_block_embs_mode=self_attn_block_embs_mode,
+                )
+            # 将 转换 self_attn_emb的尺寸
+            if (
+                self_attn_block_embs is not None
+                and self_attn_block_embs_mode.lower() == "write"
+            ):
+                self_attn_idx = block.spatial_self_attn_idx
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"self_attn_block_embs, num={len(self_attn_block_embs)}, before, shape={self_attn_block_embs[self_attn_idx].shape}, height={height}, width={width}"
+                    )
+                self_attn_block_embs[self_attn_idx] = rearrange(
+                    self_attn_block_embs[self_attn_idx],
+                    "bt (h w) c->bt c h w",
+                    h=height,
+                    w=width,
+                )
+                if self.print_idx == 0:
+                    logger.debug(
+                        f"self_attn_block_embs, num={len(self_attn_block_embs)},  after ,shape={self_attn_block_embs[self_attn_idx].shape}, height={height}, width={width}"
+                    )
+        if self.proj_out is None:
+            return hidden_states
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = (
+                    hidden_states.reshape(batch, height, width, inner_dim)
+                    .permute(0, 3, 1, 2)
+                    .contiguous()
+                )
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+            else:
+                hidden_states = (
+                    self.proj_out(hidden_states, scale=lora_scale)
+                    if not USE_PEFT_BACKEND
+                    else self.proj_out(hidden_states)
+                )
+                hidden_states = (
+                    hidden_states.reshape(batch, height, width, inner_dim)
+                    .permute(0, 3, 1, 2)
+                    .contiguous()
+                )
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = (
+                    self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                )
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (
+                    self.scale_shift_table[None] + embedded_timestep[:, None]
+                ).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(
+                    -1,
+                    height,
+                    width,
+                    self.patch_size,
+                    self.patch_size,
+                    self.out_channels,
+                )
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(
+                    -1,
+                    self.out_channels,
+                    height * self.patch_size,
+                    width * self.patch_size,
+                )
+            )
+        self.print_idx += 1
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

musev/models/unet_2d_blocks.py ADDED Viewed

	@@ -0,0 +1,1537 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Literal, Optional, Tuple, Union, List
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import is_torch_version, logging
+from diffusers.utils.torch_utils import apply_freeu
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+)
+from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.normalization import AdaGroupNorm
+from diffusers.models.resnet import (
+    Downsample2D,
+    FirDownsample2D,
+    FirUpsample2D,
+    KDownsample2D,
+    KUpsample2D,
+    ResnetBlock2D,
+    Upsample2D,
+)
+from diffusers.models.unet_2d_blocks import (
+    AttnDownBlock2D,
+    AttnDownEncoderBlock2D,
+    AttnSkipDownBlock2D,
+    AttnSkipUpBlock2D,
+    AttnUpBlock2D,
+    AttnUpDecoderBlock2D,
+    DownEncoderBlock2D,
+    KCrossAttnDownBlock2D,
+    KCrossAttnUpBlock2D,
+    KDownBlock2D,
+    KUpBlock2D,
+    ResnetDownsampleBlock2D,
+    ResnetUpsampleBlock2D,
+    SimpleCrossAttnDownBlock2D,
+    SimpleCrossAttnUpBlock2D,
+    SkipDownBlock2D,
+    SkipUpBlock2D,
+    UpDecoderBlock2D,
+)
+from .transformer_2d import Transformer2DModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    down_block_type = (
+        down_block_type[7:]
+        if down_block_type.startswith("UNetRes")
+        else down_block_type
+    )
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D"
+            )
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D"
+            )
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warn(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+    up_block_type = (
+        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    )
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D"
+            )
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D"
+            )
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.add_attention = add_attention
+        if attn_groups is None:
+            attn_groups = (
+                resnet_groups if resnet_time_scale_shift == "default" else None
+            )
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        if attention_head_dim is None:
+            logger.warn(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels
+                        if resnet_time_scale_shift == "spatial"
+                        else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(
+                    hidden_states,
+                    temb=temb,
+                    self_attn_block_embs=self_attn_block_embs,
+                    self_attn_block_embs_mode=self_attn_block_embs_mode,
+                )
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> torch.FloatTensor:
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    self_attn_block_embs=self_attn_block_embs,
+                    self_attn_block_embs_mode=self_attn_block_embs_mode,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    self_attn_block_embs=self_attn_block_embs,
+                )[0]
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+        return hidden_states
+class UNetMidBlock2DSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        self.num_heads = in_channels // self.attention_head_dim
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0()
+                if hasattr(F, "scaled_dot_product_attention")
+                else AttnAddedKVProcessor()
+            )
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = (
+            cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        )
+        lora_scale = cross_attention_kwargs.get("scale", 1.0)
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+        hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+                self_attn_block_embs=self_attn_block_embs,
+                self_attn_block_embs_mode=self_attn_block_embs_mode,
+            )
+            # resnet
+            hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+        return hidden_states
+class CrossAttnDownBlock2D(nn.Module):
+    print_idx = 0
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        blocks = list(zip(self.resnets, self.attentions))
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                if self.print_idx == 0:
+                    logger.debug(f"unet3d after resnet {hidden_states.mean()}")
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    self_attn_block_embs=self_attn_block_embs,
+                    self_attn_block_embs_mode=self_attn_block_embs_mode,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                if self.print_idx == 0:
+                    logger.debug(f"unet3d after resnet {hidden_states.mean()}")
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    self_attn_block_embs=self_attn_block_embs,
+                    self_attn_block_embs_mode=self_attn_block_embs_mode,
+                )[0]
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=lora_scale)
+            output_states = output_states + (hidden_states,)
+        self.print_idx += 1
+        return hidden_states, output_states
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, scale=scale)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> torch.FloatTensor:
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    self_attn_block_embs=self_attn_block_embs,
+                    self_attn_block_embs_mode=self_attn_block_embs_mode,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    self_attn_block_embs=self_attn_block_embs,
+                )[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(
+                    hidden_states, upsample_size, scale=lora_scale
+                )
+        return hidden_states
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        scale: float = 1.0,
+        self_attn_block_embs: Optional[List[torch.Tensor]] = None,
+        self_attn_block_embs_mode: Literal["read", "write"] = "write",
+    ) -> torch.FloatTensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb, scale=scale)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
+        return hidden_states