diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..33ebd88872c415c78009505c9e0bc7af74b76deb 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,47 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/1593596b99e2dde9.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/15b93cbe9fc5220d.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/2b4f6fdcabf53d59.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/2bd7cee1fa9c8996.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/2bff9ec89ca982c9.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/407eefe8017f6070.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/4118895a33890c5a.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/56f35362431e18dd.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/57132c374cc1ce2d.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/6b1091f5eb05783e.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/6b6d20c6a46b9fe9.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/80d21c1f8300db84.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/8135673a5a3e3d17.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/9696c0d0a01d2fd4.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/ab3d616a3d001515.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/c0bdeae8f2b84b7f.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/camera_D.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/camera_I.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/camera_L.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/camera_O.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/camera_R.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/camera_U.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/d5cddd204a805bad.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/d6849f92207aa171.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/ebf9eb32e850ea81.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/ec5c53a3d68fe3e7.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/ec8ee53e2d07e6ba.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/ffa95c3b40609c76.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/Pan_Left_90.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/Pan_Right_90.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/Spin_AntiClockwise_90.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/Spin_Clockwise_90.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/Tilt_Down_90.png filter=lfs diff=lfs merge=lfs -text
+assets/cam_trajectory/Tilt_Up_90.png filter=lfs diff=lfs merge=lfs -text
+assets/example_image/input1.png filter=lfs diff=lfs merge=lfs -text
+assets/example_image/input2.png filter=lfs diff=lfs merge=lfs -text
+assets/pages/res1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/pages/res2.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/pages/res3.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/pages/teaser.png filter=lfs diff=lfs merge=lfs -text
+results/generated_videos/A_chef_in_a_white_coat_and_gla_1593596b99e2dde9.txt.mp4 filter=lfs diff=lfs merge=lfs -text
+results/generated_videos/A_stunning_and_untouched_coast_6b6d20c6a46b9fe9.txt.mp4 filter=lfs diff=lfs merge=lfs -text
+tools/caption/assests/CogVLM2-Caption-example.png filter=lfs diff=lfs merge=lfs -text
+tools/caption/assests/cogvlm2-video-example.png filter=lfs diff=lfs merge=lfs -text
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f98e413cf55b56dbabdf693559055d79800f63c9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2024 CogVideo Model Team @ Zhipu AI
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 1e833686d7588c830d2d502d21cccf7f610691df..8bdd7daee1db0a07db0978fb39eca748763092de 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,142 @@
----
-title: FloVD
-emoji: 👀
-colorFrom: red
-colorTo: blue
-sdk: gradio
-sdk_version: 5.35.0
-app_file: app.py
-pinned: false
-license: mit
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# FloVD: Optical Flow Meets Video Diffusion Model for Enhanced Camera-Controlled Video Synthesis (CogVideoX-based FloVD)<br>
+
+![Teaser image 1](./assets/pages/teaser.png)
+
+[\[Project Page\]](https://jinwonjoon.github.io/flovd_site/)
+[\[arXiv\]](https://arxiv.org/abs/2502.08244/)
+
+**FloVD: Optical Flow Meets Video Diffusion Model for Enhanced Camera-Controlled Video Synthesis**<br>
+Wonjoon Jin, Qi Dai, Chong Luo, Seung-Hwan Baek, Sunghyun Cho<br>
+POSTECH, Microsoft Research Asia
+<br>
+
+## Gallery
+
+### FloVD-CogVideoX-5B
+
+
+<table border="0" style="width: 100%; text-align: left; margin-top: 20px;">
+  <tr>
+      <td>
+          <video src="https://github.com/user-attachments/assets/a55d1c29-6682-417d-886c-695b1d1b61fd" width="100%" controls autoplay loop></video>
+      </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/4def8617-063f-4e61-969a-fd0507dbdeec" width="100%" controls autoplay loop></video>
+      </td>
+       <td>
+          <video src="https://github.com/user-attachments/assets/55745611-fea3-4f3f-bdd1-48b5f6c24f98" width="100%" controls autoplay loop></video>
+     </td>
+      <td>
+          <video src="https://github.com/user-attachments/assets/97be3121-ae38-45f9-822a-e387cf262824" width="100%" controls autoplay loop></video>
+     </td>
+  </tr>
+</table>
+
+## Project Updates
+
+- **News**: ```2025/05/02```: We have updated the code for `FloVD-CogVideoX`. We will release dataset preprocessing and training codes soon.
+
+- **News**: ```2025/02/26```: Our paper has been accepted to CVPR 2025.
+
+
+## Quick Start
+
+### Prompt Optimization
+
+As mentioned in [CogVideoX](https://github.com/THUDM/CogVideo), we recommend to use long, detailed text prompts to get better results. Our FloVD-CogVideoX model is trained using text captions extracted from [CogVLM2](https://github.com/THUDM/CogVLM2).
+
+### Environment
+
+**Please make sure your Python version is between 3.10 and 3.12, inclusive of both 3.10 and 3.12.**
+
+```
+pip install -r requirements.txt
+```
+
+### Optical flow normalization
+As mentioned in FloVD paper, we normalize optical flow following [Generative Image Dynamics](https://generative-dynamics.github.io/). For this, we use scale factors (s_x, s_y) of (60, 36) for both FVSM and OMSM.
+
+### Pre-trained checkpoints
+Download the FloVD-CogVideoX <br>
+FVSM and OMSM (Curated) <br>
+[\[Google Drive\]](https://drive.google.com/drive/folders/1Y7Fha8QKX6bg_0YEOxQf0M6uaPJ9SfgB?usp=sharing)
+In addition, we used the off-the-shelf depth estimation model (Depth Anything V2, metric depth).
+For these models, please refer links below. <br>
+[\[Depth_anything_v2_metric\]](https://github.com/DepthAnything/Depth-Anything-V2/tree/main/metric_depth)
+<br>
+Then, place these checkpoints in ./ckpt directory
+```shell
+# File tree
+./ckpt/
+├── FVSM
+│   ├ FloVD_FVSM_Controlnet.pt
+├── OMSM
+│   ├ selected_blocks.safetensors
+│   ├ pytorch_lora_weights.safetensors
+├── others
+│   ├ depth_anything_v2_metric_hypersim_vitb.pth
+```
+
+### Pre-defined camera trajectory
+We provide several example camera trajectory for user's quick inference.
+Refer to "./assets/cam_trajectory/" for visualization of each camera trajectory.
+```shell
+# File tree
+./assets/
+├── manual_poses
+│   ├ ...
+├── re10k_poses
+│   ├ ...
+├── manual_poses_PanTiltSpin
+│   ├ ...
+```
+
+### Inference Settings
+In the inference time, we recommend to use the same setting used in the training time.
++ The number of frames: 49
+
++ FPS: 16
+
++ Flow scale factor: (s_x, s_y) = (60, 36)
+
++ CONTROLNET_GUIDANCE_END: 0.4 for better camera controllability, 0.1 for more natural object motions. This argument means the ratio of timestep to inject ControlNet features to the pre-trained model.
+
+
+### Inference
+
++ [flovd_demo](inference/flovd_demo.py): To synthesize videos with desired camera trajectory and natural object motions, use this. A more detailed inference code explanation, including the significance of common parameters. Refer to [flovd_demo_script](inference/inference_scripts/flovd_demo.sh)
+
++ [flovd_fvsm_demo](inference/flovd_fvsm_demo.py): You can solely use FVSM model for more accurate camera control with little object motions. This code omits OMSM and only uses FVSM. (The script will be released soon.)
+
++ [flovd_ddp_demo](inference/flovd_ddp_demo.py): If you want to sample large number of videos, you can use this. Note that you need to prepare dataset in advance following our dataset preprocessing pipeline. (The preprocessing pipeline will be released.)
+
+### Tools
+
+This folder contains some tools for camera trajectory generation, visualization, etc.
+
++ [generate_camparam](tools/generate_camparam.py): Generate manual camera parameters such as zoom-in, zoom-out, etc.
+
++ [visualize trajectory](tools/visualize_trajectory.py): Converts SAT model weights to Huggingface model weights.
+
+
+
+## Citation
+
+🌟 If you find our work helpful, please leave us a star and cite our paper.
+
+```
+@article{jin2025flovd,
+  title={FloVD: Optical Flow Meets Video Diffusion Model for Enhanced Camera-Controlled Video Synthesis},
+  author={Jin, Wonjoon and Dai, Qi and Luo, Chong and Baek, Seung-Hwan and Cho, Sunghyun},
+  journal={arXiv preprint arXiv:2502.08244},
+  year={2025}
+}
+```
+
+## Reference
+We thank [CogVideoX](https://github.com/THUDM/CogVideo) for open source
+
+## Model-License
+
+The CogVideoX-5B model (Transformers module, include I2V and T2V) is released under
+the [CogVideoX LICENSE](https://huggingface.co/THUDM/CogVideoX-5b/blob/main/LICENSE).
diff --git a/assets/cam_trajectory/1593596b99e2dde9.png b/assets/cam_trajectory/1593596b99e2dde9.png
new file mode 100644
index 0000000000000000000000000000000000000000..01977800c23fb4df669fb7e5ff681e6f74d4fdd5
--- /dev/null
+++ b/assets/cam_trajectory/1593596b99e2dde9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:825e0e1ed7e8906569e7730610d794fc94791610ec421a75bdcb5a552f52d1ef
+size 137713
diff --git a/assets/cam_trajectory/15b93cbe9fc5220d.png b/assets/cam_trajectory/15b93cbe9fc5220d.png
new file mode 100644
index 0000000000000000000000000000000000000000..f16450881cf39fbfd9b31d8ce5638092d78e9b25
--- /dev/null
+++ b/assets/cam_trajectory/15b93cbe9fc5220d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd862eb11a6cdd0dc4da3b740525f2b264f0bfb977da42b06b16b505d4d2221b
+size 135966
diff --git a/assets/cam_trajectory/2b4f6fdcabf53d59.png b/assets/cam_trajectory/2b4f6fdcabf53d59.png
new file mode 100644
index 0000000000000000000000000000000000000000..bffc9b564fbaab01db1d13cbab6a29decc7de3ca
--- /dev/null
+++ b/assets/cam_trajectory/2b4f6fdcabf53d59.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0524fd4bfe9456b80f85939382324c07affe26f7973608af906cb82020a6fcfe
+size 136932
diff --git a/assets/cam_trajectory/2bd7cee1fa9c8996.png b/assets/cam_trajectory/2bd7cee1fa9c8996.png
new file mode 100644
index 0000000000000000000000000000000000000000..09b5d099211eadba81c2802ea741eb1c7e90cd34
--- /dev/null
+++ b/assets/cam_trajectory/2bd7cee1fa9c8996.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3de7e72136476ec45de2ae91852fb06a29ce63936afc183c12e7edd4324e2a3
+size 136546
diff --git a/assets/cam_trajectory/2bff9ec89ca982c9.png b/assets/cam_trajectory/2bff9ec89ca982c9.png
new file mode 100644
index 0000000000000000000000000000000000000000..95d57565263c8d9a51434f639e772468caa0c1aa
--- /dev/null
+++ b/assets/cam_trajectory/2bff9ec89ca982c9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5c07ce18f48b62d43e7d20412443f0537ca5403dfdc3db4750a5f81a521819b
+size 137491
diff --git a/assets/cam_trajectory/407eefe8017f6070.png b/assets/cam_trajectory/407eefe8017f6070.png
new file mode 100644
index 0000000000000000000000000000000000000000..dcfdfcf029268be3f280d76396681199125a1713
--- /dev/null
+++ b/assets/cam_trajectory/407eefe8017f6070.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86b7d834683d895a8e2c8875799344d66cf17d70fc27570c700a1d586de921a2
+size 136339
diff --git a/assets/cam_trajectory/4118895a33890c5a.png b/assets/cam_trajectory/4118895a33890c5a.png
new file mode 100644
index 0000000000000000000000000000000000000000..9888474080f44362475c78e52a373a2a8be48559
--- /dev/null
+++ b/assets/cam_trajectory/4118895a33890c5a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d470b7592929c2c88f619814ba583c64307fc2ca2bebfcf30472041054372f0b
+size 136939
diff --git a/assets/cam_trajectory/56f35362431e18dd.png b/assets/cam_trajectory/56f35362431e18dd.png
new file mode 100644
index 0000000000000000000000000000000000000000..83b581dfb70c441efa430e5326ecb7a87238c58b
--- /dev/null
+++ b/assets/cam_trajectory/56f35362431e18dd.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe3a65154c17cd887129d2a7d45c63ef0d795128b3b65901c06811025ea0445b
+size 139526
diff --git a/assets/cam_trajectory/57132c374cc1ce2d.png b/assets/cam_trajectory/57132c374cc1ce2d.png
new file mode 100644
index 0000000000000000000000000000000000000000..6744ded106f26566d0dbdeaec6cd33ab3aef81b9
--- /dev/null
+++ b/assets/cam_trajectory/57132c374cc1ce2d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61e62e4514701eb322e1d40989188e67fbb15afd5d8f624a9f8dd5f2cf7b4514
+size 135718
diff --git a/assets/cam_trajectory/6b1091f5eb05783e.png b/assets/cam_trajectory/6b1091f5eb05783e.png
new file mode 100644
index 0000000000000000000000000000000000000000..221bff4fc20c5a4a2ba6950119e6dfb9768dfaab
--- /dev/null
+++ b/assets/cam_trajectory/6b1091f5eb05783e.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4af1b607c26324dd44e9f469c01baa93f752ee8e073b227fd7f6c185d31709a1
+size 135849
diff --git a/assets/cam_trajectory/6b6d20c6a46b9fe9.png b/assets/cam_trajectory/6b6d20c6a46b9fe9.png
new file mode 100644
index 0000000000000000000000000000000000000000..903428c19ab61cb31de0ce862beaa96ed7c4ff3c
--- /dev/null
+++ b/assets/cam_trajectory/6b6d20c6a46b9fe9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0532a45af977ef4ab23e48e60aad147e897b2f4be4796e6454d75b9aea62fdc
+size 138764
diff --git a/assets/cam_trajectory/80d21c1f8300db84.png b/assets/cam_trajectory/80d21c1f8300db84.png
new file mode 100644
index 0000000000000000000000000000000000000000..30e140e3820d8382fec965bbd271051d2069af8a
--- /dev/null
+++ b/assets/cam_trajectory/80d21c1f8300db84.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:712da6e4070e48821d0bc342a54e68f2b47c0235a4074445edc05655e79dca38
+size 138012
diff --git a/assets/cam_trajectory/8135673a5a3e3d17.png b/assets/cam_trajectory/8135673a5a3e3d17.png
new file mode 100644
index 0000000000000000000000000000000000000000..8a3e4e509cc6782b8087d07f752b4093b13c3d3a
--- /dev/null
+++ b/assets/cam_trajectory/8135673a5a3e3d17.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84807b369bebbd097a7e7ece4ec8165ebe7343f3c307031bf33c553925bb2b3a
+size 137552
diff --git a/assets/cam_trajectory/9696c0d0a01d2fd4.png b/assets/cam_trajectory/9696c0d0a01d2fd4.png
new file mode 100644
index 0000000000000000000000000000000000000000..90363a7d40a6ce5b12a1da0e41311cfab9a567fd
--- /dev/null
+++ b/assets/cam_trajectory/9696c0d0a01d2fd4.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab02e327771fc53e3827153ebf073eab688459f37cff52e1d838f8502839920b
+size 136165
diff --git a/assets/cam_trajectory/Pan_Left_90.png b/assets/cam_trajectory/Pan_Left_90.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d4545b5268641545137750de0c80820f2e17c77
--- /dev/null
+++ b/assets/cam_trajectory/Pan_Left_90.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20261d133a4d51083fafca906feceef95aa036d9216138baf3f748cd6b44a1b6
+size 132663
diff --git a/assets/cam_trajectory/Pan_Right_90.png b/assets/cam_trajectory/Pan_Right_90.png
new file mode 100644
index 0000000000000000000000000000000000000000..20e4d56bc0cf6bbe81d1a0eb45c8fd0e8a09f7d5
--- /dev/null
+++ b/assets/cam_trajectory/Pan_Right_90.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:570987ea62c35bc21ff37a19ef2a4e948fb54ecc63546864b3190f6b84471b3c
+size 132620
diff --git a/assets/cam_trajectory/Spin_AntiClockwise_90.png b/assets/cam_trajectory/Spin_AntiClockwise_90.png
new file mode 100644
index 0000000000000000000000000000000000000000..a8595f259672e6d83c3befad6bb3669d808dd905
--- /dev/null
+++ b/assets/cam_trajectory/Spin_AntiClockwise_90.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3408def14dae3877a22526dbe0690ab1f00d7a0a858489e65dae3d1ab819a169
+size 131721
diff --git a/assets/cam_trajectory/Spin_Clockwise_90.png b/assets/cam_trajectory/Spin_Clockwise_90.png
new file mode 100644
index 0000000000000000000000000000000000000000..6afadc3153653dfd36d63d74bb64f7e1b6ee960f
--- /dev/null
+++ b/assets/cam_trajectory/Spin_Clockwise_90.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:106a04caf0655981fc98f08c11e36a2db973a8525c4b71f23083a92ecec112b6
+size 131736
diff --git a/assets/cam_trajectory/Tilt_Down_90.png b/assets/cam_trajectory/Tilt_Down_90.png
new file mode 100644
index 0000000000000000000000000000000000000000..edf74d6313c2f24d2abdc4267fc31e476e2225dd
--- /dev/null
+++ b/assets/cam_trajectory/Tilt_Down_90.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a242cb6b0ebc00f7f61c4dd12a4f9f5421b259a30ff99eb27ad91dcf8c0460ff
+size 132408
diff --git a/assets/cam_trajectory/Tilt_Up_90.png b/assets/cam_trajectory/Tilt_Up_90.png
new file mode 100644
index 0000000000000000000000000000000000000000..d513b419e013735bf195eaf6748d306f80ae0c55
--- /dev/null
+++ b/assets/cam_trajectory/Tilt_Up_90.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fee4dd410348d25fc97a12152d33c5491b9bbb18fc6981260e3245a29ccb9872
+size 132672
diff --git a/assets/cam_trajectory/ab3d616a3d001515.png b/assets/cam_trajectory/ab3d616a3d001515.png
new file mode 100644
index 0000000000000000000000000000000000000000..26850ca6a2fa914320c8f7fb99a5bb15a30e396c
--- /dev/null
+++ b/assets/cam_trajectory/ab3d616a3d001515.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5534f91fad58171597b1a571afac355d5d2f465e313c20e4b71115db516502c1
+size 136755
diff --git a/assets/cam_trajectory/c0bdeae8f2b84b7f.png b/assets/cam_trajectory/c0bdeae8f2b84b7f.png
new file mode 100644
index 0000000000000000000000000000000000000000..548fa83e8b0e442326a499e9562b7605aa342896
--- /dev/null
+++ b/assets/cam_trajectory/c0bdeae8f2b84b7f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65c108c7bf4fe5fa49b0eccd16c77a5d13c783f7bf63aa513e8dc02373594e96
+size 136823
diff --git a/assets/cam_trajectory/camera_D.png b/assets/cam_trajectory/camera_D.png
new file mode 100644
index 0000000000000000000000000000000000000000..649b72b34b025fe58ed64fd08468ba4b6b55413f
--- /dev/null
+++ b/assets/cam_trajectory/camera_D.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c2fe33bb8f73465362650bfd8dfbf4efd14e85aebd9063f0af2caa24b6e27d8b
+size 138457
diff --git a/assets/cam_trajectory/camera_I.png b/assets/cam_trajectory/camera_I.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a87f8bf3d5c31a65bc4c3b57acb9062aa92be08
--- /dev/null
+++ b/assets/cam_trajectory/camera_I.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47f7a06b3731e90c45d2d3b40b5b54e09a0d0685ce3dfeb44880dce9361f0cf3
+size 137990
diff --git a/assets/cam_trajectory/camera_L.png b/assets/cam_trajectory/camera_L.png
new file mode 100644
index 0000000000000000000000000000000000000000..654640d269a498c6b793d8d3785f98ef2eb9c2cf
--- /dev/null
+++ b/assets/cam_trajectory/camera_L.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c4103b51c44a1bf35fe42a7c79a57c9968a58337b60e31c5d79f9df4781aec4
+size 141593
diff --git a/assets/cam_trajectory/camera_O.png b/assets/cam_trajectory/camera_O.png
new file mode 100644
index 0000000000000000000000000000000000000000..0956686fef4303978d565f21172fbc8c96a0694a
--- /dev/null
+++ b/assets/cam_trajectory/camera_O.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82a1a1e82b082389d620e7063fe93b862bb3483ecd10a3d247d0fe6fe3f1d472
+size 139455
diff --git a/assets/cam_trajectory/camera_R.png b/assets/cam_trajectory/camera_R.png
new file mode 100644
index 0000000000000000000000000000000000000000..ccf168448b321c6a11e549223e4bb089bad7827e
--- /dev/null
+++ b/assets/cam_trajectory/camera_R.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de4741724e5bf78501988bd0ddccda16b26a88d75a4c0c4e1b4a557d1b104958
+size 139911
diff --git a/assets/cam_trajectory/camera_U.png b/assets/cam_trajectory/camera_U.png
new file mode 100644
index 0000000000000000000000000000000000000000..eff659745a26c6e83ade44d18a123485944ddcb2
--- /dev/null
+++ b/assets/cam_trajectory/camera_U.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f5b948d707ffca069e38bf1c7e5382d413411e21b6abb9e60684f987f7b01a3a
+size 140032
diff --git a/assets/cam_trajectory/d5cddd204a805bad.png b/assets/cam_trajectory/d5cddd204a805bad.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0a92d81f38ddd1ef5ce4eda8cec2c940cd5d5e6
--- /dev/null
+++ b/assets/cam_trajectory/d5cddd204a805bad.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27d2fcf1dc6c3a6fdae692f463f0d692f4f415925fddd746645497d0c371b5f4
+size 139242
diff --git a/assets/cam_trajectory/d6849f92207aa171.png b/assets/cam_trajectory/d6849f92207aa171.png
new file mode 100644
index 0000000000000000000000000000000000000000..afd58aac5e080d2f4d54c3242fa6f0e23d292c13
--- /dev/null
+++ b/assets/cam_trajectory/d6849f92207aa171.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:498c92a8e7643a8efe9773aa3e90bda61e0a3e81449e43438391cd5ecc0a8e44
+size 135950
diff --git a/assets/cam_trajectory/ebf9eb32e850ea81.png b/assets/cam_trajectory/ebf9eb32e850ea81.png
new file mode 100644
index 0000000000000000000000000000000000000000..17d029655634b2fe7a74ac62bf0f4869363a65be
--- /dev/null
+++ b/assets/cam_trajectory/ebf9eb32e850ea81.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6086e83e2a8cf6827dfc2dad940bbfd9361dfbd8b5be4cc06c408b38793f9983
+size 137074
diff --git a/assets/cam_trajectory/ec5c53a3d68fe3e7.png b/assets/cam_trajectory/ec5c53a3d68fe3e7.png
new file mode 100644
index 0000000000000000000000000000000000000000..b520fd5b56fe2a550850c4e464d4fe291a939508
--- /dev/null
+++ b/assets/cam_trajectory/ec5c53a3d68fe3e7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef3412a94a1c0ec319d9d640339c7912f93c3b8b5460f0bfe99ec89a0727074f
+size 139281
diff --git a/assets/cam_trajectory/ec8ee53e2d07e6ba.png b/assets/cam_trajectory/ec8ee53e2d07e6ba.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e0210b1daf808d0625dd662894cb4940db3c357
--- /dev/null
+++ b/assets/cam_trajectory/ec8ee53e2d07e6ba.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81b64aaf2fac6932b0b8d63dbf18724481fbd6773e0f11a880848b1a472040fd
+size 136525
diff --git a/assets/cam_trajectory/ffa95c3b40609c76.png b/assets/cam_trajectory/ffa95c3b40609c76.png
new file mode 100644
index 0000000000000000000000000000000000000000..51eddeca61da12d16b445d0a4678830a67d5d34d
--- /dev/null
+++ b/assets/cam_trajectory/ffa95c3b40609c76.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:558ee35bb7842629cbf5c69beded0f2ec54eebbf8b7e136ef47af4c6cfa14ea5
+size 135988
diff --git a/assets/example_image/input1.png b/assets/example_image/input1.png
new file mode 100644
index 0000000000000000000000000000000000000000..81a03860ac648c9be5c676cd4dfa1298f6e55a5c
--- /dev/null
+++ b/assets/example_image/input1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aba6ec272885b7f285ffe97847bf38749f67b62f189396dd387353760471cf4b
+size 429458
diff --git a/assets/example_image/input2.png b/assets/example_image/input2.png
new file mode 100644
index 0000000000000000000000000000000000000000..aab20613eedad1067619ab395be58907dbd4b610
--- /dev/null
+++ b/assets/example_image/input2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64569ff36ffef56c5df098c43b2caf67b1d3d993a2103fe5614d2564d4fdb09b
+size 605938
diff --git a/assets/manual_poses/camera_D.txt b/assets/manual_poses/camera_D.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78fd541d94f9c89031c3d95bf3c1a21980cda81e
--- /dev/null
+++ b/assets/manual_poses/camera_D.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.031250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.062500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.093750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.125000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.156250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.187500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.218750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.250000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.281250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.312500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.343750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.375000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.406250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.437500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.468750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.500000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.531250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.562500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.593750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.625000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.656250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.687500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.718750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.750000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.781250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.812500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.843750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.875000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.906250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.937500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.968750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.031250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.062500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.093750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.125000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.156250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.187500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.218750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.250000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.281250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.312500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.343750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.375000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.406250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.437500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.468750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.500000 0.000000 0.000000 1.000000 0.000000
diff --git a/assets/manual_poses/camera_I.txt b/assets/manual_poses/camera_I.txt
new file mode 100644
index 0000000000000000000000000000000000000000..94d7c83857206fcc4802992461b78ac6faafb4ec
--- /dev/null
+++ b/assets/manual_poses/camera_I.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.031250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.062500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.093750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.125000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.156250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.187500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.218750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.250000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.281250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.312500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.343750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.375000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.406250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.437500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.468750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.500000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.531250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.562500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.593750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.625000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.656250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.687500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.718750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.750000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.781250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.812500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.843750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.875000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.906250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.937500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.968750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.031250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.062500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.093750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.125000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.156250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.187500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.218750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.250000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.281250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.312500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.343750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.375000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.406250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.437500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.468750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.500000
diff --git a/assets/manual_poses/camera_L.txt b/assets/manual_poses/camera_L.txt
new file mode 100644
index 0000000000000000000000000000000000000000..749f89322b9e53553fd5cff15f8750f1e900b1ed
--- /dev/null
+++ b/assets/manual_poses/camera_L.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.031250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.062500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.093750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.125000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.156250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.187500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.218750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.250000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.281250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.312500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.343750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.375000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.406250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.437500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.468750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.500000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.531250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.562500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.593750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.625000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.656250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.687500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.718750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.750000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.781250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.812500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.843750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.875000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.906250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.937500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.968750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.031250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.062500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.093750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.125000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.156250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.187500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.218750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.250000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.281250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.312500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.343750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.375000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.406250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.437500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.468750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.500000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
diff --git a/assets/manual_poses/camera_O.txt b/assets/manual_poses/camera_O.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b6de034cd4d0b501d0b68eb3bd6a78d54245f7df
--- /dev/null
+++ b/assets/manual_poses/camera_O.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.031250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.062500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.093750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.125000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.156250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.187500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.218750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.250000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.281250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.312500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.343750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.375000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.406250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.437500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.468750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.500000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.531250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.562500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.593750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.625000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.656250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.687500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.718750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.750000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.781250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.812500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.843750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.875000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.906250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.937500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.968750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.031250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.062500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.093750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.125000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.156250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.187500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.218750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.250000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.281250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.312500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.343750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.375000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.406250
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.437500
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.468750
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -1.500000
diff --git a/assets/manual_poses/camera_R.txt b/assets/manual_poses/camera_R.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72320f6ac15f3615730df769c2107268119a8a48
--- /dev/null
+++ b/assets/manual_poses/camera_R.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.031250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.062500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.093750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.125000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.156250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.187500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.218750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.250000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.281250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.312500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.343750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.375000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.406250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.437500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.468750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.500000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.531250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.562500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.593750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.625000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.656250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.687500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.718750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.750000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.781250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.812500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.843750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.875000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.906250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.937500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.968750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.031250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.062500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.093750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.125000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.156250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.187500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.218750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.250000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.281250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.312500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.343750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.375000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.406250 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.437500 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.468750 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 1.500000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
diff --git a/assets/manual_poses/camera_U.txt b/assets/manual_poses/camera_U.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1452d13666c4d61ee3b2d0dddd7228f2f49afd94
--- /dev/null
+++ b/assets/manual_poses/camera_U.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.031250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.062500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.093750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.125000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.156250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.187500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.218750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.250000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.281250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.312500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.343750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.375000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.406250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.437500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.468750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.500000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.531250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.562500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.593750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.625000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.656250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.687500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.718750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.750000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.781250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.812500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.843750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.875000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.906250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.937500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -0.968750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.031250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.062500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.093750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.125000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.156250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.187500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.218750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.250000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.281250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.312500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.343750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.375000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.406250 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.437500 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.468750 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 -1.500000 0.000000 0.000000 1.000000 0.000000
diff --git a/assets/manual_poses_PanTiltSpin/Pan_Left_90.000000.txt b/assets/manual_poses_PanTiltSpin/Pan_Left_90.000000.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6686a25755dc0bb57bd727652c23a317cc1121ca
--- /dev/null
+++ b/assets/manual_poses_PanTiltSpin/Pan_Left_90.000000.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.999465 0.000000 -0.032719 0.000000 0.000000 1.000000 0.000000 0.000000 0.032719 0.000000 0.999465 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.997859 0.000000 -0.065403 0.000000 0.000000 1.000000 0.000000 0.000000 0.065403 0.000000 0.997859 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.995185 0.000000 -0.098017 0.000000 0.000000 1.000000 0.000000 0.000000 0.098017 0.000000 0.995185 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.991445 0.000000 -0.130526 0.000000 0.000000 1.000000 0.000000 0.000000 0.130526 0.000000 0.991445 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.986643 0.000000 -0.162895 0.000000 0.000000 1.000000 0.000000 0.000000 0.162895 0.000000 0.986643 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.980785 0.000000 -0.195090 0.000000 0.000000 1.000000 0.000000 0.000000 0.195090 0.000000 0.980785 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.973877 0.000000 -0.227076 0.000000 0.000000 1.000000 0.000000 0.000000 0.227076 0.000000 0.973877 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.965926 0.000000 -0.258819 0.000000 0.000000 1.000000 0.000000 0.000000 0.258819 0.000000 0.965926 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.956940 0.000000 -0.290285 0.000000 0.000000 1.000000 0.000000 0.000000 0.290285 0.000000 0.956940 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.946930 0.000000 -0.321439 0.000000 0.000000 1.000000 0.000000 0.000000 0.321439 0.000000 0.946930 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.935906 0.000000 -0.352250 0.000000 0.000000 1.000000 0.000000 0.000000 0.352250 0.000000 0.935906 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.923880 0.000000 -0.382683 0.000000 0.000000 1.000000 0.000000 0.000000 0.382683 0.000000 0.923880 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.910864 0.000000 -0.412707 0.000000 0.000000 1.000000 0.000000 0.000000 0.412707 0.000000 0.910864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.896873 0.000000 -0.442289 0.000000 0.000000 1.000000 0.000000 0.000000 0.442289 0.000000 0.896873 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.881921 0.000000 -0.471397 0.000000 0.000000 1.000000 0.000000 0.000000 0.471397 0.000000 0.881921 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.866025 0.000000 -0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.500000 0.000000 0.866025 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.849202 0.000000 -0.528068 0.000000 0.000000 1.000000 0.000000 0.000000 0.528068 0.000000 0.849202 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.831470 0.000000 -0.555570 0.000000 0.000000 1.000000 0.000000 0.000000 0.555570 0.000000 0.831470 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.812847 0.000000 -0.582478 0.000000 0.000000 1.000000 0.000000 0.000000 0.582478 0.000000 0.812847 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.793353 0.000000 -0.608761 0.000000 0.000000 1.000000 0.000000 0.000000 0.608761 0.000000 0.793353 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.773010 0.000000 -0.634393 0.000000 0.000000 1.000000 0.000000 0.000000 0.634393 0.000000 0.773010 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.751840 0.000000 -0.659346 0.000000 0.000000 1.000000 0.000000 0.000000 0.659346 0.000000 0.751840 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.729864 0.000000 -0.683592 0.000000 0.000000 1.000000 0.000000 0.000000 0.683592 0.000000 0.729864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.707107 0.000000 -0.707107 0.000000 0.000000 1.000000 0.000000 0.000000 0.707107 0.000000 0.707107 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.683592 0.000000 -0.729864 0.000000 0.000000 1.000000 0.000000 0.000000 0.729864 0.000000 0.683592 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.659346 0.000000 -0.751840 0.000000 0.000000 1.000000 0.000000 0.000000 0.751840 0.000000 0.659346 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.634393 0.000000 -0.773010 0.000000 0.000000 1.000000 0.000000 0.000000 0.773010 0.000000 0.634393 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.608761 0.000000 -0.793353 0.000000 0.000000 1.000000 0.000000 0.000000 0.793353 0.000000 0.608761 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.582478 0.000000 -0.812847 0.000000 0.000000 1.000000 0.000000 0.000000 0.812847 0.000000 0.582478 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.555570 0.000000 -0.831470 0.000000 0.000000 1.000000 0.000000 0.000000 0.831470 0.000000 0.555570 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.528068 0.000000 -0.849202 0.000000 0.000000 1.000000 0.000000 0.000000 0.849202 0.000000 0.528068 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.500000 0.000000 -0.866025 0.000000 0.000000 1.000000 0.000000 0.000000 0.866025 0.000000 0.500000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.471397 0.000000 -0.881921 0.000000 0.000000 1.000000 0.000000 0.000000 0.881921 0.000000 0.471397 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.442289 0.000000 -0.896873 0.000000 0.000000 1.000000 0.000000 0.000000 0.896873 0.000000 0.442289 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.412707 0.000000 -0.910864 0.000000 0.000000 1.000000 0.000000 0.000000 0.910864 0.000000 0.412707 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.382683 0.000000 -0.923880 0.000000 0.000000 1.000000 0.000000 0.000000 0.923880 0.000000 0.382683 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.352250 0.000000 -0.935906 0.000000 0.000000 1.000000 0.000000 0.000000 0.935906 0.000000 0.352250 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.321439 0.000000 -0.946930 0.000000 0.000000 1.000000 0.000000 0.000000 0.946930 0.000000 0.321439 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.290285 0.000000 -0.956940 0.000000 0.000000 1.000000 0.000000 0.000000 0.956940 0.000000 0.290285 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.258819 0.000000 -0.965926 0.000000 0.000000 1.000000 0.000000 0.000000 0.965926 0.000000 0.258819 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.227076 0.000000 -0.973877 0.000000 0.000000 1.000000 0.000000 0.000000 0.973877 0.000000 0.227076 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.195090 0.000000 -0.980785 0.000000 0.000000 1.000000 0.000000 0.000000 0.980785 0.000000 0.195090 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.162895 0.000000 -0.986643 0.000000 0.000000 1.000000 0.000000 0.000000 0.986643 0.000000 0.162895 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.130526 0.000000 -0.991445 0.000000 0.000000 1.000000 0.000000 0.000000 0.991445 0.000000 0.130526 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.098017 0.000000 -0.995185 0.000000 0.000000 1.000000 0.000000 0.000000 0.995185 0.000000 0.098017 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.065403 0.000000 -0.997859 0.000000 0.000000 1.000000 0.000000 0.000000 0.997859 0.000000 0.065403 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.032719 0.000000 -0.999465 0.000000 0.000000 1.000000 0.000000 0.000000 0.999465 0.000000 0.032719 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.000000 0.000000 -1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000
diff --git a/assets/manual_poses_PanTiltSpin/Pan_Right_90.000000.txt b/assets/manual_poses_PanTiltSpin/Pan_Right_90.000000.txt
new file mode 100644
index 0000000000000000000000000000000000000000..afe02ebf883d5c84f0583f21ddfd54707adb28f9
--- /dev/null
+++ b/assets/manual_poses_PanTiltSpin/Pan_Right_90.000000.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.999465 0.000000 0.032719 0.000000 0.000000 1.000000 0.000000 0.000000 -0.032719 0.000000 0.999465 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.997859 0.000000 0.065403 0.000000 0.000000 1.000000 0.000000 0.000000 -0.065403 0.000000 0.997859 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.995185 0.000000 0.098017 0.000000 0.000000 1.000000 0.000000 0.000000 -0.098017 0.000000 0.995185 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.991445 0.000000 0.130526 0.000000 0.000000 1.000000 0.000000 0.000000 -0.130526 0.000000 0.991445 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.986643 0.000000 0.162895 0.000000 0.000000 1.000000 0.000000 0.000000 -0.162895 0.000000 0.986643 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.980785 0.000000 0.195090 0.000000 0.000000 1.000000 0.000000 0.000000 -0.195090 0.000000 0.980785 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.973877 0.000000 0.227076 0.000000 0.000000 1.000000 0.000000 0.000000 -0.227076 0.000000 0.973877 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.965926 0.000000 0.258819 0.000000 0.000000 1.000000 0.000000 0.000000 -0.258819 0.000000 0.965926 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.956940 0.000000 0.290285 0.000000 0.000000 1.000000 0.000000 0.000000 -0.290285 0.000000 0.956940 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.946930 0.000000 0.321439 0.000000 0.000000 1.000000 0.000000 0.000000 -0.321439 0.000000 0.946930 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.935906 0.000000 0.352250 0.000000 0.000000 1.000000 0.000000 0.000000 -0.352250 0.000000 0.935906 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.923880 0.000000 0.382683 0.000000 0.000000 1.000000 0.000000 0.000000 -0.382683 0.000000 0.923880 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.910864 0.000000 0.412707 0.000000 0.000000 1.000000 0.000000 0.000000 -0.412707 0.000000 0.910864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.896873 0.000000 0.442289 0.000000 0.000000 1.000000 0.000000 0.000000 -0.442289 0.000000 0.896873 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.881921 0.000000 0.471397 0.000000 0.000000 1.000000 0.000000 0.000000 -0.471397 0.000000 0.881921 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.866025 0.000000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.500000 0.000000 0.866025 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.849202 0.000000 0.528068 0.000000 0.000000 1.000000 0.000000 0.000000 -0.528068 0.000000 0.849202 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.831470 0.000000 0.555570 0.000000 0.000000 1.000000 0.000000 0.000000 -0.555570 0.000000 0.831470 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.812847 0.000000 0.582478 0.000000 0.000000 1.000000 0.000000 0.000000 -0.582478 0.000000 0.812847 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.793353 0.000000 0.608761 0.000000 0.000000 1.000000 0.000000 0.000000 -0.608761 0.000000 0.793353 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.773010 0.000000 0.634393 0.000000 0.000000 1.000000 0.000000 0.000000 -0.634393 0.000000 0.773010 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.751840 0.000000 0.659346 0.000000 0.000000 1.000000 0.000000 0.000000 -0.659346 0.000000 0.751840 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.729864 0.000000 0.683592 0.000000 0.000000 1.000000 0.000000 0.000000 -0.683592 0.000000 0.729864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.707107 0.000000 0.707107 0.000000 0.000000 1.000000 0.000000 0.000000 -0.707107 0.000000 0.707107 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.683592 0.000000 0.729864 0.000000 0.000000 1.000000 0.000000 0.000000 -0.729864 0.000000 0.683592 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.659346 0.000000 0.751840 0.000000 0.000000 1.000000 0.000000 0.000000 -0.751840 0.000000 0.659346 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.634393 0.000000 0.773010 0.000000 0.000000 1.000000 0.000000 0.000000 -0.773010 0.000000 0.634393 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.608761 0.000000 0.793353 0.000000 0.000000 1.000000 0.000000 0.000000 -0.793353 0.000000 0.608761 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.582478 0.000000 0.812847 0.000000 0.000000 1.000000 0.000000 0.000000 -0.812847 0.000000 0.582478 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.555570 0.000000 0.831470 0.000000 0.000000 1.000000 0.000000 0.000000 -0.831470 0.000000 0.555570 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.528068 0.000000 0.849202 0.000000 0.000000 1.000000 0.000000 0.000000 -0.849202 0.000000 0.528068 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.500000 0.000000 0.866025 0.000000 0.000000 1.000000 0.000000 0.000000 -0.866025 0.000000 0.500000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.471397 0.000000 0.881921 0.000000 0.000000 1.000000 0.000000 0.000000 -0.881921 0.000000 0.471397 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.442289 0.000000 0.896873 0.000000 0.000000 1.000000 0.000000 0.000000 -0.896873 0.000000 0.442289 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.412707 0.000000 0.910864 0.000000 0.000000 1.000000 0.000000 0.000000 -0.910864 0.000000 0.412707 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.382683 0.000000 0.923880 0.000000 0.000000 1.000000 0.000000 0.000000 -0.923880 0.000000 0.382683 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.352250 0.000000 0.935906 0.000000 0.000000 1.000000 0.000000 0.000000 -0.935906 0.000000 0.352250 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.321439 0.000000 0.946930 0.000000 0.000000 1.000000 0.000000 0.000000 -0.946930 0.000000 0.321439 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.290285 0.000000 0.956940 0.000000 0.000000 1.000000 0.000000 0.000000 -0.956940 0.000000 0.290285 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.258819 0.000000 0.965926 0.000000 0.000000 1.000000 0.000000 0.000000 -0.965926 0.000000 0.258819 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.227076 0.000000 0.973877 0.000000 0.000000 1.000000 0.000000 0.000000 -0.973877 0.000000 0.227076 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.195090 0.000000 0.980785 0.000000 0.000000 1.000000 0.000000 0.000000 -0.980785 0.000000 0.195090 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.162895 0.000000 0.986643 0.000000 0.000000 1.000000 0.000000 0.000000 -0.986643 0.000000 0.162895 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.130526 0.000000 0.991445 0.000000 0.000000 1.000000 0.000000 0.000000 -0.991445 0.000000 0.130526 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.098017 0.000000 0.995185 0.000000 0.000000 1.000000 0.000000 0.000000 -0.995185 0.000000 0.098017 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.065403 0.000000 0.997859 0.000000 0.000000 1.000000 0.000000 0.000000 -0.997859 0.000000 0.065403 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.032719 0.000000 0.999465 0.000000 0.000000 1.000000 0.000000 0.000000 -0.999465 0.000000 0.032719 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.000000 0.000000 0.000000 0.000000
diff --git a/assets/manual_poses_PanTiltSpin/Spin_AntiClockwise_90.000000.txt b/assets/manual_poses_PanTiltSpin/Spin_AntiClockwise_90.000000.txt
new file mode 100644
index 0000000000000000000000000000000000000000..765471623bb78a3a5f03aad9d963e4a09524537d
--- /dev/null
+++ b/assets/manual_poses_PanTiltSpin/Spin_AntiClockwise_90.000000.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 -0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.999465 0.032719 0.000000 0.000000 -0.032719 0.999465 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.997859 0.065403 0.000000 0.000000 -0.065403 0.997859 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.995185 0.098017 0.000000 0.000000 -0.098017 0.995185 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.991445 0.130526 0.000000 0.000000 -0.130526 0.991445 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.986643 0.162895 0.000000 0.000000 -0.162895 0.986643 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.980785 0.195090 0.000000 0.000000 -0.195090 0.980785 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.973877 0.227076 0.000000 0.000000 -0.227076 0.973877 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.965926 0.258819 0.000000 0.000000 -0.258819 0.965926 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.956940 0.290285 0.000000 0.000000 -0.290285 0.956940 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.946930 0.321439 0.000000 0.000000 -0.321439 0.946930 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.935906 0.352250 0.000000 0.000000 -0.352250 0.935906 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.923880 0.382683 0.000000 0.000000 -0.382683 0.923880 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.910864 0.412707 0.000000 0.000000 -0.412707 0.910864 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.896873 0.442289 0.000000 0.000000 -0.442289 0.896873 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.881921 0.471397 0.000000 0.000000 -0.471397 0.881921 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.866025 0.500000 0.000000 0.000000 -0.500000 0.866025 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.849202 0.528068 0.000000 0.000000 -0.528068 0.849202 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.831470 0.555570 0.000000 0.000000 -0.555570 0.831470 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.812847 0.582478 0.000000 0.000000 -0.582478 0.812847 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.793353 0.608761 0.000000 0.000000 -0.608761 0.793353 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.773010 0.634393 0.000000 0.000000 -0.634393 0.773010 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.751840 0.659346 0.000000 0.000000 -0.659346 0.751840 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.729864 0.683592 0.000000 0.000000 -0.683592 0.729864 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.707107 0.707107 0.000000 0.000000 -0.707107 0.707107 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.683592 0.729864 0.000000 0.000000 -0.729864 0.683592 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.659346 0.751840 0.000000 0.000000 -0.751840 0.659346 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.634393 0.773010 0.000000 0.000000 -0.773010 0.634393 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.608761 0.793353 0.000000 0.000000 -0.793353 0.608761 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.582478 0.812847 0.000000 0.000000 -0.812847 0.582478 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.555570 0.831470 0.000000 0.000000 -0.831470 0.555570 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.528068 0.849202 0.000000 0.000000 -0.849202 0.528068 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.500000 0.866025 0.000000 0.000000 -0.866025 0.500000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.471397 0.881921 0.000000 0.000000 -0.881921 0.471397 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.442289 0.896873 0.000000 0.000000 -0.896873 0.442289 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.412707 0.910864 0.000000 0.000000 -0.910864 0.412707 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.382683 0.923880 0.000000 0.000000 -0.923880 0.382683 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.352250 0.935906 0.000000 0.000000 -0.935906 0.352250 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.321439 0.946930 0.000000 0.000000 -0.946930 0.321439 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.290285 0.956940 0.000000 0.000000 -0.956940 0.290285 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.258819 0.965926 0.000000 0.000000 -0.965926 0.258819 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.227076 0.973877 0.000000 0.000000 -0.973877 0.227076 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.195090 0.980785 0.000000 0.000000 -0.980785 0.195090 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.162895 0.986643 0.000000 0.000000 -0.986643 0.162895 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.130526 0.991445 0.000000 0.000000 -0.991445 0.130526 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.098017 0.995185 0.000000 0.000000 -0.995185 0.098017 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.065403 0.997859 0.000000 0.000000 -0.997859 0.065403 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.032719 0.999465 0.000000 0.000000 -0.999465 0.032719 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
diff --git a/assets/manual_poses_PanTiltSpin/Spin_Clockwise_90.000000.txt b/assets/manual_poses_PanTiltSpin/Spin_Clockwise_90.000000.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c271a0cab7402e41fc0143eae07993027eb8a2f0
--- /dev/null
+++ b/assets/manual_poses_PanTiltSpin/Spin_Clockwise_90.000000.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.999465 -0.032719 0.000000 0.000000 0.032719 0.999465 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.997859 -0.065403 0.000000 0.000000 0.065403 0.997859 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.995185 -0.098017 0.000000 0.000000 0.098017 0.995185 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.991445 -0.130526 0.000000 0.000000 0.130526 0.991445 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.986643 -0.162895 0.000000 0.000000 0.162895 0.986643 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.980785 -0.195090 0.000000 0.000000 0.195090 0.980785 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.973877 -0.227076 0.000000 0.000000 0.227076 0.973877 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.965926 -0.258819 0.000000 0.000000 0.258819 0.965926 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.956940 -0.290285 0.000000 0.000000 0.290285 0.956940 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.946930 -0.321439 0.000000 0.000000 0.321439 0.946930 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.935906 -0.352250 0.000000 0.000000 0.352250 0.935906 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.923880 -0.382683 0.000000 0.000000 0.382683 0.923880 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.910864 -0.412707 0.000000 0.000000 0.412707 0.910864 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.896873 -0.442289 0.000000 0.000000 0.442289 0.896873 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.881921 -0.471397 0.000000 0.000000 0.471397 0.881921 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.866025 -0.500000 0.000000 0.000000 0.500000 0.866025 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.849202 -0.528068 0.000000 0.000000 0.528068 0.849202 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.831470 -0.555570 0.000000 0.000000 0.555570 0.831470 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.812847 -0.582478 0.000000 0.000000 0.582478 0.812847 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.793353 -0.608761 0.000000 0.000000 0.608761 0.793353 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.773010 -0.634393 0.000000 0.000000 0.634393 0.773010 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.751840 -0.659346 0.000000 0.000000 0.659346 0.751840 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.729864 -0.683592 0.000000 0.000000 0.683592 0.729864 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.707107 -0.707107 0.000000 0.000000 0.707107 0.707107 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.683592 -0.729864 0.000000 0.000000 0.729864 0.683592 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.659346 -0.751840 0.000000 0.000000 0.751840 0.659346 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.634393 -0.773010 0.000000 0.000000 0.773010 0.634393 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.608761 -0.793353 0.000000 0.000000 0.793353 0.608761 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.582478 -0.812847 0.000000 0.000000 0.812847 0.582478 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.555570 -0.831470 0.000000 0.000000 0.831470 0.555570 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.528068 -0.849202 0.000000 0.000000 0.849202 0.528068 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.500000 -0.866025 0.000000 0.000000 0.866025 0.500000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.471397 -0.881921 0.000000 0.000000 0.881921 0.471397 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.442289 -0.896873 0.000000 0.000000 0.896873 0.442289 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.412707 -0.910864 0.000000 0.000000 0.910864 0.412707 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.382683 -0.923880 0.000000 0.000000 0.923880 0.382683 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.352250 -0.935906 0.000000 0.000000 0.935906 0.352250 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.321439 -0.946930 0.000000 0.000000 0.946930 0.321439 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.290285 -0.956940 0.000000 0.000000 0.956940 0.290285 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.258819 -0.965926 0.000000 0.000000 0.965926 0.258819 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.227076 -0.973877 0.000000 0.000000 0.973877 0.227076 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.195090 -0.980785 0.000000 0.000000 0.980785 0.195090 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.162895 -0.986643 0.000000 0.000000 0.986643 0.162895 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.130526 -0.991445 0.000000 0.000000 0.991445 0.130526 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.098017 -0.995185 0.000000 0.000000 0.995185 0.098017 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.065403 -0.997859 0.000000 0.000000 0.997859 0.065403 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.032719 -0.999465 0.000000 0.000000 0.999465 0.032719 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 0.000000 -1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
diff --git a/assets/manual_poses_PanTiltSpin/Tilt_Down_90.000000.txt b/assets/manual_poses_PanTiltSpin/Tilt_Down_90.000000.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dee0092ad6997ff3b72652554fac37241e527678
--- /dev/null
+++ b/assets/manual_poses_PanTiltSpin/Tilt_Down_90.000000.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 -0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.999465 0.032719 0.000000 0.000000 -0.032719 0.999465 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.997859 0.065403 0.000000 0.000000 -0.065403 0.997859 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.995185 0.098017 0.000000 0.000000 -0.098017 0.995185 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.991445 0.130526 0.000000 0.000000 -0.130526 0.991445 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.986643 0.162895 0.000000 0.000000 -0.162895 0.986643 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.980785 0.195090 0.000000 0.000000 -0.195090 0.980785 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.973877 0.227076 0.000000 0.000000 -0.227076 0.973877 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.965926 0.258819 0.000000 0.000000 -0.258819 0.965926 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.956940 0.290285 0.000000 0.000000 -0.290285 0.956940 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.946930 0.321439 0.000000 0.000000 -0.321439 0.946930 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.935906 0.352250 0.000000 0.000000 -0.352250 0.935906 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.923880 0.382683 0.000000 0.000000 -0.382683 0.923880 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.910864 0.412707 0.000000 0.000000 -0.412707 0.910864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.896873 0.442289 0.000000 0.000000 -0.442289 0.896873 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.881921 0.471397 0.000000 0.000000 -0.471397 0.881921 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.866025 0.500000 0.000000 0.000000 -0.500000 0.866025 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.849202 0.528068 0.000000 0.000000 -0.528068 0.849202 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.831470 0.555570 0.000000 0.000000 -0.555570 0.831470 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.812847 0.582478 0.000000 0.000000 -0.582478 0.812847 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.793353 0.608761 0.000000 0.000000 -0.608761 0.793353 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.773010 0.634393 0.000000 0.000000 -0.634393 0.773010 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.751840 0.659346 0.000000 0.000000 -0.659346 0.751840 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.729864 0.683592 0.000000 0.000000 -0.683592 0.729864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.707107 0.707107 0.000000 0.000000 -0.707107 0.707107 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.683592 0.729864 0.000000 0.000000 -0.729864 0.683592 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.659346 0.751840 0.000000 0.000000 -0.751840 0.659346 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.634393 0.773010 0.000000 0.000000 -0.773010 0.634393 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.608761 0.793353 0.000000 0.000000 -0.793353 0.608761 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.582478 0.812847 0.000000 0.000000 -0.812847 0.582478 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.555570 0.831470 0.000000 0.000000 -0.831470 0.555570 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.528068 0.849202 0.000000 0.000000 -0.849202 0.528068 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.500000 0.866025 0.000000 0.000000 -0.866025 0.500000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.471397 0.881921 0.000000 0.000000 -0.881921 0.471397 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.442289 0.896873 0.000000 0.000000 -0.896873 0.442289 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.412707 0.910864 0.000000 0.000000 -0.910864 0.412707 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.382683 0.923880 0.000000 0.000000 -0.923880 0.382683 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.352250 0.935906 0.000000 0.000000 -0.935906 0.352250 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.321439 0.946930 0.000000 0.000000 -0.946930 0.321439 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.290285 0.956940 0.000000 0.000000 -0.956940 0.290285 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.258819 0.965926 0.000000 0.000000 -0.965926 0.258819 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.227076 0.973877 0.000000 0.000000 -0.973877 0.227076 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.195090 0.980785 0.000000 0.000000 -0.980785 0.195090 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.162895 0.986643 0.000000 0.000000 -0.986643 0.162895 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.130526 0.991445 0.000000 0.000000 -0.991445 0.130526 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.098017 0.995185 0.000000 0.000000 -0.995185 0.098017 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.065403 0.997859 0.000000 0.000000 -0.997859 0.065403 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.032719 0.999465 0.000000 0.000000 -0.999465 0.032719 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.000000 0.000000 0.000000
diff --git a/assets/manual_poses_PanTiltSpin/Tilt_Up_90.000000.txt b/assets/manual_poses_PanTiltSpin/Tilt_Up_90.000000.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dde6afdc39c3ea43d9801425a85ad2e36a61dcee
--- /dev/null
+++ b/assets/manual_poses_PanTiltSpin/Tilt_Up_90.000000.txt
@@ -0,0 +1,49 @@
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.999465 -0.032719 0.000000 0.000000 0.032719 0.999465 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.997859 -0.065403 0.000000 0.000000 0.065403 0.997859 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.995185 -0.098017 0.000000 0.000000 0.098017 0.995185 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.991445 -0.130526 0.000000 0.000000 0.130526 0.991445 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.986643 -0.162895 0.000000 0.000000 0.162895 0.986643 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.980785 -0.195090 0.000000 0.000000 0.195090 0.980785 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.973877 -0.227076 0.000000 0.000000 0.227076 0.973877 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.965926 -0.258819 0.000000 0.000000 0.258819 0.965926 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.956940 -0.290285 0.000000 0.000000 0.290285 0.956940 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.946930 -0.321439 0.000000 0.000000 0.321439 0.946930 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.935906 -0.352250 0.000000 0.000000 0.352250 0.935906 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.923880 -0.382683 0.000000 0.000000 0.382683 0.923880 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.910864 -0.412707 0.000000 0.000000 0.412707 0.910864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.896873 -0.442289 0.000000 0.000000 0.442289 0.896873 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.881921 -0.471397 0.000000 0.000000 0.471397 0.881921 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.866025 -0.500000 0.000000 0.000000 0.500000 0.866025 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.849202 -0.528068 0.000000 0.000000 0.528068 0.849202 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.831470 -0.555570 0.000000 0.000000 0.555570 0.831470 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.812847 -0.582478 0.000000 0.000000 0.582478 0.812847 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.793353 -0.608761 0.000000 0.000000 0.608761 0.793353 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.773010 -0.634393 0.000000 0.000000 0.634393 0.773010 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.751840 -0.659346 0.000000 0.000000 0.659346 0.751840 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.729864 -0.683592 0.000000 0.000000 0.683592 0.729864 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.707107 -0.707107 0.000000 0.000000 0.707107 0.707107 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.683592 -0.729864 0.000000 0.000000 0.729864 0.683592 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.659346 -0.751840 0.000000 0.000000 0.751840 0.659346 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.634393 -0.773010 0.000000 0.000000 0.773010 0.634393 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.608761 -0.793353 0.000000 0.000000 0.793353 0.608761 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.582478 -0.812847 0.000000 0.000000 0.812847 0.582478 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.555570 -0.831470 0.000000 0.000000 0.831470 0.555570 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.528068 -0.849202 0.000000 0.000000 0.849202 0.528068 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.500000 -0.866025 0.000000 0.000000 0.866025 0.500000 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.471397 -0.881921 0.000000 0.000000 0.881921 0.471397 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.442289 -0.896873 0.000000 0.000000 0.896873 0.442289 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.412707 -0.910864 0.000000 0.000000 0.910864 0.412707 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.382683 -0.923880 0.000000 0.000000 0.923880 0.382683 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.352250 -0.935906 0.000000 0.000000 0.935906 0.352250 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.321439 -0.946930 0.000000 0.000000 0.946930 0.321439 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.290285 -0.956940 0.000000 0.000000 0.956940 0.290285 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.258819 -0.965926 0.000000 0.000000 0.965926 0.258819 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.227076 -0.973877 0.000000 0.000000 0.973877 0.227076 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.195090 -0.980785 0.000000 0.000000 0.980785 0.195090 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.162895 -0.986643 0.000000 0.000000 0.986643 0.162895 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.130526 -0.991445 0.000000 0.000000 0.991445 0.130526 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.098017 -0.995185 0.000000 0.000000 0.995185 0.098017 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.065403 -0.997859 0.000000 0.000000 0.997859 0.065403 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.032719 -0.999465 0.000000 0.000000 0.999465 0.032719 0.000000
+0.474812 0.844111 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -1.000000 0.000000 0.000000 1.000000 0.000000 0.000000
diff --git a/assets/pages/res1.mp4 b/assets/pages/res1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9610aafc1cfe1aef135c780916d92d0d0dfaae66
--- /dev/null
+++ b/assets/pages/res1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:116cbfd8343a28ed06c52dd05ac2394568ee1a653c8bfb5302b6139ab6ca8253
+size 1144994
diff --git a/assets/pages/res2.mp4 b/assets/pages/res2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c40e4351c47952b6201b618e2559ca15081f3e6d
--- /dev/null
+++ b/assets/pages/res2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:690f048e29e2f1abc418bbd6df252e8cbbf96a2bc4888e2922b1ef4423129890
+size 370357
diff --git a/assets/pages/res3.mp4 b/assets/pages/res3.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5e25cae22116538d88c48b4bae1c65d2ae6ed11d
--- /dev/null
+++ b/assets/pages/res3.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:63f7285e7625c118520d4beee47912aa594ced88d39c6765841523ee42cb4928
+size 384597
diff --git a/assets/pages/teaser.png b/assets/pages/teaser.png
new file mode 100644
index 0000000000000000000000000000000000000000..2041055a5800454736a195ae180204702d36df35
--- /dev/null
+++ b/assets/pages/teaser.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18a0e6ca8f2c4c9f7f9ad4704bbffbf6ed166af6ce7380659b3d8327607fb5dd
+size 1717648
diff --git a/assets/re10k_poses/1593596b99e2dde9.txt b/assets/re10k_poses/1593596b99e2dde9.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8ab87cb62139cd221393b67c4e88e36b0aebd539
--- /dev/null
+++ b/assets/re10k_poses/1593596b99e2dde9.txt
@@ -0,0 +1,49 @@
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 -0.000000 -0.000000 1.000000 0.000000 0.000000 0.000000 -0.000000 1.000000 0.000000
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.999919 -0.002368 0.012528 0.008206 0.002365 0.999997 0.000246 -0.011939 -0.012529 -0.000217 0.999921 0.051175
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.999686 -0.004823 0.024583 0.016946 0.004804 0.999988 0.000826 -0.025159 -0.024587 -0.000707 0.999697 0.101218
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.999288 -0.007402 0.036991 0.023794 0.007405 0.999973 0.000076 -0.038454 -0.036991 0.000198 0.999316 0.149872
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.999057 -0.008505 0.042582 0.028704 0.008500 0.999964 0.000312 -0.045036 -0.042583 0.000050 0.999093 0.174403
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.998453 -0.011093 0.054492 0.035880 0.011066 0.999938 0.000794 -0.057561 -0.054498 -0.000190 0.998514 0.221341
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.997665 -0.013375 0.066974 0.042560 0.013351 0.999911 0.000810 -0.066974 -0.066979 0.000086 0.997754 0.269508
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.996692 -0.015736 0.079728 0.047727 0.015744 0.999876 0.000531 -0.073962 -0.079726 0.000726 0.996817 0.318513
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.995563 -0.018279 0.092301 0.053255 0.018256 0.999833 0.001097 -0.079132 -0.092306 0.000593 0.995731 0.369250
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.994184 -0.020887 0.105650 0.057289 0.020868 0.999781 0.001285 -0.081813 -0.105653 0.000928 0.994403 0.421510
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.992504 -0.023668 0.119901 0.061269 0.023679 0.999719 0.001329 -0.084911 -0.119899 0.001520 0.992785 0.474738
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.990563 -0.026417 0.134490 0.062838 0.026464 0.999649 0.001439 -0.089874 -0.134481 0.002133 0.990914 0.528024
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.989484 -0.028075 0.141893 0.064270 0.028100 0.999603 0.001828 -0.093571 -0.141888 0.002178 0.989880 0.553787
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.987024 -0.031474 0.157456 0.067278 0.031501 0.999501 0.002325 -0.100735 -0.157451 0.002666 0.987523 0.604510
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.984146 -0.035028 0.173868 0.071251 0.035139 0.999379 0.002443 -0.109026 -0.173845 0.003705 0.984766 0.652562
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.980502 -0.038320 0.192735 0.075929 0.038497 0.999255 0.002826 -0.118217 -0.192699 0.004649 0.981247 0.699022
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.976189 -0.042152 0.212788 0.082536 0.042270 0.999098 0.003996 -0.127224 -0.212765 0.005094 0.977090 0.742683
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.971148 -0.046722 0.233855 0.091725 0.046912 0.998888 0.004755 -0.134028 -0.233817 0.006353 0.972260 0.784194
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.965263 -0.051337 0.256188 0.103690 0.051612 0.998651 0.005653 -0.139397 -0.256133 0.007766 0.966610 0.823475
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.958488 -0.055717 0.279638 0.117716 0.056239 0.998398 0.006162 -0.141251 -0.279533 0.009820 0.960086 0.862623
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.954801 -0.057889 0.291554 0.125688 0.058606 0.998261 0.006280 -0.142330 -0.291410 0.011090 0.956534 0.881584
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.947141 -0.062612 0.314649 0.144799 0.063754 0.997943 0.006671 -0.144135 -0.314420 0.013742 0.949184 0.918354
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.938776 -0.067538 0.337843 0.164626 0.068917 0.997591 0.007924 -0.146242 -0.337564 0.015844 0.941169 0.954138
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.929633 -0.071565 0.361470 0.186040 0.073238 0.997273 0.009087 -0.145504 -0.361135 0.018025 0.932339 0.990309
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.919796 -0.075441 0.385076 0.210412 0.077419 0.996945 0.010389 -0.149022 -0.384684 0.020256 0.922826 1.026103
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.909773 -0.079613 0.407399 0.238390 0.082160 0.996555 0.011270 -0.153513 -0.406893 0.023218 0.913181 1.060644
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.899202 -0.084401 0.429316 0.267439 0.086985 0.996116 0.013641 -0.160531 -0.428800 0.025078 0.903051 1.092495
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.887792 -0.089028 0.451552 0.295610 0.091841 0.995649 0.015734 -0.167618 -0.450988 0.027502 0.892106 1.124068
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.881656 -0.091542 0.462928 0.309812 0.094520 0.995381 0.016817 -0.172125 -0.462329 0.028929 0.886237 1.139258
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.869221 -0.096173 0.484980 0.338161 0.099158 0.994879 0.019568 -0.182767 -0.484378 0.031081 0.874306 1.167749
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.855955 -0.100601 0.507170 0.366942 0.104017 0.994339 0.021684 -0.192308 -0.506480 0.034194 0.861573 1.195232
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.842439 -0.105119 0.528437 0.395102 0.108793 0.993769 0.024246 -0.202301 -0.527693 0.037065 0.848626 1.221788
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.827179 -0.109905 0.551086 0.423258 0.113645 0.993141 0.027484 -0.212104 -0.550327 0.039894 0.833996 1.246251
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.811120 -0.114244 0.573614 0.451580 0.118499 0.992498 0.030107 -0.221700 -0.572750 0.043552 0.818572 1.270168
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.794061 -0.118795 0.596116 0.481666 0.123250 0.991811 0.033474 -0.229374 -0.595211 0.046891 0.802200 1.293764
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.776022 -0.122838 0.618628 0.510776 0.128401 0.991079 0.035725 -0.233784 -0.617498 0.051709 0.784871 1.318889
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.766754 -0.125112 0.629632 0.526259 0.130578 0.990715 0.037846 -0.238206 -0.628521 0.053197 0.775971 1.330786
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.747430 -0.129443 0.651608 0.555359 0.135117 0.989953 0.041669 -0.244782 -0.650455 0.056899 0.757411 1.355215
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.727256 -0.133493 0.673259 0.584176 0.139273 0.989199 0.045694 -0.253335 -0.672087 0.060536 0.737993 1.377998
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.706488 -0.137376 0.694264 0.611819 0.143655 0.988395 0.049392 -0.262496 -0.692992 0.064840 0.718023 1.401850
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.685792 -0.141096 0.713990 0.638342 0.147714 0.987594 0.053285 -0.275585 -0.712650 0.068923 0.698125 1.423374
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.665148 -0.144278 0.732641 0.662227 0.151785 0.986796 0.056526 -0.287814 -0.731122 0.073606 0.678264 1.443285
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.643099 -0.148078 0.751330 0.687133 0.155931 0.985893 0.060839 -0.300538 -0.749739 0.078030 0.657117 1.457629
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.620575 -0.151364 0.769400 0.711715 0.159799 0.985014 0.064892 -0.312400 -0.767692 0.082679 0.635463 1.471619
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.609202 -0.152482 0.778217 0.724442 0.161842 0.984592 0.066226 -0.318479 -0.776325 0.085603 0.624494 1.477000
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.586464 -0.154258 0.795150 0.749440 0.164675 0.983902 0.069420 -0.327781 -0.793058 0.090229 0.602426 1.487681
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.562944 -0.158017 0.811249 0.777268 0.167692 0.982975 0.075101 -0.338037 -0.809304 0.093762 0.579858 1.495192
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.538750 -0.160670 0.827003 0.812024 0.170949 0.982073 0.079433 -0.344134 -0.824939 0.098581 0.556558 1.505713
+0.497111 0.883753 0.500000 0.500000 0.000000 0.000000 0.516293 -0.162830 0.840790 0.843954 0.173682 0.981265 0.083385 -0.349199 -0.838615 0.102979 0.534901 1.513592
diff --git a/assets/re10k_poses/15b93cbe9fc5220d.txt b/assets/re10k_poses/15b93cbe9fc5220d.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ef215983574f36106a091fc874d5f942540a2eb
--- /dev/null
+++ b/assets/re10k_poses/15b93cbe9fc5220d.txt
@@ -0,0 +1,49 @@
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 -0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999999 0.000036 0.001697 0.003251 -0.000034 1.000000 -0.001011 -0.004629 -0.001697 0.001011 0.999998 0.048252
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999993 0.000400 0.003595 0.003805 -0.000394 0.999998 -0.001737 -0.007293 -0.003596 0.001735 0.999992 0.097305
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999981 0.000469 0.006087 0.001856 -0.000455 0.999997 -0.002403 -0.009899 -0.006088 0.002400 0.999979 0.147761
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999970 0.000438 0.007790 0.000256 -0.000414 0.999995 -0.003018 -0.010644 -0.007791 0.003014 0.999965 0.172868
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999928 0.000250 0.011978 -0.003168 -0.000201 0.999992 -0.004099 -0.013879 -0.011979 0.004096 0.999920 0.223462
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999855 -0.000162 0.017020 -0.010207 0.000252 0.999986 -0.005311 -0.017362 -0.017019 0.005315 0.999841 0.274201
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999760 0.000188 0.021886 -0.018542 -0.000053 0.999981 -0.006180 -0.022313 -0.021887 0.006178 0.999741 0.326529
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999641 0.000673 0.026799 -0.028358 -0.000477 0.999973 -0.007303 -0.026431 -0.026804 0.007287 0.999614 0.377895
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999480 0.000925 0.032231 -0.038202 -0.000654 0.999964 -0.008410 -0.030493 -0.032238 0.008385 0.999445 0.426334
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999250 0.001639 0.038680 -0.050652 -0.001284 0.999957 -0.009189 -0.033700 -0.038693 0.009133 0.999209 0.473359
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.999005 0.002214 0.044533 -0.060744 -0.001803 0.999955 -0.009270 -0.039215 -0.044552 0.009181 0.998965 0.518502
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.998859 0.002359 0.047707 -0.067015 -0.001868 0.999945 -0.010353 -0.038549 -0.047729 0.010252 0.998808 0.540905
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.998585 0.003134 0.053083 -0.076162 -0.002578 0.999941 -0.010548 -0.043275 -0.053113 0.010396 0.998534 0.583681
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.998270 0.003700 0.058683 -0.085776 -0.003069 0.999937 -0.010843 -0.047342 -0.058719 0.010645 0.998218 0.627145
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.997819 0.004039 0.065887 -0.096865 -0.003246 0.999921 -0.012137 -0.048826 -0.065931 0.011897 0.997753 0.670760
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.997295 0.004131 0.073392 -0.105819 -0.003248 0.999921 -0.012150 -0.055345 -0.073436 0.011879 0.997229 0.712151
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.996735 0.004351 0.080630 -0.114296 -0.003216 0.999894 -0.014199 -0.057606 -0.080683 0.013894 0.996643 0.754061
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.996146 0.004477 0.087599 -0.122782 -0.003272 0.999898 -0.013896 -0.067687 -0.087652 0.013555 0.996059 0.793076
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.995514 0.004893 0.094484 -0.131388 -0.003343 0.999858 -0.016548 -0.069442 -0.094552 0.016158 0.995389 0.833858
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.995138 0.005530 0.098330 -0.136563 -0.003946 0.999860 -0.016291 -0.074711 -0.098406 0.015824 0.995020 0.853119
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.994358 0.005901 0.105914 -0.144598 -0.004170 0.999854 -0.016561 -0.082281 -0.105997 0.016026 0.994237 0.890677
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.993310 0.006090 0.115315 -0.154643 -0.003998 0.999823 -0.018361 -0.084488 -0.115406 0.017777 0.993159 0.928200
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.992222 0.005798 0.124343 -0.161694 -0.003434 0.999810 -0.019214 -0.089514 -0.124430 0.018638 0.992053 0.964938
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.990899 0.005441 0.134497 -0.172528 -0.002732 0.999790 -0.020323 -0.093789 -0.134579 0.019771 0.990706 1.003175
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.989477 0.005218 0.144594 -0.183960 -0.002174 0.999773 -0.021201 -0.097472 -0.144671 0.020663 0.989264 1.042127
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.988016 0.005151 0.154265 -0.194398 -0.001661 0.999740 -0.022744 -0.098495 -0.154342 0.022215 0.987768 1.080648
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.986536 0.004666 0.163480 -0.203514 -0.000877 0.999729 -0.023244 -0.102155 -0.163544 0.022788 0.986273 1.116974
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.985708 0.004739 0.168396 -0.208835 -0.000707 0.999712 -0.023996 -0.101844 -0.168461 0.023534 0.985427 1.135975
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.983998 0.004567 0.178118 -0.218586 -0.000164 0.999694 -0.024724 -0.102076 -0.178177 0.024299 0.983698 1.172261
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.982281 0.004238 0.187368 -0.224151 0.000597 0.999668 -0.025742 -0.101096 -0.187415 0.025398 0.981953 1.206498
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.980452 0.003816 0.196720 -0.228080 0.001448 0.999645 -0.026608 -0.099425 -0.196752 0.026373 0.980098 1.240007
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.978412 0.003292 0.206636 -0.232664 0.002347 0.999632 -0.027038 -0.097719 -0.206649 0.026939 0.978044 1.273694
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.976458 0.002254 0.215695 -0.233548 0.003710 0.999622 -0.027242 -0.096199 -0.215675 0.027401 0.976081 1.306537
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.974292 0.001561 0.225286 -0.234757 0.004757 0.999611 -0.027500 -0.093172 -0.225241 0.027864 0.973905 1.339746
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.971990 0.000502 0.235023 -0.235709 0.006099 0.999607 -0.027356 -0.089577 -0.234944 0.028023 0.971605 1.372923
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.970833 0.000188 0.239758 -0.235527 0.006581 0.999602 -0.027434 -0.086784 -0.239668 0.028212 0.970445 1.389073
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.968387 -0.000816 0.249451 -0.235524 0.007822 0.999602 -0.027096 -0.081261 -0.249330 0.028190 0.968008 1.419631
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.965979 -0.001936 0.258613 -0.233372 0.009201 0.999596 -0.026885 -0.073222 -0.258457 0.028349 0.965607 1.447913
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.963616 -0.003024 0.267272 -0.230123 0.010605 0.999581 -0.026926 -0.063013 -0.267079 0.028781 0.963245 1.472872
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.961027 -0.003968 0.276425 -0.227916 0.011866 0.999568 -0.026904 -0.051422 -0.276199 0.029136 0.960659 1.496958
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.958526 -0.005060 0.284962 -0.223746 0.013176 0.999560 -0.026569 -0.039264 -0.284702 0.029221 0.958171 1.519233
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.955819 -0.006137 0.293891 -0.220222 0.014536 0.999546 -0.026403 -0.025538 -0.293596 0.029508 0.955474 1.539423
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.952971 -0.007035 0.302980 -0.216524 0.015648 0.999539 -0.026009 -0.011267 -0.302657 0.029527 0.952642 1.558276
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.951465 -0.007442 0.307668 -0.214925 0.016172 0.999535 -0.025835 -0.003861 -0.307333 0.029557 0.951143 1.566910
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.948365 -0.008160 0.317076 -0.211457 0.017117 0.999529 -0.025473 0.011136 -0.316718 0.029585 0.948058 1.582754
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.945262 -0.008890 0.326190 -0.206312 0.018098 0.999518 -0.025205 0.025217 -0.325809 0.029729 0.944968 1.595381
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.941983 -0.009645 0.335521 -0.200899 0.019081 0.999509 -0.024840 0.039427 -0.335117 0.029801 0.941705 1.605713
+0.479135 0.851796 0.500000 0.500000 0.000000 0.000000 0.938726 -0.010132 0.344516 -0.193746 0.019980 0.999487 -0.025046 0.055169 -0.344085 0.030395 0.938446 1.615121
diff --git a/assets/re10k_poses/2b4f6fdcabf53d59.txt b/assets/re10k_poses/2b4f6fdcabf53d59.txt
new file mode 100644
index 0000000000000000000000000000000000000000..795f799f36d3c75199950fd8893d8222d87a4b0f
--- /dev/null
+++ b/assets/re10k_poses/2b4f6fdcabf53d59.txt
@@ -0,0 +1,49 @@
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999983 0.000274 -0.005820 -0.010876 -0.000278 1.000000 -0.000593 -0.003817 0.005820 0.000595 0.999983 0.040582
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999926 0.000534 -0.012176 -0.019437 -0.000545 0.999999 -0.000898 -0.008671 0.012176 0.000905 0.999925 0.081364
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999822 0.000108 -0.018846 -0.028347 -0.000129 0.999999 -0.001076 -0.012120 0.018846 0.001079 0.999822 0.122045
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999751 0.000015 -0.022334 -0.033046 -0.000036 1.000000 -0.000940 -0.014129 0.022334 0.000940 0.999750 0.143528
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999575 -0.000153 -0.029159 -0.042116 0.000127 1.000000 -0.000898 -0.018864 0.029159 0.000894 0.999574 0.185059
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999345 -0.000734 -0.036191 -0.050278 0.000668 0.999998 -0.001843 -0.022280 0.036192 0.001818 0.999343 0.225983
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.999067 -0.000979 -0.043170 -0.058234 0.000874 0.999997 -0.002430 -0.029651 0.043172 0.002390 0.999065 0.266512
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.998729 -0.001345 -0.050375 -0.066190 0.001132 0.999990 -0.004264 -0.033386 0.050381 0.004201 0.998721 0.306336
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.998341 -0.001310 -0.057564 -0.073777 0.000977 0.999983 -0.005825 -0.039559 0.057571 0.005760 0.998325 0.345022
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.997932 -0.000652 -0.064271 -0.081812 0.000184 0.999973 -0.007293 -0.044863 0.064274 0.007266 0.997906 0.383226
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.997438 0.000066 -0.071537 -0.089258 -0.000667 0.999965 -0.008381 -0.048706 0.071534 0.008407 0.997403 0.420917
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.997189 0.000309 -0.074932 -0.094145 -0.000982 0.999960 -0.008935 -0.050036 0.074926 0.008983 0.997149 0.440981
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.996598 0.001350 -0.082401 -0.102035 -0.002121 0.999955 -0.009264 -0.055781 0.082385 0.009408 0.996556 0.479971
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.996011 0.002504 -0.089197 -0.112845 -0.003399 0.999945 -0.009893 -0.061481 0.089167 0.010157 0.995965 0.520078
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.995354 0.003582 -0.096218 -0.123308 -0.004788 0.999913 -0.012308 -0.063034 0.096165 0.012711 0.995284 0.560508
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.994614 0.004645 -0.103543 -0.132509 -0.006053 0.999893 -0.013284 -0.070962 0.103470 0.013840 0.994536 0.602346
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.993914 0.006202 -0.109981 -0.144892 -0.007918 0.999853 -0.015177 -0.077985 0.109870 0.015956 0.993818 0.643666
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.993085 0.007922 -0.117127 -0.155186 -0.009972 0.999807 -0.016923 -0.087038 0.116970 0.017974 0.992973 0.684931
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.992204 0.009800 -0.124242 -0.164305 -0.012255 0.999744 -0.019011 -0.095056 0.124024 0.020385 0.992070 0.724264
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.991732 0.010834 -0.127867 -0.169222 -0.013491 0.999710 -0.019930 -0.098860 0.127614 0.021490 0.991591 0.742714
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.990685 0.012564 -0.135592 -0.180059 -0.015629 0.999645 -0.021563 -0.104111 0.135273 0.023481 0.990530 0.780910
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.989349 0.013878 -0.144900 -0.188385 -0.017309 0.999598 -0.022451 -0.109013 0.144530 0.024720 0.989192 0.818593
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.987884 0.015008 -0.154467 -0.196309 -0.018861 0.999546 -0.023508 -0.113338 0.154044 0.026137 0.987718 0.857002
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.986414 0.016079 -0.163488 -0.203636 -0.020370 0.999490 -0.024603 -0.118104 0.163009 0.027599 0.986239 0.895057
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.984840 0.017126 -0.172617 -0.210246 -0.021903 0.999427 -0.025806 -0.123364 0.172076 0.029195 0.984651 0.933214
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.983147 0.018235 -0.181906 -0.216250 -0.023596 0.999347 -0.027356 -0.127467 0.181289 0.031187 0.982935 0.973157
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.981386 0.019164 -0.191088 -0.223159 -0.025060 0.999280 -0.028487 -0.132498 0.190404 0.032746 0.981160 1.012769
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.980324 0.019446 -0.196433 -0.225475 -0.025616 0.999254 -0.028918 -0.135483 0.195724 0.033381 0.980091 1.032790
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.978132 0.019942 -0.207028 -0.231276 -0.026722 0.999193 -0.030004 -0.140903 0.206263 0.034880 0.977875 1.072695
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.975769 0.020721 -0.217822 -0.236454 -0.028211 0.999111 -0.031330 -0.146610 0.216979 0.036716 0.975486 1.111645
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.973231 0.021154 -0.228853 -0.240282 -0.029603 0.998998 -0.033552 -0.150778 0.227914 0.039429 0.972883 1.149310
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.970598 0.021509 -0.239745 -0.243270 -0.030956 0.998883 -0.035710 -0.155488 0.238709 0.042082 0.970179 1.184824
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.967855 0.021903 -0.250552 -0.245002 -0.032389 0.998760 -0.037807 -0.159955 0.249414 0.044707 0.967365 1.219566
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.965093 0.021967 -0.260986 -0.246690 -0.033300 0.998681 -0.039080 -0.163435 0.259783 0.046407 0.964551 1.254090
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.962360 0.021377 -0.270938 -0.248889 -0.033284 0.998668 -0.039429 -0.166546 0.269734 0.046962 0.961789 1.288966
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.960998 0.020914 -0.275764 -0.250165 -0.033079 0.998671 -0.039533 -0.167941 0.274571 0.047113 0.960412 1.306411
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.958197 0.020245 -0.285391 -0.252838 -0.032770 0.998695 -0.039181 -0.172319 0.284226 0.046895 0.957610 1.343018
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.955430 0.019858 -0.294548 -0.257970 -0.033016 0.998663 -0.039769 -0.175176 0.293364 0.047721 0.954809 1.380689
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.952584 0.019546 -0.303647 -0.263939 -0.033208 0.998652 -0.039894 -0.180444 0.302458 0.048085 0.951949 1.419289
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.949803 0.019950 -0.312213 -0.271473 -0.034074 0.998625 -0.039848 -0.187992 0.310989 0.048486 0.949176 1.456800
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.946814 0.020163 -0.321148 -0.278371 -0.034912 0.998580 -0.040232 -0.195439 0.319881 0.049304 0.946174 1.494038
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.943923 0.020716 -0.329516 -0.286073 -0.036195 0.998507 -0.040909 -0.202381 0.328177 0.050541 0.943263 1.529263
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.940929 0.020827 -0.337962 -0.292081 -0.037339 0.998401 -0.042431 -0.207094 0.336538 0.052544 0.940203 1.563252
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.939401 0.020845 -0.342187 -0.294775 -0.037899 0.998346 -0.043226 -0.209022 0.340720 0.053575 0.938637 1.579924
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.936352 0.021441 -0.350407 -0.300334 -0.039546 0.998222 -0.044593 -0.212743 0.348828 0.055612 0.935535 1.612898
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.933391 0.021923 -0.358190 -0.304990 -0.040857 0.998134 -0.045376 -0.216936 0.356527 0.056988 0.932545 1.645313
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.930548 0.021771 -0.365521 -0.311132 -0.041592 0.998055 -0.046441 -0.219298 0.363799 0.058419 0.929644 1.678555
+0.474328 0.843250 0.500000 0.500000 0.000000 0.000000 0.927420 0.021146 -0.373423 -0.315755 -0.041579 0.998041 -0.046746 -0.223595 0.371703 0.058879 0.926483 1.711262
diff --git a/assets/re10k_poses/2bd7cee1fa9c8996.txt b/assets/re10k_poses/2bd7cee1fa9c8996.txt
new file mode 100644
index 0000000000000000000000000000000000000000..533a193090dbd5a50d2f8b94e96a5ccfc5481dc8
--- /dev/null
+++ b/assets/re10k_poses/2bd7cee1fa9c8996.txt
@@ -0,0 +1,49 @@
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 -0.000000 1.000000 0.000000 -0.000000 -0.000000 -0.000000 1.000000 -0.000000
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999985 0.000330 -0.005409 0.000656 -0.000329 1.000000 0.000158 -0.000149 0.005409 -0.000156 0.999985 -0.006829
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999941 0.000838 -0.010843 0.001956 -0.000836 1.000000 0.000164 0.000236 0.010844 -0.000155 0.999941 -0.013323
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999864 0.001295 -0.016443 0.002872 -0.001291 0.999999 0.000237 0.000231 0.016443 -0.000216 0.999865 -0.019596
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999813 0.001580 -0.019284 0.003783 -0.001575 0.999999 0.000225 0.000377 0.019284 -0.000195 0.999814 -0.022641
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999689 0.002138 -0.024837 0.004775 -0.002133 0.999998 0.000217 0.000684 0.024837 -0.000164 0.999691 -0.029130
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999533 0.002591 -0.030444 0.005806 -0.002582 0.999997 0.000330 0.000845 0.030445 -0.000251 0.999536 -0.035927
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999344 0.003030 -0.036081 0.006767 -0.003015 0.999995 0.000466 0.000740 0.036082 -0.000357 0.999349 -0.042292
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.999126 0.003447 -0.041650 0.007893 -0.003427 0.999994 0.000554 0.000947 0.041651 -0.000411 0.999132 -0.048768
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.998871 0.003943 -0.047345 0.009014 -0.003921 0.999992 0.000554 0.001219 0.047347 -0.000368 0.998878 -0.054999
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.998594 0.004469 -0.052826 0.009600 -0.004441 0.999990 0.000647 0.001250 0.052828 -0.000411 0.998604 -0.062316
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.998273 0.005056 -0.058531 0.011051 -0.005023 0.999987 0.000721 0.001533 0.058534 -0.000426 0.998285 -0.068586
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.998109 0.005315 -0.061242 0.011538 -0.005280 0.999986 0.000724 0.001383 0.061245 -0.000400 0.998123 -0.071648
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.997744 0.005773 -0.066882 0.012501 -0.005729 0.999983 0.000850 0.001555 0.066886 -0.000465 0.997761 -0.078355
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.997345 0.006203 -0.072562 0.013411 -0.006146 0.999981 0.001009 0.001468 0.072567 -0.000561 0.997363 -0.084935
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.996910 0.006729 -0.078270 0.014465 -0.006670 0.999977 0.001024 0.001957 0.078275 -0.000499 0.996932 -0.091476
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.996448 0.007355 -0.083887 0.014931 -0.007287 0.999973 0.001123 0.002230 0.083893 -0.000508 0.996475 -0.098135
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.995944 0.007753 -0.089637 0.015976 -0.007668 0.999970 0.001289 0.002097 0.089644 -0.000596 0.995974 -0.104276
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.995416 0.008438 -0.095270 0.016789 -0.008345 0.999964 0.001371 0.002357 0.095278 -0.000570 0.995450 -0.110929
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.994857 0.008791 -0.100909 0.017506 -0.008672 0.999961 0.001620 0.002163 0.100919 -0.000736 0.994894 -0.117502
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.994553 0.009200 -0.103830 0.018042 -0.009082 0.999957 0.001612 0.002439 0.103840 -0.000660 0.994594 -0.120973
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.993951 0.009666 -0.109400 0.019008 -0.009528 0.999953 0.001781 0.002477 0.109412 -0.000728 0.993996 -0.127339
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.993317 0.010153 -0.114969 0.020210 -0.009992 0.999948 0.001981 0.002454 0.114983 -0.000819 0.993367 -0.134286
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.992685 0.010599 -0.120265 0.020847 -0.010419 0.999943 0.002128 0.002429 0.120281 -0.000859 0.992740 -0.140898
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.991991 0.011158 -0.125815 0.022155 -0.010960 0.999937 0.002263 0.002615 0.125832 -0.000866 0.992051 -0.147497
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.991296 0.011614 -0.131138 0.022814 -0.011401 0.999932 0.002372 0.002912 0.131157 -0.000856 0.991361 -0.154554
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.990568 0.012264 -0.136475 0.023808 -0.012032 0.999924 0.002523 0.002887 0.136496 -0.000857 0.990640 -0.161414
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.989799 0.012753 -0.141901 0.025038 -0.012509 0.999918 0.002615 0.003321 0.141923 -0.000813 0.989877 -0.168118
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.989418 0.012897 -0.144520 0.025519 -0.012635 0.999916 0.002730 0.003340 0.144543 -0.000875 0.989498 -0.171500
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.988626 0.013481 -0.149792 0.026145 -0.013213 0.999909 0.002780 0.003754 0.149815 -0.000769 0.988714 -0.178699
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.987796 0.013826 -0.155140 0.026998 -0.013520 0.999904 0.003027 0.003521 0.155167 -0.000893 0.987888 -0.185325
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.986932 0.014308 -0.160503 0.028147 -0.013962 0.999897 0.003282 0.003311 0.160534 -0.000998 0.987030 -0.191993
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.986051 0.014672 -0.165797 0.028976 -0.014306 0.999892 0.003396 0.003522 0.165829 -0.000977 0.986154 -0.198723
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.985133 0.015068 -0.171129 0.030233 -0.014662 0.999886 0.003636 0.003593 0.171164 -0.001073 0.985242 -0.205194
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.984187 0.015542 -0.176449 0.031151 -0.015120 0.999879 0.003733 0.003879 0.176486 -0.001006 0.984303 -0.211897
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.983218 0.016069 -0.181727 0.032090 -0.015619 0.999870 0.003906 0.003958 0.181766 -0.001002 0.983341 -0.218536
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.982708 0.016351 -0.184438 0.032784 -0.015887 0.999866 0.003994 0.003882 0.184479 -0.000994 0.982836 -0.221906
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.981673 0.016666 -0.189841 0.034037 -0.016181 0.999861 0.004107 0.004143 0.189883 -0.000960 0.981806 -0.228517
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.980591 0.017226 -0.195305 0.035374 -0.016708 0.999851 0.004299 0.004190 0.195350 -0.000952 0.980733 -0.235325
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.979514 0.017622 -0.200603 0.036209 -0.017069 0.999844 0.004485 0.004334 0.200651 -0.000969 0.979662 -0.241731
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.978373 0.018206 -0.206046 0.037437 -0.017623 0.999834 0.004666 0.004522 0.206096 -0.000934 0.978531 -0.248238
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.977211 0.018532 -0.211459 0.037885 -0.017927 0.999828 0.004778 0.004847 0.211511 -0.000879 0.977375 -0.255142
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.976012 0.019050 -0.216884 0.039135 -0.018408 0.999818 0.004982 0.004922 0.216939 -0.000870 0.976185 -0.261305
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.974789 0.019679 -0.222261 0.039791 -0.019036 0.999806 0.005032 0.005336 0.222317 -0.000674 0.974974 -0.268106
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.974151 0.019788 -0.225030 0.040594 -0.019098 0.999804 0.005244 0.005056 0.225090 -0.000811 0.974338 -0.271040
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.972891 0.020362 -0.230367 0.041332 -0.019663 0.999792 0.005330 0.005537 0.230428 -0.000655 0.973089 -0.277575
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.971580 0.020817 -0.235793 0.042519 -0.020072 0.999783 0.005560 0.005671 0.235857 -0.000669 0.971787 -0.283897
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.970239 0.021250 -0.241215 0.043585 -0.020484 0.999774 0.005686 0.005986 0.241281 -0.000575 0.970455 -0.290670
+0.484271 0.860926 0.500000 0.500000 0.000000 0.000000 0.968914 0.021827 -0.246432 0.043821 -0.021013 0.999762 0.005934 0.005978 0.246503 -0.000572 0.969142 -0.297111
diff --git a/assets/re10k_poses/2bff9ec89ca982c9.txt b/assets/re10k_poses/2bff9ec89ca982c9.txt
new file mode 100644
index 0000000000000000000000000000000000000000..345cf81c8d8e1612a13592206c07fab7e904a94b
--- /dev/null
+++ b/assets/re10k_poses/2bff9ec89ca982c9.txt
@@ -0,0 +1,49 @@
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 0.000000 1.000000 -0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.999897 0.002324 -0.014159 0.026021 -0.002323 0.999997 0.000138 -0.009400 0.014159 -0.000105 0.999900 0.059376
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.999532 0.004330 -0.030279 0.051997 -0.004340 0.999991 -0.000262 -0.021375 0.030278 0.000393 0.999541 0.123797
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.998814 0.006311 -0.048278 0.079519 -0.006329 0.999980 -0.000216 -0.033931 0.048276 0.000521 0.998834 0.193079
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.998386 0.007339 -0.056308 0.092190 -0.007353 0.999973 -0.000052 -0.039643 0.056306 0.000466 0.998413 0.226595
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.997311 0.008934 -0.072745 0.118203 -0.008985 0.999960 -0.000374 -0.049467 0.072739 0.001027 0.997350 0.293440
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.995958 0.010773 -0.089176 0.144259 -0.010822 0.999941 -0.000068 -0.057259 0.089170 0.001033 0.996016 0.362029
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.994296 0.012484 -0.105923 0.171418 -0.012573 0.999921 -0.000173 -0.062901 0.105913 0.001504 0.994374 0.434094
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.992399 0.014137 -0.122246 0.199304 -0.014232 0.999899 0.000092 -0.066528 0.122235 0.001649 0.992500 0.509865
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.990195 0.015604 -0.138815 0.227993 -0.015724 0.999876 0.000238 -0.070378 0.138802 0.001947 0.990318 0.588065
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.987766 0.017325 -0.154978 0.256049 -0.017414 0.999848 0.000784 -0.076363 0.154968 0.001924 0.987918 0.666504
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.984948 0.019131 -0.171791 0.282590 -0.019201 0.999815 0.001252 -0.085113 0.171783 0.002066 0.985133 0.744662
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.983436 0.020091 -0.180137 0.295254 -0.020143 0.999796 0.001543 -0.090415 0.180131 0.002111 0.983640 0.783141
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.980066 0.021941 -0.197457 0.321437 -0.021962 0.999757 0.002084 -0.100742 0.197455 0.002295 0.980309 0.859164
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.976519 0.023850 -0.214108 0.347346 -0.023868 0.999712 0.002500 -0.110018 0.214106 0.002669 0.976807 0.933428
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.972594 0.025588 -0.231096 0.374147 -0.025634 0.999667 0.002808 -0.119008 0.231091 0.003193 0.972927 1.006998
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.968557 0.027407 -0.247277 0.399689 -0.027425 0.999618 0.003369 -0.127224 0.247275 0.003519 0.968939 1.080326
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.964121 0.029075 -0.263867 0.424254 -0.029134 0.999569 0.003690 -0.135122 0.263860 0.004130 0.964552 1.151247
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.959633 0.030781 -0.279567 0.446264 -0.030806 0.999516 0.004303 -0.142286 0.279564 0.004483 0.960117 1.216115
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.954751 0.032547 -0.295619 0.466625 -0.032607 0.999457 0.004728 -0.148820 0.295612 0.005126 0.955294 1.277755
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.952193 0.033358 -0.303671 0.476423 -0.033447 0.999428 0.004909 -0.151214 0.303661 0.005482 0.952764 1.308142
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.946711 0.035018 -0.320176 0.497445 -0.035103 0.999368 0.005509 -0.154191 0.320167 0.006023 0.947342 1.370851
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.941068 0.036618 -0.336230 0.518821 -0.036714 0.999307 0.006073 -0.155649 0.336219 0.006629 0.941760 1.434526
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.935004 0.038245 -0.352568 0.542762 -0.038404 0.999241 0.006547 -0.155317 0.352551 0.007419 0.935763 1.499789
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.928313 0.039847 -0.369658 0.568687 -0.039989 0.999174 0.007282 -0.154440 0.369643 0.008023 0.929139 1.564249
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.920977 0.041568 -0.387394 0.595933 -0.041788 0.999096 0.007859 -0.153736 0.387370 0.008951 0.921881 1.629508
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.912836 0.043256 -0.406029 0.625278 -0.043677 0.999012 0.008234 -0.152654 0.405984 0.010217 0.913823 1.698288
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.903990 0.045021 -0.425176 0.657225 -0.045464 0.998924 0.009111 -0.151581 0.425129 0.011094 0.905065 1.770352
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.899223 0.045919 -0.435074 0.673571 -0.046417 0.998877 0.009487 -0.151491 0.435021 0.011664 0.900345 1.807215
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.888747 0.047817 -0.455898 0.709352 -0.048434 0.998773 0.010336 -0.155933 0.455832 0.012895 0.889972 1.878813
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.876771 0.050338 -0.478267 0.752175 -0.050939 0.998633 0.011724 -0.162468 0.478203 0.014083 0.878136 1.951436
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.864520 0.051994 -0.499902 0.799755 -0.052692 0.998530 0.012730 -0.168717 0.499828 0.015335 0.865989 2.027966
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.850998 0.053820 -0.522404 0.845949 -0.054815 0.998404 0.013566 -0.176977 0.522300 0.017091 0.852590 2.104931
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.836391 0.056476 -0.545216 0.889401 -0.057450 0.998232 0.015270 -0.186996 0.545115 0.018551 0.838156 2.179345
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.820821 0.058387 -0.568194 0.928250 -0.059734 0.998082 0.016269 -0.197658 0.568054 0.020587 0.822734 2.250248
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.805151 0.060955 -0.589929 0.958668 -0.062284 0.997894 0.018102 -0.204944 0.589790 0.022168 0.807252 2.316562
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.796637 0.062198 -0.601250 0.973154 -0.063559 0.997797 0.019006 -0.208209 0.601107 0.023074 0.798835 2.351170
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.779352 0.064198 -0.623289 0.998340 -0.065969 0.997616 0.020266 -0.211364 0.623104 0.025323 0.781729 2.420085
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.761591 0.066956 -0.644590 1.019801 -0.068387 0.997398 0.022804 -0.211089 0.644439 0.026714 0.764189 2.488975
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.743650 0.069371 -0.664960 1.039719 -0.070761 0.997183 0.024895 -0.209020 0.664814 0.028540 0.746464 2.557999
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.725058 0.071800 -0.684935 1.059024 -0.073093 0.996956 0.027133 -0.206816 0.684798 0.030391 0.728099 2.628687
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.706432 0.074370 -0.703863 1.075163 -0.075529 0.996707 0.029507 -0.205321 0.703740 0.032317 0.709722 2.703440
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.687673 0.076240 -0.722006 1.085937 -0.077715 0.996487 0.031204 -0.206246 0.721849 0.034652 0.691182 2.780448
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.668181 0.078386 -0.739858 1.093372 -0.079660 0.996255 0.033608 -0.210897 0.739722 0.036481 0.671923 2.857585
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.658419 0.079464 -0.748445 1.096167 -0.080635 0.996135 0.034826 -0.214928 0.748319 0.037421 0.662282 2.895822
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.637973 0.081084 -0.765778 1.103610 -0.082358 0.995922 0.036840 -0.225527 0.765642 0.039565 0.642049 2.969662
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.617274 0.083415 -0.782314 1.112704 -0.084269 0.995653 0.039672 -0.237335 0.782222 0.041436 0.621620 3.043346
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.596393 0.085307 -0.798146 1.119668 -0.086233 0.995391 0.041953 -0.248458 0.798047 0.043806 0.601001 3.116273
+0.497319 0.884124 0.500000 0.500000 0.000000 0.000000 0.574961 0.086951 -0.813547 1.123553 -0.087938 0.995144 0.044211 -0.258951 0.813441 0.046123 0.579816 3.185046
diff --git a/assets/re10k_poses/407eefe8017f6070.txt b/assets/re10k_poses/407eefe8017f6070.txt
new file mode 100644
index 0000000000000000000000000000000000000000..05417e49eb2c48196eddf5b2c2d30dbc759b0aa1
--- /dev/null
+++ b/assets/re10k_poses/407eefe8017f6070.txt
@@ -0,0 +1,49 @@
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 -0.000000 1.000000 0.000000
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 1.000000 0.000192 0.000330 -0.011279 -0.000192 1.000000 -0.000687 -0.002436 -0.000330 0.000687 1.000000 0.027542
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 1.000000 0.000374 0.000662 -0.023512 -0.000374 0.999999 -0.001064 -0.004608 -0.000662 0.001063 0.999999 0.055032
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 1.000000 0.000515 0.000772 -0.036727 -0.000514 0.999999 -0.001358 -0.007364 -0.000773 0.001358 0.999999 0.083311
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 1.000000 0.000498 0.000529 -0.042916 -0.000497 0.999999 -0.001550 -0.008861 -0.000529 0.001550 0.999999 0.097641
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 1.000000 0.000672 0.000480 -0.055582 -0.000671 0.999997 -0.002302 -0.012520 -0.000482 0.002302 0.999997 0.126017
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999999 0.001042 0.000645 -0.067724 -0.001040 0.999996 -0.002650 -0.016048 -0.000648 0.002649 0.999996 0.154257
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999998 0.001237 0.001382 -0.081251 -0.001232 0.999993 -0.003442 -0.019737 -0.001386 0.003440 0.999993 0.181957
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999999 0.001009 0.001147 -0.094230 -0.001005 0.999992 -0.003810 -0.023564 -0.001151 0.003809 0.999992 0.209621
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999999 0.000963 0.000640 -0.106398 -0.000960 0.999989 -0.004538 -0.026456 -0.000645 0.004537 0.999989 0.236803
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999999 0.000952 0.000592 -0.118676 -0.000949 0.999986 -0.005302 -0.028627 -0.000597 0.005301 0.999986 0.263776
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999999 0.001154 0.000838 -0.130387 -0.001149 0.999981 -0.005999 -0.029535 -0.000845 0.005998 0.999982 0.290513
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999999 0.001132 0.000950 -0.136268 -0.001126 0.999980 -0.006220 -0.029935 -0.000957 0.006218 0.999980 0.303972
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999998 0.001353 0.001183 -0.147770 -0.001344 0.999975 -0.006901 -0.029901 -0.001192 0.006899 0.999976 0.331031
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999997 0.001618 0.001631 -0.159771 -0.001606 0.999973 -0.007207 -0.030368 -0.001643 0.007204 0.999973 0.358456
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999997 0.001698 0.001614 -0.173370 -0.001685 0.999967 -0.007974 -0.030846 -0.001628 0.007971 0.999967 0.386855
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999997 0.001924 0.001180 -0.184553 -0.001915 0.999966 -0.008027 -0.032693 -0.001195 0.008025 0.999967 0.415576
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999996 0.001827 0.002023 -0.197701 -0.001809 0.999959 -0.008892 -0.035309 -0.002039 0.008888 0.999958 0.444552
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999997 0.001721 0.001546 -0.208412 -0.001706 0.999956 -0.009274 -0.039185 -0.001562 0.009271 0.999956 0.473935
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999994 0.001958 0.002831 -0.221277 -0.001930 0.999949 -0.009917 -0.042569 -0.002850 0.009911 0.999947 0.502434
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999993 0.002023 0.003015 -0.227225 -0.001992 0.999946 -0.010231 -0.044746 -0.003035 0.010225 0.999943 0.516686
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999990 0.002099 0.004006 -0.239450 -0.002055 0.999937 -0.011002 -0.047904 -0.004029 0.010994 0.999931 0.544149
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999987 0.002081 0.004633 -0.252782 -0.002027 0.999930 -0.011660 -0.051359 -0.004657 0.011650 0.999921 0.571882
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999984 0.002263 0.005167 -0.265025 -0.002200 0.999923 -0.012254 -0.054227 -0.005195 0.012242 0.999912 0.599599
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999976 0.002285 0.006585 -0.278309 -0.002202 0.999918 -0.012623 -0.057211 -0.006614 0.012608 0.999899 0.627726
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999971 0.002076 0.007380 -0.291958 -0.001976 0.999906 -0.013604 -0.060015 -0.007407 0.013589 0.999880 0.656224
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999963 0.001895 0.008342 -0.305551 -0.001780 0.999902 -0.013863 -0.063016 -0.008368 0.013847 0.999869 0.684694
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999963 0.001904 0.008385 -0.318687 -0.001779 0.999889 -0.014781 -0.066332 -0.008412 0.014766 0.999856 0.712869
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999961 0.001944 0.008578 -0.324241 -0.001815 0.999886 -0.015009 -0.068283 -0.008606 0.014993 0.999851 0.726721
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999940 0.001978 0.010772 -0.335769 -0.001806 0.999871 -0.015973 -0.072369 -0.010802 0.015952 0.999814 0.754293
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999921 0.001622 0.012440 -0.349131 -0.001415 0.999861 -0.016619 -0.076440 -0.012465 0.016600 0.999785 0.781228
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999906 0.001571 0.013630 -0.360545 -0.001331 0.999844 -0.017622 -0.079349 -0.013655 0.017603 0.999752 0.808080
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999891 0.001443 0.014692 -0.372093 -0.001171 0.999828 -0.018526 -0.082157 -0.014716 0.018507 0.999720 0.834599
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999862 0.001325 0.016570 -0.382348 -0.001003 0.999811 -0.019398 -0.083474 -0.016593 0.019378 0.999675 0.860887
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999831 0.001175 0.018352 -0.393599 -0.000802 0.999793 -0.020313 -0.084294 -0.018372 0.020295 0.999625 0.887115
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999796 0.001132 0.020183 -0.404349 -0.000712 0.999783 -0.020814 -0.084635 -0.020202 0.020795 0.999580 0.914220
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999777 0.001123 0.021075 -0.409107 -0.000682 0.999781 -0.020918 -0.084869 -0.021094 0.020899 0.999559 0.927905
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999730 0.001012 0.023207 -0.420579 -0.000518 0.999774 -0.021275 -0.085700 -0.023223 0.021258 0.999504 0.956024
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999702 0.001104 0.024387 -0.431972 -0.000581 0.999770 -0.021433 -0.086826 -0.024405 0.021412 0.999473 0.984514
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999649 0.001490 0.026445 -0.441903 -0.000905 0.999755 -0.022119 -0.089888 -0.026471 0.022087 0.999406 1.013673
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999608 0.001258 0.027971 -0.453844 -0.000623 0.999742 -0.022694 -0.092862 -0.027993 0.022668 0.999351 1.042317
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999567 0.001436 0.029393 -0.464570 -0.000745 0.999724 -0.023501 -0.096638 -0.029419 0.023469 0.999292 1.070822
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999529 0.001655 0.030655 -0.475674 -0.000930 0.999720 -0.023664 -0.099788 -0.030686 0.023624 0.999250 1.098712
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999473 0.001254 0.032432 -0.486086 -0.000467 0.999706 -0.024256 -0.102732 -0.032453 0.024228 0.999180 1.126361
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999430 0.001171 0.033737 -0.491839 -0.000352 0.999705 -0.024281 -0.103756 -0.033755 0.024255 0.999136 1.140245
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999383 0.001258 0.035091 -0.503847 -0.000396 0.999699 -0.024540 -0.106702 -0.035112 0.024511 0.999083 1.167994
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999342 0.001365 0.036241 -0.516362 -0.000455 0.999685 -0.025101 -0.108587 -0.036264 0.025068 0.999028 1.196527
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999304 0.001516 0.037264 -0.528497 -0.000568 0.999676 -0.025440 -0.110041 -0.037290 0.025401 0.998982 1.225300
+0.480777 0.854715 0.500000 0.500000 0.000000 0.000000 0.999254 0.001515 0.038600 -0.541444 -0.000524 0.999670 -0.025669 -0.112626 -0.038626 0.025630 0.998925 1.254329
diff --git a/assets/re10k_poses/4118895a33890c5a.txt b/assets/re10k_poses/4118895a33890c5a.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af11e4d594ec43939982532dc9f29311cb600396
--- /dev/null
+++ b/assets/re10k_poses/4118895a33890c5a.txt
@@ -0,0 +1,49 @@
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 -0.000000 1.000000 -0.000000
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.999929 -0.002090 0.011700 0.018388 0.002099 0.999997 -0.000809 -0.001126 -0.011698 0.000833 0.999931 0.015357
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.999722 -0.003967 0.023261 0.037211 0.004015 0.999990 -0.002002 -0.002332 -0.023253 0.002095 0.999727 0.030883
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.999383 -0.006283 0.034545 0.054412 0.006421 0.999972 -0.003866 -0.002045 -0.034520 0.004086 0.999396 0.050968
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.999178 -0.007673 0.039795 0.062394 0.007885 0.999956 -0.005159 -0.000939 -0.039754 0.005469 0.999195 0.062129
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.998767 -0.009386 0.048748 0.079698 0.009683 0.999936 -0.005863 -0.003406 -0.048690 0.006328 0.998794 0.085244
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.998263 -0.011477 0.057784 0.095209 0.011889 0.999906 -0.006787 -0.004739 -0.057701 0.007462 0.998306 0.110808
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.997647 -0.012882 0.067341 0.109019 0.013387 0.999886 -0.007048 -0.008924 -0.067243 0.007933 0.997705 0.139561
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.996895 -0.014493 0.077402 0.121617 0.015101 0.999859 -0.007279 -0.013550 -0.077285 0.008425 0.996973 0.171617
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.996028 -0.015797 0.087631 0.133660 0.016501 0.999837 -0.007313 -0.019480 -0.087502 0.008730 0.996126 0.205027
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.995022 -0.016752 0.098240 0.143957 0.017550 0.999820 -0.007267 -0.025990 -0.098100 0.008955 0.995136 0.240096
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.993871 -0.016853 0.109256 0.153035 0.017709 0.999820 -0.006868 -0.034439 -0.109120 0.008760 0.993990 0.274839
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.993239 -0.017229 0.114801 0.157513 0.018176 0.999809 -0.007206 -0.036934 -0.114655 0.009244 0.993362 0.291797
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.991878 -0.017175 0.126025 0.165589 0.018198 0.999810 -0.006975 -0.043814 -0.125881 0.009212 0.992003 0.326197
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.990465 -0.017801 0.136610 0.173699 0.018931 0.999796 -0.006979 -0.047973 -0.136458 0.009499 0.990600 0.360341
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.989135 -0.017853 0.145923 0.180554 0.018829 0.999809 -0.005305 -0.054380 -0.145800 0.007994 0.989282 0.396576
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.987817 -0.018846 0.154475 0.187036 0.019907 0.999788 -0.005324 -0.054942 -0.154342 0.008334 0.987982 0.435335
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.986567 -0.018675 0.162287 0.194698 0.019773 0.999791 -0.005148 -0.057746 -0.162157 0.008288 0.986730 0.474401
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.985330 -0.018232 0.169683 0.203361 0.019405 0.999798 -0.005260 -0.059204 -0.169553 0.008476 0.985485 0.514097
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.984073 -0.017660 0.176887 0.212695 0.018849 0.999810 -0.005045 -0.060287 -0.176764 0.008299 0.984218 0.555117
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.983424 -0.017209 0.180500 0.217198 0.018396 0.999819 -0.004907 -0.060264 -0.180383 0.008146 0.983563 0.577167
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.982103 -0.016245 0.187640 0.225919 0.017405 0.999838 -0.004536 -0.060257 -0.187536 0.007721 0.982227 0.621457
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.981059 -0.015883 0.193056 0.236523 0.017013 0.999846 -0.004198 -0.059625 -0.192960 0.007403 0.981179 0.665481
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.980182 -0.015960 0.197454 0.246723 0.016993 0.999849 -0.003539 -0.060489 -0.197368 0.006824 0.980306 0.708284
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.979314 -0.015732 0.201732 0.257470 0.016557 0.999860 -0.002401 -0.062629 -0.201666 0.005692 0.979438 0.749880
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.978445 -0.015250 0.205943 0.268956 0.015758 0.999875 -0.000829 -0.066501 -0.205905 0.004057 0.978564 0.790945
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.977558 -0.014954 0.210137 0.280878 0.015200 0.999884 0.000448 -0.070276 -0.210119 0.002756 0.977672 0.831286
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.976688 -0.015041 0.214138 0.294171 0.015083 0.999885 0.001436 -0.074557 -0.214135 0.001828 0.976802 0.871503
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.976274 -0.014769 0.216036 0.300910 0.014636 0.999890 0.002219 -0.077225 -0.216045 0.000995 0.976383 0.892413
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975696 -0.014897 0.218621 0.315586 0.014356 0.999889 0.004062 -0.081701 -0.218657 -0.000825 0.975801 0.936239
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975444 -0.015550 0.219700 0.331164 0.014479 0.999874 0.006487 -0.085385 -0.219773 -0.003147 0.975546 0.981525
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975325 -0.016052 0.220191 0.348039 0.014305 0.999852 0.009526 -0.090879 -0.220312 -0.006141 0.975410 1.027279
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975131 -0.016550 0.221008 0.364738 0.014184 0.999824 0.012288 -0.097606 -0.221173 -0.008847 0.975195 1.072202
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975049 -0.017070 0.221333 0.383586 0.014087 0.999787 0.015053 -0.106208 -0.221543 -0.011559 0.975082 1.114022
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.974993 -0.017591 0.221539 0.402182 0.014137 0.999753 0.017166 -0.114338 -0.221786 -0.013605 0.975000 1.151848
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975218 -0.018421 0.220479 0.421241 0.014596 0.999714 0.018967 -0.121487 -0.220765 -0.015278 0.975207 1.186979
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.975432 -0.018391 0.219531 0.430045 0.014359 0.999698 0.019945 -0.125347 -0.219831 -0.016303 0.975402 1.204347
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.976062 -0.018335 0.216720 0.449412 0.014041 0.999674 0.021340 -0.131833 -0.217041 -0.017786 0.976000 1.237450
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.976850 -0.017452 0.213210 0.470849 0.012954 0.999664 0.022473 -0.137161 -0.213531 -0.019191 0.976748 1.269373
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.977581 -0.016111 0.209941 0.491550 0.011555 0.999671 0.022911 -0.140656 -0.210241 -0.019972 0.977446 1.302023
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.978572 -0.014703 0.205380 0.517261 0.010115 0.999676 0.023374 -0.144073 -0.205657 -0.020796 0.978403 1.332913
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.979552 -0.013222 0.200757 0.543940 0.008613 0.999679 0.023815 -0.147083 -0.201007 -0.021599 0.979352 1.365068
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.980551 -0.012110 0.195892 0.571768 0.007636 0.999693 0.023576 -0.149039 -0.196118 -0.021622 0.980342 1.397076
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.981599 -0.010543 0.190660 0.599852 0.006168 0.999704 0.023526 -0.153562 -0.190852 -0.021917 0.981374 1.428958
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.982100 -0.009839 0.188101 0.613282 0.005578 0.999716 0.023172 -0.155654 -0.188276 -0.021708 0.981876 1.444590
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.983179 -0.007927 0.182475 0.640218 0.003867 0.999737 0.022595 -0.161715 -0.182606 -0.021509 0.982951 1.474119
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.984252 -0.006040 0.176668 0.666188 0.002318 0.999771 0.021265 -0.167869 -0.176756 -0.020521 0.984041 1.502077
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.985342 -0.003545 0.170552 0.690547 0.000078 0.999793 0.020332 -0.176250 -0.170589 -0.020020 0.985139 1.529668
+0.475744 0.845767 0.500000 0.500000 0.000000 0.000000 0.986393 -0.000655 0.164405 0.712575 -0.002482 0.999819 0.018880 -0.184228 -0.164388 -0.019031 0.986212 1.556736
diff --git a/assets/re10k_poses/56f35362431e18dd.txt b/assets/re10k_poses/56f35362431e18dd.txt
new file mode 100644
index 0000000000000000000000000000000000000000..057c431f78ea64412209291d7752e7b61ad801a4
--- /dev/null
+++ b/assets/re10k_poses/56f35362431e18dd.txt
@@ -0,0 +1,49 @@
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 -0.000000 -0.000000 -0.000000 1.000000 -0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.999946 0.000152 -0.010367 -0.010672 -0.000153 1.000000 -0.000067 0.000324 0.010367 0.000069 0.999946 0.004125
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.999785 0.000277 -0.020729 -0.022690 -0.000280 1.000000 -0.000142 0.000744 0.020729 0.000148 0.999785 0.007883
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.999510 0.000412 -0.031305 -0.036088 -0.000414 1.000000 -0.000073 0.001213 0.031305 0.000086 0.999510 0.011557
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.999319 0.000579 -0.036901 -0.042202 -0.000574 1.000000 0.000147 0.000387 0.036901 -0.000126 0.999319 0.013353
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.998850 0.000835 -0.047927 -0.055002 -0.000842 1.000000 -0.000111 -0.000195 0.047927 0.000151 0.998851 0.017723
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.998264 0.001004 -0.058893 -0.068181 -0.001017 0.999999 -0.000206 -0.001439 0.058893 0.000265 0.998264 0.020936
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.997543 0.001239 -0.070049 -0.080672 -0.001234 0.999999 0.000123 -0.002348 0.070049 -0.000036 0.997544 0.024880
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.996701 0.001509 -0.081146 -0.093346 -0.001523 0.999999 -0.000114 -0.003070 0.081146 0.000237 0.996702 0.028525
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.995718 0.001551 -0.092425 -0.104868 -0.001565 0.999999 -0.000072 -0.003941 0.092425 0.000216 0.995720 0.032257
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.994615 0.001831 -0.103622 -0.117113 -0.001842 0.999998 -0.000017 -0.003979 0.103621 0.000207 0.994617 0.035978
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.993386 0.001901 -0.114810 -0.129917 -0.001919 0.999998 -0.000048 -0.004950 0.114810 0.000268 0.993387 0.038353
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.992676 0.002025 -0.120789 -0.135917 -0.002057 0.999998 -0.000136 -0.005509 0.120788 0.000384 0.992678 0.040057
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.991224 0.002132 -0.132176 -0.150070 -0.002151 0.999998 -0.000002 -0.007246 0.132176 0.000286 0.991226 0.043454
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.989612 0.002193 -0.143750 -0.163620 -0.002218 0.999998 -0.000012 -0.007923 0.143750 0.000330 0.989614 0.047003
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.987889 0.002395 -0.155142 -0.176225 -0.002466 0.999997 -0.000267 -0.008029 0.155141 0.000646 0.987892 0.049921
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.986156 0.002577 -0.165798 -0.188016 -0.002641 0.999997 -0.000167 -0.007926 0.165797 0.000603 0.986160 0.052707
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.984366 0.002643 -0.176114 -0.198641 -0.002727 0.999996 -0.000232 -0.008070 0.176113 0.000709 0.984370 0.055693
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.982475 0.002434 -0.186376 -0.209997 -0.002546 0.999997 -0.000359 -0.008249 0.186375 0.000828 0.982478 0.058889
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.980469 0.002238 -0.196659 -0.221716 -0.002452 0.999997 -0.000840 -0.007515 0.196657 0.001306 0.980472 0.062979
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.979406 0.002314 -0.201889 -0.226813 -0.002508 0.999997 -0.000706 -0.008414 0.201887 0.001197 0.979408 0.064367
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.977276 0.002201 -0.211959 -0.237621 -0.002497 0.999996 -0.001129 -0.008585 0.211956 0.001633 0.977278 0.068416
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.975018 0.002179 -0.222117 -0.248013 -0.002474 0.999996 -0.001050 -0.009815 0.222114 0.001573 0.975020 0.073687
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.972687 0.002136 -0.232112 -0.258671 -0.002492 0.999996 -0.001240 -0.010277 0.232108 0.001784 0.972688 0.079394
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.970320 0.002217 -0.241815 -0.269882 -0.002655 0.999995 -0.001483 -0.009895 0.241810 0.002081 0.970321 0.085599
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.967859 0.002284 -0.251484 -0.280887 -0.002669 0.999996 -0.001192 -0.009967 0.251480 0.001825 0.967861 0.091267
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.965247 0.002593 -0.261327 -0.292274 -0.003009 0.999995 -0.001193 -0.009536 0.261323 0.001938 0.965249 0.098043
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.962650 0.002775 -0.270734 -0.304298 -0.003198 0.999994 -0.001122 -0.007986 0.270730 0.001946 0.962653 0.104694
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.961226 0.002799 -0.275747 -0.309565 -0.003188 0.999994 -0.000963 -0.007488 0.275743 0.001805 0.961230 0.108488
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.958543 0.003038 -0.284930 -0.320939 -0.003636 0.999992 -0.001570 -0.005881 0.284923 0.002541 0.958547 0.116223
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.955726 0.003564 -0.294236 -0.330066 -0.004099 0.999991 -0.001200 -0.007183 0.294229 0.002353 0.955732 0.125613
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.952865 0.003396 -0.303376 -0.339261 -0.004004 0.999991 -0.001381 -0.006290 0.303368 0.002530 0.952870 0.134009
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.950247 0.003260 -0.311481 -0.351407 -0.003922 0.999991 -0.001501 -0.007812 0.311473 0.002648 0.950251 0.142204
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.947471 0.002956 -0.319828 -0.362367 -0.003817 0.999991 -0.002065 -0.009078 0.319819 0.003177 0.947473 0.150996
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.944806 0.002446 -0.327621 -0.374030 -0.003397 0.999992 -0.002331 -0.010213 0.327613 0.003316 0.944806 0.160064
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.941971 0.002453 -0.335686 -0.381987 -0.003570 0.999990 -0.002711 -0.011336 0.335677 0.003752 0.941970 0.169459
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.940813 0.002133 -0.338919 -0.387737 -0.003463 0.999989 -0.003319 -0.010378 0.338908 0.004296 0.940810 0.172667
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.938495 0.002135 -0.345285 -0.398181 -0.003646 0.999986 -0.003727 -0.009508 0.345272 0.004756 0.938491 0.179618
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.935926 0.001799 -0.352193 -0.407352 -0.003388 0.999987 -0.003894 -0.010269 0.352181 0.004838 0.935919 0.187291
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.933155 0.001600 -0.359470 -0.416199 -0.003340 0.999986 -0.004219 -0.010357 0.359458 0.005138 0.933147 0.195373
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.930465 0.001695 -0.366376 -0.426097 -0.003518 0.999985 -0.004310 -0.010103 0.366363 0.005299 0.930457 0.203127
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.927552 0.001860 -0.373690 -0.435554 -0.003764 0.999983 -0.004364 -0.010133 0.373676 0.005454 0.927543 0.210887
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.924629 0.001727 -0.380866 -0.445834 -0.003797 0.999982 -0.004685 -0.010130 0.380851 0.005778 0.924618 0.218561
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.921902 0.001513 -0.387420 -0.455702 -0.003809 0.999979 -0.005158 -0.011045 0.387404 0.006231 0.921889 0.225590
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.920440 0.001455 -0.390880 -0.460270 -0.003748 0.999980 -0.005103 -0.011090 0.390865 0.006162 0.920427 0.230323
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.917583 0.001481 -0.397541 -0.469537 -0.004022 0.999977 -0.005557 -0.010787 0.397524 0.006698 0.917567 0.240004
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.914962 0.001564 -0.403537 -0.479542 -0.004061 0.999978 -0.005333 -0.010750 0.403520 0.006519 0.914948 0.248781
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.912533 0.001134 -0.409002 -0.489532 -0.003971 0.999974 -0.006086 -0.009150 0.408985 0.007178 0.912513 0.258232
+0.485810 0.863663 0.500000 0.500000 0.000000 0.000000 0.910169 0.001462 -0.414234 -0.499087 -0.004392 0.999972 -0.006121 -0.009433 0.414213 0.007390 0.910150 0.267163
diff --git a/assets/re10k_poses/57132c374cc1ce2d.txt b/assets/re10k_poses/57132c374cc1ce2d.txt
new file mode 100644
index 0000000000000000000000000000000000000000..280648b2937678c0c2d27a528ff7b3fb37c18446
--- /dev/null
+++ b/assets/re10k_poses/57132c374cc1ce2d.txt
@@ -0,0 +1,49 @@
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 -0.000000 -0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.999963 -0.000025 0.008547 -0.006260 0.000029 1.000000 -0.000405 -0.002127 -0.008547 0.000405 0.999963 0.030243
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.999855 0.000186 0.017001 -0.012983 -0.000175 1.000000 -0.000617 -0.004737 -0.017002 0.000614 0.999855 0.061249
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.999682 -0.000273 0.025222 -0.018772 0.000306 0.999999 -0.001303 -0.006165 -0.025221 0.001310 0.999681 0.091734
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.999572 -0.000549 0.029233 -0.021278 0.000608 0.999998 -0.002016 -0.004904 -0.029232 0.002033 0.999571 0.106695
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.999265 -0.001002 0.038309 -0.028415 0.001064 0.999998 -0.001624 -0.006939 -0.038307 0.001663 0.999265 0.135966
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.998866 -0.001333 0.047581 -0.034061 0.001415 0.999998 -0.001696 -0.007782 -0.047579 0.001761 0.998866 0.164817
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.998365 -0.001880 0.057135 -0.039691 0.002005 0.999996 -0.002125 -0.008577 -0.057131 0.002236 0.998364 0.192809
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.997773 -0.002395 0.066653 -0.044265 0.002589 0.999993 -0.002819 -0.009052 -0.066646 0.002985 0.997772 0.220777
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.997044 -0.003354 0.076761 -0.048215 0.003612 0.999988 -0.003215 -0.009459 -0.076749 0.003483 0.997044 0.248224
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.996134 -0.004226 0.087743 -0.052309 0.004489 0.999986 -0.002797 -0.009386 -0.087730 0.003180 0.996139 0.275173
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.995092 -0.005219 0.098814 -0.055074 0.005461 0.999983 -0.002182 -0.009134 -0.098801 0.002711 0.995104 0.302823
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.994528 -0.005702 0.104312 -0.056440 0.005943 0.999980 -0.001998 -0.008658 -0.104298 0.002607 0.994543 0.316480
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.993283 -0.006984 0.115499 -0.058730 0.007186 0.999973 -0.001331 -0.008907 -0.115487 0.002152 0.993307 0.345301
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.991918 -0.008427 0.126601 -0.060195 0.008620 0.999962 -0.000975 -0.009589 -0.126588 0.002059 0.991953 0.375042
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.990404 -0.009554 0.137875 -0.061099 0.009714 0.999953 -0.000484 -0.010523 -0.137863 0.001819 0.990450 0.406239
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.988711 -0.010585 0.149462 -0.060602 0.010635 0.999943 0.000464 -0.012948 -0.149459 0.001131 0.988767 0.437576
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.986776 -0.011300 0.161699 -0.059872 0.011208 0.999936 0.001484 -0.015283 -0.161705 0.000348 0.986839 0.469697
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.984661 -0.011680 0.174088 -0.059371 0.011413 0.999932 0.002535 -0.018491 -0.174105 -0.000510 0.984727 0.500812
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.982435 -0.012575 0.186178 -0.056663 0.012192 0.999921 0.003205 -0.022419 -0.186204 -0.000879 0.982511 0.530740
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.981236 -0.013109 0.192366 -0.055357 0.012757 0.999914 0.003069 -0.023642 -0.192389 -0.000557 0.981319 0.545473
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.978712 -0.014419 0.204729 -0.052332 0.014149 0.999896 0.002782 -0.026231 -0.204748 0.000174 0.978815 0.574362
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.975851 -0.015731 0.217871 -0.049548 0.015597 0.999876 0.002337 -0.028200 -0.217881 0.001118 0.975975 0.603207
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.972793 -0.016672 0.231077 -0.046263 0.016608 0.999860 0.002222 -0.030494 -0.231082 0.001677 0.972933 0.632985
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.969519 -0.017930 0.244358 -0.043465 0.018010 0.999836 0.001906 -0.033135 -0.244353 0.002553 0.969683 0.663056
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.966072 -0.019309 0.257550 -0.039345 0.019691 0.999806 0.001098 -0.036074 -0.257521 0.004010 0.966264 0.694196
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.962296 -0.020509 0.271229 -0.037900 0.021040 0.999778 0.000950 -0.039881 -0.271188 0.004793 0.962514 0.725804
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.958309 -0.021980 0.284886 -0.035934 0.022588 0.999744 0.001153 -0.042032 -0.284839 0.005330 0.958561 0.757485
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.956166 -0.022523 0.291958 -0.035143 0.023091 0.999732 0.001499 -0.043170 -0.291913 0.005308 0.956430 0.773551
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.951773 -0.023822 0.305877 -0.033576 0.024385 0.999701 0.001983 -0.045296 -0.305832 0.005571 0.952069 0.805106
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.947139 -0.025188 0.319834 -0.031669 0.025724 0.999666 0.002548 -0.048658 -0.319792 0.005815 0.947470 0.836044
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.942244 -0.026412 0.333885 -0.029480 0.027156 0.999628 0.002439 -0.050936 -0.333826 0.006769 0.942610 0.866117
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.937211 -0.027883 0.347646 -0.025925 0.028736 0.999583 0.002704 -0.054069 -0.347576 0.007456 0.937622 0.894841
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.932058 -0.029490 0.361108 -0.023297 0.030453 0.999532 0.003025 -0.055484 -0.361028 0.008177 0.932519 0.923851
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.927152 -0.031750 0.373337 -0.020319 0.032481 0.999463 0.004335 -0.057515 -0.373274 0.008107 0.927686 0.951847
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.922147 -0.034050 0.385339 -0.016660 0.034910 0.999379 0.004767 -0.057194 -0.385262 0.009056 0.922763 0.979209
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.919576 -0.035357 0.391317 -0.014436 0.036114 0.999333 0.005428 -0.058002 -0.391248 0.009141 0.920240 0.992980
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.914382 -0.037761 0.403088 -0.009665 0.038427 0.999241 0.006437 -0.058470 -0.403025 0.009603 0.915139 1.020295
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.908907 -0.040310 0.415047 -0.004010 0.040763 0.999139 0.007773 -0.059114 -0.415003 0.009854 0.909767 1.046953
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.902992 -0.043396 0.427461 0.001671 0.043461 0.999009 0.009611 -0.058996 -0.427455 0.009899 0.903983 1.073699
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.897126 -0.046639 0.439305 0.008422 0.046384 0.998860 0.011321 -0.057753 -0.439332 0.010220 0.898267 1.099546
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.891028 -0.049891 0.451198 0.015645 0.049014 0.998705 0.013638 -0.057333 -0.451294 0.009963 0.892320 1.125882
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.884714 -0.053018 0.463110 0.023277 0.051652 0.998543 0.015641 -0.057310 -0.463265 0.010083 0.886163 1.152206
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.878201 -0.055794 0.475026 0.031002 0.054005 0.998389 0.017424 -0.056747 -0.475233 0.010352 0.879799 1.178311
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.874890 -0.057200 0.480932 0.035561 0.054814 0.998315 0.019020 -0.057471 -0.481210 0.009721 0.876552 1.191407
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.867804 -0.059199 0.493367 0.043239 0.055854 0.998207 0.021532 -0.057060 -0.493757 0.008871 0.869555 1.217167
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.860479 -0.060933 0.505830 0.052601 0.056371 0.998113 0.024340 -0.056034 -0.506359 0.007570 0.862290 1.242476
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.852792 -0.062715 0.518471 0.061984 0.056816 0.998012 0.027270 -0.055713 -0.519150 0.006202 0.854660 1.267605
+0.479127 0.851781 0.500000 0.500000 0.000000 0.000000 0.844971 -0.064364 0.530926 0.072113 0.057320 0.997912 0.029752 -0.055120 -0.531732 0.005293 0.846896 1.292203
diff --git a/assets/re10k_poses/6b1091f5eb05783e.txt b/assets/re10k_poses/6b1091f5eb05783e.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8bf8396456310e2bd727e1ac264dd34aa58ab3d5
--- /dev/null
+++ b/assets/re10k_poses/6b1091f5eb05783e.txt
@@ -0,0 +1,49 @@
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999996 -0.000499 0.002631 -0.010429 0.000503 0.999999 -0.001491 -0.001114 -0.002630 0.001492 0.999995 0.049869
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999984 -0.000661 0.005654 -0.020721 0.000680 0.999995 -0.003202 -0.002282 -0.005652 0.003206 0.999979 0.100130
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999957 -0.001089 0.009240 -0.030968 0.001133 0.999988 -0.004776 -0.003059 -0.009234 0.004786 0.999946 0.150037
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999936 -0.001286 0.011262 -0.035872 0.001346 0.999985 -0.005274 -0.004098 -0.011255 0.005289 0.999923 0.175446
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999874 -0.001652 0.015787 -0.045514 0.001756 0.999977 -0.006575 -0.005718 -0.015775 0.006602 0.999854 0.227800
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999791 -0.002233 0.020344 -0.053370 0.002378 0.999972 -0.007077 -0.008857 -0.020328 0.007124 0.999768 0.279422
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999676 -0.002087 0.025365 -0.061636 0.002277 0.999969 -0.007480 -0.012972 -0.025349 0.007536 0.999650 0.330565
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999522 -0.002319 0.030818 -0.069290 0.002577 0.999962 -0.008359 -0.015263 -0.030798 0.008434 0.999490 0.380701
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999320 -0.002366 0.036784 -0.078000 0.002720 0.999950 -0.009571 -0.016727 -0.036760 0.009665 0.999277 0.430320
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.999064 -0.002354 0.043189 -0.086370 0.002830 0.999936 -0.010976 -0.017965 -0.043161 0.011088 0.999007 0.479785
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.998727 -0.002643 0.050382 -0.095292 0.003284 0.999915 -0.012643 -0.018053 -0.050345 0.012793 0.998650 0.529269
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.998546 -0.002724 0.053831 -0.099889 0.003454 0.999903 -0.013472 -0.018047 -0.053789 0.013639 0.998459 0.553966
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.998110 -0.002657 0.061391 -0.109283 0.003585 0.999881 -0.015006 -0.018716 -0.061344 0.015198 0.998001 0.604242
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.997572 -0.002675 0.069594 -0.120575 0.003822 0.999859 -0.016354 -0.020038 -0.069540 0.016580 0.997441 0.654110
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.996869 -0.002235 0.079039 -0.133835 0.003612 0.999844 -0.017291 -0.022165 -0.078988 0.017523 0.996722 0.704160
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.996112 -0.002139 0.088066 -0.143864 0.003742 0.999830 -0.018044 -0.024733 -0.088012 0.018303 0.995951 0.756299
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.995178 -0.002203 0.098059 -0.155889 0.003997 0.999828 -0.018096 -0.028941 -0.098003 0.018401 0.995016 0.807195
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.994071 -0.002413 0.108706 -0.168539 0.004466 0.999816 -0.018648 -0.031546 -0.108641 0.019023 0.993899 0.858593
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.992763 -0.002492 0.120063 -0.181853 0.004796 0.999810 -0.018904 -0.034681 -0.119993 0.019343 0.992586 0.909583
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.992062 -0.002544 0.125723 -0.188082 0.004996 0.999803 -0.019193 -0.035617 -0.125650 0.019669 0.991880 0.935196
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.990639 -0.002782 0.136480 -0.196939 0.005583 0.999782 -0.020142 -0.036488 -0.136394 0.020716 0.990438 0.984816
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.989069 -0.002698 0.147431 -0.204579 0.005844 0.999764 -0.020910 -0.038011 -0.147340 0.021543 0.988851 1.034012
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.987535 -0.002852 0.157374 -0.208250 0.006323 0.999748 -0.021556 -0.039494 -0.157273 0.022282 0.987304 1.080650
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.985603 -0.003405 0.169041 -0.216257 0.007340 0.999716 -0.022659 -0.038691 -0.168916 0.023573 0.985349 1.127037
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.983601 -0.004036 0.180315 -0.222408 0.008318 0.999701 -0.022996 -0.039032 -0.180168 0.024119 0.983340 1.175368
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.981634 -0.004870 0.190711 -0.224244 0.009441 0.999689 -0.023067 -0.040097 -0.190539 0.024443 0.981375 1.221190
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.979437 -0.005705 0.201668 -0.226852 0.010480 0.999689 -0.022619 -0.042081 -0.201476 0.024267 0.979193 1.269333
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.978407 -0.006642 0.206582 -0.225581 0.011478 0.999687 -0.022221 -0.043498 -0.206370 0.024112 0.978177 1.293265
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.975839 -0.007627 0.218357 -0.228752 0.012703 0.999680 -0.021853 -0.044519 -0.218120 0.024099 0.975624 1.342997
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.973072 -0.008859 0.230332 -0.231643 0.014094 0.999678 -0.021092 -0.046906 -0.230071 0.023770 0.972884 1.391204
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.970183 -0.010246 0.242155 -0.233846 0.015542 0.999680 -0.019971 -0.050136 -0.241873 0.023139 0.970032 1.437838
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.967154 -0.011398 0.253934 -0.233738 0.016770 0.999679 -0.019002 -0.052380 -0.253636 0.022637 0.967035 1.485028
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.963810 -0.012588 0.266293 -0.235889 0.018112 0.999668 -0.018300 -0.053390 -0.265974 0.022460 0.963718 1.529152
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.960554 -0.014482 0.277717 -0.233843 0.020151 0.999643 -0.017571 -0.055150 -0.277363 0.022475 0.960502 1.573199
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.956614 -0.015220 0.290959 -0.238134 0.021340 0.999613 -0.017871 -0.054048 -0.290574 0.023305 0.956569 1.618829
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.954694 -0.015761 0.297171 -0.236747 0.022099 0.999594 -0.017981 -0.053634 -0.296767 0.023734 0.954655 1.643468
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.950495 -0.018125 0.310211 -0.235719 0.024820 0.999536 -0.017649 -0.054532 -0.309747 0.024474 0.950504 1.693785
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.946268 -0.020302 0.322746 -0.231351 0.027438 0.999469 -0.017576 -0.054188 -0.322218 0.025488 0.946322 1.744426
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.942072 -0.022228 0.334673 -0.223955 0.029652 0.999414 -0.017088 -0.055047 -0.334097 0.026021 0.942179 1.795191
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.937389 -0.024534 0.347419 -0.218573 0.031992 0.999364 -0.015747 -0.057684 -0.346811 0.025875 0.937578 1.847007
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.932692 -0.027081 0.359655 -0.211258 0.034683 0.999290 -0.014699 -0.060579 -0.359002 0.026184 0.932970 1.894406
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.928032 -0.029412 0.371338 -0.200798 0.036518 0.999259 -0.012118 -0.067646 -0.370706 0.024807 0.928419 1.945519
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.922867 -0.031836 0.383801 -0.192589 0.038730 0.999197 -0.010246 -0.072749 -0.383167 0.024321 0.923359 1.994479
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.920251 -0.033399 0.389902 -0.187824 0.040475 0.999131 -0.009942 -0.072113 -0.389231 0.024931 0.920803 2.020332
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.914774 -0.035337 0.402418 -0.178855 0.042483 0.999058 -0.008842 -0.073651 -0.401727 0.025184 0.915413 2.068351
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.908434 -0.037387 0.416354 -0.174262 0.045050 0.998948 -0.008592 -0.073055 -0.415595 0.026562 0.909162 2.116669
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.902757 -0.038937 0.428384 -0.163055 0.046499 0.998892 -0.007197 -0.075529 -0.427630 0.026417 0.903568 2.162950
+0.501420 0.891414 0.500000 0.500000 0.000000 0.000000 0.897003 -0.040889 0.440129 -0.151191 0.048437 0.998809 -0.005925 -0.078336 -0.439362 0.026633 0.897915 2.204097
diff --git a/assets/re10k_poses/6b6d20c6a46b9fe9.txt b/assets/re10k_poses/6b6d20c6a46b9fe9.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9406ba8d575fffc261080e12c2033de09d4431e2
--- /dev/null
+++ b/assets/re10k_poses/6b6d20c6a46b9fe9.txt
@@ -0,0 +1,49 @@
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.999920 0.006862 -0.010603 0.022806 -0.006864 0.999976 -0.000138 0.000675 0.010602 0.000211 0.999944 -0.000791
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.999668 0.014057 -0.021599 0.045391 -0.014063 0.999901 -0.000142 0.001112 0.021595 0.000446 0.999767 -0.001212
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.999236 0.021283 -0.032766 0.067778 -0.021312 0.999773 -0.000538 0.001593 0.032747 0.001236 0.999463 -0.001751
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.998965 0.024834 -0.038115 0.079273 -0.024872 0.999691 -0.000527 0.001527 0.038090 0.001475 0.999273 -0.001727
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.998299 0.031616 -0.048990 0.102022 -0.031681 0.999498 -0.000554 0.001773 0.048948 0.002105 0.998799 -0.002096
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.997441 0.038607 -0.060182 0.124746 -0.038724 0.999250 -0.000772 0.002129 0.060107 0.003101 0.998187 -0.002024
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.996405 0.045790 -0.071277 0.147756 -0.045971 0.998942 -0.000900 0.002148 0.071161 0.004173 0.997456 -0.001785
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.995226 0.052546 -0.082242 0.170898 -0.052797 0.998605 -0.000886 0.001783 0.082081 0.005224 0.996612 -0.001191
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.993871 0.059346 -0.093267 0.193851 -0.059683 0.998217 -0.000821 0.001207 0.093051 0.006383 0.995641 -0.000463
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.992306 0.066184 -0.104636 0.216737 -0.066660 0.997775 -0.001057 0.000824 0.104333 0.008024 0.994510 0.000277
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.990566 0.073041 -0.115946 0.240160 -0.073678 0.997281 -0.001210 -0.000018 0.115542 0.009741 0.993255 0.001499
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.989655 0.076419 -0.121423 0.252027 -0.077167 0.997017 -0.001459 -0.000122 0.120949 0.010813 0.992600 0.002100
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.987832 0.082479 -0.131851 0.275794 -0.083388 0.996516 -0.001381 -0.001785 0.131277 0.012359 0.991269 0.003851
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.985726 0.089065 -0.142872 0.299398 -0.090277 0.995915 -0.002006 -0.002865 0.142109 0.014876 0.989739 0.005392
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.983441 0.095700 -0.153897 0.323191 -0.097291 0.995252 -0.002823 -0.004223 0.152896 0.017749 0.988083 0.007069
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.980979 0.101766 -0.165299 0.347728 -0.103872 0.994582 -0.004120 -0.005445 0.163985 0.021212 0.986235 0.008909
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.978293 0.107875 -0.176933 0.371721 -0.110539 0.993858 -0.005240 -0.007091 0.175281 0.024684 0.984209 0.010349
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.975318 0.113304 -0.189518 0.395586 -0.116675 0.993148 -0.006687 -0.009149 0.187462 0.028634 0.981854 0.011607
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.971737 0.119055 -0.203847 0.419980 -0.123300 0.992336 -0.008205 -0.011229 0.201308 0.033107 0.978968 0.013089
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.969836 0.121974 -0.211047 0.431855 -0.126644 0.991910 -0.008706 -0.011997 0.208277 0.035171 0.977437 0.013503
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.965868 0.127219 -0.225641 0.455933 -0.132708 0.991112 -0.009267 -0.014596 0.222457 0.038896 0.974166 0.014846
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.961275 0.132861 -0.241452 0.479856 -0.139273 0.990207 -0.009609 -0.017285 0.237811 0.042865 0.970365 0.016078
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.956261 0.137947 -0.257946 0.504747 -0.145468 0.989310 -0.010208 -0.019778 0.253781 0.047285 0.966105 0.017502
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.950711 0.143216 -0.275022 0.530008 -0.151708 0.988377 -0.009743 -0.023816 0.270430 0.050986 0.961389 0.019234
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.944687 0.148231 -0.292563 0.555801 -0.157980 0.987393 -0.009843 -0.026747 0.287416 0.055518 0.956196 0.021300
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.938101 0.153313 -0.310584 0.581363 -0.164514 0.986324 -0.010026 -0.029291 0.304799 0.060501 0.950493 0.023592
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.931092 0.158149 -0.328721 0.606439 -0.170629 0.985292 -0.009273 -0.033838 0.322419 0.064723 0.944382 0.026211
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.927403 0.160330 -0.337962 0.619721 -0.173607 0.984772 -0.009219 -0.035450 0.331337 0.067222 0.941115 0.027652
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.919780 0.165019 -0.356052 0.645578 -0.179640 0.983699 -0.008146 -0.040073 0.348903 0.071454 0.934431 0.030777
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.911727 0.169900 -0.374014 0.670630 -0.186129 0.982497 -0.007412 -0.044056 0.366209 0.076373 0.927393 0.033509
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.903567 0.174467 -0.391317 0.695672 -0.192131 0.981350 -0.006108 -0.049293 0.382953 0.080703 0.920236 0.037247
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.895038 0.178672 -0.408635 0.721959 -0.197892 0.980212 -0.004856 -0.054345 0.399681 0.085212 0.912685 0.041329
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.886327 0.183099 -0.425321 0.747706 -0.203979 0.978969 -0.003630 -0.059169 0.415712 0.089974 0.905035 0.045593
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.877793 0.187542 -0.440802 0.771627 -0.209887 0.977724 -0.001980 -0.063908 0.430612 0.094257 0.897602 0.050230
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.868578 0.192101 -0.456803 0.795769 -0.215908 0.976414 0.000082 -0.069340 0.446045 0.098557 0.889568 0.054742
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.863998 0.194080 -0.464586 0.808254 -0.218676 0.975797 0.000963 -0.071981 0.453528 0.100762 0.885528 0.057760
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.854842 0.198467 -0.479434 0.832263 -0.224573 0.974453 0.002967 -0.077866 0.467774 0.105132 0.877573 0.062803
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.845235 0.202713 -0.494455 0.856114 -0.230418 0.973079 0.005052 -0.083847 0.482168 0.109661 0.869189 0.068014
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.835512 0.206863 -0.509045 0.879733 -0.236174 0.971684 0.007228 -0.089877 0.496126 0.114185 0.860710 0.073690
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.825708 0.211070 -0.523122 0.903297 -0.241741 0.970290 0.009924 -0.096960 0.509674 0.118266 0.852200 0.079916
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.816237 0.214923 -0.536251 0.926424 -0.246961 0.968946 0.012438 -0.103910 0.522271 0.122281 0.843967 0.086049
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.806273 0.219015 -0.549506 0.949035 -0.252383 0.967506 0.015302 -0.111173 0.535002 0.126349 0.835350 0.091759
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.795760 0.222697 -0.563180 0.971573 -0.257589 0.966086 0.018050 -0.117812 0.548100 0.130706 0.826137 0.098264
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.790420 0.224640 -0.569888 0.983236 -0.260445 0.965296 0.019273 -0.120957 0.554441 0.133191 0.821496 0.102003
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.780206 0.228268 -0.582385 1.006152 -0.265149 0.963942 0.022608 -0.129088 0.566547 0.136780 0.812598 0.109198
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.770758 0.231998 -0.593387 1.028084 -0.270308 0.962444 0.025182 -0.135979 0.576945 0.140988 0.804523 0.116676
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.760290 0.235097 -0.605548 1.050716 -0.275741 0.960857 0.026838 -0.140506 0.588155 0.146570 0.795356 0.125336
+0.626046 1.112971 0.500000 0.500000 0.000000 0.000000 0.750057 0.238770 -0.616768 1.071595 -0.280940 0.959265 0.029708 -0.147626 0.598737 0.150992 0.786584 0.133040
diff --git a/assets/re10k_poses/80d21c1f8300db84.txt b/assets/re10k_poses/80d21c1f8300db84.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29fa22eee57bd6f9e53b27c8f397cbad15a7e62b
--- /dev/null
+++ b/assets/re10k_poses/80d21c1f8300db84.txt
@@ -0,0 +1,49 @@
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 -0.000000 0.000000 0.000000 1.000000 0.000000 -0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.999915 0.001198 -0.012960 0.012737 -0.001204 0.999999 -0.000453 -0.008929 0.012959 0.000469 0.999916 0.035683
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.999696 0.002257 -0.024549 0.025917 -0.002277 0.999997 -0.000821 -0.018750 0.024547 0.000877 0.999698 0.070840
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.999351 0.003395 -0.035864 0.040524 -0.003424 0.999994 -0.000728 -0.027123 0.035862 0.000851 0.999356 0.106536
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.999141 0.004118 -0.041231 0.047853 -0.004174 0.999990 -0.001273 -0.030475 0.041225 0.001444 0.999149 0.125064
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.998640 0.004868 -0.051903 0.063964 -0.004956 0.999986 -0.001573 -0.034920 0.051894 0.001828 0.998651 0.163213
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.998001 0.006006 -0.062909 0.081507 -0.006126 0.999980 -0.001714 -0.037583 0.062898 0.002096 0.998018 0.204559
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.997194 0.007129 -0.074520 0.100189 -0.007334 0.999970 -0.002470 -0.038511 0.074500 0.003009 0.997216 0.248243
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.996136 0.008069 -0.087449 0.119601 -0.008307 0.999963 -0.002358 -0.039311 0.087427 0.003075 0.996166 0.294039
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.994722 0.009584 -0.102153 0.141254 -0.009907 0.999947 -0.002656 -0.041773 0.102122 0.003654 0.994765 0.339906
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.992929 0.010579 -0.118234 0.164036 -0.010924 0.999938 -0.002274 -0.048083 0.118203 0.003549 0.992983 0.385859
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.990842 0.011782 -0.134512 0.185403 -0.012297 0.999920 -0.002995 -0.057254 0.134466 0.004622 0.990907 0.431823
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.989581 0.012801 -0.143404 0.195589 -0.013394 0.999905 -0.003168 -0.062770 0.143350 0.005055 0.989659 0.455004
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.986726 0.014588 -0.161737 0.215954 -0.015497 0.999870 -0.004358 -0.072551 0.161652 0.006807 0.986824 0.502409
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.983643 0.016042 -0.179411 0.235701 -0.017027 0.999847 -0.003948 -0.079500 0.179320 0.006938 0.983766 0.551084
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.980451 0.018301 -0.195911 0.253483 -0.019375 0.999806 -0.003566 -0.086517 0.195808 0.007292 0.980615 0.598656
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.977197 0.019795 -0.211408 0.267566 -0.021047 0.999772 -0.003675 -0.090746 0.211287 0.008041 0.977391 0.644449
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.973869 0.021259 -0.226113 0.281240 -0.022673 0.999736 -0.003658 -0.094417 0.225975 0.008689 0.974094 0.689828
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.971016 0.022753 -0.237927 0.294774 -0.024426 0.999693 -0.004088 -0.097013 0.237761 0.009781 0.971275 0.734818
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.968484 0.023377 -0.247975 0.306818 -0.025192 0.999674 -0.004147 -0.098347 0.247798 0.010264 0.968757 0.782875
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.967186 0.023975 -0.252937 0.312200 -0.025838 0.999658 -0.004043 -0.098720 0.252753 0.010446 0.967474 0.807375
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.964365 0.025084 -0.263384 0.321908 -0.027141 0.999623 -0.004175 -0.099324 0.263180 0.011174 0.964682 0.857908
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.961378 0.025898 -0.274009 0.329951 -0.028153 0.999594 -0.004301 -0.102448 0.273786 0.011849 0.961718 0.910743
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.958206 0.026831 -0.284817 0.335955 -0.029246 0.999563 -0.004229 -0.108451 0.284579 0.012382 0.958573 0.964920
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.954646 0.028331 -0.296391 0.340870 -0.030925 0.999513 -0.004068 -0.116027 0.296132 0.013050 0.955058 1.019504
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.951143 0.029166 -0.307370 0.344250 -0.032205 0.999470 -0.004820 -0.125016 0.307066 0.014483 0.951578 1.073207
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.947942 0.030009 -0.317027 0.347776 -0.033068 0.999444 -0.004273 -0.132747 0.316722 0.014534 0.948407 1.125245
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.944856 0.030833 -0.326031 0.350053 -0.033963 0.999415 -0.003911 -0.140439 0.325720 0.014769 0.945351 1.176845
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.943331 0.030951 -0.330408 0.349985 -0.034273 0.999404 -0.004229 -0.143460 0.330080 0.015314 0.943829 1.202211
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.940238 0.031594 -0.339048 0.350124 -0.035217 0.999369 -0.004537 -0.150646 0.338691 0.016206 0.940758 1.251543
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.936946 0.032581 -0.347952 0.350213 -0.036452 0.999325 -0.004581 -0.155994 0.347568 0.016976 0.937501 1.301848
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.933802 0.033418 -0.356225 0.349397 -0.037386 0.999292 -0.004258 -0.160371 0.355831 0.017294 0.934390 1.354229
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.930411 0.034216 -0.364917 0.347414 -0.038344 0.999256 -0.004070 -0.161386 0.364507 0.017780 0.931031 1.409197
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.926657 0.035240 -0.374253 0.345650 -0.039530 0.999211 -0.003790 -0.161596 0.373824 0.018306 0.927319 1.466257
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.922866 0.036219 -0.383415 0.344432 -0.040630 0.999168 -0.003408 -0.163195 0.382972 0.018724 0.923570 1.525748
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.918845 0.037104 -0.392871 0.344478 -0.041623 0.999129 -0.002987 -0.166638 0.392418 0.019097 0.919589 1.587358
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.916628 0.037614 -0.397968 0.344710 -0.042344 0.999098 -0.003099 -0.169051 0.397493 0.019692 0.917394 1.618612
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.911931 0.038554 -0.408529 0.345586 -0.043511 0.999049 -0.002844 -0.177032 0.408030 0.020369 0.912741 1.680971
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.907689 0.039873 -0.417744 0.345402 -0.044879 0.998990 -0.002162 -0.186324 0.417236 0.020711 0.908562 1.743520
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.902917 0.040190 -0.427931 0.344700 -0.045882 0.998942 -0.002992 -0.195634 0.427358 0.022336 0.903807 1.805539
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.898251 0.041594 -0.437509 0.342883 -0.047243 0.998881 -0.002030 -0.204524 0.436935 0.022493 0.899212 1.867171
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.894440 0.042603 -0.445154 0.338272 -0.048480 0.998823 -0.001819 -0.213010 0.444552 0.023208 0.895452 1.926459
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.890753 0.043302 -0.452420 0.332487 -0.049231 0.998787 -0.001334 -0.221921 0.451814 0.023462 0.891804 1.983956
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.887577 0.043707 -0.458582 0.324376 -0.050106 0.998742 -0.001790 -0.230050 0.457927 0.024566 0.888651 2.040565
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.886246 0.043864 -0.461134 0.319251 -0.050196 0.998738 -0.001470 -0.234455 0.460487 0.024450 0.887329 2.068950
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.883926 0.044463 -0.465508 0.309211 -0.050954 0.998700 -0.001364 -0.240506 0.464843 0.024925 0.885042 2.126600
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.882454 0.045294 -0.468214 0.296239 -0.051677 0.998664 -0.000788 -0.246250 0.467552 0.024891 0.883615 2.184092
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.881499 0.044897 -0.470047 0.281222 -0.051360 0.998680 -0.000928 -0.251009 0.469384 0.024960 0.882641 2.242193
+0.477737 0.849310 0.500000 0.500000 0.000000 0.000000 0.881078 0.045582 -0.470769 0.263367 -0.051789 0.998658 -0.000232 -0.256198 0.470126 0.024585 0.882257 2.299598
diff --git a/assets/re10k_poses/8135673a5a3e3d17.txt b/assets/re10k_poses/8135673a5a3e3d17.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0af09e81e9ed8603584a37bdae89db2d9f59351d
--- /dev/null
+++ b/assets/re10k_poses/8135673a5a3e3d17.txt
@@ -0,0 +1,49 @@
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000 -0.000000 -0.000000 0.000000 1.000000 0.000000
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999993 -0.000067 -0.003871 0.008413 0.000067 1.000000 -0.000029 0.000107 0.003871 0.000028 0.999992 0.023519
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999969 -0.000147 -0.007811 0.017213 0.000148 1.000000 0.000078 0.000021 0.007811 -0.000079 0.999969 0.046643
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999924 -0.000126 -0.012340 0.026019 0.000128 1.000000 0.000131 -0.000829 0.012340 -0.000133 0.999924 0.069817
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999891 -0.000072 -0.014766 0.030358 0.000077 1.000000 0.000333 -0.001829 0.014766 -0.000334 0.999891 0.081205
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999805 -0.000003 -0.019755 0.039418 0.000009 1.000000 0.000298 -0.003285 0.019755 -0.000298 0.999805 0.103586
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999693 0.000046 -0.024793 0.047365 -0.000032 1.000000 0.000572 -0.004427 0.024793 -0.000571 0.999692 0.126135
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999541 0.000337 -0.030287 0.055426 -0.000317 1.000000 0.000660 -0.005185 0.030287 -0.000651 0.999541 0.147793
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999358 0.000636 -0.035816 0.063492 -0.000613 1.000000 0.000665 -0.005049 0.035817 -0.000643 0.999358 0.168949
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.999150 0.001274 -0.041200 0.071373 -0.001251 0.999999 0.000588 -0.004430 0.041201 -0.000536 0.999151 0.190637
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.998912 0.001616 -0.046598 0.079004 -0.001589 0.999999 0.000618 -0.003285 0.046599 -0.000543 0.998914 0.212492
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.998627 0.001576 -0.052353 0.086657 -0.001535 0.999998 0.000834 -0.002284 0.052354 -0.000752 0.998628 0.235898
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.998460 0.001718 -0.055452 0.090627 -0.001674 0.999998 0.000838 -0.001442 0.055454 -0.000744 0.998461 0.247737
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.998111 0.002212 -0.061390 0.098792 -0.002143 0.999997 0.001184 -0.000719 0.061393 -0.001050 0.998113 0.271429
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.997720 0.002449 -0.067450 0.106658 -0.002346 0.999996 0.001616 -0.000998 0.067454 -0.001454 0.997721 0.295246
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.997277 0.002883 -0.073694 0.113970 -0.002766 0.999995 0.001699 -0.002037 0.073698 -0.001491 0.997279 0.318133
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.996803 0.003168 -0.079841 0.121414 -0.003026 0.999994 0.001899 -0.004463 0.079847 -0.001651 0.996806 0.340820
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.996297 0.003087 -0.085925 0.128562 -0.002931 0.999994 0.001940 -0.006819 0.085931 -0.001681 0.996300 0.362690
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.995720 0.002989 -0.092376 0.135977 -0.002812 0.999994 0.002051 -0.008642 0.092382 -0.001782 0.995722 0.384243
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.995089 0.003298 -0.098926 0.144240 -0.003095 0.999993 0.002205 -0.009521 0.098933 -0.001888 0.995092 0.405250
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.994767 0.003411 -0.102114 0.148256 -0.003184 0.999992 0.002386 -0.009572 0.102121 -0.002048 0.994770 0.416119
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.994094 0.003702 -0.108459 0.156371 -0.003437 0.999991 0.002630 -0.009422 0.108468 -0.002242 0.994097 0.438032
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.993385 0.004156 -0.114760 0.164722 -0.003877 0.999989 0.002655 -0.008917 0.114769 -0.002193 0.993390 0.460526
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.992639 0.004737 -0.121015 0.172994 -0.004419 0.999986 0.002894 -0.008599 0.121027 -0.002338 0.992646 0.483269
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.991839 0.004862 -0.127408 0.181286 -0.004501 0.999985 0.003118 -0.008920 0.127421 -0.002519 0.991845 0.506236
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.990903 0.004855 -0.134493 0.189468 -0.004456 0.999985 0.003266 -0.009716 0.134507 -0.002637 0.990909 0.528699
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.989885 0.005220 -0.141776 0.197865 -0.004793 0.999983 0.003353 -0.010967 0.141791 -0.002639 0.989893 0.549838
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.988703 0.005622 -0.149784 0.205777 -0.005147 0.999980 0.003559 -0.011966 0.149802 -0.002748 0.988712 0.572508
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.988081 0.005689 -0.153832 0.209163 -0.005180 0.999980 0.003711 -0.012641 0.153850 -0.002870 0.988090 0.582938
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.986728 0.005749 -0.162283 0.215731 -0.005187 0.999979 0.003889 -0.013143 0.162301 -0.002996 0.986737 0.603538
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.985283 0.006069 -0.170823 0.222276 -0.005436 0.999977 0.004177 -0.013684 0.170844 -0.003187 0.985293 0.623985
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.983753 0.006457 -0.179414 0.229013 -0.005784 0.999974 0.004269 -0.013588 0.179437 -0.003162 0.983764 0.644438
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.982135 0.006875 -0.188050 0.235740 -0.006108 0.999971 0.004659 -0.013266 0.188077 -0.003428 0.982148 0.665830
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.980447 0.007433 -0.196645 0.242610 -0.006603 0.999966 0.004876 -0.012657 0.196674 -0.003482 0.980463 0.687916
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.978732 0.008230 -0.204978 0.249394 -0.007276 0.999959 0.005410 -0.012172 0.205014 -0.003804 0.978752 0.710373
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.976884 0.008625 -0.213595 0.255357 -0.007538 0.999954 0.005905 -0.012398 0.213636 -0.004159 0.976904 0.733121
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.975920 0.008919 -0.217947 0.258302 -0.007762 0.999951 0.006164 -0.012921 0.217991 -0.004324 0.975941 0.744588
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.974000 0.009511 -0.226347 0.264817 -0.008238 0.999945 0.006566 -0.014538 0.226397 -0.004531 0.974024 0.767239
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.972109 0.009844 -0.234323 0.271050 -0.008506 0.999941 0.006720 -0.016160 0.234375 -0.004539 0.972136 0.789496
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.970225 0.010039 -0.241997 0.277051 -0.008593 0.999938 0.007033 -0.018084 0.242053 -0.004744 0.970251 0.810735
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.968345 0.009942 -0.249417 0.282961 -0.008392 0.999938 0.007278 -0.019448 0.249474 -0.004954 0.968369 0.831122
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.966540 0.010364 -0.256305 0.289047 -0.008765 0.999934 0.007381 -0.020132 0.256365 -0.004887 0.966568 0.851752
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.964833 0.010630 -0.262648 0.295095 -0.008975 0.999932 0.007503 -0.019948 0.262709 -0.004882 0.964863 0.872784
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.962990 0.010885 -0.269318 0.301143 -0.009165 0.999929 0.007642 -0.019360 0.269382 -0.004891 0.963021 0.894089
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.962051 0.011093 -0.272643 0.303830 -0.009313 0.999926 0.007822 -0.019167 0.272710 -0.004986 0.962083 0.905359
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.960104 0.011404 -0.279412 0.310627 -0.009551 0.999922 0.007990 -0.018652 0.279481 -0.005003 0.960138 0.927948
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.958168 0.011632 -0.285970 0.316825 -0.009624 0.999918 0.008425 -0.018761 0.286044 -0.005321 0.958202 0.951114
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.956013 0.011773 -0.293086 0.322737 -0.009653 0.999916 0.008679 -0.019767 0.293164 -0.005468 0.956047 0.973885
+0.487073 0.865907 0.500000 0.500000 0.000000 0.000000 0.953713 0.011905 -0.300484 0.328704 -0.009704 0.999914 0.008816 -0.020841 0.300563 -0.005492 0.953746 0.997058
diff --git a/assets/re10k_poses/9696c0d0a01d2fd4.txt b/assets/re10k_poses/9696c0d0a01d2fd4.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4abaeae7d17790393edd3b52240f5a62cc699145
--- /dev/null
+++ b/assets/re10k_poses/9696c0d0a01d2fd4.txt
@@ -0,0 +1,49 @@
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 -0.000000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999999 0.000951 0.001169 -0.004064 -0.000951 0.999999 -0.000397 0.000736 -0.001169 0.000396 0.999999 0.064782
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999996 0.001351 0.002598 -0.008417 -0.001351 0.999999 0.000342 -0.003265 -0.002598 -0.000345 0.999997 0.124665
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999990 0.001634 0.004188 -0.009776 -0.001631 0.999998 -0.000796 -0.004510 -0.004189 0.000790 0.999991 0.184881
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999984 0.001933 0.005238 -0.011655 -0.001927 0.999997 -0.001150 -0.004545 -0.005240 0.001140 0.999986 0.212503
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999962 0.001789 0.008476 -0.015073 -0.001776 0.999997 -0.001466 -0.010239 -0.008478 0.001451 0.999963 0.297190
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999951 0.002400 0.009551 -0.014875 -0.002384 0.999996 -0.001695 -0.010922 -0.009555 0.001672 0.999953 0.324394
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999923 0.002108 0.012239 -0.017903 -0.002083 0.999996 -0.002061 -0.010383 -0.012243 0.002035 0.999923 0.380847
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999900 0.002166 0.013979 -0.017916 -0.002138 0.999996 -0.002004 -0.012809 -0.013983 0.001974 0.999900 0.436811
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999858 0.001993 0.016712 -0.020435 -0.001952 0.999995 -0.002452 -0.012087 -0.016717 0.002419 0.999857 0.493293
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999815 0.002853 0.019042 -0.020545 -0.002793 0.999991 -0.003176 -0.011178 -0.019051 0.003122 0.999814 0.552191
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999796 0.002648 0.020033 -0.016823 -0.002603 0.999994 -0.002281 -0.013693 -0.020039 0.002228 0.999797 0.609231
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999760 0.002187 0.021797 -0.018031 -0.002132 0.999994 -0.002563 -0.013898 -0.021803 0.002516 0.999759 0.638297
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999695 0.002384 0.024577 -0.016319 -0.002321 0.999994 -0.002585 -0.017190 -0.024583 0.002527 0.999695 0.695850
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999656 0.002033 0.026148 -0.011809 -0.001984 0.999996 -0.001895 -0.023113 -0.026152 0.001842 0.999656 0.750447
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999606 0.002249 0.027971 -0.009498 -0.002182 0.999995 -0.002434 -0.028415 -0.027976 0.002372 0.999606 0.802388
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999529 0.003045 0.030532 -0.008741 -0.002949 0.999991 -0.003193 -0.032835 -0.030541 0.003102 0.999529 0.851762
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999457 0.002740 0.032821 -0.008749 -0.002644 0.999992 -0.002973 -0.038742 -0.032829 0.002884 0.999457 0.897892
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999406 0.002767 0.034352 -0.010703 -0.002650 0.999991 -0.003440 -0.041938 -0.034361 0.003347 0.999404 0.942641
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999328 0.003142 0.036508 -0.012679 -0.003002 0.999988 -0.003889 -0.041761 -0.036520 0.003777 0.999326 0.987583
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999238 0.004460 0.038768 -0.014283 -0.004301 0.999982 -0.004179 -0.042934 -0.038786 0.004009 0.999240 1.030917
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999184 0.004165 0.040181 -0.015295 -0.003991 0.999982 -0.004424 -0.042554 -0.040199 0.004260 0.999183 1.052545
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.999103 0.003638 0.042177 -0.014419 -0.003459 0.999985 -0.004316 -0.042578 -0.042192 0.004166 0.999101 1.097377
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.998979 0.004848 0.044923 -0.016962 -0.004621 0.999976 -0.005154 -0.039558 -0.044946 0.004941 0.998977 1.142034
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.998843 0.005107 0.047808 -0.020347 -0.004880 0.999976 -0.004869 -0.039968 -0.047832 0.004630 0.998845 1.186319
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.998729 0.005086 0.050144 -0.022800 -0.004828 0.999974 -0.005267 -0.040132 -0.050169 0.005018 0.998728 1.230976
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.998544 0.004754 0.053736 -0.026974 -0.004502 0.999978 -0.004811 -0.040736 -0.053758 0.004562 0.998544 1.274744
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.998338 0.003981 0.057492 -0.031516 -0.003660 0.999977 -0.005686 -0.039600 -0.057513 0.005466 0.998330 1.316581
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.998230 0.003710 0.059363 -0.034009 -0.003387 0.999979 -0.005531 -0.040036 -0.059383 0.005320 0.998221 1.336201
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.997961 0.004389 0.063681 -0.038818 -0.004016 0.999974 -0.005976 -0.040770 -0.063705 0.005709 0.997952 1.373317
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.997701 0.002959 0.067704 -0.040976 -0.002525 0.999976 -0.006498 -0.042794 -0.067721 0.006312 0.997684 1.406825
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.997336 0.003108 0.072873 -0.043404 -0.002570 0.999969 -0.007479 -0.041889 -0.072894 0.007272 0.997313 1.438142
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.996935 0.001958 0.078204 -0.044723 -0.001292 0.999962 -0.008563 -0.041551 -0.078218 0.008436 0.996901 1.468493
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.996539 0.001715 0.083109 -0.044687 -0.000990 0.999961 -0.008770 -0.040916 -0.083121 0.008658 0.996502 1.497284
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.996010 0.000992 0.089240 -0.044670 -0.000139 0.999954 -0.009566 -0.039415 -0.089245 0.009515 0.995964 1.525691
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.995440 0.000409 0.095393 -0.043966 0.000547 0.999950 -0.009993 -0.038787 -0.095393 0.009999 0.995390 1.553025
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.995127 -0.000495 0.098602 -0.043629 0.001480 0.999950 -0.009916 -0.040019 -0.098592 0.010014 0.995078 1.567194
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.994457 -0.001029 0.105139 -0.042453 0.002172 0.999940 -0.010754 -0.038815 -0.105121 0.010923 0.994399 1.595717
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.993769 -0.001989 0.111439 -0.040834 0.003221 0.999936 -0.010880 -0.038869 -0.111411 0.011171 0.993712 1.623076
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.993081 -0.002525 0.117403 -0.037610 0.003893 0.999927 -0.011430 -0.038010 -0.117365 0.011808 0.993019 1.652861
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.992336 -0.003328 0.123523 -0.035369 0.004860 0.999915 -0.012103 -0.038383 -0.123472 0.012611 0.992268 1.681466
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.991646 -0.003863 0.128932 -0.030783 0.005461 0.999913 -0.012044 -0.038418 -0.128874 0.012647 0.991580 1.712980
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.990858 -0.004182 0.134846 -0.027447 0.005922 0.999904 -0.012507 -0.038177 -0.134781 0.013191 0.990788 1.742178
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.990097 -0.004550 0.140308 -0.023629 0.006371 0.999901 -0.012530 -0.038000 -0.140238 0.013300 0.990029 1.772753
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.989709 -0.004503 0.143024 -0.021398 0.006402 0.999897 -0.012823 -0.036700 -0.142951 0.013607 0.989636 1.789764
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.988933 -0.005238 0.148269 -0.016446 0.007263 0.999887 -0.013122 -0.037431 -0.148183 0.014054 0.988860 1.817507
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.988096 -0.005693 0.153735 -0.014576 0.007759 0.999887 -0.012841 -0.040323 -0.153644 0.013881 0.988029 1.845693
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.987444 -0.005969 0.157857 -0.010907 0.008159 0.999879 -0.013229 -0.041469 -0.157759 0.014350 0.987373 1.874296
+0.485181 0.862545 0.500000 0.500000 0.000000 0.000000 0.986728 -0.006071 0.162265 -0.007846 0.008365 0.999874 -0.013456 -0.041462 -0.162163 0.014635 0.986655 1.901476
diff --git a/assets/re10k_poses/ab3d616a3d001515.txt b/assets/re10k_poses/ab3d616a3d001515.txt
new file mode 100644
index 0000000000000000000000000000000000000000..67c13ad1c2f6c2154cc3e3aee5e68d39d504d172
--- /dev/null
+++ b/assets/re10k_poses/ab3d616a3d001515.txt
@@ -0,0 +1,49 @@
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 -0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999998 -0.000485 0.001717 -0.000326 0.000484 1.000000 0.000292 -0.005131 -0.001717 -0.000291 0.999998 0.000463
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999994 -0.000879 0.003249 -0.000639 0.000880 1.000000 -0.000331 -0.010310 -0.003249 0.000334 0.999995 0.001323
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999987 -0.001288 0.004904 -0.002084 0.001295 0.999998 -0.001527 -0.014493 -0.004902 0.001533 0.999987 0.002659
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999985 -0.001432 0.005191 -0.002268 0.001441 0.999997 -0.001717 -0.016827 -0.005189 0.001724 0.999985 0.003490
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999980 -0.001750 0.006089 -0.003092 0.001768 0.999994 -0.002828 -0.021230 -0.006084 0.002839 0.999977 0.005676
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999971 -0.001982 0.007380 -0.004578 0.002010 0.999991 -0.003803 -0.025205 -0.007372 0.003817 0.999966 0.008331
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999958 -0.002291 0.008838 -0.006178 0.002335 0.999985 -0.004902 -0.028590 -0.008826 0.004922 0.999949 0.011336
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999943 -0.002597 0.010402 -0.007941 0.002661 0.999978 -0.006145 -0.031734 -0.010386 0.006173 0.999927 0.014651
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999928 -0.002812 0.011660 -0.010335 0.002895 0.999971 -0.007112 -0.034391 -0.011640 0.007145 0.999907 0.018595
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999911 -0.003045 0.012957 -0.013843 0.003149 0.999963 -0.008000 -0.036580 -0.012932 0.008040 0.999884 0.023446
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999897 -0.003283 0.014000 -0.016906 0.003408 0.999954 -0.008950 -0.038224 -0.013970 0.008997 0.999862 0.029333
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999888 -0.003415 0.014566 -0.018593 0.003554 0.999949 -0.009502 -0.039231 -0.014533 0.009553 0.999849 0.032427
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999869 -0.003590 0.015797 -0.022067 0.003751 0.999941 -0.010200 -0.040974 -0.015759 0.010258 0.999823 0.038822
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999850 -0.003858 0.016863 -0.025543 0.004043 0.999932 -0.010964 -0.043269 -0.016820 0.011030 0.999798 0.045734
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999825 -0.004067 0.018272 -0.029945 0.004285 0.999920 -0.011923 -0.045360 -0.018222 0.012000 0.999762 0.053349
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999803 -0.004304 0.019402 -0.034843 0.004549 0.999910 -0.012621 -0.046975 -0.019346 0.012707 0.999732 0.062490
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999791 -0.004504 0.019931 -0.040296 0.004776 0.999896 -0.013581 -0.048200 -0.019868 0.013674 0.999709 0.072455
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999773 -0.004453 0.020821 -0.045114 0.004748 0.999889 -0.014134 -0.048758 -0.020755 0.014230 0.999683 0.082761
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999746 -0.004513 0.022092 -0.051274 0.004844 0.999876 -0.014973 -0.049008 -0.022022 0.015076 0.999644 0.093260
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999734 -0.004663 0.022571 -0.054101 0.005009 0.999871 -0.015288 -0.049037 -0.022497 0.015397 0.999628 0.098670
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999723 -0.004793 0.023026 -0.060561 0.005166 0.999856 -0.016175 -0.048863 -0.022945 0.016290 0.999604 0.109569
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999712 -0.005038 0.023452 -0.067085 0.005430 0.999846 -0.016675 -0.048608 -0.023365 0.016798 0.999586 0.120657
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999710 -0.005409 0.023487 -0.074230 0.005811 0.999837 -0.017105 -0.048890 -0.023391 0.017236 0.999578 0.132014
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999697 -0.005215 0.024064 -0.082337 0.005641 0.999828 -0.017664 -0.049595 -0.023968 0.017794 0.999554 0.143662
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999680 -0.005245 0.024759 -0.091885 0.005691 0.999822 -0.017987 -0.049602 -0.024660 0.018122 0.999532 0.155812
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999682 -0.005647 0.024580 -0.101909 0.006094 0.999817 -0.018151 -0.049877 -0.024473 0.018295 0.999533 0.168439
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999682 -0.005552 0.024617 -0.112707 0.006011 0.999809 -0.018578 -0.049541 -0.024509 0.018720 0.999524 0.181189
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999690 -0.005421 0.024280 -0.117844 0.005873 0.999810 -0.018609 -0.049633 -0.024175 0.018746 0.999532 0.187612
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999701 -0.005242 0.023885 -0.128698 0.005696 0.999804 -0.018979 -0.048545 -0.023781 0.019109 0.999535 0.200795
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999698 -0.005224 0.024015 -0.139721 0.005688 0.999797 -0.019313 -0.047905 -0.023909 0.019444 0.999525 0.213596
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999705 -0.005207 0.023722 -0.150395 0.005674 0.999791 -0.019668 -0.046969 -0.023615 0.019797 0.999525 0.226089
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999714 -0.005032 0.023369 -0.160390 0.005495 0.999789 -0.019779 -0.046630 -0.023265 0.019902 0.999531 0.237971
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999710 -0.005057 0.023558 -0.171260 0.005533 0.999781 -0.020192 -0.046660 -0.023451 0.020316 0.999519 0.249779
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999716 -0.005028 0.023306 -0.181933 0.005504 0.999777 -0.020392 -0.046639 -0.023198 0.020514 0.999520 0.262356
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999725 -0.005054 0.022898 -0.192801 0.005528 0.999771 -0.020679 -0.045502 -0.022788 0.020800 0.999524 0.275931
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999728 -0.005009 0.022790 -0.197825 0.005480 0.999772 -0.020638 -0.045016 -0.022682 0.020757 0.999527 0.283256
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999743 -0.004881 0.022139 -0.208547 0.005354 0.999758 -0.021344 -0.043415 -0.022029 0.021457 0.999527 0.298613
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999776 -0.004738 0.020635 -0.219383 0.005176 0.999761 -0.021242 -0.041503 -0.020530 0.021344 0.999561 0.314592
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999834 -0.004244 0.017727 -0.229654 0.004626 0.999757 -0.021565 -0.039864 -0.017632 0.021644 0.999610 0.330743
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999888 -0.003729 0.014519 -0.240414 0.004042 0.999758 -0.021642 -0.038302 -0.014435 0.021698 0.999660 0.347401
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999946 -0.003123 0.009953 -0.251440 0.003339 0.999756 -0.021822 -0.037225 -0.009882 0.021854 0.999712 0.364212
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999991 -0.002184 0.003621 -0.263150 0.002262 0.999760 -0.021810 -0.037185 -0.003573 0.021818 0.999756 0.380981
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999983 -0.000823 -0.005692 -0.275144 0.000696 0.999752 -0.022247 -0.036988 0.005709 0.022243 0.999736 0.396812
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999935 0.000177 -0.011407 -0.281579 -0.000433 0.999748 -0.022430 -0.036915 0.011400 0.022433 0.999683 0.404647
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999662 0.002665 -0.025859 -0.295223 -0.003254 0.999735 -0.022796 -0.036696 0.025791 0.022873 0.999406 0.419406
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.999038 0.005686 -0.043485 -0.308694 -0.006684 0.999717 -0.022841 -0.036357 0.043343 0.023110 0.998793 0.433006
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.997869 0.009007 -0.064617 -0.321625 -0.010563 0.999662 -0.023775 -0.035504 0.064381 0.024407 0.997627 0.446049
+0.499282 0.887612 0.500000 0.500000 0.000000 0.000000 0.996012 0.012716 -0.088308 -0.333908 -0.014974 0.999576 -0.024957 -0.033871 0.087954 0.026180 0.995781 0.458989
diff --git a/assets/re10k_poses/c0bdeae8f2b84b7f.txt b/assets/re10k_poses/c0bdeae8f2b84b7f.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f287fce8bdcd4ae093706ad60a7eca7b82e441bb
--- /dev/null
+++ b/assets/re10k_poses/c0bdeae8f2b84b7f.txt
@@ -0,0 +1,49 @@
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 -0.000000 -0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000035 0.000813 0.011953 0.000035 1.000000 -0.000333 -0.001293 -0.000813 0.000333 1.000000 0.027228
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999998 -0.000130 0.002049 0.023333 0.000131 1.000000 -0.000407 -0.002818 -0.002049 0.000407 0.999998 0.055190
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999995 -0.000306 0.003093 0.034288 0.000308 1.000000 -0.000752 -0.003946 -0.003093 0.000752 0.999995 0.081853
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999993 -0.000368 0.003616 0.038969 0.000372 0.999999 -0.000984 -0.004550 -0.003616 0.000985 0.999993 0.095366
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999990 -0.000518 0.004402 0.047686 0.000526 0.999998 -0.001788 -0.005800 -0.004401 0.001790 0.999989 0.122459
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999989 -0.000649 0.004707 0.055329 0.000660 0.999997 -0.002270 -0.007439 -0.004705 0.002274 0.999986 0.146884
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999988 -0.000632 0.004843 0.062087 0.000645 0.999996 -0.002732 -0.009171 -0.004842 0.002736 0.999985 0.170706
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999985 -0.000559 0.005393 0.067726 0.000576 0.999995 -0.003135 -0.011563 -0.005391 0.003139 0.999981 0.194489
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999983 -0.000462 0.005875 0.073024 0.000484 0.999993 -0.003710 -0.013456 -0.005874 0.003713 0.999976 0.216432
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999981 -0.000379 0.006181 0.078012 0.000405 0.999991 -0.004278 -0.015020 -0.006179 0.004281 0.999972 0.238447
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999980 -0.000196 0.006386 0.082690 0.000225 0.999990 -0.004563 -0.016280 -0.006385 0.004565 0.999969 0.259839
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999979 -0.000159 0.006415 0.084968 0.000188 0.999989 -0.004627 -0.016342 -0.006414 0.004628 0.999969 0.270846
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999979 -0.000072 0.006533 0.089774 0.000105 0.999987 -0.005022 -0.016986 -0.006532 0.005023 0.999966 0.293888
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999978 -0.000101 0.006631 0.095192 0.000136 0.999986 -0.005320 -0.016529 -0.006631 0.005321 0.999964 0.318729
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999976 0.000027 0.006973 0.101006 0.000013 0.999984 -0.005649 -0.015788 -0.006973 0.005649 0.999960 0.345582
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999973 -0.000074 0.007317 0.107557 0.000117 0.999982 -0.005919 -0.015701 -0.007316 0.005920 0.999956 0.374176
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999973 -0.000059 0.007399 0.113781 0.000103 0.999983 -0.005886 -0.016267 -0.007399 0.005887 0.999955 0.403728
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999970 0.000114 0.007698 0.120324 -0.000067 0.999982 -0.006072 -0.018721 -0.007698 0.006071 0.999952 0.433728
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999967 -0.000077 0.008107 0.126294 0.000130 0.999979 -0.006553 -0.021362 -0.008107 0.006554 0.999946 0.463366
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999966 -0.000081 0.008188 0.129461 0.000135 0.999978 -0.006620 -0.023272 -0.008187 0.006621 0.999945 0.477213
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999964 -0.000177 0.008429 0.135788 0.000236 0.999976 -0.006985 -0.026442 -0.008428 0.006987 0.999940 0.504591
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999960 -0.000184 0.008972 0.142306 0.000246 0.999976 -0.006944 -0.029111 -0.008970 0.006945 0.999936 0.530229
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999955 -0.000297 0.009434 0.148756 0.000365 0.999974 -0.007257 -0.030909 -0.009431 0.007260 0.999929 0.555418
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999950 -0.000518 0.009971 0.156144 0.000596 0.999969 -0.007811 -0.032597 -0.009967 0.007817 0.999920 0.580444
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999946 -0.000436 0.010398 0.163764 0.000519 0.999968 -0.008020 -0.033923 -0.010394 0.008025 0.999914 0.605355
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999939 -0.000520 0.011014 0.172436 0.000614 0.999964 -0.008520 -0.034973 -0.011009 0.008526 0.999903 0.631049
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999927 -0.000467 0.012075 0.180833 0.000572 0.999962 -0.008648 -0.035757 -0.012070 0.008655 0.999890 0.658028
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999921 -0.000501 0.012534 0.185013 0.000611 0.999961 -0.008777 -0.036157 -0.012529 0.008784 0.999883 0.671476
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999908 -0.000514 0.013549 0.192915 0.000636 0.999959 -0.008982 -0.036714 -0.013544 0.008990 0.999868 0.700236
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999899 -0.000637 0.014188 0.201377 0.000771 0.999956 -0.009383 -0.037739 -0.014181 0.009393 0.999855 0.729866
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999891 -0.000728 0.014747 0.210753 0.000869 0.999954 -0.009552 -0.039348 -0.014739 0.009563 0.999846 0.759760
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999882 -0.000904 0.015323 0.220835 0.001056 0.999950 -0.009961 -0.042138 -0.015313 0.009977 0.999833 0.788755
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999874 -0.000867 0.015847 0.230015 0.001030 0.999947 -0.010287 -0.044342 -0.015837 0.010302 0.999822 0.817443
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999866 -0.001011 0.016312 0.239918 0.001184 0.999943 -0.010633 -0.047499 -0.016300 0.010651 0.999810 0.843864
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999863 -0.001220 0.016501 0.249171 0.001395 0.999942 -0.010639 -0.050347 -0.016487 0.010661 0.999807 0.870351
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999856 -0.001204 0.016933 0.253302 0.001387 0.999940 -0.010824 -0.052344 -0.016919 0.010846 0.999798 0.882950
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999847 -0.001204 0.017458 0.261259 0.001399 0.999937 -0.011116 -0.055711 -0.017443 0.011139 0.999786 0.909235
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999848 -0.001260 0.017411 0.269017 0.001460 0.999933 -0.011485 -0.058941 -0.017396 0.011508 0.999782 0.935793
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999852 -0.001306 0.017155 0.276749 0.001505 0.999932 -0.011559 -0.060936 -0.017139 0.011583 0.999786 0.963491
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999844 -0.001235 0.017624 0.284631 0.001442 0.999930 -0.011765 -0.062503 -0.017608 0.011789 0.999775 0.991940
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999834 -0.001118 0.018182 0.293036 0.001335 0.999928 -0.011950 -0.063605 -0.018168 0.011972 0.999763 1.021376
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999824 -0.001058 0.018735 0.301022 0.001284 0.999926 -0.012073 -0.064935 -0.018720 0.012095 0.999752 1.052890
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999827 -0.001200 0.018552 0.309545 0.001425 0.999925 -0.012145 -0.066111 -0.018536 0.012170 0.999754 1.085754
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999830 -0.001254 0.018375 0.314069 0.001479 0.999924 -0.012215 -0.066669 -0.018358 0.012240 0.999757 1.102973
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999829 -0.001223 0.018464 0.323822 0.001445 0.999926 -0.012045 -0.068096 -0.018447 0.012069 0.999757 1.137579
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999816 -0.001154 0.019157 0.332336 0.001385 0.999927 -0.012021 -0.069171 -0.019141 0.012045 0.999744 1.173947
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999815 -0.001375 0.019192 0.342363 0.001606 0.999927 -0.012001 -0.070475 -0.019174 0.012030 0.999744 1.210997
+0.470181 0.835878 0.500000 0.500000 0.000000 0.000000 0.999810 -0.001296 0.019439 0.352856 0.001534 0.999924 -0.012209 -0.072263 -0.019422 0.012237 0.999736 1.248029
diff --git a/assets/re10k_poses/d5cddd204a805bad.txt b/assets/re10k_poses/d5cddd204a805bad.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0e149197b29a5295744ddd62f1a6e652c80497c7
--- /dev/null
+++ b/assets/re10k_poses/d5cddd204a805bad.txt
@@ -0,0 +1,49 @@
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.999942 0.000380 -0.010757 -0.019407 -0.000362 0.999999 0.001674 0.003743 0.010757 -0.001670 0.999941 0.044714
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.999765 0.000750 -0.021665 -0.037213 -0.000664 0.999992 0.003990 0.009484 0.021667 -0.003974 0.999757 0.090514
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.999494 0.001001 -0.031776 -0.055919 -0.000855 0.999989 0.004613 0.015675 0.031780 -0.004584 0.999484 0.138301
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.999288 0.000945 -0.037726 -0.064944 -0.000760 0.999988 0.004899 0.019009 0.037730 -0.004867 0.999276 0.162963
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.998787 0.001472 -0.049224 -0.081851 -0.001142 0.999977 0.006740 0.022928 0.049232 -0.006676 0.998765 0.213127
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.998150 0.001685 -0.060783 -0.099309 -0.001161 0.999962 0.008649 0.022523 0.060796 -0.008562 0.998114 0.265150
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.997307 0.001370 -0.073328 -0.114258 -0.000684 0.999956 0.009379 0.019631 0.073338 -0.009303 0.997264 0.315392
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.996335 0.002138 -0.085506 -0.127648 -0.001131 0.999930 0.011817 0.015204 0.085525 -0.011677 0.996268 0.363403
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.995216 0.002989 -0.097654 -0.139806 -0.001773 0.999920 0.012535 0.009366 0.097683 -0.012302 0.995141 0.409051
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.993840 0.003403 -0.110776 -0.150279 -0.001961 0.999912 0.013125 0.004833 0.110811 -0.012827 0.993759 0.452403
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.992415 0.004106 -0.122862 -0.160616 -0.002293 0.999886 0.014900 0.001779 0.122910 -0.014505 0.992312 0.495302
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.991537 0.004426 -0.129746 -0.165532 -0.002391 0.999872 0.015833 0.000757 0.129800 -0.015389 0.991421 0.516688
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.989703 0.005185 -0.143042 -0.175288 -0.002780 0.999851 0.017011 -0.000394 0.143109 -0.016438 0.989570 0.560027
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.987709 0.005704 -0.156200 -0.185797 -0.003102 0.999852 0.016898 0.001238 0.156274 -0.016206 0.987581 0.604269
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.985420 0.006334 -0.170022 -0.195947 -0.003307 0.999831 0.018077 0.005017 0.170107 -0.017251 0.985274 0.648615
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.982801 0.007514 -0.184517 -0.206640 -0.003760 0.999779 0.020690 0.009468 0.184632 -0.019641 0.982612 0.694840
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.980058 0.008846 -0.198514 -0.218238 -0.004384 0.999728 0.022910 0.012236 0.198662 -0.021583 0.979830 0.742605
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.977275 0.009810 -0.211746 -0.229752 -0.004984 0.999716 0.023310 0.012753 0.211915 -0.021725 0.977047 0.790915
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.974132 0.010526 -0.225732 -0.240410 -0.005310 0.999705 0.023701 0.011372 0.225915 -0.021890 0.973901 0.837939
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.972385 0.010973 -0.233123 -0.245316 -0.005245 0.999669 0.025179 0.009565 0.233323 -0.023261 0.972121 0.860561
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.968690 0.011831 -0.247990 -0.253846 -0.005268 0.999618 0.027113 0.005290 0.248217 -0.024957 0.968383 0.904403
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.964671 0.012552 -0.263157 -0.260906 -0.005621 0.999618 0.027074 0.000986 0.263396 -0.024639 0.964373 0.945852
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.960491 0.014404 -0.277939 -0.265527 -0.006756 0.999572 0.028457 -0.001906 0.278230 -0.025455 0.960177 0.985619
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.956104 0.015370 -0.292625 -0.269310 -0.007257 0.999559 0.028791 -0.002625 0.292938 -0.025403 0.955794 1.025039
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.951185 0.016473 -0.308182 -0.270296 -0.007546 0.999517 0.030137 -0.001108 0.308529 -0.026341 0.950850 1.063455
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.945978 0.017756 -0.323745 -0.269330 -0.008159 0.999487 0.030979 0.003061 0.324129 -0.026664 0.945637 1.101103
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.940059 0.018928 -0.340486 -0.267211 -0.008394 0.999440 0.032384 0.009667 0.340908 -0.027585 0.939692 1.139350
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.936952 0.019642 -0.348904 -0.265744 -0.008490 0.999404 0.033463 0.013666 0.349354 -0.028391 0.936561 1.158643
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.930401 0.020616 -0.365963 -0.261202 -0.008269 0.999343 0.035274 0.021021 0.366450 -0.029793 0.929961 1.198613
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.922966 0.021337 -0.384290 -0.252687 -0.008502 0.999349 0.035068 0.026520 0.384788 -0.029100 0.922546 1.238247
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.915315 0.023283 -0.402065 -0.241884 -0.008947 0.999257 0.037498 0.029031 0.402639 -0.030725 0.914843 1.277587
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.906737 0.025785 -0.420907 -0.232449 -0.009634 0.999135 0.040454 0.029922 0.421586 -0.032626 0.906201 1.317419
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.897681 0.028042 -0.439753 -0.222358 -0.011152 0.999099 0.040945 0.030323 0.440505 -0.031852 0.897185 1.356960
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.887846 0.029916 -0.459167 -0.212220 -0.011828 0.999038 0.042220 0.031313 0.459989 -0.032053 0.887346 1.395743
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.878094 0.032752 -0.477365 -0.203560 -0.012920 0.998914 0.044770 0.031927 0.478313 -0.033144 0.877564 1.432926
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.872818 0.034054 -0.486856 -0.199788 -0.013408 0.998859 0.045831 0.032510 0.487862 -0.033474 0.872279 1.450581
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.861835 0.036647 -0.505862 -0.193482 -0.014485 0.998758 0.047678 0.034006 0.506981 -0.033763 0.861296 1.485191
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.850399 0.038922 -0.524697 -0.187225 -0.015519 0.998682 0.048930 0.036982 0.525910 -0.033467 0.849881 1.519922
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.839239 0.041659 -0.542165 -0.181826 -0.016506 0.998553 0.051178 0.041147 0.543512 -0.034002 0.838712 1.554743
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.827589 0.044937 -0.559533 -0.178087 -0.017474 0.998370 0.054335 0.045774 0.561063 -0.035190 0.827025 1.591248
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.815877 0.047018 -0.576311 -0.175745 -0.018227 0.998284 0.055641 0.050355 0.577938 -0.034892 0.815334 1.629258
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.804400 0.049470 -0.592024 -0.173198 -0.018599 0.998136 0.058134 0.053185 0.593797 -0.035752 0.803821 1.667530
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.792398 0.052670 -0.607727 -0.170991 -0.019391 0.997937 0.061206 0.052669 0.609697 -0.036716 0.791784 1.706168
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.786266 0.054261 -0.615500 -0.169641 -0.020293 0.997867 0.062046 0.051518 0.617554 -0.036294 0.785691 1.724857
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.773293 0.056031 -0.631568 -0.167008 -0.021666 0.997841 0.061999 0.049104 0.633678 -0.034260 0.772838 1.760388
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.759745 0.058699 -0.647566 -0.162958 -0.022919 0.997715 0.063550 0.046314 0.649817 -0.033440 0.759355 1.793546
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.745923 0.062392 -0.663103 -0.157865 -0.024053 0.997477 0.066796 0.041585 0.665598 -0.033875 0.745542 1.824212
+0.497882 0.885123 0.500000 0.500000 0.000000 0.000000 0.731464 0.063761 -0.678892 -0.152085 -0.024706 0.997443 0.067060 0.037093 0.681432 -0.032279 0.731169 1.852700
diff --git a/assets/re10k_poses/d6849f92207aa171.txt b/assets/re10k_poses/d6849f92207aa171.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e65f723bbf9e60e66ae7d8c972704dbae241d19
--- /dev/null
+++ b/assets/re10k_poses/d6849f92207aa171.txt
@@ -0,0 +1,49 @@
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 -0.000000 1.000000 -0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 0.000064 0.001174 -0.006853 -0.000064 1.000000 -0.000222 0.002947 -0.001174 0.000222 0.999999 0.014517
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999997 -0.000032 0.002314 -0.014951 0.000033 1.000000 -0.000313 0.005496 -0.002314 0.000313 0.999997 0.030776
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999997 -0.000228 0.002337 -0.022979 0.000229 1.000000 -0.000321 0.007371 -0.002337 0.000322 0.999997 0.048828
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999997 0.000125 0.002530 -0.030696 -0.000124 1.000000 -0.000311 0.008946 -0.002530 0.000311 0.999997 0.068718
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999995 0.000244 0.003004 -0.034404 -0.000243 1.000000 -0.000237 0.009271 -0.003004 0.000237 0.999995 0.079236
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999992 0.000256 0.004042 -0.042718 -0.000253 1.000000 -0.000749 0.009632 -0.004042 0.000748 0.999992 0.101464
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999990 0.000035 0.004383 -0.050679 -0.000032 1.000000 -0.000635 0.010299 -0.004383 0.000635 0.999990 0.124883
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999990 -0.000525 0.004415 -0.057851 0.000529 1.000000 -0.000786 0.010093 -0.004415 0.000788 0.999990 0.148255
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999993 -0.001025 0.003702 -0.064173 0.001028 0.999999 -0.000652 0.009482 -0.003701 0.000656 0.999993 0.172088
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999992 -0.001211 0.003709 -0.070439 0.001215 0.999999 -0.001079 0.007707 -0.003707 0.001083 0.999993 0.196797
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999994 -0.001124 0.003212 -0.074953 0.001129 0.999998 -0.001596 0.005743 -0.003210 0.001599 0.999994 0.222038
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999994 -0.001020 0.003284 -0.077487 0.001025 0.999998 -0.001575 0.004864 -0.003282 0.001579 0.999993 0.234668
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999995 -0.000791 0.003008 -0.082258 0.000797 0.999998 -0.002002 0.002893 -0.003006 0.002004 0.999993 0.259388
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999996 -0.000997 0.002521 -0.087103 0.001003 0.999997 -0.002444 0.000514 -0.002519 0.002447 0.999994 0.283747
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999997 -0.000873 0.002177 -0.092092 0.000880 0.999994 -0.003239 -0.001292 -0.002174 0.003241 0.999992 0.307550
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999998 -0.000949 0.001548 -0.096197 0.000955 0.999992 -0.003884 -0.003314 -0.001545 0.003886 0.999991 0.331484
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001042 0.001321 -0.100752 0.001048 0.999989 -0.004515 -0.004850 -0.001317 0.004517 0.999989 0.355573
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999998 -0.001263 0.001203 -0.105216 0.001270 0.999984 -0.005432 -0.006344 -0.001196 0.005433 0.999984 0.381506
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001373 0.000825 -0.109769 0.001378 0.999983 -0.005652 -0.006905 -0.000817 0.005654 0.999984 0.409912
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001355 0.000693 -0.112642 0.001359 0.999982 -0.005837 -0.006438 -0.000685 0.005838 0.999983 0.425071
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001480 0.000066 -0.118037 0.001481 0.999977 -0.006658 -0.004049 -0.000056 0.006658 0.999978 0.456552
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001641 0.000303 -0.123564 0.001643 0.999967 -0.007922 -0.000699 -0.000290 0.007922 0.999969 0.489741
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001667 -0.000078 -0.131195 0.001666 0.999958 -0.009021 0.001976 0.000093 0.009021 0.999959 0.523973
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001578 0.000246 -0.140301 0.001581 0.999946 -0.010309 0.003358 -0.000229 0.010309 0.999947 0.559209
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999999 -0.001583 0.000225 -0.150329 0.001585 0.999931 -0.011658 0.004426 -0.000206 0.011659 0.999932 0.594496
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999998 -0.001723 -0.000709 -0.160046 0.001714 0.999919 -0.012596 0.005561 0.000730 0.012595 0.999920 0.629359
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999997 -0.002242 -0.000826 -0.169308 0.002230 0.999892 -0.014498 0.006108 0.000858 0.014496 0.999895 0.663225
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999997 -0.002378 -0.001045 -0.173499 0.002361 0.999882 -0.015166 0.007081 0.001081 0.015164 0.999884 0.680129
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999996 -0.002353 -0.001609 -0.181995 0.002326 0.999867 -0.016122 0.006858 0.001647 0.016118 0.999869 0.713521
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999994 -0.002526 -0.002525 -0.188216 0.002479 0.999826 -0.018484 0.007530 0.002571 0.018477 0.999826 0.746714
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999993 -0.002335 -0.002892 -0.191836 0.002279 0.999815 -0.019126 0.008786 0.002936 0.019120 0.999813 0.780679
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999993 -0.002052 -0.003127 -0.195215 0.001986 0.999776 -0.021049 0.010979 0.003170 0.021043 0.999774 0.816036
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999990 -0.002018 -0.003905 -0.197136 0.001930 0.999750 -0.022270 0.014766 0.003949 0.022262 0.999744 0.853522
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999988 -0.002283 -0.004440 -0.197500 0.002176 0.999708 -0.024087 0.019450 0.004494 0.024077 0.999700 0.892137
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999988 -0.002782 -0.004105 -0.196915 0.002676 0.999671 -0.025511 0.024639 0.004175 0.025500 0.999666 0.931874
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999986 -0.002986 -0.004305 -0.196323 0.002872 0.999651 -0.026249 0.026329 0.004382 0.026236 0.999646 0.951859
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999982 -0.003367 -0.004939 -0.194206 0.003229 0.999612 -0.027662 0.029722 0.005030 0.027646 0.999605 0.992540
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999979 -0.003929 -0.005229 -0.189245 0.003778 0.999581 -0.028702 0.030496 0.005340 0.028682 0.999574 1.033937
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999970 -0.004790 -0.006162 -0.183635 0.004603 0.999538 -0.030044 0.029527 0.006303 0.030014 0.999530 1.074981
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999966 -0.005280 -0.006328 -0.177010 0.005075 0.999474 -0.032025 0.028281 0.006494 0.031991 0.999467 1.116395
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999965 -0.005175 -0.006649 -0.170823 0.004947 0.999415 -0.033842 0.025539 0.006820 0.033808 0.999405 1.155532
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999965 -0.005536 -0.006211 -0.165555 0.005309 0.999343 -0.035862 0.022663 0.006406 0.035828 0.999337 1.194167
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999964 -0.005853 -0.006150 -0.161629 0.005612 0.999246 -0.038409 0.019634 0.006370 0.038373 0.999243 1.231885
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999960 -0.006007 -0.006569 -0.159732 0.005742 0.999197 -0.039641 0.018813 0.006801 0.039602 0.999192 1.251023
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999955 -0.005963 -0.007335 -0.156620 0.005650 0.999105 -0.041925 0.017164 0.007579 0.041882 0.999094 1.289603
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999953 -0.006007 -0.007628 -0.153704 0.005660 0.998985 -0.044694 0.016587 0.007889 0.044649 0.998972 1.328780
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999948 -0.006066 -0.008158 -0.150858 0.005673 0.998863 -0.047328 0.016921 0.008436 0.047279 0.998846 1.370076
+0.502934 0.894104 0.500000 0.500000 0.000000 0.000000 0.999948 -0.006063 -0.008224 -0.149121 0.005645 0.998735 -0.049965 0.016199 0.008516 0.049916 0.998717 1.412501
diff --git a/assets/re10k_poses/ebf9eb32e850ea81.txt b/assets/re10k_poses/ebf9eb32e850ea81.txt
new file mode 100644
index 0000000000000000000000000000000000000000..71d3ef611e4eabd0399c82cc6a59975f15c429e5
--- /dev/null
+++ b/assets/re10k_poses/ebf9eb32e850ea81.txt
@@ -0,0 +1,49 @@
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 0.000000 0.000000 -0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 1.000000 0.000189 0.000635 0.014635 -0.000189 1.000000 -0.000125 -0.000171 -0.000635 0.000125 1.000000 0.000311
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999996 0.000519 0.002700 0.042540 -0.000518 1.000000 -0.000341 -0.000592 -0.002701 0.000340 0.999996 0.000652
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999993 0.000785 0.003730 0.056438 -0.000784 1.000000 -0.000364 -0.000925 -0.003730 0.000361 0.999993 0.000780
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999983 0.001140 0.005706 0.084397 -0.001137 0.999999 -0.000516 -0.001437 -0.005707 0.000510 0.999984 0.000906
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999977 0.001359 0.006600 0.098384 -0.001355 0.999999 -0.000650 -0.001562 -0.006601 0.000641 0.999978 0.001111
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999963 0.001857 0.008350 0.126390 -0.001850 0.999998 -0.000816 -0.002005 -0.008352 0.000801 0.999965 0.001400
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999955 0.001932 0.009275 0.140430 -0.001922 0.999998 -0.001090 -0.001902 -0.009277 0.001072 0.999956 0.001387
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999932 0.002478 0.011385 0.167525 -0.002467 0.999996 -0.001014 -0.002631 -0.011387 0.000986 0.999935 0.001365
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999922 0.002597 0.012245 0.181588 -0.002584 0.999996 -0.001120 -0.002718 -0.012248 0.001088 0.999924 0.001296
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999895 0.003001 0.014164 0.208939 -0.002985 0.999995 -0.001198 -0.003097 -0.014168 0.001155 0.999899 0.001294
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999878 0.003289 0.015255 0.222249 -0.003274 0.999994 -0.000993 -0.003634 -0.015258 0.000943 0.999883 0.001178
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999851 0.003713 0.016875 0.250209 -0.003692 0.999992 -0.001274 -0.003715 -0.016880 0.001211 0.999857 0.000994
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999834 0.003946 0.017793 0.263751 -0.003925 0.999992 -0.001169 -0.004028 -0.017798 0.001098 0.999841 0.000819
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999803 0.004356 0.019359 0.291357 -0.004334 0.999990 -0.001189 -0.004308 -0.019364 0.001105 0.999812 0.000570
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999785 0.004631 0.020197 0.304967 -0.004607 0.999989 -0.001243 -0.004468 -0.020203 0.001150 0.999795 0.000469
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999749 0.005162 0.021813 0.332076 -0.005139 0.999986 -0.001106 -0.004942 -0.021818 0.000994 0.999761 0.000273
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999726 0.005400 0.022783 0.345403 -0.005371 0.999985 -0.001334 -0.004778 -0.022790 0.001211 0.999740 0.000062
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999686 0.005985 0.024339 0.372389 -0.005955 0.999981 -0.001297 -0.005081 -0.024347 0.001152 0.999703 -0.000281
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999663 0.006235 0.025183 0.385965 -0.006197 0.999980 -0.001582 -0.004772 -0.025192 0.001425 0.999682 -0.000546
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999618 0.006859 0.026790 0.412664 -0.006823 0.999976 -0.001417 -0.005184 -0.026799 0.001234 0.999640 -0.001048
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999595 0.007144 0.027537 0.426054 -0.007114 0.999974 -0.001182 -0.005595 -0.027545 0.000986 0.999620 -0.001216
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999546 0.007729 0.029114 0.452901 -0.007692 0.999969 -0.001389 -0.005410 -0.029123 0.001165 0.999575 -0.002048
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999524 0.008018 0.029797 0.466295 -0.007982 0.999967 -0.001318 -0.005494 -0.029807 0.001079 0.999555 -0.002368
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999472 0.008674 0.031301 0.492989 -0.008635 0.999962 -0.001375 -0.005456 -0.031312 0.001104 0.999509 -0.003251
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999447 0.008954 0.032016 0.506354 -0.008911 0.999959 -0.001471 -0.005306 -0.032028 0.001185 0.999486 -0.003572
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999393 0.009723 0.033463 0.532752 -0.009682 0.999952 -0.001401 -0.005397 -0.033475 0.001076 0.999439 -0.004537
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999368 0.010139 0.034060 0.546404 -0.010095 0.999948 -0.001461 -0.005335 -0.034073 0.001116 0.999419 -0.004987
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999310 0.010939 0.035508 0.572863 -0.010895 0.999940 -0.001413 -0.005354 -0.035522 0.001025 0.999368 -0.006006
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999284 0.011314 0.036102 0.586254 -0.011272 0.999936 -0.001346 -0.005411 -0.036115 0.000938 0.999347 -0.006601
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999218 0.012184 0.037617 0.612687 -0.012147 0.999925 -0.001207 -0.005500 -0.037629 0.000749 0.999291 -0.008041
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999190 0.012580 0.038216 0.626235 -0.012542 0.999921 -0.001220 -0.005449 -0.038228 0.000739 0.999269 -0.008666
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999124 0.013512 0.039596 0.652940 -0.013484 0.999909 -0.000951 -0.005687 -0.039605 0.000416 0.999215 -0.010143
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999091 0.013998 0.040252 0.666384 -0.013968 0.999902 -0.001036 -0.005527 -0.040263 0.000473 0.999189 -0.010825
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.999019 0.015021 0.041654 0.693261 -0.015004 0.999887 -0.000718 -0.005808 -0.041660 0.000092 0.999132 -0.012439
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998994 0.015566 0.042056 0.707246 -0.015536 0.999879 -0.001047 -0.005312 -0.042067 0.000392 0.999115 -0.013171
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998916 0.016664 0.043457 0.733959 -0.016649 0.999861 -0.000713 -0.005677 -0.043463 -0.000011 0.999055 -0.014835
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998873 0.017258 0.044224 0.747400 -0.017243 0.999851 -0.000730 -0.005507 -0.044230 -0.000034 0.999021 -0.015753
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998801 0.018320 0.045387 0.775021 -0.018302 0.999832 -0.000816 -0.005331 -0.045394 -0.000016 0.998969 -0.017364
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998720 0.019411 0.046700 0.802249 -0.019392 0.999812 -0.000865 -0.005198 -0.046708 -0.000041 0.998909 -0.019138
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998686 0.019991 0.047178 0.816519 -0.019961 0.999800 -0.001109 -0.004863 -0.047191 0.000166 0.998886 -0.019770
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998628 0.020635 0.048126 0.829710 -0.020617 0.999787 -0.000866 -0.005176 -0.048134 -0.000128 0.998841 -0.020716
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998525 0.021830 0.049702 0.856855 -0.021823 0.999762 -0.000697 -0.005408 -0.049705 -0.000388 0.998764 -0.022536
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998486 0.022434 0.050225 0.870902 -0.022423 0.999748 -0.000782 -0.005286 -0.050230 -0.000346 0.998738 -0.023387
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998389 0.023634 0.051591 0.898296 -0.023624 0.999721 -0.000794 -0.005331 -0.051595 -0.000426 0.998668 -0.025107
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998326 0.024276 0.052488 0.911671 -0.024280 0.999705 -0.000578 -0.005671 -0.052486 -0.000698 0.998621 -0.026015
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998223 0.025496 0.053853 0.939418 -0.025500 0.999675 -0.000603 -0.005786 -0.053851 -0.000772 0.998549 -0.027736
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998103 0.026705 0.055467 0.966590 -0.026712 0.999643 -0.000620 -0.005963 -0.055463 -0.000863 0.998460 -0.029470
+0.528932 0.940323 0.500000 0.500000 0.000000 0.000000 0.998039 0.027387 0.056284 0.980128 -0.027378 0.999625 -0.000935 -0.005629 -0.056288 -0.000608 0.998414 -0.030254
diff --git a/assets/re10k_poses/ec5c53a3d68fe3e7.txt b/assets/re10k_poses/ec5c53a3d68fe3e7.txt
new file mode 100644
index 0000000000000000000000000000000000000000..828e946b72baf2c5195795650f542245c08bdbad
--- /dev/null
+++ b/assets/re10k_poses/ec5c53a3d68fe3e7.txt
@@ -0,0 +1,49 @@
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 -0.000000 0.000000 -0.000000 1.000000 0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.999892 -0.000818 0.014672 -0.017619 0.000821 1.000000 -0.000249 0.001386 -0.014671 0.000261 0.999892 -0.003520
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.999575 -0.001460 0.029104 -0.034500 0.001471 0.999999 -0.000380 0.002525 -0.029103 0.000422 0.999576 -0.007302
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.999039 -0.002125 0.043790 -0.050857 0.002148 0.999998 -0.000477 0.003810 -0.043789 0.000571 0.999041 -0.011142
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.998260 -0.002781 0.058900 -0.066446 0.002823 0.999996 -0.000637 0.005194 -0.058898 0.000802 0.998264 -0.015410
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.997774 -0.003153 0.066611 -0.074006 0.003200 0.999995 -0.000592 0.005792 -0.066608 0.000804 0.997779 -0.017845
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.996609 -0.003722 0.082198 -0.089242 0.003790 0.999993 -0.000663 0.006620 -0.082195 0.000972 0.996616 -0.023226
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.995169 -0.004139 0.098085 -0.104487 0.004228 0.999991 -0.000703 0.007291 -0.098081 0.001114 0.995178 -0.028187
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.993469 -0.004507 0.114015 -0.119654 0.004616 0.999989 -0.000688 0.007139 -0.114010 0.001210 0.993479 -0.033958
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.991555 -0.004996 0.129592 -0.134803 0.005133 0.999987 -0.000720 0.007446 -0.129587 0.001379 0.991567 -0.038934
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.989434 -0.005079 0.144897 -0.150102 0.005255 0.999986 -0.000828 0.007222 -0.144890 0.001580 0.989446 -0.043521
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.987142 -0.005153 0.159764 -0.165227 0.005322 0.999986 -0.000633 0.006785 -0.159758 0.001476 0.987155 -0.048171
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.984748 -0.005406 0.173904 -0.180050 0.005599 0.999984 -0.000623 0.005683 -0.173898 0.001587 0.984762 -0.054045
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.983504 -0.005639 0.180798 -0.187330 0.005849 0.999983 -0.000627 0.005380 -0.180791 0.001675 0.983520 -0.056882
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.980987 -0.006144 0.193975 -0.201933 0.006342 0.999980 -0.000399 0.004372 -0.193969 0.001621 0.981006 -0.063050
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.978404 -0.006737 0.206593 -0.216874 0.006955 0.999976 -0.000331 0.003231 -0.206586 0.001761 0.978427 -0.069268
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.975782 -0.007255 0.218625 -0.231944 0.007479 0.999972 -0.000196 0.002273 -0.218617 0.001826 0.975809 -0.076339
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.973174 -0.007483 0.229947 -0.247272 0.007723 0.999970 -0.000142 0.001053 -0.229939 0.001914 0.973203 -0.083939
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.970632 -0.007646 0.240449 -0.262609 0.007899 0.999969 -0.000091 0.000237 -0.240441 0.001987 0.970662 -0.091880
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.968226 -0.007766 0.249955 -0.278207 0.008028 0.999968 -0.000029 -0.000259 -0.249947 0.002035 0.968257 -0.100089
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.965935 -0.007782 0.258666 -0.294252 0.008088 0.999967 -0.000117 -0.000437 -0.258657 0.002205 0.965967 -0.108482
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.964836 -0.007932 0.262733 -0.302357 0.008252 0.999966 -0.000113 -0.000433 -0.262723 0.002277 0.964869 -0.112972
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.962634 -0.008292 0.270678 -0.318669 0.008668 0.999962 -0.000195 -0.000261 -0.270666 0.002534 0.962670 -0.122323
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.960546 -0.008449 0.277993 -0.335300 0.008874 0.999961 -0.000271 -0.000680 -0.277980 0.002727 0.960583 -0.131508
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.958596 -0.008455 0.284645 -0.352212 0.008906 0.999960 -0.000291 -0.001183 -0.284631 0.002814 0.958633 -0.141409
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.956809 -0.008532 0.290592 -0.369415 0.009034 0.999959 -0.000387 -0.001333 -0.290577 0.002996 0.956847 -0.151297
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.955271 -0.008800 0.295603 -0.386473 0.009338 0.999956 -0.000409 -0.001227 -0.295586 0.003151 0.955311 -0.161364
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.953990 -0.008859 0.299706 -0.403249 0.009436 0.999955 -0.000479 -0.001589 -0.299689 0.003285 0.954031 -0.171308
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.952800 -0.008722 0.303474 -0.420061 0.009331 0.999956 -0.000558 -0.001716 -0.303456 0.003363 0.952839 -0.181650
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.952225 -0.008873 0.305269 -0.428422 0.009488 0.999955 -0.000529 -0.001984 -0.305250 0.003400 0.952266 -0.186040
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.951126 -0.009082 0.308670 -0.444875 0.009775 0.999952 -0.000698 -0.001944 -0.308649 0.003682 0.951169 -0.195442
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.950193 -0.009399 0.311519 -0.461193 0.010104 0.999949 -0.000648 -0.001886 -0.311497 0.003764 0.950240 -0.205146
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.949454 -0.009725 0.313756 -0.477099 0.010451 0.999945 -0.000633 -0.002181 -0.313732 0.003880 0.949504 -0.215422
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.948767 -0.009993 0.315820 -0.492554 0.010715 0.999942 -0.000550 -0.002558 -0.315796 0.003906 0.948819 -0.226017
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.948057 -0.010103 0.317940 -0.507942 0.010852 0.999941 -0.000585 -0.003093 -0.317915 0.004005 0.948111 -0.236776
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.947367 -0.010125 0.319990 -0.523624 0.010914 0.999940 -0.000670 -0.003674 -0.319964 0.004127 0.947421 -0.247957
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.946752 -0.010255 0.321799 -0.539260 0.011077 0.999938 -0.000723 -0.003901 -0.321772 0.004249 0.946808 -0.259314
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.946487 -0.010532 0.322570 -0.546866 0.011349 0.999935 -0.000653 -0.004092 -0.322542 0.004279 0.946545 -0.264981
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.946082 -0.010996 0.323740 -0.562210 0.011849 0.999930 -0.000663 -0.004191 -0.323710 0.004463 0.946146 -0.276273
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.945781 -0.011197 0.324613 -0.577144 0.012037 0.999927 -0.000579 -0.004603 -0.324583 0.004455 0.945847 -0.287975
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.945479 -0.011373 0.325484 -0.591569 0.012210 0.999925 -0.000527 -0.004464 -0.325454 0.004472 0.945547 -0.299837
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.945179 -0.011552 0.326350 -0.605704 0.012400 0.999923 -0.000516 -0.004309 -0.326318 0.004534 0.945249 -0.311948
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944881 -0.011762 0.327203 -0.619002 0.012622 0.999920 -0.000505 -0.003874 -0.327171 0.004607 0.944954 -0.324393
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944650 -0.011863 0.327866 -0.632206 0.012736 0.999919 -0.000514 -0.003318 -0.327834 0.004661 0.944724 -0.336802
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944535 -0.011918 0.328195 -0.644489 0.012736 0.999919 -0.000343 -0.002597 -0.328164 0.004504 0.944610 -0.349411
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944538 -0.011924 0.328186 -0.650551 0.012700 0.999919 -0.000222 -0.002436 -0.328157 0.004377 0.944613 -0.355610
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944574 -0.012027 0.328078 -0.661877 0.012792 0.999918 -0.000172 -0.001866 -0.328049 0.004360 0.944651 -0.368515
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944648 -0.012119 0.327862 -0.672403 0.012874 0.999917 -0.000132 -0.001579 -0.327833 0.004345 0.944726 -0.381544
+0.432004 0.768007 0.500000 0.500000 0.000000 0.000000 0.944645 -0.012183 0.327868 -0.682081 0.012913 0.999917 -0.000049 -0.001455 -0.327840 0.004280 0.944724 -0.394843
diff --git a/assets/re10k_poses/ec8ee53e2d07e6ba.txt b/assets/re10k_poses/ec8ee53e2d07e6ba.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b4ccb1156f32e9cef4b542a3c0f5ecf473f45385
--- /dev/null
+++ b/assets/re10k_poses/ec8ee53e2d07e6ba.txt
@@ -0,0 +1,49 @@
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 -0.000000 1.000000 0.000000
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000152 -0.000121 -0.012012 0.000152 1.000000 -0.000075 -0.010840 0.000121 0.000075 1.000000 0.047443
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000448 -0.000629 -0.014647 0.000448 1.000000 0.000464 -0.018447 0.000629 -0.000465 1.000000 0.072636
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999999 -0.000980 0.001105 -0.032179 0.000979 0.999999 0.000591 -0.037677 -0.001106 -0.000590 0.999999 0.149587
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999998 -0.001463 0.001368 -0.035051 0.001462 0.999999 0.000883 -0.042654 -0.001369 -0.000881 0.999999 0.177634
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999994 -0.002115 0.002613 -0.045213 0.002110 0.999996 0.001939 -0.061827 -0.002617 -0.001934 0.999995 0.256417
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999991 -0.002621 0.003318 -0.048726 0.002612 0.999993 0.002512 -0.067966 -0.003324 -0.002504 0.999991 0.282744
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999984 -0.003135 0.004788 -0.052807 0.003120 0.999991 0.002990 -0.078875 -0.004797 -0.002975 0.999984 0.336163
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999970 -0.004063 0.006529 -0.055728 0.004040 0.999986 0.003446 -0.088265 -0.006543 -0.003420 0.999973 0.391586
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999952 -0.005052 0.008380 -0.058304 0.005021 0.999981 0.003660 -0.095974 -0.008398 -0.003617 0.999958 0.445434
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999945 -0.005575 0.008883 -0.056713 0.005539 0.999977 0.004033 -0.101209 -0.008905 -0.003983 0.999952 0.473153
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999909 -0.007076 0.011456 -0.056895 0.007034 0.999968 0.003690 -0.112931 -0.011482 -0.003609 0.999928 0.554367
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999896 -0.007932 0.012070 -0.055131 0.007888 0.999962 0.003690 -0.116415 -0.012099 -0.003594 0.999920 0.581413
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999839 -0.010948 0.014201 -0.054561 0.010901 0.999935 0.003347 -0.127628 -0.014237 -0.003192 0.999894 0.661084
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999822 -0.011939 0.014624 -0.054325 0.011893 0.999924 0.003189 -0.132390 -0.014661 -0.003014 0.999888 0.687156
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999781 -0.014383 0.015197 -0.053487 0.014332 0.999891 0.003441 -0.141690 -0.015245 -0.003223 0.999879 0.739590
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999749 -0.016747 0.014885 -0.051762 0.016701 0.999855 0.003196 -0.148735 -0.014936 -0.002947 0.999884 0.792615
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999733 -0.017888 0.014600 -0.048967 0.017828 0.999832 0.004211 -0.156617 -0.014673 -0.003949 0.999884 0.816935
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999664 -0.020530 0.015821 -0.045509 0.020452 0.999778 0.005078 -0.175018 -0.015922 -0.004753 0.999862 0.896843
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999633 -0.021369 0.016635 -0.039334 0.021279 0.999758 0.005566 -0.188448 -0.016750 -0.005210 0.999846 0.950668
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999597 -0.021269 0.018780 -0.041828 0.021148 0.999755 0.006602 -0.204449 -0.018916 -0.006203 0.999802 1.009104
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999570 -0.021563 0.019849 -0.042187 0.021423 0.999745 0.007203 -0.211364 -0.019999 -0.006774 0.999777 1.040346
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999505 -0.021821 0.022667 -0.047121 0.021653 0.999736 0.007634 -0.222710 -0.022828 -0.007139 0.999714 1.102351
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999449 -0.021384 0.025400 -0.053584 0.021209 0.999750 0.007119 -0.232121 -0.025546 -0.006577 0.999652 1.163674
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999381 -0.020745 0.028423 -0.059754 0.020561 0.999766 0.006758 -0.242933 -0.028556 -0.006169 0.999573 1.223891
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999288 -0.020313 0.031784 -0.067879 0.020130 0.999779 0.006062 -0.251665 -0.031900 -0.005417 0.999476 1.285750
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999171 -0.020274 0.035311 -0.075841 0.020034 0.999774 0.007147 -0.264177 -0.035448 -0.006434 0.999351 1.349794
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999055 -0.019508 0.038828 -0.082157 0.019226 0.999786 0.007624 -0.274211 -0.038969 -0.006870 0.999217 1.412946
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.999033 -0.017955 0.040134 -0.083642 0.017640 0.999811 0.008175 -0.280946 -0.040273 -0.007459 0.999161 1.449174
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.998883 -0.015580 0.044611 -0.094371 0.015244 0.999853 0.007862 -0.290355 -0.044727 -0.007173 0.998973 1.512075
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.998699 -0.018566 0.047486 -0.097665 0.018221 0.999804 0.007690 -0.301511 -0.047620 -0.006815 0.998842 1.563515
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.998456 -0.020917 0.051457 -0.103968 0.020587 0.999764 0.006937 -0.311120 -0.051590 -0.005867 0.998651 1.614183
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.998239 -0.024161 0.054175 -0.106039 0.023903 0.999700 0.005411 -0.316985 -0.054290 -0.004106 0.998517 1.665657
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.998046 -0.025626 0.056989 -0.108739 0.025390 0.999666 0.004855 -0.326821 -0.057094 -0.003398 0.998363 1.723733
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997826 -0.026543 0.060324 -0.111867 0.026328 0.999644 0.004367 -0.337291 -0.060418 -0.002769 0.998169 1.791481
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997606 -0.027239 0.063569 -0.113263 0.027069 0.999627 0.003534 -0.345836 -0.063641 -0.001805 0.997971 1.865463
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997532 -0.027002 0.064809 -0.115314 0.026887 0.999635 0.002648 -0.347884 -0.064857 -0.000899 0.997894 1.896211
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997553 -0.028820 0.063692 -0.110617 0.028779 0.999585 0.001560 -0.356787 -0.063710 0.000276 0.997968 1.942270
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997561 -0.030254 0.062906 -0.109431 0.030336 0.999540 -0.000349 -0.362528 -0.062867 0.002257 0.998019 1.979610
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997540 -0.032428 0.062145 -0.108583 0.032598 0.999467 -0.001728 -0.372199 -0.062056 0.003750 0.998066 2.009728
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997538 -0.035528 0.060457 -0.102035 0.035777 0.999355 -0.003049 -0.381277 -0.060309 0.005204 0.998166 2.035357
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997436 -0.036833 0.061358 -0.099398 0.037072 0.999309 -0.002766 -0.398238 -0.061214 0.005034 0.998112 2.086371
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997353 -0.038584 0.061631 -0.094389 0.038857 0.999240 -0.003226 -0.411190 -0.061460 0.005612 0.998094 2.141555
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997314 -0.037843 0.062704 -0.091990 0.038110 0.999269 -0.003069 -0.425066 -0.062542 0.005450 0.998027 2.198218
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997250 -0.037982 0.063642 -0.090311 0.038254 0.999263 -0.003050 -0.431885 -0.063479 0.005476 0.997968 2.227505
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.997057 -0.036439 0.067446 -0.089299 0.036702 0.999323 -0.002673 -0.450568 -0.067303 0.005140 0.997719 2.301579
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.996964 -0.036144 0.068960 -0.089361 0.036409 0.999334 -0.002578 -0.456039 -0.068820 0.005081 0.997616 2.326873
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.996814 -0.035002 0.071666 -0.087715 0.035250 0.999376 -0.002195 -0.466253 -0.071545 0.004714 0.997426 2.376839
+0.477267 0.848475 0.500000 0.500000 0.000000 0.000000 0.996629 -0.034577 0.074396 -0.085681 0.034857 0.999389 -0.002460 -0.472506 -0.074265 0.005045 0.997226 2.423373
diff --git a/assets/re10k_poses/ffa95c3b40609c76.txt b/assets/re10k_poses/ffa95c3b40609c76.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0d0c33a07301d0a84819c0fd3d0a99ebf19ea4dd
--- /dev/null
+++ b/assets/re10k_poses/ffa95c3b40609c76.txt
@@ -0,0 +1,49 @@
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 1.000000 -0.000000 -0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -0.000000 0.000000 1.000000 0.000000
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 1.000000 0.000215 -0.000327 0.001813 -0.000214 1.000000 0.000477 -0.009889 0.000327 -0.000477 1.000000 -0.001529
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 1.000000 0.000317 -0.000699 0.003404 -0.000316 0.999999 0.001008 -0.020295 0.000699 -0.001008 0.999999 -0.003326
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 1.000000 0.000318 -0.000751 0.004891 -0.000317 0.999999 0.001477 -0.030886 0.000751 -0.001476 0.999999 -0.005395
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 1.000000 0.000409 -0.000742 0.006001 -0.000408 0.999998 0.001824 -0.036560 0.000742 -0.001824 0.999998 -0.006422
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 1.000000 0.000526 -0.000768 0.007271 -0.000524 0.999997 0.002248 -0.047519 0.000769 -0.002248 0.999997 -0.008329
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999999 0.000716 -0.000859 0.008957 -0.000713 0.999996 0.002643 -0.058917 0.000861 -0.002642 0.999996 -0.010188
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999999 0.000731 -0.001036 0.010127 -0.000728 0.999995 0.003061 -0.070428 0.001038 -0.003060 0.999995 -0.012113
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999999 0.000898 -0.001072 0.012077 -0.000894 0.999993 0.003660 -0.081854 0.001076 -0.003659 0.999993 -0.014271
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999999 0.000992 -0.001138 0.013399 -0.000988 0.999991 0.004035 -0.093187 0.001142 -0.004034 0.999991 -0.016292
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999998 0.001144 -0.001424 0.015275 -0.001137 0.999989 0.004575 -0.104784 0.001429 -0.004574 0.999989 -0.018427
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999998 0.001333 -0.001589 0.017036 -0.001325 0.999986 0.005046 -0.116144 0.001596 -0.005044 0.999986 -0.020853
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999998 0.001341 -0.001517 0.017487 -0.001333 0.999985 0.005314 -0.121989 0.001524 -0.005312 0.999985 -0.022131
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999997 0.001529 -0.001691 0.019269 -0.001520 0.999983 0.005645 -0.133104 0.001700 -0.005642 0.999983 -0.024771
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999997 0.001416 -0.001932 0.020898 -0.001404 0.999980 0.006180 -0.144576 0.001941 -0.006177 0.999979 -0.027307
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999997 0.001548 -0.001990 0.022102 -0.001535 0.999976 0.006696 -0.155835 0.002001 -0.006693 0.999976 -0.030287
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999996 0.001815 -0.002178 0.023868 -0.001799 0.999972 0.007242 -0.167078 0.002191 -0.007238 0.999971 -0.033047
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999996 0.001735 -0.002195 0.025182 -0.001718 0.999968 0.007756 -0.178377 0.002208 -0.007752 0.999968 -0.036110
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999995 0.001888 -0.002457 0.026766 -0.001868 0.999965 0.008184 -0.189906 0.002473 -0.008180 0.999963 -0.039023
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999995 0.002015 -0.002575 0.028158 -0.001993 0.999961 0.008660 -0.201422 0.002592 -0.008654 0.999959 -0.041661
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999995 0.001951 -0.002455 0.028327 -0.001929 0.999959 0.008898 -0.207477 0.002472 -0.008893 0.999957 -0.043296
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999994 0.002039 -0.002711 0.029982 -0.002014 0.999954 0.009427 -0.219348 0.002730 -0.009421 0.999952 -0.045578
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999994 0.001914 -0.002915 0.030768 -0.001885 0.999949 0.009949 -0.231634 0.002934 -0.009943 0.999946 -0.047678
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999994 0.002042 -0.002807 0.030782 -0.002013 0.999942 0.010552 -0.244134 0.002828 -0.010546 0.999940 -0.050397
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999993 0.002103 -0.003098 0.031351 -0.002069 0.999938 0.010904 -0.256507 0.003121 -0.010897 0.999936 -0.052897
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999992 0.002243 -0.003249 0.031786 -0.002205 0.999932 0.011464 -0.268867 0.003275 -0.011457 0.999929 -0.055193
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999992 0.002395 -0.003306 0.031250 -0.002355 0.999927 0.011886 -0.281574 0.003335 -0.011879 0.999924 -0.057737
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999991 0.002539 -0.003471 0.031590 -0.002496 0.999920 0.012430 -0.294074 0.003502 -0.012421 0.999917 -0.060433
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999991 0.002506 -0.003517 0.031775 -0.002462 0.999916 0.012714 -0.300324 0.003549 -0.012705 0.999913 -0.061508
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999991 0.002524 -0.003456 0.031207 -0.002478 0.999909 0.013263 -0.312924 0.003490 -0.013254 0.999906 -0.064115
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999991 0.002610 -0.003302 0.029912 -0.002564 0.999900 0.013903 -0.325419 0.003338 -0.013894 0.999898 -0.066755
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999990 0.002694 -0.003567 0.029532 -0.002643 0.999895 0.014272 -0.337734 0.003605 -0.014262 0.999892 -0.068922
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999990 0.002782 -0.003597 0.028573 -0.002729 0.999887 0.014752 -0.350293 0.003638 -0.014742 0.999885 -0.071044
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999989 0.002861 -0.003731 0.027236 -0.002803 0.999878 0.015366 -0.362400 0.003775 -0.015356 0.999875 -0.073339
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999988 0.002949 -0.003789 0.026220 -0.002889 0.999872 0.015718 -0.374106 0.003835 -0.015707 0.999869 -0.075246
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999988 0.003087 -0.003925 0.025008 -0.003024 0.999865 0.016126 -0.385810 0.003974 -0.016114 0.999862 -0.077187
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999987 0.003183 -0.003887 0.024026 -0.003118 0.999860 0.016448 -0.391591 0.003939 -0.016435 0.999857 -0.078230
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999986 0.003320 -0.004004 0.022551 -0.003253 0.999855 0.016728 -0.402909 0.004059 -0.016715 0.999852 -0.080099
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999986 0.003457 -0.003986 0.021424 -0.003388 0.999844 0.017350 -0.414243 0.004045 -0.017336 0.999842 -0.082075
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999985 0.003597 -0.004141 0.020466 -0.003523 0.999839 0.017612 -0.425279 0.004203 -0.017598 0.999836 -0.084044
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999985 0.003755 -0.004066 0.019628 -0.003681 0.999831 0.017998 -0.436512 0.004133 -0.017983 0.999830 -0.085721
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999986 0.003677 -0.003773 0.019102 -0.003606 0.999822 0.018520 -0.447487 0.003840 -0.018506 0.999821 -0.087582
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999991 0.003373 -0.002459 0.018532 -0.003325 0.999813 0.019034 -0.458501 0.002523 -0.019026 0.999816 -0.089160
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999996 0.002966 -0.000473 0.016508 -0.002957 0.999807 0.019413 -0.469394 0.000530 -0.019411 0.999811 -0.090956
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999996 0.002803 0.000248 0.016510 -0.002807 0.999803 0.019638 -0.474855 -0.000193 -0.019639 0.999807 -0.091925
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999995 0.002560 0.002043 0.014604 -0.002601 0.999794 0.020124 -0.485998 -0.001991 -0.020129 0.999795 -0.093879
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999989 0.002242 0.004014 0.013148 -0.002324 0.999786 0.020558 -0.496942 -0.003967 -0.020567 0.999781 -0.095912
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999979 0.001809 0.006297 0.011479 -0.001942 0.999776 0.021091 -0.508253 -0.006258 -0.021103 0.999758 -0.097527
+0.615278 1.093828 0.500000 0.500000 0.000000 0.000000 0.999961 0.001255 0.008746 0.010364 -0.001441 0.999773 0.021260 -0.519731 -0.008718 -0.021272 0.999736 -0.099358
diff --git a/finetune/accelerate_config.yaml b/finetune/accelerate_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..56adf3c5884f5629d1c0389209191cab05c5464b
--- /dev/null
+++ b/finetune/accelerate_config.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+
+gpu_ids: "0,1,2,3,4,5,6,7"
+num_processes: 8  # should be the same as the number of GPUs
+
+debug: false
+deepspeed_config:
+  deepspeed_config_file: configs/zero2_controlnet.yaml  # e.g. configs/zero2.yaml, need use absolute path
+  zero3_init_flag: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/finetune/configs/zero2.yaml b/finetune/configs/zero2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..96afa13811435e9d85abdfc9fb55d305aba95033
--- /dev/null
+++ b/finetune/configs/zero2.yaml
@@ -0,0 +1,38 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/finetune/configs/zero2_controlnet.yaml b/finetune/configs/zero2_controlnet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa9f7744a82be84a72a399bf1971c66d62836f5d
--- /dev/null
+++ b/finetune/configs/zero2_controlnet.yaml
@@ -0,0 +1,38 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "warmup_min_ratio": 0.0,
+            "cos_min_ratio": 0.0001,
+            "warmup_num_steps": 250,
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/finetune/configs/zero2_offload.yaml b/finetune/configs/zero2_offload.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b542665c84a2b9be74ea70bbf1fe9b91ef508669
--- /dev/null
+++ b/finetune/configs/zero2_offload.yaml
@@ -0,0 +1,42 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/finetune/configs/zero3.yaml b/finetune/configs/zero3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f73fe80142e480c45f78c3ce233e0331adfc488
--- /dev/null
+++ b/finetune/configs/zero3.yaml
@@ -0,0 +1,43 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e8,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto",
+        "stage3_prefetch_bucket_size": 5e8,
+        "stage3_param_persistence_threshold": 1e5
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/finetune/configs/zero3_offload.yaml b/finetune/configs/zero3_offload.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a2c502ce6b28709e2952fe1d8060b7c9bdb42b4
--- /dev/null
+++ b/finetune/configs/zero3_offload.yaml
@@ -0,0 +1,51 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e8,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto",
+        "stage3_prefetch_bucket_size": 5e8,
+        "stage3_param_persistence_threshold": 1e6
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
diff --git a/finetune/constants.py b/finetune/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..89dde585a03026fb562a8f43ad9ecbb2e778afcd
--- /dev/null
+++ b/finetune/constants.py
@@ -0,0 +1,2 @@
+LOG_NAME = "trainer"
+LOG_LEVEL = "INFO"
diff --git a/finetune/datasets/__init__.py b/finetune/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aaa2432221e4307db532d90759ec3bc3d7353a0
--- /dev/null
+++ b/finetune/datasets/__init__.py
@@ -0,0 +1,14 @@
+from .bucket_sampler import BucketSampler
+from .i2v_dataset import I2VDatasetWithBuckets, I2VDatasetWithResize
+from .t2v_dataset import T2VDatasetWithBuckets, T2VDatasetWithResize
+from .i2v_flow_dataset import I2VFlowDataset
+
+
+__all__ = [
+    "I2VDatasetWithResize",
+    "I2VDatasetWithBuckets",
+    "T2VDatasetWithResize",
+    "T2VDatasetWithBuckets",
+    "BucketSampler",
+    "I2VFlowDataset",
+]
diff --git a/finetune/datasets/bucket_sampler.py b/finetune/datasets/bucket_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bc1dde237db37f7f735d10f29a642e184a508e4
--- /dev/null
+++ b/finetune/datasets/bucket_sampler.py
@@ -0,0 +1,71 @@
+import logging
+import random
+
+from torch.utils.data import Dataset, Sampler
+
+
+logger = logging.getLogger(__name__)
+
+
+class BucketSampler(Sampler):
+    r"""
+    PyTorch Sampler that groups 3D data by height, width and frames.
+
+    Args:
+        data_source (`VideoDataset`):
+            A PyTorch dataset object that is an instance of `VideoDataset`.
+        batch_size (`int`, defaults to `8`):
+            The batch size to use for training.
+        shuffle (`bool`, defaults to `True`):
+            Whether or not to shuffle the data in each batch before dispatching to dataloader.
+        drop_last (`bool`, defaults to `False`):
+            Whether or not to drop incomplete buckets of data after completely iterating over all data
+            in the dataset. If set to True, only batches that have `batch_size` number of entries will
+            be yielded. If set to False, it is guaranteed that all data in the dataset will be processed
+            and batches that do not have `batch_size` number of entries will also be yielded.
+    """
+
+    def __init__(
+        self, data_source: Dataset, batch_size: int = 8, shuffle: bool = True, drop_last: bool = False
+    ) -> None:
+        self.data_source = data_source
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+
+        self.buckets = {resolution: [] for resolution in data_source.video_resolution_buckets}
+
+        self._raised_warning_for_drop_last = False
+
+    def __len__(self):
+        if self.drop_last and not self._raised_warning_for_drop_last:
+            self._raised_warning_for_drop_last = True
+            logger.warning(
+                "Calculating the length for bucket sampler is not possible when `drop_last` is set to True. This may cause problems when setting the number of epochs used for training."
+            )
+        return (len(self.data_source) + self.batch_size - 1) // self.batch_size
+
+    def __iter__(self):
+        for index, data in enumerate(self.data_source):
+            video_metadata = data["video_metadata"]
+            f, h, w = video_metadata["num_frames"], video_metadata["height"], video_metadata["width"]
+
+            self.buckets[(f, h, w)].append(data)
+            if len(self.buckets[(f, h, w)]) == self.batch_size:
+                if self.shuffle:
+                    random.shuffle(self.buckets[(f, h, w)])
+                yield self.buckets[(f, h, w)]
+                del self.buckets[(f, h, w)]
+                self.buckets[(f, h, w)] = []
+
+        if self.drop_last:
+            return
+
+        for fhw, bucket in list(self.buckets.items()):
+            if len(bucket) == 0:
+                continue
+            if self.shuffle:
+                random.shuffle(bucket)
+                yield bucket
+                del self.buckets[fhw]
+                self.buckets[fhw] = []
diff --git a/finetune/datasets/i2v_dataset.py b/finetune/datasets/i2v_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad6331f2025a9a3312c5a7119b0590c8c47bde1
--- /dev/null
+++ b/finetune/datasets/i2v_dataset.py
@@ -0,0 +1,311 @@
+import hashlib
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+
+import torch
+from accelerate.logging import get_logger
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from typing_extensions import override
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+from .utils import (
+    load_images,
+    load_images_from_videos,
+    load_prompts,
+    load_videos,
+    preprocess_image_with_resize,
+    preprocess_video_with_buckets,
+    preprocess_video_with_resize,
+)
+
+
+if TYPE_CHECKING:
+    from finetune.trainer import Trainer
+
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+
+decord.bridge.set_bridge("torch")
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+class BaseI2VDataset(Dataset):
+    """
+    Base dataset class for Image-to-Video (I2V) training.
+
+    This dataset loads prompts, videos and corresponding conditioning images for I2V training.
+
+    Args:
+        data_root (str): Root directory containing the dataset files
+        caption_column (str): Path to file containing text prompts/captions
+        video_column (str): Path to file containing video paths
+        image_column (str): Path to file containing image paths
+        device (torch.device): Device to load the data on
+        encode_video_fn (Callable[[torch.Tensor], torch.Tensor], optional): Function to encode videos
+    """
+
+    def __init__(
+        self,
+        data_root: str,
+        caption_column: str,
+        video_column: str,
+        image_column: str | None,
+        device: torch.device,
+        trainer: "Trainer" = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        data_root = Path(data_root)
+        self.prompts = load_prompts(data_root / caption_column)
+        self.videos = load_videos(data_root / video_column)
+        if image_column is not None:
+            self.images = load_images(data_root / image_column)
+        else:
+            self.images = load_images_from_videos(self.videos)
+        self.trainer = trainer
+
+        self.device = device
+        self.encode_video = trainer.encode_video
+        self.encode_text = trainer.encode_text
+
+        # Check if number of prompts matches number of videos and images
+        if not (len(self.videos) == len(self.prompts) == len(self.images)):
+            raise ValueError(
+                f"Expected length of prompts, videos and images to be the same but found {len(self.prompts)=}, {len(self.videos)=} and {len(self.images)=}. Please ensure that the number of caption prompts, videos and images match in your dataset."
+            )
+
+        # Check if all video files exist
+        if any(not path.is_file() for path in self.videos):
+            raise ValueError(
+                f"Some video files were not found. Please ensure that all video files exist in the dataset directory. Missing file: {next(path for path in self.videos if not path.is_file())}"
+            )
+
+        # Check if all image files exist
+        if any(not path.is_file() for path in self.images):
+            raise ValueError(
+                f"Some image files were not found. Please ensure that all image files exist in the dataset directory. Missing file: {next(path for path in self.images if not path.is_file())}"
+            )
+
+    def __len__(self) -> int:
+        return len(self.videos)
+
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        if isinstance(index, list):
+            # Here, index is actually a list of data objects that we need to return.
+            # The BucketSampler should ideally return indices. But, in the sampler, we'd like
+            # to have information about num_frames, height and width. Since this is not stored
+            # as metadata, we need to read the video to get this information. You could read this
+            # information without loading the full video in memory, but we do it anyway. In order
+            # to not load the video twice (once to get the metadata, and once to return the loaded video
+            # based on sampled indices), we cache it in the BucketSampler. When the sampler is
+            # to yield, we yield the cache data instead of indices. So, this special check ensures
+            # that data is not loaded a second time. PRs are welcome for improvements.
+            return index
+
+        prompt = self.prompts[index]
+        video = self.videos[index]
+        image = self.images[index]
+        train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+
+        cache_dir = self.trainer.args.data_root / "cache"
+        video_latent_dir = cache_dir / "video_latent" / self.trainer.args.model_name / train_resolution_str
+        prompt_embeddings_dir = cache_dir / "prompt_embeddings"
+        video_latent_dir.mkdir(parents=True, exist_ok=True)
+        prompt_embeddings_dir.mkdir(parents=True, exist_ok=True)
+
+        prompt_hash = str(hashlib.sha256(prompt.encode()).hexdigest())
+        prompt_embedding_path = prompt_embeddings_dir / (prompt_hash + ".safetensors")
+        encoded_video_path = video_latent_dir / (video.stem + ".safetensors")
+
+        if prompt_embedding_path.exists():
+            prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"]
+            logger.debug(
+                f"process {self.trainer.accelerator.process_index}: Loaded prompt embedding from {prompt_embedding_path}",
+                main_process_only=False,
+            )
+        else:
+            prompt_embedding = self.encode_text(prompt)
+            prompt_embedding = prompt_embedding.to("cpu")
+            # [1, seq_len, hidden_size] -> [seq_len, hidden_size]
+            prompt_embedding = prompt_embedding[0]
+            save_file({"prompt_embedding": prompt_embedding}, prompt_embedding_path)
+            logger.info(f"Saved prompt embedding to {prompt_embedding_path}", main_process_only=False)
+
+        if encoded_video_path.exists():
+            encoded_video = load_file(encoded_video_path)["encoded_video"]
+            logger.debug(f"Loaded encoded video from {encoded_video_path}", main_process_only=False)
+            # shape of image: [C, H, W]
+            _, image = self.preprocess(None, self.images[index])
+            image = self.image_transform(image)
+        else:
+            frames, image = self.preprocess(video, image)
+            frames = frames.to(self.device)
+            image = image.to(self.device)
+            image = self.image_transform(image)
+            # Current shape of frames: [F, C, H, W]
+            frames = self.video_transform(frames)
+
+            # Convert to [B, C, F, H, W]
+            frames = frames.unsqueeze(0)
+            frames = frames.permute(0, 2, 1, 3, 4).contiguous()
+            encoded_video = self.encode_video(frames)
+
+            # [1, C, F, H, W] -> [C, F, H, W]
+            encoded_video = encoded_video[0]
+            encoded_video = encoded_video.to("cpu")
+            image = image.to("cpu")
+            save_file({"encoded_video": encoded_video}, encoded_video_path)
+            logger.info(f"Saved encoded video to {encoded_video_path}", main_process_only=False)
+
+        # shape of encoded_video: [C, F, H, W]
+        # shape of image: [C, H, W]
+        return {
+            "image": image,
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Loads and preprocesses a video and an image.
+        If either path is None, no preprocessing will be done for that input.
+
+        Args:
+            video_path: Path to the video file to load
+            image_path: Path to the image file to load
+
+        Returns:
+            A tuple containing:
+                - video(torch.Tensor) of shape [F, C, H, W] where F is number of frames,
+                  C is number of channels, H is height and W is width
+                - image(torch.Tensor) of shape [C, H, W]
+        """
+        raise NotImplementedError("Subclass must implement this method")
+
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        """
+        Applies transformations to a video.
+
+        Args:
+            frames (torch.Tensor): A 4D tensor representing a video
+                with shape [F, C, H, W] where:
+                - F is number of frames
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+
+        Returns:
+            torch.Tensor: The transformed video tensor
+        """
+        raise NotImplementedError("Subclass must implement this method")
+
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Applies transformations to an image.
+
+        Args:
+            image (torch.Tensor): A 3D tensor representing an image
+                with shape [C, H, W] where:
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+
+        Returns:
+            torch.Tensor: The transformed image tensor
+        """
+        raise NotImplementedError("Subclass must implement this method")
+
+
+class I2VDatasetWithResize(BaseI2VDataset):
+    """
+    A dataset class for image-to-video generation that resizes inputs to fixed dimensions.
+
+    This class preprocesses videos and images by resizing them to specified dimensions:
+    - Videos are resized to max_num_frames x height x width
+    - Images are resized to height x width
+
+    Args:
+        max_num_frames (int): Maximum number of frames to extract from videos
+        height (int): Target height for resizing videos and images
+        width (int): Target width for resizing videos and images
+    """
+
+    def __init__(self, max_num_frames: int, height: int, width: int, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+
+    @override
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if video_path is not None:
+            video = preprocess_video_with_resize(video_path, self.max_num_frames, self.height, self.width)
+        else:
+            video = None
+        if image_path is not None:
+            image = preprocess_image_with_resize(image_path, self.height, self.width)
+        else:
+            image = None
+        return video, image
+
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+
+    @override
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)
+
+
+class I2VDatasetWithBuckets(BaseI2VDataset):
+    def __init__(
+        self,
+        video_resolution_buckets: List[Tuple[int, int, int]],
+        vae_temporal_compression_ratio: int,
+        vae_height_compression_ratio: int,
+        vae_width_compression_ratio: int,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.video_resolution_buckets = [
+            (
+                int(b[0] / vae_temporal_compression_ratio),
+                int(b[1] / vae_height_compression_ratio),
+                int(b[2] / vae_width_compression_ratio),
+            )
+            for b in video_resolution_buckets
+        ]
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+
+    @override
+    def preprocess(self, video_path: Path, image_path: Path) -> Tuple[torch.Tensor, torch.Tensor]:
+        video = preprocess_video_with_buckets(video_path, self.video_resolution_buckets)
+        image = preprocess_image_with_resize(image_path, video.shape[2], video.shape[3])
+        return video, image
+
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+
+    @override
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)
diff --git a/finetune/datasets/i2v_flow_dataset.py b/finetune/datasets/i2v_flow_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..48cd3d075989269042fe499c6185e4959cff6131
--- /dev/null
+++ b/finetune/datasets/i2v_flow_dataset.py
@@ -0,0 +1,188 @@
+import hashlib
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import json
+import random
+
+import torch
+from accelerate.logging import get_logger
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from typing_extensions import override
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+from .utils import (
+    load_images,
+    load_images_from_videos,
+    load_prompts,
+    load_videos,
+    preprocess_image_with_resize,
+    preprocess_video_with_buckets,
+    preprocess_video_with_resize,
+    load_binary_mask_compressed,
+)
+
+import pdb
+
+if TYPE_CHECKING:
+    from finetune.trainer import Trainer
+
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+
+decord.bridge.set_bridge("torch")
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+class I2VFlowDataset(Dataset):
+    """
+    A dataset class for (image,flow)-to-video generation or image-to-flow_video that resizes inputs to fixed dimensions.
+
+    This class preprocesses videos and images by resizing them to specified dimensions:
+    - Videos are resized to max_num_frames x height x width
+    - Images are resized to height x width
+
+    Args:
+        max_num_frames (int): Maximum number of frames to extract from videos
+        height (int): Target height for resizing videos and images
+        width (int): Target width for resizing videos and images
+    """
+
+    def __init__(
+        self, 
+        max_num_frames: int, 
+        height: int, 
+        width: int, 
+        data_root: str,
+        caption_column: str,
+        video_column: str,
+        image_column: str | None,
+        device: torch.device,
+        trainer: "Trainer" = None,
+        *args, 
+        **kwargs
+    ) -> None:
+        data_root = Path(data_root)
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl in the root path"
+        
+        # Load metadata
+        # metadata = {
+        #     "video_path": ...,
+        #     "hash_code": ...,
+        #     "prompt": ...,
+        # }
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+
+        self.prompts = [x["prompt"] for x in metadata]
+        if 'curated' in str(data_root).lower():
+            self.prompt_embeddings = [data_root / "prompt_embeddings" / (x["hash_code"] + '.safetensors') for x in metadata]
+        else:
+            self.prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.videos = [data_root / "video_latent" / "x".join(str(x) for x in trainer.args.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        self.flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+
+
+        # data_root = Path(data_root)
+        # self.prompts = load_prompts(data_root / caption_column)
+        # self.videos = load_videos(data_root / video_column)
+
+        self.trainer = trainer
+
+        self.device = device
+        self.encode_video = trainer.encode_video
+        self.encode_text = trainer.encode_text
+        
+        # Check if number of prompts matches number of videos and images
+        if not (len(self.videos) == len(self.prompts) == len(self.images) == len(self.flows)):
+            raise ValueError(
+                f"Expected length of prompts, videos and images to be the same but found {len(self.prompts)=}, {len(self.videos)=}, {len(self.images)=} and {len(self.flows)=}. Please ensure that the number of caption prompts, videos and images match in your dataset."
+            )
+
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+        
+        self.length = len(self.videos)
+
+        print(f"Dataset size: {self.length}")
+
+    def __len__(self) -> int:
+        return self.length
+    
+    def load_data_pair(self, index):
+        # prompt = self.prompts[index]
+        prompt_embedding_path = self.prompt_embeddings[index]
+        encoded_video_path = self.videos[index]
+        encoded_flow_path = self.flows[index]
+        # mask_path = self.masks[index]
+        # image_path = self.images[index]
+        # train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+        
+        prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"] 
+        encoded_video = load_file(encoded_video_path)["encoded_video"] # CFHW
+        encoded_flow = load_file(encoded_flow_path)["encoded_flow_f"] # CFHW
+        
+        return prompt_embedding, encoded_video, encoded_flow
+    
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        while True:
+            try:
+                prompt_embedding, encoded_video, encoded_flow = self.load_data_pair(index)
+                break
+            except Exception as e:
+                print(f"Error loading {self.prompt_embeddings[index]}: {str(e)}")
+                index = random.randint(0, self.length - 1)
+            
+        image_path = self.images[index]
+        prompt = self.prompts[index]
+        train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+        
+        _, image = self.preprocess(None, image_path)
+        image = self.image_transform(image)
+        
+
+        # shape of encoded_video: [C, F, H, W]
+        # shape and scale of image: [C, H, W], [-1,1]
+        return {
+            "image": image,
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "encoded_flow": encoded_flow,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+
+    @override
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if video_path is not None:
+            video = preprocess_video_with_resize(video_path, self.max_num_frames, self.height, self.width)
+        else:
+            video = None
+        if image_path is not None:
+            image = preprocess_image_with_resize(image_path, self.height, self.width)
+        else:
+            image = None
+        return video, image
+
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+
+    @override
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)
\ No newline at end of file
diff --git a/finetune/datasets/t2v_dataset.py b/finetune/datasets/t2v_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d123ccf4a23eab9585187dfc6fa918fdbf403d7e
--- /dev/null
+++ b/finetune/datasets/t2v_dataset.py
@@ -0,0 +1,251 @@
+import hashlib
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+
+import torch
+from accelerate.logging import get_logger
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from typing_extensions import override
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+from .utils import load_prompts, load_videos, preprocess_video_with_buckets, preprocess_video_with_resize
+
+
+if TYPE_CHECKING:
+    from finetune.trainer import Trainer
+
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+
+decord.bridge.set_bridge("torch")
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+class BaseT2VDataset(Dataset):
+    """
+    Base dataset class for Text-to-Video (T2V) training.
+
+    This dataset loads prompts and videos for T2V training.
+
+    Args:
+        data_root (str): Root directory containing the dataset files
+        caption_column (str): Path to file containing text prompts/captions
+        video_column (str): Path to file containing video paths
+        device (torch.device): Device to load the data on
+        encode_video_fn (Callable[[torch.Tensor], torch.Tensor], optional): Function to encode videos
+    """
+
+    def __init__(
+        self,
+        data_root: str,
+        caption_column: str,
+        video_column: str,
+        device: torch.device = None,
+        trainer: "Trainer" = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+
+        data_root = Path(data_root)
+        self.prompts = load_prompts(data_root / caption_column)
+        self.videos = load_videos(data_root / video_column)
+        self.device = device
+        self.encode_video = trainer.encode_video
+        self.encode_text = trainer.encode_text
+        self.trainer = trainer
+
+        # Check if all video files exist
+        if any(not path.is_file() for path in self.videos):
+            raise ValueError(
+                f"Some video files were not found. Please ensure that all video files exist in the dataset directory. Missing file: {next(path for path in self.videos if not path.is_file())}"
+            )
+
+        # Check if number of prompts matches number of videos
+        if len(self.videos) != len(self.prompts):
+            raise ValueError(
+                f"Expected length of prompts and videos to be the same but found {len(self.prompts)=} and {len(self.videos)=}. Please ensure that the number of caption prompts and videos match in your dataset."
+            )
+
+    def __len__(self) -> int:
+        return len(self.videos)
+
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        if isinstance(index, list):
+            # Here, index is actually a list of data objects that we need to return.
+            # The BucketSampler should ideally return indices. But, in the sampler, we'd like
+            # to have information about num_frames, height and width. Since this is not stored
+            # as metadata, we need to read the video to get this information. You could read this
+            # information without loading the full video in memory, but we do it anyway. In order
+            # to not load the video twice (once to get the metadata, and once to return the loaded video
+            # based on sampled indices), we cache it in the BucketSampler. When the sampler is
+            # to yield, we yield the cache data instead of indices. So, this special check ensures
+            # that data is not loaded a second time. PRs are welcome for improvements.
+            return index
+
+        prompt = self.prompts[index]
+        video = self.videos[index]
+        train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+
+        cache_dir = self.trainer.args.data_root / "cache"
+        video_latent_dir = cache_dir / "video_latent" / self.trainer.args.model_name / train_resolution_str
+        prompt_embeddings_dir = cache_dir / "prompt_embeddings"
+        video_latent_dir.mkdir(parents=True, exist_ok=True)
+        prompt_embeddings_dir.mkdir(parents=True, exist_ok=True)
+
+        prompt_hash = str(hashlib.sha256(prompt.encode()).hexdigest())
+        prompt_embedding_path = prompt_embeddings_dir / (prompt_hash + ".safetensors")
+        encoded_video_path = video_latent_dir / (video.stem + ".safetensors")
+
+        if prompt_embedding_path.exists():
+            prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"]
+            logger.debug(
+                f"process {self.trainer.accelerator.process_index}: Loaded prompt embedding from {prompt_embedding_path}",
+                main_process_only=False,
+            )
+        else:
+            prompt_embedding = self.encode_text(prompt)
+            prompt_embedding = prompt_embedding.to("cpu")
+            # [1, seq_len, hidden_size] -> [seq_len, hidden_size]
+            prompt_embedding = prompt_embedding[0]
+            save_file({"prompt_embedding": prompt_embedding}, prompt_embedding_path)
+            logger.info(f"Saved prompt embedding to {prompt_embedding_path}", main_process_only=False)
+
+        if encoded_video_path.exists():
+            # encoded_video = torch.load(encoded_video_path, weights_only=True)
+            encoded_video = load_file(encoded_video_path)["encoded_video"]
+            logger.debug(f"Loaded encoded video from {encoded_video_path}", main_process_only=False)
+            # shape of image: [C, H, W]
+        else:
+            frames = self.preprocess(video)
+            frames = frames.to(self.device)
+            # Current shape of frames: [F, C, H, W]
+            frames = self.video_transform(frames)
+            # Convert to [B, C, F, H, W]
+            frames = frames.unsqueeze(0)
+            frames = frames.permute(0, 2, 1, 3, 4).contiguous()
+            encoded_video = self.encode_video(frames)
+
+            # [1, C, F, H, W] -> [C, F, H, W]
+            encoded_video = encoded_video[0]
+            encoded_video = encoded_video.to("cpu")
+            save_file({"encoded_video": encoded_video}, encoded_video_path)
+            logger.info(f"Saved encoded video to {encoded_video_path}", main_process_only=False)
+
+        # shape of encoded_video: [C, F, H, W]
+        return {
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+
+    def preprocess(self, video_path: Path) -> torch.Tensor:
+        """
+        Loads and preprocesses a video.
+
+        Args:
+            video_path: Path to the video file to load.
+
+        Returns:
+            torch.Tensor: Video tensor of shape [F, C, H, W] where:
+                - F is number of frames
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+        """
+        raise NotImplementedError("Subclass must implement this method")
+
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        """
+        Applies transformations to a video.
+
+        Args:
+            frames (torch.Tensor): A 4D tensor representing a video
+                with shape [F, C, H, W] where:
+                - F is number of frames
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+
+        Returns:
+            torch.Tensor: The transformed video tensor with the same shape as the input
+        """
+        raise NotImplementedError("Subclass must implement this method")
+
+
+class T2VDatasetWithResize(BaseT2VDataset):
+    """
+    A dataset class for text-to-video generation that resizes inputs to fixed dimensions.
+
+    This class preprocesses videos by resizing them to specified dimensions:
+    - Videos are resized to max_num_frames x height x width
+
+    Args:
+        max_num_frames (int): Maximum number of frames to extract from videos
+        height (int): Target height for resizing videos
+        width (int): Target width for resizing videos
+    """
+
+    def __init__(self, max_num_frames: int, height: int, width: int, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+
+        self.__frame_transform = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+
+    @override
+    def preprocess(self, video_path: Path) -> torch.Tensor:
+        return preprocess_video_with_resize(
+            video_path,
+            self.max_num_frames,
+            self.height,
+            self.width,
+        )
+
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transform(f) for f in frames], dim=0)
+
+
+class T2VDatasetWithBuckets(BaseT2VDataset):
+    def __init__(
+        self,
+        video_resolution_buckets: List[Tuple[int, int, int]],
+        vae_temporal_compression_ratio: int,
+        vae_height_compression_ratio: int,
+        vae_width_compression_ratio: int,
+        *args,
+        **kwargs,
+    ) -> None:
+        """ """
+        super().__init__(*args, **kwargs)
+
+        self.video_resolution_buckets = [
+            (
+                int(b[0] / vae_temporal_compression_ratio),
+                int(b[1] / vae_height_compression_ratio),
+                int(b[2] / vae_width_compression_ratio),
+            )
+            for b in video_resolution_buckets
+        ]
+
+        self.__frame_transform = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+
+    @override
+    def preprocess(self, video_path: Path) -> torch.Tensor:
+        return preprocess_video_with_buckets(video_path, self.video_resolution_buckets)
+
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transform(f) for f in frames], dim=0)
diff --git a/finetune/datasets/utils.py b/finetune/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffe2ff733efc347f1957019a91b64a6cd90b156
--- /dev/null
+++ b/finetune/datasets/utils.py
@@ -0,0 +1,211 @@
+import logging
+from pathlib import Path
+from typing import List, Tuple
+
+import cv2
+import torch
+from torchvision.transforms.functional import resize
+from einops import repeat, rearrange
+
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+
+decord.bridge.set_bridge("torch")
+
+from PIL import Image
+import numpy as np
+import pdb
+
+##########  loaders  ##########
+
+
+def load_prompts(prompt_path: Path) -> List[str]:
+    with open(prompt_path, "r", encoding="utf-8") as file:
+        return [line.strip() for line in file.readlines() if len(line.strip()) > 0]
+
+
+def load_videos(video_path: Path) -> List[Path]:
+    with open(video_path, "r", encoding="utf-8") as file:
+        return [video_path.parent / line.strip() for line in file.readlines() if len(line.strip()) > 0]
+
+
+def load_images(image_path: Path) -> List[Path]:
+    with open(image_path, "r", encoding="utf-8") as file:
+        return [image_path.parent / line.strip() for line in file.readlines() if len(line.strip()) > 0]
+
+
+def load_images_from_videos(videos_path: List[Path]) -> List[Path]:
+    first_frames_dir = videos_path[0].parent.parent / "first_frames"
+    first_frames_dir.mkdir(exist_ok=True)
+
+    first_frame_paths = []
+    for video_path in videos_path:
+        frame_path = first_frames_dir / f"{video_path.stem}.png"
+        if frame_path.exists():
+            first_frame_paths.append(frame_path)
+            continue
+
+        # Open video
+        cap = cv2.VideoCapture(str(video_path))
+
+        # Read first frame
+        ret, frame = cap.read()
+        if not ret:
+            raise RuntimeError(f"Failed to read video: {video_path}")
+
+        # Save frame as PNG with same name as video
+        cv2.imwrite(str(frame_path), frame)
+        logging.info(f"Saved first frame to {frame_path}")
+
+        # Release video capture
+        cap.release()
+
+        first_frame_paths.append(frame_path)
+
+    return first_frame_paths
+
+
+def load_binary_mask_compressed(path, shape, device, dtype):
+    # shape: (F,C,H,W), C=1
+    with open(path, 'rb') as f:
+        packed = np.frombuffer(f.read(), dtype=np.uint8)
+    unpacked = np.unpackbits(packed)[:np.prod(shape)]
+    mask_loaded = torch.from_numpy(unpacked).to(device, dtype).reshape(shape)
+
+    mask_interp = torch.nn.functional.interpolate(rearrange(mask_loaded, 'f c h w -> c f h w').unsqueeze(0), size=(shape[0]//4+1, shape[2]//8, shape[3]//8), mode='trilinear', align_corners=False).squeeze(0) # CFHW
+    mask_interp[mask_interp>=0.5] = 1.0
+    mask_interp[mask_interp<0.5] = 0.0
+    
+    return rearrange(mask_loaded, 'f c h w -> c f h w'), mask_interp
+
+##########  preprocessors  ##########
+
+
+def preprocess_image_with_resize(
+    image_path: Path | str,
+    height: int,
+    width: int,
+) -> torch.Tensor:
+    """
+    Loads and resizes a single image.
+
+    Args:
+        image_path: Path to the image file.
+        height: Target height for resizing.
+        width: Target width for resizing.
+
+    Returns:
+        torch.Tensor: Image tensor with shape [C, H, W] where:
+            C = number of channels (3 for RGB)
+            H = height
+            W = width
+    """
+    if isinstance(image_path, str):
+        image_path = Path(image_path)
+    # image = cv2.imread(image_path.as_posix())
+    # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    # image = cv2.resize(image, (width, height))
+    # image = torch.from_numpy(image).float()
+    # image = image.permute(2, 0, 1).contiguous()
+
+    image = np.array(Image.open(image_path.as_posix()).resize((width, height)))
+    image = torch.from_numpy(image).float()
+    image = image.permute(2, 0, 1).contiguous()
+
+    return image
+
+
+def preprocess_video_with_resize(
+    video_path: Path | str,
+    max_num_frames: int,
+    height: int,
+    width: int,
+) -> torch.Tensor:
+    """
+    Loads and resizes a single video.
+
+    The function processes the video through these steps:
+      1. If video frame count > max_num_frames, downsample frames evenly
+      2. If video dimensions don't match (height, width), resize frames
+
+    Args:
+        video_path: Path to the video file.
+        max_num_frames: Maximum number of frames to keep.
+        height: Target height for resizing.
+        width: Target width for resizing.
+
+    Returns:
+        A torch.Tensor with shape [F, C, H, W] where:
+          F = number of frames
+          C = number of channels (3 for RGB)
+          H = height
+          W = width
+    """
+    if isinstance(video_path, str):
+        video_path = Path(video_path)
+    video_reader = decord.VideoReader(uri=video_path.as_posix(), width=width, height=height)
+    video_num_frames = len(video_reader)
+    if video_num_frames < max_num_frames:
+        # Get all frames first
+        frames = video_reader.get_batch(list(range(video_num_frames)))
+        # Repeat the last frame until we reach max_num_frames
+        last_frame = frames[-1:]
+        num_repeats = max_num_frames - video_num_frames
+        repeated_frames = last_frame.repeat(num_repeats, 1, 1, 1)
+        frames = torch.cat([frames, repeated_frames], dim=0)
+        return frames.float().permute(0, 3, 1, 2).contiguous()
+    else:
+        indices = list(range(0, video_num_frames, video_num_frames // max_num_frames))
+        frames = video_reader.get_batch(indices)
+        import pdb
+        pdb.set_trace()
+        frames = frames[:max_num_frames].float()
+        frames = frames.permute(0, 3, 1, 2).contiguous()
+        return frames
+
+
+def preprocess_video_with_buckets(
+    video_path: Path,
+    resolution_buckets: List[Tuple[int, int, int]],
+) -> torch.Tensor:
+    """
+    Args:
+        video_path: Path to the video file.
+        resolution_buckets: List of tuples (num_frames, height, width) representing
+            available resolution buckets.
+
+    Returns:
+        torch.Tensor: Video tensor with shape [F, C, H, W] where:
+            F = number of frames
+            C = number of channels (3 for RGB)
+            H = height
+            W = width
+
+    The function processes the video through these steps:
+        1. Finds nearest frame bucket <= video frame count
+        2. Downsamples frames evenly to match bucket size
+        3. Finds nearest resolution bucket based on dimensions
+        4. Resizes frames to match bucket resolution
+    """
+    video_reader = decord.VideoReader(uri=video_path.as_posix())
+    video_num_frames = len(video_reader)
+    resolution_buckets = [bucket for bucket in resolution_buckets if bucket[0] <= video_num_frames]
+    if len(resolution_buckets) == 0:
+        raise ValueError(f"video frame count in {video_path} is less than all frame buckets {resolution_buckets}")
+
+    nearest_frame_bucket = min(
+        resolution_buckets,
+        key=lambda bucket: video_num_frames - bucket[0],
+        default=1,
+    )[0]
+    frame_indices = list(range(0, video_num_frames, video_num_frames // nearest_frame_bucket))
+    frames = video_reader.get_batch(frame_indices)
+    frames = frames[:nearest_frame_bucket].float()
+    frames = frames.permute(0, 3, 1, 2).contiguous()
+
+    nearest_res = min(resolution_buckets, key=lambda x: abs(x[1] - frames.shape[2]) + abs(x[2] - frames.shape[3]))
+    nearest_res = (nearest_res[1], nearest_res[2])
+    frames = torch.stack([resize(f, nearest_res) for f in frames], dim=0)
+
+    return frames
diff --git a/finetune/models/__init__.py b/finetune/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa8142e5d03e5fc3160c15fd7dfa23aa58a2cb5b
--- /dev/null
+++ b/finetune/models/__init__.py
@@ -0,0 +1,12 @@
+import importlib
+from pathlib import Path
+
+
+package_dir = Path(__file__).parent
+
+for subdir in package_dir.iterdir():
+    if subdir.is_dir() and not subdir.name.startswith("_"):
+        for module_path in subdir.glob("*.py"):
+            module_name = module_path.stem
+            full_module_name = f".{subdir.name}.{module_name}"
+            importlib.import_module(full_module_name, package=__name__)
diff --git a/finetune/models/cogvideox_i2v/flovd_OMSM_lora_trainer.py b/finetune/models/cogvideox_i2v/flovd_OMSM_lora_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e34b6e3267e3aa8c1ed74594411173eb17256c9
--- /dev/null
+++ b/finetune/models/cogvideox_i2v/flovd_OMSM_lora_trainer.py
@@ -0,0 +1,748 @@
+from typing import Any, Dict, List, Tuple
+from pathlib import Path
+import os
+import hashlib
+import json
+import random
+import wandb
+import math
+import numpy as np
+from einops import rearrange, repeat
+from safetensors.torch import load_file, save_file
+from accelerate.logging import get_logger
+
+import torch
+
+from accelerate.utils import gather_object
+
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils.export_utils import export_to_video
+
+from finetune.pipeline.flovd_OMSM_cogvideox_pipeline import FloVDOMSMCogVideoXImageToVideoPipeline
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from PIL import Image
+from numpy import dtype
+from transformers import AutoTokenizer, T5EncoderModel
+from typing_extensions import override
+
+from finetune.schemas import Args, Components, State
+from finetune.trainer import Trainer
+from finetune.utils import (
+    cast_training_params,
+    free_memory,
+    get_memory_statistics,
+    string_to_filename,
+    unwrap_model,
+)
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    load_binary_mask_compressed,
+)
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting, flow_to_color
+
+from ..utils import register
+
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import pdb
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+class FloVDOMSMCogVideoXI2VLoraTrainer(Trainer):
+    UNLOAD_LIST = ["text_encoder"]
+
+    @override
+    def __init__(self, args: Args) -> None:
+        super().__init__(args)
+
+
+    @override
+    def load_components(self) -> Dict[str, Any]:
+        # TODO. Change the pipeline and ...
+        components = Components()
+        model_path = str(self.args.model_path)
+
+        components.pipeline_cls = FloVDOMSMCogVideoXImageToVideoPipeline
+
+        components.tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")
+
+        components.text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+
+        components.transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+
+        components.vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+
+        components.scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+
+        return components
+    
+
+    @override
+    def initialize_pipeline(self) -> FloVDOMSMCogVideoXImageToVideoPipeline:
+        # TODO. Change the pipeline and ...
+        pipe = FloVDOMSMCogVideoXImageToVideoPipeline(
+            tokenizer=self.components.tokenizer,
+            text_encoder=self.components.text_encoder,
+            vae=self.components.vae,
+            transformer=unwrap_model(self.accelerator, self.components.transformer),
+            scheduler=self.components.scheduler,
+        )
+        return pipe
+
+    def initialize_flow_generator(self):
+        depth_estimator_kwargs = {
+            "target": 'modules.depth_warping.depth_warping.DepthWarping_wrapper',
+            "kwargs": {
+                "ckpt_path": '/workspace/workspace/checkpoints/depth_anything/depth_anything_v2_metric_hypersim_vitb.pth',
+                "model_config": {
+                    "max_depth": 20,
+                    "encoder": 'vitb',
+                    "features": 128,
+                    "out_channels": [96, 192, 384, 768],
+                }
+
+            }
+        }
+
+        return CameraFlowGenerator(depth_estimator_kwargs)
+
+    @override
+    def collate_fn(self, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        ret = {"encoded_videos": [], "prompt_embedding": [], "images": [], "encoded_flow": []}
+
+        for sample in samples:
+            encoded_video = sample["encoded_video"]
+            prompt_embedding = sample["prompt_embedding"]
+            image = sample["image"]
+            encoded_flow = sample["encoded_flow"]
+
+            ret["encoded_videos"].append(encoded_video)
+            ret["prompt_embedding"].append(prompt_embedding)
+            ret["images"].append(image)
+            ret["encoded_flow"].append(encoded_flow)
+
+        ret["encoded_videos"] = torch.stack(ret["encoded_videos"])
+        ret["prompt_embedding"] = torch.stack(ret["prompt_embedding"])
+        ret["images"] = torch.stack(ret["images"])
+        ret["encoded_flow"] = torch.stack(ret["encoded_flow"])
+
+        return ret
+    
+
+    @override
+    def compute_loss(self, batch) -> torch.Tensor:
+        prompt_embedding = batch["prompt_embedding"]
+        images = batch["images"]
+        latent_flow = batch["encoded_flow"]
+
+        # Shape of prompt_embedding: [B, seq_len, hidden_size]
+        # Shape of images: [B, C, H, W]
+        # Shape of latent_flow: [B, C, F, H, W]
+
+        patch_size_t = self.state.transformer_config.patch_size_t # WJ: None in i2v setting...
+        if patch_size_t is not None:
+            # ncopy = latent.shape[2] % patch_size_t
+            # # Copy the first frame ncopy times to match patch_size_t
+            # first_frame = latent[:, :, :1, :, :]  # Get first frame [B, C, 1, H, W]
+            # latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)
+            # assert latent.shape[2] % patch_size_t == 0
+            raise NotImplementedError("Do not use the case whose patch_size_t is not None")
+        
+        batch_size, num_channels, num_frames, height, width = latent_flow.shape
+
+        # Get prompt embeddings
+        _, seq_len, _ = prompt_embedding.shape
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent_flow.dtype)
+        
+        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
+        images = images.unsqueeze(2)
+        # Add noise to images
+        image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=self.accelerator.device)
+        image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+        noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+        image_latent_dist = self.components.vae.encode(noisy_images.to(dtype=self.components.vae.dtype)).latent_dist
+        image_latents = image_latent_dist.sample() * self.components.vae.config.scaling_factor
+
+        # Sample a random timestep for each sample
+        timesteps = torch.randint(
+            0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+        )
+        timesteps = timesteps.long()
+
+        # from [B, C, F, H, W] to [B, F, C, H, W]
+        latent_flow = latent_flow.permute(0, 2, 1, 3, 4)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4)
+        assert (image_latents.shape[0], *image_latents.shape[2:]) == (latent_flow.shape[0], *latent_flow.shape[2:])
+
+        # Padding image_latents to the same frame number as latent
+        padding_shape = (latent_flow.shape[0], latent_flow.shape[1] - 1, *latent_flow.shape[2:])
+        latent_padding = image_latents.new_zeros(padding_shape)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+        # Add noise to latent
+        noise = torch.randn_like(latent_flow)
+        latent_flow_noisy = self.components.scheduler.add_noise(latent_flow, noise, timesteps)
+
+
+        # Concatenate latent and image_latents in the channel dimension
+        latent_flow_img_noisy = torch.cat([latent_flow_noisy, image_latents], dim=2)
+
+        # Prepare rotary embeds
+        vae_scale_factor_spatial = 2 ** (len(self.components.vae.config.block_out_channels) - 1)
+        transformer_config = self.state.transformer_config
+        rotary_emb = (
+            self.prepare_rotary_positional_embeddings(
+                height=height * vae_scale_factor_spatial,
+                width=width * vae_scale_factor_spatial,
+                num_frames=num_frames,
+                transformer_config=transformer_config,
+                vae_scale_factor_spatial=vae_scale_factor_spatial,
+                device=self.accelerator.device,
+            )
+            if transformer_config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # Predict noise, For CogVideoX1.5 Only.
+        ofs_emb = (
+            None if self.state.transformer_config.ofs_embed_dim is None else latent_flow.new_full((1,), fill_value=2.0)
+        )
+
+        predicted_noise = self.components.transformer(
+            hidden_states=latent_flow_img_noisy,
+            encoder_hidden_states=prompt_embedding,
+            timestep=timesteps,
+            ofs=ofs_emb,
+            image_rotary_emb=rotary_emb,
+            return_dict=False,
+        )[0]
+
+        # Denoise
+        latent_pred = self.components.scheduler.get_velocity(predicted_noise, latent_flow_noisy, timesteps)
+
+        alphas_cumprod = self.components.scheduler.alphas_cumprod[timesteps]
+        weights = 1 / (1 - alphas_cumprod)
+        while len(weights.shape) < len(latent_pred.shape):
+            weights = weights.unsqueeze(-1)
+
+        loss = torch.mean((weights * (latent_pred - latent_flow) ** 2).reshape(batch_size, -1), dim=1)
+        loss = loss.mean()
+
+        return loss
+    
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        transformer_config: Dict,
+        vae_scale_factor_spatial: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (vae_scale_factor_spatial * transformer_config.patch_size)
+        grid_width = width // (vae_scale_factor_spatial * transformer_config.patch_size)
+
+        if transformer_config.patch_size_t is None:
+            base_num_frames = num_frames
+        else:
+            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
+
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=transformer_config.attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(grid_height, grid_width),
+            device=device,
+        )
+
+        return freqs_cos, freqs_sin
+
+    # Validation
+
+    @override
+    def prepare_for_validation(self):
+        # Load from dataset?
+        # Data_root
+        # - metadata.jsonl
+        # - video_latent / args.resolution /
+        # - prompt_embeddings /
+        # - first_frames /
+        # - flow_direct_f_latent /
+
+        data_root = self.args.data_root
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl or metadata_revised.jsonl in the root path"
+        
+        # Load metadata
+        # metadata = {
+        #     "video_path": ...,
+        #     "hash_code": ...,
+        #     "prompt": ...,
+        # }
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+
+        metadata = random.sample(metadata, self.args.max_scene)
+
+        prompts = [x["prompt"] for x in metadata]
+        if 'curated' in str(data_root).lower():
+            self.prompt_embeddings = [data_root / "prompt_embeddings" / (x["hash_code"] + '.safetensors') for x in metadata]
+        else:
+            self.prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        videos = [data_root / "video_latent" / "x".join(str(x) for x in self.args.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+
+        # load prompt embedding
+        validation_prompts = []
+        validation_prompt_embeddings = []
+        validation_video_latents = []
+        validation_images = []
+        validation_flow_latents = []
+        for prompt, prompt_embedding, video_latent, image, flow_latent in zip(prompts, prompt_embeddings, videos, images, flows):
+            validation_prompts.append(prompt)
+            validation_prompt_embeddings.append(load_file(prompt_embedding)["prompt_embedding"].unsqueeze(0))
+            validation_video_latents.append(load_file(video_latent)["encoded_video"].unsqueeze(0))
+            validation_flow_latents.append(load_file(flow_latent)["encoded_flow_f"].unsqueeze(0))
+            # validation_images.append(preprocess_image_with_resize(image, self.args.train_resolution[1], self.args.train_resolution[2]))
+            validation_images.append(image)
+            
+            
+        validation_videos = [None] * len(validation_prompts)
+        
+
+        self.state.validation_prompts = validation_prompts
+        self.state.validation_prompt_embeddings = validation_prompt_embeddings
+        self.state.validation_images = validation_images
+        self.state.validation_videos = validation_videos
+        self.state.validation_video_latents = validation_video_latents
+        self.state.validation_flow_latents = validation_flow_latents
+
+        # Debug..
+        self.validate(0)
+    
+    
+    @override
+    def validation_step(
+        self, eval_data: Dict[str, Any], pipe: FloVDOMSMCogVideoXImageToVideoPipeline
+    ) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        """
+        Return the data that needs to be saved. For videos, the data format is List[PIL],
+        and for images, the data format is PIL
+        """
+
+        prompt_embedding, image = eval_data["prompt_embedding"], eval_data["image"]
+
+        flow_latent_generate = pipe(
+            num_frames=self.state.train_frames,
+            height=self.state.train_height,
+            width=self.state.train_width,
+            prompt=None,
+            prompt_embeds=prompt_embedding,
+            image=image,
+            generator=self.state.generator,
+            num_inference_steps=50,
+            output_type='latent'
+        ).frames[0]
+
+        flow_generate = decode_flow(flow_latent_generate.unsqueeze(0).to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36]) # BF,C,H,W
+        
+        return [("synthesized_flow", flow_generate)]
+
+
+    @override
+    def validate(self, step: int) -> None:
+        #TODO. Fix the codes!!!!
+        logger.info("Starting validation")
+
+        accelerator = self.accelerator
+        num_validation_samples = len(self.state.validation_prompts)
+
+        if num_validation_samples == 0:
+            logger.warning("No validation samples found. Skipping validation.")
+            return
+
+        self.components.transformer.eval()
+        torch.set_grad_enabled(False)
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory before validation start: {json.dumps(memory_statistics, indent=4)}")
+
+        #####  Initialize pipeline  #####
+        pipe = self.initialize_pipeline()
+        camera_flow_generator = self.initialize_flow_generator().to(device=self.accelerator.device, dtype=self.state.weight_dtype)
+
+        if self.state.using_deepspeed:
+            # Can't using model_cpu_offload in deepspeed,
+            # so we need to move all components in pipe to device
+            # pipe.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["transformer"])
+        else:
+            # if not using deepspeed, use model_cpu_offload to further reduce memory usage
+            # Or use pipe.enable_sequential_cpu_offload() to further reduce memory usage
+            pipe.enable_model_cpu_offload(device=self.accelerator.device)
+
+            # Convert all model weights to training dtype
+            # Note, this will change LoRA weights in self.components.transformer to training dtype, rather than keep them in fp32
+            pipe = pipe.to(dtype=self.state.weight_dtype)
+
+        #################################
+        all_processes_artifacts = []
+        for i in range(num_validation_samples):
+            if self.state.using_deepspeed and self.accelerator.deepspeed_plugin.zero_stage != 3:
+                # Skip current validation on all processes but one
+                if i % accelerator.num_processes != accelerator.process_index:
+                    continue
+
+            prompt = self.state.validation_prompts[i]
+            image = self.state.validation_images[i]
+            video = self.state.validation_videos[i]
+            video_latent = self.state.validation_video_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+            prompt_embedding = self.state.validation_prompt_embeddings[i]
+            flow_latent = self.state.validation_flow_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+            
+
+            if image is not None:
+                image = preprocess_image_with_resize(image, self.state.train_height, self.state.train_width)
+                image_torch = image.detach().clone()
+                # Convert image tensor (C, H, W) to PIL images
+                image = image.to(torch.uint8)
+                image = image.permute(1, 2, 0).cpu().numpy()
+                image = Image.fromarray(image)
+
+            if video is not None:
+                video = preprocess_video_with_resize(
+                    video, self.state.train_frames, self.state.train_height, self.state.train_width
+                )
+                # Convert video tensor (F, C, H, W) to list of PIL images
+                video = video.round().clamp(0, 255).to(torch.uint8)
+                video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+            else:
+                with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                    try:
+                        video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                    except:
+                        pass
+                    video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                video = ((video_decoded + 1.) / 2. * 255.)[0].permute(1,0,2,3).float().clip(0., 255.).to(torch.uint8)
+                video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+                
+                with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                    try:
+                        flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36])
+                    except:
+                        pass
+                    flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36]) # (BF)CHW (C=2)
+
+
+            logger.debug(
+                f"Validating sample {i + 1}/{num_validation_samples} on process {accelerator.process_index}. Prompt: {prompt}",
+                main_process_only=False,
+            )
+            # validation_artifacts = self.validation_step({"prompt": prompt, "image": image, "video": video}, pipe)
+            validation_artifacts = self.validation_step({"prompt_embedding": prompt_embedding, "image": image}, pipe)
+
+            if (
+                self.state.using_deepspeed
+                and self.accelerator.deepspeed_plugin.zero_stage == 3
+                and not accelerator.is_main_process
+            ):
+                continue
+
+            prompt_filename = string_to_filename(prompt)[:25]
+            # Calculate hash of reversed prompt as a unique identifier
+            reversed_prompt = prompt[::-1]
+            hash_suffix = hashlib.md5(reversed_prompt.encode()).hexdigest()[:5]
+
+            artifacts = {
+                "image": {"type": "image", "value": image},
+                "video": {"type": "video", "value": video},
+            }
+            for i, (artifact_type, artifact_value) in enumerate(validation_artifacts):
+                artifacts.update({f"artifact_{i}": {"type": artifact_type, "value": artifact_value}})
+                
+                # Log flow
+                artifacts.update({f"artifact_flow_{i}": {"type": 'flow', "value": flow_decoded}})
+                
+                # Log flow_warped_frames
+                image_tensor = repeat(rearrange(torch.tensor(np.array(image)).to(flow_decoded.device, torch.float), 'h w c -> 1 c h w'), 'b c h w -> (b f) c h w', f=flow_decoded.size(0)) # scale~(0,255) (BF) C H W
+                warped_video = forward_bilinear_splatting(image_tensor, flow_decoded.to(torch.float)) # if we have an occlusion mask from dataset, we can use it.
+                frame_list = []
+                for frame in warped_video:
+                    frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                    frame_list.append(Image.fromarray(frame))
+                
+                artifacts.update({f"artifact_warped_video_{i}": {"type": 'warped_video', "value": frame_list}})
+                
+                # Log synthesized_flow_wraped_frames
+                # artifact_value: synthesized optical flow
+                warped_video2 = forward_bilinear_splatting(image_tensor, artifact_value.to(torch.float)) # if we have an occlusion mask from dataset, we can use it. For OMSM, do not use.
+                frame_list2 = []
+                for frame in warped_video2:
+                    frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                    frame_list2.append(Image.fromarray(frame))
+                
+                artifacts.update({f"artifact_synthesized_flow_warped_video_{i}": {"type": 'synthesized_flow_warped_video', "value": frame_list2}})
+                
+                    
+            logger.debug(
+                f"Validation artifacts on process {accelerator.process_index}: {list(artifacts.keys())}",
+                main_process_only=False,
+            )
+
+            for key, value in list(artifacts.items()):
+                artifact_type = value["type"]
+                artifact_value = value["value"]
+                if artifact_type not in ["image", "video", "flow", "warped_video", "synthesized_flow", "synthesized_flow_warped_video"] or artifact_value is None:
+                    continue
+
+                extension = "png" if artifact_type == "image" else "mp4"
+                if artifact_type == "warped_video" or artifact_type == "synthesized_flow_warped_video":
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}_{artifact_type}.{extension}"
+                elif artifact_type == "synthesized_flow":
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}_synthesized_flow.{extension}"
+                elif artifact_type == "flow":
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}_original_flow.{extension}"
+                else:
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}.{extension}"
+                validation_path = self.args.output_dir / "validation_res"
+                validation_path.mkdir(parents=True, exist_ok=True)
+                filename = str(validation_path / filename)
+
+                if artifact_type == "image":
+                    logger.debug(f"Saving image to {filename}")
+                    artifact_value.save(filename)
+                    artifact_value = wandb.Image(filename)
+                elif artifact_type == "video" or artifact_type == "warped_video" or artifact_type == "synthesized_flow_warped_video":
+                    logger.debug(f"Saving video to {filename}")
+                    export_to_video(artifact_value, filename, fps=self.args.gen_fps)
+                    artifact_value = wandb.Video(filename, caption=f"[{artifact_type}]--{prompt}")
+                elif artifact_type == "synthesized_flow" or artifact_type == "flow":
+                    # TODO. RGB Visualization of optical flow. (F,2,H,W)
+                    artifact_value_RGB = flow_to_color(artifact_value) # BF,C,H,W (B=1)
+
+                    frame_list = []
+                    for frame in artifact_value_RGB:
+                        frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                        frame_list.append(Image.fromarray(frame))
+                    
+                    logger.debug(f"Saving video to {filename}")
+                    export_to_video(frame_list, filename, fps=self.args.gen_fps)
+                    artifact_value = wandb.Video(filename, caption=f"[{artifact_type}]--{prompt}")
+
+                all_processes_artifacts.append(artifact_value)
+
+        all_artifacts = gather_object(all_processes_artifacts)
+
+        if accelerator.is_main_process:
+            tracker_key = "validation"
+            for tracker in accelerator.trackers:
+                if tracker.name == "wandb":
+                    image_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Image)]
+                    video_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Video)]
+                    tracker.log(
+                        {
+                            tracker_key: {f"images": image_artifacts, f"videos": video_artifacts},
+                        },
+                        step=step,
+                    )
+
+        ##########  Clean up  ##########
+        if self.state.using_deepspeed:
+            del pipe
+            # Unload models except those needed for training
+            self.__move_components_to_cpu(unload_list=self.UNLOAD_LIST)
+        else:
+            pipe.remove_all_hooks()
+            del pipe
+            # Load models except those not needed for training
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=self.UNLOAD_LIST)
+            self.components.transformer.to(self.accelerator.device, dtype=self.state.weight_dtype)
+
+            # Change trainable weights back to fp32 to keep with dtype after prepare the model
+            cast_training_params([self.components.transformer], dtype=torch.float32)
+
+        del camera_flow_generator
+
+        free_memory()
+        accelerator.wait_for_everyone()
+        ################################
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
+        torch.cuda.reset_peak_memory_stats(accelerator.device)
+
+        torch.set_grad_enabled(True)
+        self.components.transformer.train()
+
+
+    # mangling
+    def __move_components_to_device(self, dtype, ignore_list: List[str] = []):
+        ignore_list = set(ignore_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name not in ignore_list:
+                    setattr(self.components, name, component.to(self.accelerator.device, dtype=dtype))
+
+    # mangling
+    def __move_components_to_cpu(self, unload_list: List[str] = []):
+        unload_list = set(unload_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name in unload_list:
+                    setattr(self.components, name, component.to("cpu"))
+
+
+register("cogvideox-flovd-omsm", "lora", FloVDOMSMCogVideoXI2VLoraTrainer)
+
+
+#--------------------------------------------------------------------------------------------------
+# Extract function
+def encode_text(prompt: str, components, device) -> torch.Tensor:
+    prompt_token_ids = components.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=components.transformer.config.max_text_seq_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    prompt_token_ids = prompt_token_ids.input_ids
+    prompt_embedding = components.text_encoder(prompt_token_ids.to(device))[0]
+    return prompt_embedding
+
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+
+def decode_latents(latents: torch.Tensor, vae) -> torch.Tensor:
+    latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    latents = 1 / vae.config.scaling_factor * latents
+
+    frames = vae.decode(latents).sample
+    return frames
+
+def compute_optical_flow(raft, ctxt, trgt, raft_iter=20, chunk=2, only_forward=True):
+    num_frames = ctxt.shape[0]
+    chunk_size = (num_frames // chunk) + 1
+
+    flow_f_list = []
+    if not only_forward:
+        flow_b_list = []
+    for i in range(chunk):
+        start = chunk_size * i
+        end = chunk_size * (i+1)
+
+        with torch.no_grad():
+            flow_f = raft(ctxt[start:end], trgt[start:end], num_flow_updates=raft_iter)[-1]
+            if not only_forward:
+                flow_b = raft(trgt[start:end], ctxt[start:end], num_flow_updates=raft_iter)[-1]
+        
+        flow_f_list.append(flow_f)
+        if not only_forward:
+            flow_b_list.append(flow_b)
+    
+    flow_f = torch.cat(flow_f_list)
+    if not only_forward:
+        flow_b = torch.cat(flow_b_list)
+    
+    if not only_forward:
+        return flow_f, flow_b
+    else:
+        return flow_f, None
+
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+
+    return encode_video(flow_norm_extended, vae)
+
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    
+    flow = vae.decode(flow_latent).sample # BCFHW
+
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+
+    return flow_norm
+
+
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    
+    return flow_orig
+
+#--------------------------------------------------------------------------------------------------
diff --git a/finetune/models/cogvideox_i2v/flovd_controlnet_trainer.py b/finetune/models/cogvideox_i2v/flovd_controlnet_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..286bb00de2931836d187869a262cc9eb485a8c9c
--- /dev/null
+++ b/finetune/models/cogvideox_i2v/flovd_controlnet_trainer.py
@@ -0,0 +1,814 @@
+from typing import Any, Dict, List, Tuple
+from pathlib import Path
+import os
+import hashlib
+import json
+import random
+import wandb
+import math
+import numpy as np
+from einops import rearrange, repeat
+from safetensors.torch import load_file, save_file
+from accelerate.logging import get_logger
+
+import torch
+
+from accelerate.utils import gather_object
+
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils.export_utils import export_to_video
+
+from finetune.pipeline.flovd_FVSM_cogvideox_controlnet_pipeline import FloVDCogVideoXControlnetImageToVideoPipeline
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from PIL import Image
+from numpy import dtype
+from transformers import AutoTokenizer, T5EncoderModel
+from typing_extensions import override
+
+from finetune.schemas import Args, Components, State
+from finetune.trainer import Trainer
+from finetune.utils import (
+    cast_training_params,
+    free_memory,
+    get_memory_statistics,
+    string_to_filename,
+    unwrap_model,
+)
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    load_binary_mask_compressed,
+)
+
+from finetune.modules.cogvideox_controlnet import CogVideoXControlnet
+from finetune.modules.cogvideox_custom_model import CustomCogVideoXTransformer3DModel
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting
+
+from ..utils import register
+
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import pdb
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+class FloVDCogVideoXI2VControlnetTrainer(Trainer):
+    UNLOAD_LIST = ["text_encoder"]
+
+    @override
+    def __init__(self, args: Args) -> None:
+        super().__init__(args)
+
+        # For validation
+        self.CameraSampler = SampleManualCam()
+
+
+
+    @override
+    def load_components(self) -> Dict[str, Any]:
+        # TODO. Change the pipeline and ...
+        components = Components()
+        model_path = str(self.args.model_path)
+
+        components.pipeline_cls = FloVDCogVideoXControlnetImageToVideoPipeline
+
+        components.tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")
+
+        components.text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+
+        # components.transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+        
+        components.transformer = CustomCogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+        
+        additional_kwargs = {
+            'num_layers': self.args.controlnet_transformer_num_layers,
+            'out_proj_dim_factor': self.args.controlnet_out_proj_dim_factor,
+            'out_proj_dim_zero_init': self.args.controlnet_out_proj_zero_init,
+            'notextinflow': self.args.notextinflow,
+        }
+        components.controlnet = CogVideoXControlnet.from_pretrained(model_path, subfolder="transformer", **additional_kwargs)
+
+        components.vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+
+        components.scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        
+        return components
+    
+
+    @override
+    def initialize_pipeline(self) -> FloVDCogVideoXControlnetImageToVideoPipeline:
+        # TODO. Change the pipeline and ...
+        pipe = FloVDCogVideoXControlnetImageToVideoPipeline(
+            tokenizer=self.components.tokenizer,
+            text_encoder=unwrap_model(self.accelerator, self.components.text_encoder),
+            vae=unwrap_model(self.accelerator, self.components.vae),
+            transformer=unwrap_model(self.accelerator, self.components.transformer),
+            controlnet=unwrap_model(self.accelerator, self.components.controlnet),
+            scheduler=self.components.scheduler,
+        )
+        return pipe
+
+    def initialize_flow_generator(self, ckpt_path):
+        depth_estimator_kwargs = {
+            "target": 'modules.depth_warping.depth_warping.DepthWarping_wrapper',
+            "kwargs": {
+                "ckpt_path": ckpt_path,
+                "model_config": {
+                    "max_depth": 20,
+                    "encoder": 'vitb',
+                    "features": 128,
+                    "out_channels": [96, 192, 384, 768],
+                }
+
+            }
+        }
+
+        return CameraFlowGenerator(depth_estimator_kwargs)
+
+    @override
+    def collate_fn(self, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        ret = {"encoded_videos": [], "prompt_embedding": [], "images": [], "encoded_flow": []}
+
+        for sample in samples:
+            encoded_video = sample["encoded_video"]
+            prompt_embedding = sample["prompt_embedding"]
+            image = sample["image"]
+            encoded_flow = sample["encoded_flow"]
+
+            ret["encoded_videos"].append(encoded_video)
+            ret["prompt_embedding"].append(prompt_embedding)
+            ret["images"].append(image)
+            ret["encoded_flow"].append(encoded_flow)
+
+
+        ret["encoded_videos"] = torch.stack(ret["encoded_videos"])
+        ret["prompt_embedding"] = torch.stack(ret["prompt_embedding"])
+        ret["images"] = torch.stack(ret["images"])
+        ret["encoded_flow"] = torch.stack(ret["encoded_flow"])
+
+        return ret
+    
+
+    @override
+    def compute_loss(self, batch) -> torch.Tensor:
+        prompt_embedding = batch["prompt_embedding"]
+        latent = batch["encoded_videos"]
+        images = batch["images"]
+        latent_flow = batch["encoded_flow"]
+
+        # Shape of prompt_embedding: [B, seq_len, hidden_size]
+        # Shape of latent: [B, C, F, H, W]
+        # Shape of images: [B, C, H, W]
+        # Shape of latent_flow: [B, C, F, H, W]
+
+        patch_size_t = self.state.transformer_config.patch_size_t # WJ: None in i2v setting...
+        if patch_size_t is not None:
+            ncopy = latent.shape[2] % patch_size_t
+            # Copy the first frame ncopy times to match patch_size_t
+            first_frame = latent[:, :, :1, :, :]  # Get first frame [B, C, 1, H, W]
+            latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)
+            assert latent.shape[2] % patch_size_t == 0
+        
+        batch_size, num_channels, num_frames, height, width = latent.shape
+
+        # Get prompt embeddings
+        _, seq_len, _ = prompt_embedding.shape
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent.dtype)
+        
+        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
+        images = images.unsqueeze(2)
+        # Add noise to images
+        image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=self.accelerator.device)
+        image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+        noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+        image_latent_dist = self.components.vae.encode(noisy_images.to(dtype=self.components.vae.dtype)).latent_dist
+        image_latents = image_latent_dist.sample() * self.components.vae.config.scaling_factor
+
+        """
+            Modify below
+        """
+        # Sample a random timestep for each sample
+        # timesteps = torch.randint(
+        #     0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+        # )
+        if self.args.enable_time_sampling:
+            if self.args.time_sampling_type == "truncated_normal":
+                time_sampling_dict = {
+                    'mean': self.args.time_sampling_mean,
+                    'std': self.args.time_sampling_std,
+                    'a': 1 - self.args.controlnet_guidance_end,
+                    'b': 1 - self.args.controlnet_guidance_start,
+                }
+                timesteps = torch.nn.init.trunc_normal_(
+                    torch.empty(batch_size, device=latent.device), **time_sampling_dict
+                    ) * self.components.scheduler.config.num_train_timesteps
+            elif self.args.time_sampling_type == "truncated_uniform":
+                timesteps = torch.randint(
+                    int((1- self.args.controlnet_guidance_end) * self.components.scheduler.config.num_train_timesteps),
+                    int((1 - self.args.controlnet_guidance_start) * self.components.scheduler.config.num_train_timesteps),
+                    (batch_size,), device=latent.device
+                )
+        else:    
+            timesteps = torch.randint(
+                0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+            )
+        timesteps = timesteps.long()
+
+        # from [B, C, F, H, W] to [B, F, C, H, W]
+        latent = latent.permute(0, 2, 1, 3, 4)
+        latent_flow = latent_flow.permute(0, 2, 1, 3, 4)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4)
+        assert (latent.shape[0], *latent.shape[2:]) == (image_latents.shape[0], *image_latents.shape[2:]) == (latent_flow.shape[0], *latent_flow.shape[2:])
+
+        # Padding image_latents to the same frame number as latent
+        padding_shape = (latent.shape[0], latent.shape[1] - 1, *latent.shape[2:])
+        latent_padding = image_latents.new_zeros(padding_shape)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+        # Add noise to latent
+        noise = torch.randn_like(latent)
+        latent_noisy = self.components.scheduler.add_noise(latent, noise, timesteps)
+        
+
+        # Concatenate latent and image_latents in the channel dimension
+        # latent_img_flow_noisy = torch.cat([latent_noisy, image_latents, latent_flow], dim=2)
+        latent_img_noisy = torch.cat([latent_noisy, image_latents], dim=2)
+
+        # Prepare rotary embeds
+        vae_scale_factor_spatial = 2 ** (len(self.components.vae.config.block_out_channels) - 1)
+        transformer_config = self.state.transformer_config
+        rotary_emb = (
+            self.prepare_rotary_positional_embeddings(
+                height=height * vae_scale_factor_spatial,
+                width=width * vae_scale_factor_spatial,
+                num_frames=num_frames,
+                transformer_config=transformer_config,
+                vae_scale_factor_spatial=vae_scale_factor_spatial,
+                device=self.accelerator.device,
+            )
+            if transformer_config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # Predict noise, For CogVideoX1.5 Only.
+        ofs_emb = (
+            None if self.state.transformer_config.ofs_embed_dim is None else latent.new_full((1,), fill_value=2.0)
+        )
+        
+        # Controlnet feedforward
+        controlnet_states = self.components.controlnet(
+            hidden_states=latent_noisy,
+            encoder_hidden_states=prompt_embedding,
+            image_rotary_emb=rotary_emb,
+            controlnet_hidden_states=latent_flow,
+            timestep=timesteps,
+            return_dict=False,
+        )[0]
+        if isinstance(controlnet_states, (tuple, list)):
+            controlnet_states = [x.to(dtype=self.state.weight_dtype) for x in controlnet_states]
+        else:
+            controlnet_states = controlnet_states.to(dtype=self.state.weight_dtype)
+
+            
+        # Transformer feedforward
+        predicted_noise = self.components.transformer(
+            hidden_states=latent_img_noisy,
+            encoder_hidden_states=prompt_embedding,
+            controlnet_states=controlnet_states,
+            controlnet_weights=self.args.controlnet_weights,
+            timestep=timesteps,
+            # ofs=ofs_emb,
+            image_rotary_emb=rotary_emb,
+            return_dict=False,
+        )[0]
+        
+
+        # Denoise
+        latent_pred = self.components.scheduler.get_velocity(predicted_noise, latent_noisy, timesteps)
+
+        alphas_cumprod = self.components.scheduler.alphas_cumprod[timesteps]
+        weights = 1 / (1 - alphas_cumprod)
+        while len(weights.shape) < len(latent_pred.shape):
+            weights = weights.unsqueeze(-1)
+
+        loss = torch.mean((weights * (latent_pred - latent) ** 2).reshape(batch_size, -1), dim=1)
+        loss = loss.mean()
+
+        return loss
+    
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        transformer_config: Dict,
+        vae_scale_factor_spatial: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (vae_scale_factor_spatial * transformer_config.patch_size)
+        grid_width = width // (vae_scale_factor_spatial * transformer_config.patch_size)
+
+        if transformer_config.patch_size_t is None:
+            base_num_frames = num_frames
+        else:
+            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
+
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=transformer_config.attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(grid_height, grid_width),
+            device=device,
+        )
+
+        return freqs_cos, freqs_sin
+
+    # Validation
+
+    @override
+    def prepare_for_validation(self):
+        # Load from dataset?
+        # Data_root
+        # - metadata.jsonl
+        # - video_latent / args.resolution /
+        # - prompt_embeddings /
+        # - first_frames /
+        # - flow_direct_f_latent /
+
+        data_root = self.args.data_root
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl or metadata_revised.jsonl in the root path"
+        
+        # Load metadata
+        # metadata = {
+        #     "video_path": ...,
+        #     "hash_code": ...,
+        #     "prompt": ...,
+        # }
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+
+        metadata = random.sample(metadata, self.args.max_scene)
+
+        prompts = [x["prompt"] for x in metadata]
+        prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        videos = [data_root / "video_latent" / "x".join(str(x) for x in self.args.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+
+        # load prompt embedding
+        validation_prompts = []
+        validation_prompt_embeddings = []
+        validation_video_latents = []
+        validation_images = []
+        validation_flow_latents = []
+        for prompt, prompt_embedding, video_latent, image, flow_latent in zip(prompts, prompt_embeddings, videos, images, flows):
+            validation_prompts.append(prompt)
+            validation_prompt_embeddings.append(load_file(prompt_embedding)["prompt_embedding"].unsqueeze(0))
+            validation_video_latents.append(load_file(video_latent)["encoded_video"].unsqueeze(0))
+            validation_flow_latents.append(load_file(flow_latent)["encoded_flow_f"].unsqueeze(0))
+            # validation_images.append(preprocess_image_with_resize(image, self.args.train_resolution[1], self.args.train_resolution[2]))
+            validation_images.append(image)
+            
+            
+        validation_videos = [None] * len(validation_prompts)
+        
+
+        self.state.validation_prompts = validation_prompts
+        self.state.validation_prompt_embeddings = validation_prompt_embeddings
+        self.state.validation_images = validation_images
+        self.state.validation_videos = validation_videos
+        self.state.validation_video_latents = validation_video_latents
+        self.state.validation_flow_latents = validation_flow_latents
+
+        # Debug..
+        # self.validate(0)
+    
+    
+    @override
+    def validation_step(
+        self, eval_data: Dict[str, Any], pipe: FloVDCogVideoXControlnetImageToVideoPipeline
+    ) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        """
+        Return the data that needs to be saved. For videos, the data format is List[PIL],
+        and for images, the data format is PIL
+        """
+
+        prompt_embedding, image, flow_latent = eval_data["prompt_embedding"], eval_data["image"], eval_data["flow_latent"]
+
+        video_generate = pipe(
+            num_frames=self.state.train_frames,
+            height=self.state.train_height,
+            width=self.state.train_width,
+            prompt=None,
+            prompt_embeds=prompt_embedding,
+            image=image,
+            flow_latent=flow_latent,
+            generator=self.state.generator,
+            num_inference_steps=50,
+            controlnet_guidance_start = self.args.controlnet_guidance_start,
+            controlnet_guidance_end = self.args.controlnet_guidance_end,
+        ).frames[0]
+        return [("synthesized_video", video_generate)]
+
+
+    @override
+    def validate(self, step: int) -> None:
+        #TODO. Fix the codes!!!!
+        logger.info("Starting validation")
+
+        accelerator = self.accelerator
+        num_validation_samples = len(self.state.validation_prompts)
+
+        if num_validation_samples == 0:
+            logger.warning("No validation samples found. Skipping validation.")
+            return
+
+        self.components.controlnet.eval()
+        torch.set_grad_enabled(False)
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory before validation start: {json.dumps(memory_statistics, indent=4)}")
+
+        #####  Initialize pipeline  #####
+        pipe = self.initialize_pipeline()
+        camera_flow_generator = self.initialize_flow_generator(ckpt_path=self.args.depth_ckpt_path).to(device=self.accelerator.device, dtype=self.state.weight_dtype)
+
+        if self.state.using_deepspeed:
+            # Can't using model_cpu_offload in deepspeed,
+            # so we need to move all components in pipe to device
+            # pipe.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["controlnet"])
+            # self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["transformer", "controlnet"])
+        else:
+            # if not using deepspeed, use model_cpu_offload to further reduce memory usage
+            # Or use pipe.enable_sequential_cpu_offload() to further reduce memory usage
+            pipe.enable_model_cpu_offload(device=self.accelerator.device)
+
+            # Convert all model weights to training dtype
+            # Note, this will change LoRA weights in self.components.transformer to training dtype, rather than keep them in fp32
+            pipe = pipe.to(dtype=self.state.weight_dtype)
+        
+        
+        #################################
+        inference_type = ['training', 'inference']
+        # inference_type = ['inference']
+        for infer_type in inference_type:
+
+
+            all_processes_artifacts = []
+            for i in range(num_validation_samples):
+                if self.state.using_deepspeed and self.accelerator.deepspeed_plugin.zero_stage != 3:
+                    # Skip current validation on all processes but one
+                    if i % accelerator.num_processes != accelerator.process_index:
+                        continue
+
+                prompt = self.state.validation_prompts[i]
+                image = self.state.validation_images[i]
+                video = self.state.validation_videos[i]
+                video_latent = self.state.validation_video_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+                prompt_embedding = self.state.validation_prompt_embeddings[i]
+                flow_latent = self.state.validation_flow_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+                
+
+                if image is not None:
+                    image = preprocess_image_with_resize(image, self.state.train_height, self.state.train_width)
+                    image_torch = image.detach().clone()
+                    # Convert image tensor (C, H, W) to PIL images
+                    image = image.to(torch.uint8)
+                    image = image.permute(1, 2, 0).cpu().numpy()
+                    image = Image.fromarray(image)
+
+                if video is not None:
+                    video = preprocess_video_with_resize(
+                        video, self.state.train_frames, self.state.train_height, self.state.train_width
+                    )
+                    # Convert video tensor (F, C, H, W) to list of PIL images
+                    video = video.round().clamp(0, 255).to(torch.uint8)
+                    video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+                else:
+                    if infer_type == 'training':
+                        with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                            try:
+                                video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                            except:
+                                pass
+                            video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                        video = ((video_decoded + 1.) / 2. * 255.)[0].permute(1,0,2,3).float().clip(0., 255.).to(torch.uint8)
+                        video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+                        
+                        with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                            try:
+                                flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36])
+                            except:
+                                pass
+                            flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36]) # (BF)CHW (C=2)
+
+
+                # Prepare camera flow
+                if infer_type == 'inference':
+                    with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                        camparam, cam_name = self.CameraSampler.sample()
+                        camera_flow_generator_input = get_camera_flow_generator_input(image_torch, camparam, device=self.accelerator.device, speed=0.5)
+                        image_torch = ((image_torch.unsqueeze(0) / 255.) * 2. - 1.).to(self.accelerator.device)
+                        camera_flow, log_dict = camera_flow_generator(image_torch, camera_flow_generator_input)
+                        camera_flow = camera_flow.to(self.accelerator.device)
+                        # WTF, unknown bug. Need warm up inference.
+                        try:
+                            flow_latent = rearrange(encode_flow(camera_flow, self.components.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(self.accelerator.device, self.state.weight_dtype)
+                        except:
+                            pass
+                        flow_latent = rearrange(encode_flow(camera_flow, self.components.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(self.accelerator.device, self.state.weight_dtype)
+
+
+                logger.debug(
+                    f"Validating sample {i + 1}/{num_validation_samples} on process {accelerator.process_index}. Prompt: {prompt}",
+                    main_process_only=False,
+                )
+                # validation_artifacts = self.validation_step({"prompt": prompt, "image": image, "video": video}, pipe)                
+                validation_artifacts = self.validation_step({"prompt_embedding": prompt_embedding, "image": image, "flow_latent": flow_latent}, pipe)
+
+                if (
+                    self.state.using_deepspeed
+                    and self.accelerator.deepspeed_plugin.zero_stage == 3
+                    and not accelerator.is_main_process
+                ):
+                    continue
+
+                prompt_filename = string_to_filename(prompt)[:25]
+                # Calculate hash of reversed prompt as a unique identifier
+                reversed_prompt = prompt[::-1]
+                hash_suffix = hashlib.md5(reversed_prompt.encode()).hexdigest()[:5]
+
+                artifacts = {
+                    "image": {"type": "image", "value": image},
+                    "video": {"type": "video", "value": video},
+                }
+                for i, (artifact_type, artifact_value) in enumerate(validation_artifacts):
+                    artifacts.update({f"artifact_{i}": {"type": artifact_type, "value": artifact_value}})
+                    if infer_type == 'training':
+                        # Log flow_warped_frames
+                        image_tensor = repeat(rearrange(torch.tensor(np.array(image)).to(flow_decoded.device, torch.float), 'h w c -> 1 c h w'), 'b c h w -> (b f) c h w', f=flow_decoded.size(0)) # scale~(0,255) (BF) C H W
+                        warped_video = forward_bilinear_splatting(image_tensor, flow_decoded.to(torch.float)) # if we have an occlusion mask from dataset, we can use it.
+                        frame_list = []
+                        for frame in warped_video:
+                            frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                            frame_list.append(Image.fromarray(frame))
+                        
+                        artifacts.update({f"artifact_warped_video_{i}": {"type": 'warped_video', "value": frame_list}})
+                        
+                    if infer_type == 'inference':
+                        warped_video = log_dict['depth_warped_frames']
+                        frame_list = []
+                        for frame in warped_video:
+                            frame = (frame + 1.)/2. * 255.
+                            frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                            frame_list.append(Image.fromarray(frame))
+
+                        artifacts.update({f"artifact_warped_video_{i}": {"type": 'warped_video', "value": frame_list}})
+                logger.debug(
+                    f"Validation artifacts on process {accelerator.process_index}: {list(artifacts.keys())}",
+                    main_process_only=False,
+                )
+
+                for key, value in list(artifacts.items()):
+                    artifact_type = value["type"]
+                    artifact_value = value["value"]
+                    if artifact_type not in ["image", "video", "warped_video", "synthesized_video"] or artifact_value is None:
+                        continue
+
+                    extension = "png" if artifact_type == "image" else "mp4"
+                    if artifact_type == "warped_video":
+                        filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}-{infer_type}_warped_video.{extension}"
+                    elif artifact_type == "synthesized_video":
+                        filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}-{infer_type}_synthesized_video.{extension}"
+                    else:
+                        filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}-{infer_type}.{extension}"
+                    validation_path = self.args.output_dir / "validation_res"
+                    validation_path.mkdir(parents=True, exist_ok=True)
+                    filename = str(validation_path / filename)
+
+                    if artifact_type == "image":
+                        logger.debug(f"Saving image to {filename}")
+                        artifact_value.save(filename)
+                        artifact_value = wandb.Image(filename)
+                    elif artifact_type == "video" or artifact_type == "warped_video" or artifact_type == "synthesized_video":
+                        logger.debug(f"Saving video to {filename}")
+                        export_to_video(artifact_value, filename, fps=self.args.gen_fps)
+                        artifact_value = wandb.Video(filename, caption=prompt)
+
+                    all_processes_artifacts.append(artifact_value)
+
+            all_artifacts = gather_object(all_processes_artifacts)
+
+            if accelerator.is_main_process:
+                tracker_key = "validation"
+                for tracker in accelerator.trackers:
+                    if tracker.name == "wandb":
+                        image_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Image)]
+                        video_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Video)]
+                        tracker.log(
+                            {
+                                tracker_key: {f"images_{infer_type}": image_artifacts, f"videos_{infer_type}": video_artifacts},
+                            },
+                            step=step,
+                        )
+
+        ##########  Clean up  ##########
+        if self.state.using_deepspeed:
+            del pipe
+            # Unload models except those needed for training
+            self.__move_components_to_cpu(unload_list=self.UNLOAD_LIST)
+        else:
+            pipe.remove_all_hooks()
+            del pipe
+            # Load models except those not needed for training
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=self.UNLOAD_LIST)
+            self.components.controlnet.to(self.accelerator.device, dtype=self.state.weight_dtype)
+
+            # Change trainable weights back to fp32 to keep with dtype after prepare the model
+            cast_training_params([self.components.controlnet], dtype=torch.float32)
+
+        del camera_flow_generator
+
+        free_memory()
+        accelerator.wait_for_everyone()
+        ################################
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
+        torch.cuda.reset_peak_memory_stats(accelerator.device)
+
+        torch.set_grad_enabled(True)
+        self.components.controlnet.train()
+
+
+    # mangling
+    def __move_components_to_device(self, dtype, ignore_list: List[str] = []):
+        ignore_list = set(ignore_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name not in ignore_list:
+                    setattr(self.components, name, component.to(self.accelerator.device, dtype=dtype))
+
+    # mangling
+    def __move_components_to_cpu(self, unload_list: List[str] = []):
+        unload_list = set(unload_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name in unload_list:
+                    setattr(self.components, name, component.to("cpu"))
+
+
+register("cogvideox-flovd", "controlnet", FloVDCogVideoXI2VControlnetTrainer)
+
+
+#--------------------------------------------------------------------------------------------------
+# Extract function
+def encode_text(prompt: str, components, device) -> torch.Tensor:
+    prompt_token_ids = components.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=components.transformer.config.max_text_seq_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    prompt_token_ids = prompt_token_ids.input_ids
+    prompt_embedding = components.text_encoder(prompt_token_ids.to(device))[0]
+    return prompt_embedding
+
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+
+def decode_latents(latents: torch.Tensor, vae) -> torch.Tensor:
+    latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    latents = 1 / vae.config.scaling_factor * latents
+
+    frames = vae.decode(latents).sample
+    return frames
+
+def compute_optical_flow(raft, ctxt, trgt, raft_iter=20, chunk=2, only_forward=True):
+    num_frames = ctxt.shape[0]
+    chunk_size = (num_frames // chunk) + 1
+
+    flow_f_list = []
+    if not only_forward:
+        flow_b_list = []
+    for i in range(chunk):
+        start = chunk_size * i
+        end = chunk_size * (i+1)
+
+        with torch.no_grad():
+            flow_f = raft(ctxt[start:end], trgt[start:end], num_flow_updates=raft_iter)[-1]
+            if not only_forward:
+                flow_b = raft(trgt[start:end], ctxt[start:end], num_flow_updates=raft_iter)[-1]
+        
+        flow_f_list.append(flow_f)
+        if not only_forward:
+            flow_b_list.append(flow_b)
+    
+    flow_f = torch.cat(flow_f_list)
+    if not only_forward:
+        flow_b = torch.cat(flow_b_list)
+    
+    if not only_forward:
+        return flow_f, flow_b
+    else:
+        return flow_f, None
+
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+
+    return encode_video(flow_norm_extended, vae)
+
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    
+    flow = vae.decode(flow_latent).sample # BCFHW
+
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+
+    return flow_norm
+
+
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    
+    return flow_orig
+
+#--------------------------------------------------------------------------------------------------
diff --git a/finetune/models/cogvideox_i2v/lora_trainer.py b/finetune/models/cogvideox_i2v/lora_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0241895258616e528a226d85cefd2ffabb6b0ad
--- /dev/null
+++ b/finetune/models/cogvideox_i2v/lora_trainer.py
@@ -0,0 +1,246 @@
+from typing import Any, Dict, List, Tuple
+
+import torch
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from PIL import Image
+from numpy import dtype
+from transformers import AutoTokenizer, T5EncoderModel
+from typing_extensions import override
+
+from finetune.schemas import Components
+from finetune.trainer import Trainer
+from finetune.utils import unwrap_model
+
+from ..utils import register
+
+
+class CogVideoXI2VLoraTrainer(Trainer):
+    UNLOAD_LIST = ["text_encoder"]
+
+    @override
+    def load_components(self) -> Dict[str, Any]:
+        components = Components()
+        model_path = str(self.args.model_path)
+
+        components.pipeline_cls = CogVideoXImageToVideoPipeline
+
+        components.tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")
+
+        components.text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+
+        components.transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+
+        components.vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+
+        components.scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+
+        return components
+
+    @override
+    def initialize_pipeline(self) -> CogVideoXImageToVideoPipeline:
+        pipe = CogVideoXImageToVideoPipeline(
+            tokenizer=self.components.tokenizer,
+            text_encoder=self.components.text_encoder,
+            vae=self.components.vae,
+            transformer=unwrap_model(self.accelerator, self.components.transformer),
+            scheduler=self.components.scheduler,
+        )
+        return pipe
+
+    @override
+    def encode_video(self, video: torch.Tensor) -> torch.Tensor:
+        # shape of input video: [B, C, F, H, W]
+        vae = self.components.vae
+        video = video.to(vae.device, dtype=vae.dtype)
+        latent_dist = vae.encode(video).latent_dist
+        latent = latent_dist.sample() * vae.config.scaling_factor
+        return latent
+
+    @override
+    def encode_text(self, prompt: str) -> torch.Tensor:
+        prompt_token_ids = self.components.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.state.transformer_config.max_text_seq_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        prompt_token_ids = prompt_token_ids.input_ids
+        prompt_embedding = self.components.text_encoder(prompt_token_ids.to(self.accelerator.device))[0]
+        return prompt_embedding
+
+    @override
+    def collate_fn(self, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        ret = {"encoded_videos": [], "prompt_embedding": [], "images": []}
+
+        for sample in samples:
+            encoded_video = sample["encoded_video"]
+            prompt_embedding = sample["prompt_embedding"]
+            image = sample["image"]
+
+            ret["encoded_videos"].append(encoded_video)
+            ret["prompt_embedding"].append(prompt_embedding)
+            ret["images"].append(image)
+
+        ret["encoded_videos"] = torch.stack(ret["encoded_videos"])
+        ret["prompt_embedding"] = torch.stack(ret["prompt_embedding"])
+        ret["images"] = torch.stack(ret["images"])
+
+        return ret
+
+    @override
+    def compute_loss(self, batch) -> torch.Tensor:
+        prompt_embedding = batch["prompt_embedding"]
+        latent = batch["encoded_videos"]
+        images = batch["images"]
+
+        # Shape of prompt_embedding: [B, seq_len, hidden_size]
+        # Shape of latent: [B, C, F, H, W]
+        # Shape of images: [B, C, H, W]
+
+        patch_size_t = self.state.transformer_config.patch_size_t
+        if patch_size_t is not None:
+            ncopy = latent.shape[2] % patch_size_t
+            # Copy the first frame ncopy times to match patch_size_t
+            first_frame = latent[:, :, :1, :, :]  # Get first frame [B, C, 1, H, W]
+            latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)
+            assert latent.shape[2] % patch_size_t == 0
+
+        batch_size, num_channels, num_frames, height, width = latent.shape
+
+        # Get prompt embeddings
+        _, seq_len, _ = prompt_embedding.shape
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent.dtype)
+
+        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
+        images = images.unsqueeze(2)
+        # Add noise to images
+        image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=self.accelerator.device)
+        image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+        noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+        image_latent_dist = self.components.vae.encode(noisy_images.to(dtype=self.components.vae.dtype)).latent_dist
+        image_latents = image_latent_dist.sample() * self.components.vae.config.scaling_factor
+
+        # Sample a random timestep for each sample
+        timesteps = torch.randint(
+            0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+        )
+        timesteps = timesteps.long()
+
+        # from [B, C, F, H, W] to [B, F, C, H, W]
+        latent = latent.permute(0, 2, 1, 3, 4)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4)
+        assert (latent.shape[0], *latent.shape[2:]) == (image_latents.shape[0], *image_latents.shape[2:])
+
+        # Padding image_latents to the same frame number as latent
+        padding_shape = (latent.shape[0], latent.shape[1] - 1, *latent.shape[2:])
+        latent_padding = image_latents.new_zeros(padding_shape)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+        # Add noise to latent
+        noise = torch.randn_like(latent)
+        latent_noisy = self.components.scheduler.add_noise(latent, noise, timesteps)
+
+        # Concatenate latent and image_latents in the channel dimension
+        latent_img_noisy = torch.cat([latent_noisy, image_latents], dim=2)
+
+        # Prepare rotary embeds
+        vae_scale_factor_spatial = 2 ** (len(self.components.vae.config.block_out_channels) - 1)
+        transformer_config = self.state.transformer_config
+        rotary_emb = (
+            self.prepare_rotary_positional_embeddings(
+                height=height * vae_scale_factor_spatial,
+                width=width * vae_scale_factor_spatial,
+                num_frames=num_frames,
+                transformer_config=transformer_config,
+                vae_scale_factor_spatial=vae_scale_factor_spatial,
+                device=self.accelerator.device,
+            )
+            if transformer_config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # Predict noise, For CogVideoX1.5 Only.
+        ofs_emb = (
+            None if self.state.transformer_config.ofs_embed_dim is None else latent.new_full((1,), fill_value=2.0)
+        )
+        predicted_noise = self.components.transformer(
+            hidden_states=latent_img_noisy,
+            encoder_hidden_states=prompt_embedding,
+            timestep=timesteps,
+            ofs=ofs_emb,
+            image_rotary_emb=rotary_emb,
+            return_dict=False,
+        )[0]
+
+        # Denoise
+        latent_pred = self.components.scheduler.get_velocity(predicted_noise, latent_noisy, timesteps)
+
+        alphas_cumprod = self.components.scheduler.alphas_cumprod[timesteps]
+        weights = 1 / (1 - alphas_cumprod)
+        while len(weights.shape) < len(latent_pred.shape):
+            weights = weights.unsqueeze(-1)
+
+        loss = torch.mean((weights * (latent_pred - latent) ** 2).reshape(batch_size, -1), dim=1)
+        loss = loss.mean()
+
+        return loss
+
+    @override
+    def validation_step(
+        self, eval_data: Dict[str, Any], pipe: CogVideoXImageToVideoPipeline
+    ) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        """
+        Return the data that needs to be saved. For videos, the data format is List[PIL],
+        and for images, the data format is PIL
+        """
+        prompt, image, video = eval_data["prompt"], eval_data["image"], eval_data["video"]
+
+        video_generate = pipe(
+            num_frames=self.state.train_frames,
+            height=self.state.train_height,
+            width=self.state.train_width,
+            prompt=prompt,
+            image=image,
+            generator=self.state.generator,
+        ).frames[0]
+        return [("video", video_generate)]
+
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        transformer_config: Dict,
+        vae_scale_factor_spatial: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (vae_scale_factor_spatial * transformer_config.patch_size)
+        grid_width = width // (vae_scale_factor_spatial * transformer_config.patch_size)
+
+        if transformer_config.patch_size_t is None:
+            base_num_frames = num_frames
+        else:
+            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
+
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=transformer_config.attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(grid_height, grid_width),
+            device=device,
+        )
+
+        return freqs_cos, freqs_sin
+
+
+register("cogvideox-i2v", "lora", CogVideoXI2VLoraTrainer)
diff --git a/finetune/models/cogvideox_i2v/sft_trainer.py b/finetune/models/cogvideox_i2v/sft_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b55bee820f7e60a6cfa63a1f58aa4de047ab63b1
--- /dev/null
+++ b/finetune/models/cogvideox_i2v/sft_trainer.py
@@ -0,0 +1,9 @@
+from ..cogvideox_i2v.lora_trainer import CogVideoXI2VLoraTrainer
+from ..utils import register
+
+
+class CogVideoXI2VSftTrainer(CogVideoXI2VLoraTrainer):
+    pass
+
+
+register("cogvideox-i2v", "sft", CogVideoXI2VSftTrainer)
diff --git a/finetune/models/utils.py b/finetune/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c096c48136ad06b05fb81680a32cca6f0c2b5e24
--- /dev/null
+++ b/finetune/models/utils.py
@@ -0,0 +1,57 @@
+from typing import Dict, Literal
+
+from finetune.trainer import Trainer
+
+
+SUPPORTED_MODELS: Dict[str, Dict[str, Trainer]] = {}
+
+
+def register(model_name: str, training_type: Literal["lora", "sft", "controlnet"], trainer_cls: Trainer):
+    """Register a model and its associated functions for a specific training type.
+
+    Args:
+        model_name (str): Name of the model to register (e.g. "cogvideox-5b")
+        training_type (Literal["lora", "sft", "controlnet"]): Type of training - either "lora" or "sft" or "controlnet"
+        trainer_cls (Trainer): Trainer class to register.
+    """
+
+    # Check if model_name and training_type exists in SUPPORTED_MODELS
+    if model_name not in SUPPORTED_MODELS:
+        SUPPORTED_MODELS[model_name] = {}
+    else:
+        if training_type in SUPPORTED_MODELS[model_name]:
+            raise ValueError(f"Training type {training_type} already exists for model {model_name}")
+
+    SUPPORTED_MODELS[model_name][training_type] = trainer_cls
+
+
+def show_supported_models():
+    """Print all currently supported models and their training types."""
+
+    print("\nSupported Models:")
+    print("================")
+
+    for model_name, training_types in SUPPORTED_MODELS.items():
+        print(f"\n{model_name}")
+        print("-" * len(model_name))
+        for training_type in training_types:
+            print(f"  • {training_type}")
+
+
+def get_model_cls(model_type: str, training_type: Literal["lora", "sft"]) -> Trainer:
+    """Get the trainer class for a specific model and training type."""
+    if model_type not in SUPPORTED_MODELS:
+        print(f"\nModel '{model_type}' is not supported.")
+        print("\nSupported models are:")
+        for supported_model in SUPPORTED_MODELS:
+            print(f"  • {supported_model}")
+        raise ValueError(f"Model '{model_type}' is not supported")
+
+    if training_type not in SUPPORTED_MODELS[model_type]:
+        print(f"\nTraining type '{training_type}' is not supported for model '{model_type}'.")
+        print(f"\nSupported training types for '{model_type}' are:")
+        for supported_type in SUPPORTED_MODELS[model_type]:
+            print(f"  • {supported_type}")
+        raise ValueError(f"Training type '{training_type}' is not supported for model '{model_type}'")
+
+    return SUPPORTED_MODELS[model_type][training_type]
diff --git a/finetune/modules/__init__.py b/finetune/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/finetune/modules/camera_flow_generator.py b/finetune/modules/camera_flow_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..303286b3e0abc59121b7105fbd153ddb7d7c30d4
--- /dev/null
+++ b/finetune/modules/camera_flow_generator.py
@@ -0,0 +1,46 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+from .utils import instantiate_from_config, get_camera_flow_generator_input, warp_image
+
+import pdb
+
+class CameraFlowGenerator(nn.Module):
+    def __init__(
+        self,
+        depth_estimator_kwargs,
+        use_observed_mask=False,
+        cycle_th=3.,
+    ):
+        super().__init__()
+        
+        self.depth_warping_module = instantiate_from_config(depth_estimator_kwargs)
+        self.use_observed_mask = use_observed_mask
+        self.cycle_th = cycle_th
+    
+    def forward(self, condition_image, camera_flow_generator_input):
+        # NOTE. camera_flow_generator_input is a dict of network inputs!
+        # camera_flow_generator_input: Dict
+        # - image
+        # - intrinsics
+        # - extrinsics
+        with torch.no_grad():
+            flow_f, flow_b, depth_warped_frames, depth_ctxt, depth_trgt = self.depth_warping_module(camera_flow_generator_input)
+        image_ctxt = repeat(condition_image, "b c h w -> (b v) c h w", v=(depth_warped_frames.shape[0]//condition_image.shape[0]))
+        
+        log_dict = {
+            'depth_warped_frames': depth_warped_frames,
+            'depth_ctxt': depth_ctxt,
+            'depth_trgt': depth_trgt,
+        }
+        
+        # if self.use_observed_mask:
+        #     observed_mask = run_filtering(flow_f, flow_b, cycle_th=self.cycle_th)
+        #     log_dict[
+        #         'observed_mask': observed_mask
+        #     ]
+        
+        return flow_f, log_dict
diff --git a/finetune/modules/camera_sampler.py b/finetune/modules/camera_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed699c23ee654286f4c2be9672e60fe0add9fd6c
--- /dev/null
+++ b/finetune/modules/camera_sampler.py
@@ -0,0 +1,52 @@
+import numpy as np
+from glob import glob
+import random
+import os
+import pdb
+random.seed(7777)
+
+class SampleManualCam:
+    def __init__(
+        self,
+        pose_type = 'manual',
+        root_path = '../assets/manual_poses',
+    ):
+        self.root_path = root_path
+        if pose_type == 'manual':
+            self.MANUAL_CAM = ['I', 'D', 'L', 'O', 'R', 'U']
+        elif pose_type == 're10k':
+            self.RE10K_CAM = os.listdir(root_path)
+        # self.pose_path = glob(root_path, "*.txt")
+        
+        self.pose_type = pose_type
+    
+    def sample(self, order=None, name=None):
+        # Sample camera parameters (W2C)
+        
+        if self.pose_type == 'manual':
+            if name is not None:
+                assert name in self.MANUAL_CAM
+                cam_name = name
+            elif order is not None:
+                order = order % len(self.MANUAL_CAM)
+                cam_name = self.MANUAL_CAM[order]
+            else:
+                cam_name = random.choice(self.MANUAL_CAM)
+            path = os.path.join(self.root_path, f"camera_{cam_name}.txt")
+        elif self.pose_type == 're10k':
+            if name is not None:
+                assert name in self.RE10K_CAM
+                cam_name = name
+            elif order is not None:
+                order = order % len(self.RE10K_CAM)
+                cam_name = self.RE10K_CAM[order]
+            else:
+                cam_name = random.choice(self.RE10K_CAM)
+            path = os.path.join(self.root_path, cam_name)
+        with open(path, 'r') as f:
+            poses = f.readlines()
+        
+        poses = [pose.strip().split(' ') for pose in poses]
+        poses = [[float(x) for x in pose] for pose in poses]
+        
+        return poses, cam_name
\ No newline at end of file
diff --git a/finetune/modules/cogvideox_controlnet.py b/finetune/modules/cogvideox_controlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..173556bbea12aeae83a7d7267041a13e6ad41259
--- /dev/null
+++ b/finetune/modules/cogvideox_controlnet.py
@@ -0,0 +1,353 @@
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+
+from diffusers.configuration_utils import FrozenDict
+
+from diffusers import CogVideoXTransformer3DModel
+from diffusers.models.transformers.cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock
+from diffusers.utils import is_torch_version
+from diffusers.loaders import  PeftAdapterMixin
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor2_0
+from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+from .cogvideox_custom_modules import CustomCogVideoXPatchEmbed, CustomCogVideoXBlock
+
+import pdb
+
+class CogVideoXControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+    
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30, # 48 for 5B, 30 for 2B.
+        attention_head_dim: int = 64,
+        # in_channels: int = 3,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16, # Not used
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        ofs_embed_dim: Optional[int] = None,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
+        out_proj_dim_factor: int = 8,
+        out_proj_dim_zero_init: bool = True,
+        notextinflow: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        
+        self.notextinflow = notextinflow
+
+        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+                "issue at https://github.com/huggingface/diffusers/issues."
+            )
+        
+        """
+            Delete below.
+            In our case, FloVD, controlnet_hidden_states is already flow_latents encoded by 3D-Causal-VAE
+        """
+        # start_channels = in_channels * (downscale_coef ** 2)
+        # input_channels = [start_channels, start_channels // 2, start_channels // 4]
+        # self.unshuffle = nn.PixelUnshuffle(downscale_coef)
+        
+        # self.controlnet_encode_first = nn.Sequential(
+        #     nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
+        #     nn.GroupNorm(2, input_channels[1]),
+        #     nn.ReLU(),
+        # )
+
+        # self.controlnet_encode_second = nn.Sequential(
+        #     nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
+        #     nn.GroupNorm(2, input_channels[2]),
+        #     nn.ReLU(),
+        # )
+        
+        # """
+        #     Modify below.
+        #     In our case, patch_embed takes encoder_hidden_states, hidden_states, controlnet_hidden_states (flow)
+        # """
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            bias=True,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+        )
+        # self.patch_embed = CustomCogVideoXPatchEmbed(
+        #     patch_size=patch_size,
+        #     patch_size_t=patch_size_t,
+        #     in_channels=in_channels,
+        #     embed_dim=inner_dim,
+        #     text_embed_dim=text_embed_dim,
+        #     bias=patch_bias,
+        #     sample_width=sample_width,
+        #     sample_height=sample_height,
+        #     sample_frames=sample_frames,
+        #     temporal_compression_ratio=temporal_compression_ratio,
+        #     max_text_seq_length=max_text_seq_length,
+        #     spatial_interpolation_scale=spatial_interpolation_scale,
+        #     temporal_interpolation_scale=temporal_interpolation_scale,
+        #     use_positional_embeddings=not use_rotary_positional_embeddings,
+        #     use_learned_positional_embeddings=use_learned_positional_embeddings,
+        # )
+        
+        self.embedding_dropout = nn.Dropout(dropout)
+
+        # 2. Time embeddings
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+
+        # 3. Define spatio-temporal transformers blocks
+        # self.transformer_blocks = nn.ModuleList(
+        #     [
+        #         CogVideoXBlock(
+        #             dim=inner_dim,
+        #             num_attention_heads=num_attention_heads,
+        #             attention_head_dim=attention_head_dim,
+        #             time_embed_dim=time_embed_dim,
+        #             dropout=dropout,
+        #             activation_fn=activation_fn,
+        #             attention_bias=attention_bias,
+        #             norm_elementwise_affine=norm_elementwise_affine,
+        #             norm_eps=norm_eps,
+        #         )
+        #         for _ in range(num_layers)
+        #     ]
+        # )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CustomCogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.out_projectors = None
+        if out_proj_dim_factor is not None:
+            out_proj_dim = num_attention_heads * out_proj_dim_factor
+            self.out_projectors = nn.ModuleList(
+                [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)]
+            )
+            if out_proj_dim_zero_init:
+                for out_projector in self.out_projectors:
+                    self.zeros_init_linear(out_projector)   
+            
+        self.gradient_checkpointing = False
+    
+    def zeros_init_linear(self, linear: nn.Module):
+        if isinstance(linear, (nn.Linear, nn.Conv1d)):
+            if hasattr(linear, "weight"):
+                nn.init.zeros_(linear.weight)
+            if hasattr(linear, "bias"):
+                nn.init.zeros_(linear.bias)
+        
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    def compress_time(self, x, num_frames):
+        x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
+        batch_size, frames, channels, height, width = x.shape
+        x = rearrange(x, 'b f c h w -> (b h w) c f')
+        
+        if x.shape[-1] % 2 == 1:
+            x_first, x_rest = x[..., 0], x[..., 1:]
+            if x_rest.shape[-1] > 0:
+                x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+
+            x = torch.cat([x_first[..., None], x_rest], dim=-1)
+        else:
+            x = F.avg_pool1d(x, kernel_size=2, stride=2)
+        x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
+        return x
+
+    # """
+    #     Add below.
+    #     Load pre-trained weight from Diffusers
+    #     For patch_embed, copy a projection layer for controlnet_states
+    # """
+    @classmethod
+    def from_pretrained(cls, model_path, subfolder, **additional_kwargs):
+        base = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder=subfolder)
+        controlnet_config = FrozenDict({**base.config, **additional_kwargs})
+        model = cls(**controlnet_config)
+
+        missing, unexpected = model.load_state_dict(base.state_dict(), strict=False)
+        print(f"Load CogVideoXTransformer3DModel.")
+        # if len(missing) != 0 or len(unexpected) != 0:
+        #     print(f"Missing keys: {missing}")
+        #     print(f"Unexpected keys: {unexpected}")
+        
+        del base
+        torch.cuda.empty_cache()
+        
+        
+        return model
+        
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        controlnet_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        controlnet_valid_mask: torch.Tensor = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+            Delete below.
+            In our case, FloVD, controlnet_hidden_states is already flow_latents encoded by 3D-Causal-VAE
+        """
+        # batch_size, num_frames, channels, height, width = controlnet_states.shape
+        # # 0. Controlnet encoder
+        # controlnet_states = rearrange(controlnet_states, 'b f c h w -> (b f) c h w')
+        # controlnet_states = self.unshuffle(controlnet_states)
+        # controlnet_states = self.controlnet_encode_first(controlnet_states)
+        # controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 
+        # num_frames = controlnet_states.shape[0] // batch_size
+
+        # controlnet_states = self.controlnet_encode_second(controlnet_states)
+        # controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames) 
+        # controlnet_states = rearrange(controlnet_states, '(b f) c h w -> b f c h w', b=batch_size)
+        
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        
+
+        # """
+        #     Modify below.
+        #     Distinguish hidden_states and controlnet_states (i.e., flow_hidden_states)
+        # """
+        hidden_states = torch.cat([hidden_states, controlnet_hidden_states], dim=2) # instead of image_latents, we use flow_latents for condition.
+        
+        # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep)
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        
+        # """
+        #     Modify below.
+        #     patch_embed takes encoder, hidden_states, controlnet_hidden_states
+        # """
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        # hidden_states = self.patch_embed(encoder_hidden_states, hidden_states, controlnet_hidden_states) # output: [text_embeds, image_embeds, flow_embeds] [B, 35326, 3072]
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        """
+            Not modified below.
+            hidden_states include both hidden_states and controlnet_hidden_states
+        """
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length] # [text_embeds] [B, 226, 3072]
+        hidden_states = hidden_states[:, text_seq_length:] # [image_embeds, flow_embeds] [B, 35100, 3072]
+
+        # attention mask
+        if controlnet_valid_mask is not None:
+            mask_shape = controlnet_valid_mask.shape
+            attention_mask = torch.nn.functional.interpolate(controlnet_valid_mask, size=(mask_shape[2], mask_shape[3]//2, mask_shape[4]//2), mode='trilinear', align_corners=False) # CFHW
+            attention_mask[attention_mask>=0.5] = 1
+            attention_mask[attention_mask<0.5] = 0
+            attention_mask = attention_mask.to(torch.bool)
+            attention_mask = rearrange(attention_mask.squeeze(1), 'b f h w -> b (f h w)') # (B, N=(fxhxw))
+
+            # Consider encoder_hidden_states.. or do not use?? not sure..
+            if not self.notextinflow:
+                attention_mask = F.pad(attention_mask, (text_seq_length, 0), value=0.0)
+
+        attention_kwargs = {
+            'attention_mask': attention_mask if controlnet_valid_mask is not None else None,
+            'notextinflow': self.notextinflow,
+        }
+
+        controlnet_hidden_states = ()
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                )
+                
+            if self.out_projectors is not None:
+                controlnet_hidden_states += (self.out_projectors[i](hidden_states),)
+            else:
+                controlnet_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return (controlnet_hidden_states,)
+        return Transformer2DModelOutput(sample=controlnet_hidden_states)
\ No newline at end of file
diff --git a/finetune/modules/cogvideox_custom_model.py b/finetune/modules/cogvideox_custom_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5d71cc9abc235a49e3297368d17817ff1c55281
--- /dev/null
+++ b/finetune/modules/cogvideox_custom_model.py
@@ -0,0 +1,109 @@
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import numpy as np
+from diffusers.utils import is_torch_version
+from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXTransformer3DModel, Transformer2DModelOutput
+
+import pdb
+
+class CustomCogVideoXTransformer3DModel(CogVideoXTransformer3DModel):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        start_frame = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        controlnet_states: torch.Tensor = None,
+        controlnet_weights: Optional[Union[float, int, list, np.ndarray, torch.FloatTensor]] = 1.0,
+        return_dict: bool = True,
+    ):
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+
+        if start_frame is not None:
+            hidden_states = torch.cat([start_frame, hidden_states], dim=2)
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+            if (controlnet_states is not None) and (i < len(controlnet_states)):
+                controlnet_states_block = controlnet_states[i]
+                controlnet_block_weight = 1.0
+                if isinstance(controlnet_weights, (list, np.ndarray)) or torch.is_tensor(controlnet_weights):
+                    controlnet_block_weight = controlnet_weights[i]
+                elif isinstance(controlnet_weights, (float, int)):
+                    controlnet_block_weight = controlnet_weights
+                hidden_states = hidden_states + controlnet_states_block * controlnet_block_weight
+
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
\ No newline at end of file
diff --git a/finetune/modules/cogvideox_custom_modules.py b/finetune/modules/cogvideox_custom_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..45ed16df08f7214b2a9d92abfce295d199264423
--- /dev/null
+++ b/finetune/modules/cogvideox_custom_modules.py
@@ -0,0 +1,357 @@
+import math
+from typing import List, Optional, Tuple, Union, Dict, Any
+import copy
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers import CogVideoXTransformer3DModel
+from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXBlock
+from diffusers.models.normalization import CogVideoXLayerNormZero
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import CogVideoXAttnProcessor2_0, Attention
+from diffusers.models.embeddings import CogVideoXPatchEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+
+from contextlib import contextmanager
+from peft.tuners.lora.layer import LoraLayer  # PEFT의 LoRA 레이어 기본 클래스
+
+import pdb
+
+# Code heavily borrowed from https://github.com/huggingface/diffusers
+
+
+class enable_lora:
+    def __init__(self, modules, enable=True):
+        self.modules = modules
+        self.enable = enable
+        self.prev_states = {}
+    
+    def __enter__(self):
+        for module in self.modules:
+            self.prev_states[module] = getattr(module, "lora_enabled", True)
+            setattr(module, "lora_enabled", self.enable)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for module in self.modules:
+            setattr(module, "lora_enabled", self.prev_states[module])
+        return False
+
+
+
+class CustomCogVideoXPatchEmbed(CogVideoXPatchEmbed):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        
+        patch_size = kwargs['patch_size']
+        patch_size_t = kwargs['patch_size_t']
+        bias = kwargs['bias']
+        in_channels = kwargs['in_channels']
+        embed_dim = kwargs['embed_dim']
+        
+        # projection layer for flow latents
+        if patch_size_t is None:
+            # CogVideoX 1.0 checkpoints
+            self.flow_proj = nn.Conv2d(in_channels//2, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias)
+        else:
+            # CogVideoX 1.5 checkpoints
+            self.flow_proj = nn.Linear(in_channels//2 * patch_size * patch_size * patch_size_t, embed_dim)
+        
+        # Add positional embedding for flow_embeds
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            flow_pos_embedding = self._get_positional_embeddings(self.sample_height, self.sample_width, self.sample_frames)[:,self.max_text_seq_length:] # shape: [1, 17550, 3072]
+            self.flow_pos_embedding = nn.Parameter(flow_pos_embedding)
+        
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor, flow_embeds: torch.Tensor):
+        r"""
+        Args:
+            text_embeds (`torch.Tensor`):
+                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
+            image_embeds (`torch.Tensor`):
+                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+            flow_embeds (`torch.Tensor`):
+                Input flow embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+        """
+        text_embeds = self.text_proj(text_embeds)
+
+        batch_size, num_frames, channels, height, width = image_embeds.shape
+
+        if self.patch_size_t is None:
+            # embed video latents
+            image_embeds = image_embeds.reshape(-1, channels, height, width)
+            image_embeds = self.proj(image_embeds)
+            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
+            image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+            image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+            
+            # embed flow latents
+            flow_embeds = flow_embeds.reshape(-1, channels//2, height, width)
+            flow_embeds = self.flow_proj(flow_embeds)
+            flow_embeds = flow_embeds.view(batch_size, num_frames, *flow_embeds.shape[1:])
+            flow_embeds = flow_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+            flow_embeds = flow_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]    
+        else:
+            p = self.patch_size
+            p_t = self.patch_size_t
+
+            # embed video latents
+            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
+            image_embeds = image_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
+            )
+            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            image_embeds = self.proj(image_embeds)
+            
+            # embed flow latents
+            flow_embeds = flow_embeds.permute(0, 1, 3, 4, 2)
+            flow_embeds = flow_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels//2
+            )
+            flow_embeds = flow_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            flow_embeds = self.flow_proj(flow_embeds)
+        
+        # Curriculum learning of flow token
+        # flow_embeds = self.flow_scale * flow_embeds
+
+
+        embeds = torch.cat(
+            [text_embeds, image_embeds, flow_embeds], dim=1
+        ).contiguous()  # [batch, num_frames x height x width + seq_length + num_frames x height x width, channels]
+
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
+                raise ValueError(
+                    "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
+                    "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
+                )
+
+            pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+
+            if (
+                self.sample_height != height
+                or self.sample_width != width
+                or self.sample_frames != pre_time_compression_frames
+            ):
+                pos_embedding = self._get_positional_embeddings(
+                    height, width, pre_time_compression_frames, device=embeds.device
+                )
+            else:
+                pos_embedding = self.pos_embedding
+
+            # Previous version..                  
+            # pos_embedding = pos_embedding.to(dtype=embeds.dtype)
+            # embeds = embeds + pos_embedding
+            
+            # Add flow embedding..
+            # flow_pos_embedding = self.flow_pos_scale * self.flow_pos_embedding
+            flow_pos_embedding = self.flow_pos_embedding
+            pos_embedding_total = torch.cat([pos_embedding, flow_pos_embedding], dim=1).to(dtype=embeds.dtype)
+            embeds = embeds + pos_embedding_total
+
+        return embeds
+
+
+
+@maybe_allow_in_graph
+class CustomCogVideoXBlock(nn.Module):
+    r"""
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = "gelu-approximate",
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CustomCogVideoXAttnProcessor2_0(),
+        )
+
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        attention_kwargs = attention_kwargs or {}
+
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **attention_kwargs,
+        )
+
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+            hidden_states, encoder_hidden_states, temb
+        )
+
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+
+        return hidden_states, encoder_hidden_states
+
+
+class CustomCogVideoXAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        notextinflow: Optional[bool] = False,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        if not notextinflow:
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+
+            if not notextinflow:
+                query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+                if not attn.is_cross_attention:
+                    key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+            else:
+                query[:, :, :] = apply_rotary_emb(query[:, :, :], image_rotary_emb)
+                if not attn.is_cross_attention:
+                    key[:, :, :] = apply_rotary_emb(key[:, :, :], image_rotary_emb)
+
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if not notextinflow:
+            encoder_hidden_states, hidden_states = hidden_states.split(
+                [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+            )
+            
+        return hidden_states, encoder_hidden_states
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/__init__.py b/finetune/modules/depth_warping/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/finetune/modules/depth_warping/camera/Camera.py b/finetune/modules/depth_warping/camera/Camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2633820185f4c6394fb5ff2739843356ea83788
--- /dev/null
+++ b/finetune/modules/depth_warping/camera/Camera.py
@@ -0,0 +1,70 @@
+import os
+import random
+import json
+import torch
+
+import torch.nn as nn
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as F
+import numpy as np
+from einops import rearrange, repeat
+
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+
+def load_cameras(path):       
+    with open(path, 'r') as f:
+        poses = f.readlines()
+    poses = [pose.strip().split(' ') for pose in poses[1:]]
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    return cam_params
+
+def get_relative_pose(cam_params):
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    source_cam_c2w = abs_c2ws[0]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+
+def get_K(intrinsics, size, do_normalize=False):
+    def normalize_intrinsic(x, size):
+        h, w = size
+        x[:,:,0:1] = x[:,:,0:1] / w
+        x[:,:,1:2] = x[:,:,1:2] / h
+        return x
+    
+    b, _, t, _ = intrinsics.shape
+    K = torch.zeros((b, t, 9), dtype=intrinsics.dtype, device=intrinsics.device)
+    fx, fy, cx, cy = intrinsics.squeeze(1).chunk(4, dim=-1)
+    
+    K[:,:,0:1] = fx
+    K[:,:,2:3] = cx
+    K[:,:,4:5] = fy
+    K[:,:,5:6] = cy
+    K[:,:,8:9] = 1.0
+    
+    K = rearrange(K, "b t (h w) -> b t h w", h=3, w=3)
+    if do_normalize:
+        K = normalize_intrinsic(K, size)
+    
+    return K
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/camera/WarperPytorch.py b/finetune/modules/depth_warping/camera/WarperPytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..294f1f5acabd23c345aa2fca19ed71942db8b1e0
--- /dev/null
+++ b/finetune/modules/depth_warping/camera/WarperPytorch.py
@@ -0,0 +1,416 @@
+# Shree KRISHNAya Namaha
+# Differentiable warper implemented in PyTorch. Warping is done on batches.
+# Tested on PyTorch 1.8.1
+# Author: Nagabhushan S N
+# Last Modified: 27/09/2021
+# Code from https://github.com/NagabhushanSN95/Pose-Warping
+
+import datetime
+import time
+import traceback
+from pathlib import Path
+from typing import Tuple, Optional
+
+import numpy
+# import skimage.io
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+# import Imath
+# import OpenEXR
+
+import pdb
+
+class Warper:
+    def __init__(self, resolution: tuple = None):
+        self.resolution = resolution
+
+    def forward_warp(self, frame1: torch.Tensor, mask1: Optional[torch.Tensor], depth1: torch.Tensor,
+                     transformation1: torch.Tensor, transformation2: torch.Tensor, intrinsic1: torch.Tensor, 
+                     intrinsic2: Optional[torch.Tensor], is_image=True) -> \
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Given a frame1 and global transformations transformation1 and transformation2, warps frame1 to next view using
+        bilinear splatting.
+        All arrays should be torch tensors with batch dimension and channel first
+        :param frame1: (b, 3, h, w). If frame1 is not in the range [-1, 1], either set is_image=False when calling
+                        bilinear_splatting on frame within this function, or modify clipping in bilinear_splatting()
+                        method accordingly.
+        :param mask1: (b, 1, h, w) - 1 for known, 0 for unknown. Optional
+        :param depth1: (b, 1, h, w)
+        :param transformation1: (b, 4, 4) extrinsic transformation matrix of first view: [R, t; 0, 1]
+        :param transformation2: (b, 4, 4) extrinsic transformation matrix of second view: [R, t; 0, 1]
+        :param intrinsic1: (b, 3, 3) camera intrinsic matrix
+        :param intrinsic2: (b, 3, 3) camera intrinsic matrix. Optional
+        """
+        self.device = frame1.device
+        
+        if self.resolution is not None:
+            assert frame1.shape[2:4] == self.resolution
+        b, c, h, w = frame1.shape
+        if mask1 is None:
+            mask1 = torch.ones(size=(b, 1, h, w)).to(frame1)
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+
+        assert frame1.shape == (b, 3, h, w) or frame1.shape == (b, 2, h, w) # flow b2hw
+        assert mask1.shape == (b, 1, h, w)
+        assert depth1.shape == (b, 1, h, w)
+        assert transformation1.shape == (b, 4, 4)
+        assert transformation2.shape == (b, 4, 4)
+        assert intrinsic1.shape == (b, 3, 3)
+        assert intrinsic2.shape == (b, 3, 3)
+
+        frame1 = frame1.to(self.device)
+        mask1 = mask1.to(self.device)
+        depth1 = depth1.to(self.device)
+        transformation1 = transformation1.to(self.device)
+        transformation2 = transformation2.to(self.device)
+        intrinsic1 = intrinsic1.to(self.device)
+        intrinsic2 = intrinsic2.to(self.device)
+
+        trans_points1 = self.compute_transformed_points(depth1, transformation1, transformation2, intrinsic1,
+                                                        intrinsic2)
+        # trans_coordinates = trans_points1[:, :, :2, 0] / trans_points1[:, :, 2:3, 0]
+        trans_coordinates = trans_points1[:, :, :, :2, 0] / (trans_points1[:, :, :, 2:3, 0]+1e-7)
+        trans_depth1 = rearrange(trans_points1[:, :, :, 2:3, 0], "b h w c -> b c h w")
+
+        grid = self.create_grid(b, h, w).to(trans_coordinates)
+        flow12 = rearrange(trans_coordinates, "b h w c -> b c h w") - grid
+
+        warped_frame2, mask2 = self.bilinear_splatting(frame1, mask1, trans_depth1, flow12, None, is_image=is_image)
+        warped_depth2 = self.bilinear_splatting(trans_depth1, mask1, trans_depth1, flow12, None, is_image=False)[0] # [0][:, :, 0]
+
+        return warped_frame2, mask2, warped_depth2, flow12
+
+    def forward_warp_displacement(self, depth1: torch.Tensor, flow1: torch.Tensor, 
+                                  transformation1: torch.Tensor, transformation2: torch.Tensor, intrinsic1: torch.Tensor, intrinsic2: Optional[torch.Tensor],):
+        """
+        Given a frame1 and global transformations transformation1 and transformation2, warps frame1 to next view using
+        bilinear splatting.
+        All arrays should be torch tensors with batch dimension and channel first
+        :param depth1: (b, 1, h, w)
+        :param flow1: (b, 2, h, w)
+        :param transformation1: (b, 4, 4) extrinsic transformation matrix of first view: [R, t; 0, 1]
+        :param transformation2: (b, 4, 4) extrinsic transformation matrix of second view: [R, t; 0, 1]
+        :param intrinsic1: (b, 3, 3) camera intrinsic matrix
+        :param intrinsic2: (b, 3, 3) camera intrinsic matrix. Optional
+        """
+        self.device = flow1.device
+        
+        if self.resolution is not None:
+            assert flow1.shape[2:4] == self.resolution
+        b, c, h, w = flow1.shape
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+        
+        assert flow1.shape == (b, 2, h, w)
+        assert depth1.shape == (b, 1, h, w)
+        assert transformation1.shape == (b, 4, 4)
+        assert transformation2.shape == (b, 4, 4)
+        assert intrinsic1.shape == (b, 3, 3)
+        assert intrinsic2.shape == (b, 3, 3)
+        
+        depth1 = depth1.to(self.device)
+        flow1 = flow1.to(self.device)
+        transformation1 = transformation1.to(self.device)
+        transformation2 = transformation2.to(self.device)
+        intrinsic1 = intrinsic1.to(self.device)
+        intrinsic2 = intrinsic2.to(self.device)
+        
+        trans_points1 = self.compute_transformed_points(depth1, transformation1, transformation2, intrinsic1, intrinsic2)
+        trans_coordinates1 = trans_points1[:, :, :, :2, 0] / (trans_points1[:, :, :, 2:3, 0]+1e-7)
+        
+        trans_points2 = self.compute_transformed_points(depth1, transformation1, transformation2, intrinsic1, intrinsic2, flow1)
+        trans_coordinates2 = trans_points2[:, :, :, :2, 0] / (trans_points2[:, :, :, 2:3, 0]+1e-7)
+        
+        flow12_displacement = rearrange(trans_coordinates2 - trans_coordinates1, "b h w c -> b c h w")
+        
+        return flow12_displacement
+
+    def compute_transformed_points(self, depth1: torch.Tensor, transformation1: torch.Tensor, transformation2: torch.Tensor,
+                                   intrinsic1: torch.Tensor, intrinsic2: Optional[torch.Tensor], flow1: Optional[torch.Tensor]=None):
+        """
+        Computes transformed position for each pixel location
+        """
+        if self.resolution is not None:
+            assert depth1.shape[2:4] == self.resolution
+        b, _, h, w = depth1.shape
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+        transformation = torch.bmm(transformation2, torch.linalg.inv(transformation1)).to(transformation1.dtype)  # (b, 4, 4)
+
+        x1d = torch.arange(0, w)[None]
+        y1d = torch.arange(0, h)[:, None]
+        x2d = x1d.repeat([h, 1]).to(depth1)  # (h, w)
+        y2d = y1d.repeat([1, w]).to(depth1)  # (h, w)
+        
+        ones_2d = torch.ones(size=(h, w)).to(depth1)  # (h, w)
+        ones_4d = ones_2d[None, :, :, None, None].repeat([b, 1, 1, 1, 1])  # (b, h, w, 1, 1)
+        
+        
+        if flow1 is not None:
+            x4d = repeat(x2d[None, :, :, None], '1 h w c -> b h w c', b=b)
+            y4d = repeat(y2d[None, :, :, None], '1 h w c -> b h w c', b=b)
+            flow1_x4d = rearrange(flow1[:,:1].detach().clone(), "b c h w -> b h w c")
+            flow1_y4d = rearrange(flow1[:,1:].detach().clone(), "b c h w -> b h w c")
+            
+            x4d = x4d + flow1_x4d
+            y4d = y4d + flow1_y4d
+            
+            pos_vectors_homo = torch.stack([x4d, y4d, ones_4d.squeeze(-1)], dim=3)  # (b, h, w, 3, 1)
+        else:
+            pos_vectors_homo = torch.stack([x2d, y2d, ones_2d], dim=2)[None, :, :, :, None]  # (1, h, w, 3, 1)
+
+        intrinsic1_inv = torch.linalg.inv(intrinsic1)  # (b, 3, 3)
+        intrinsic1_inv_4d = intrinsic1_inv[:, None, None]  # (b, 1, 1, 3, 3)
+        intrinsic2_4d = intrinsic2[:, None, None]  # (b, 1, 1, 3, 3)
+        depth_4d = depth1[:, 0][:, :, :, None, None]  # (b, h, w, 1, 1)
+        trans_4d = transformation[:, None, None]  # (b, 1, 1, 4, 4)
+
+        unnormalized_pos = torch.matmul(intrinsic1_inv_4d, pos_vectors_homo).to(transformation1.dtype)  # (b, h, w, 3, 1)
+        world_points = depth_4d * unnormalized_pos  # (b, h, w, 3, 1)
+        world_points_homo = torch.cat([world_points, ones_4d], dim=3)  # (b, h, w, 4, 1)
+        trans_world_homo = torch.matmul(trans_4d, world_points_homo).to(transformation1.dtype)  # (b, h, w, 4, 1)
+        trans_world = trans_world_homo[:, :, :, :3]  # (b, h, w, 3, 1)
+        trans_norm_points = torch.matmul(intrinsic2_4d, trans_world).to(transformation1.dtype)  # (b, h, w, 3, 1)
+        return trans_norm_points
+
+    def bilinear_splatting(self, frame1: torch.Tensor, mask1: Optional[torch.Tensor], depth1: torch.Tensor,
+                           flow12: torch.Tensor, flow12_mask: Optional[torch.Tensor], is_image: bool = False) -> \
+            Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Bilinear splatting
+        :param frame1: (b,c,h,w)
+        :param mask1: (b,1,h,w): 1 for known, 0 for unknown. Optional
+        :param depth1: (b,1,h,w)
+        :param flow12: (b,2,h,w)
+        :param flow12_mask: (b,1,h,w): 1 for valid flow, 0 for invalid flow. Optional
+        :param is_image: if true, output will be clipped to (-1,1) range
+        :return: warped_frame2: (b,c,h,w)
+                 mask2: (b,1,h,w): 1 for known and 0 for unknown
+        """
+        if self.resolution is not None:
+            assert frame1.shape[2:4] == self.resolution
+        b, c, h, w = frame1.shape
+        if mask1 is None:
+            mask1 = torch.ones(size=(b, 1, h, w)).to(frame1)
+        if flow12_mask is None:
+            flow12_mask = torch.ones(size=(b, 1, h, w)).to(flow12)
+        grid = self.create_grid(b, h, w).to(frame1)
+        trans_pos = flow12 + grid
+
+        trans_pos_offset = trans_pos + 1
+        trans_pos_floor = torch.floor(trans_pos_offset).long()
+        trans_pos_ceil = torch.ceil(trans_pos_offset).long()
+        trans_pos_offset = torch.stack([
+            torch.clamp(trans_pos_offset[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_offset[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_floor = torch.stack([
+            torch.clamp(trans_pos_floor[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_floor[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_ceil = torch.stack([
+            torch.clamp(trans_pos_ceil[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_ceil[:, 1], min=0, max=h + 1)], dim=1)
+
+        prox_weight_nw = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_sw = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_ne = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+        prox_weight_se = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+
+        sat_depth1 = torch.clamp(depth1, min=0, max=1000)
+        log_depth1 = torch.log(1 + sat_depth1)
+        depth_weights = torch.exp(log_depth1 / log_depth1.max() * 50)
+
+        weight_nw = torch.moveaxis(prox_weight_nw * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_sw = torch.moveaxis(prox_weight_sw * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_ne = torch.moveaxis(prox_weight_ne * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_se = torch.moveaxis(prox_weight_se * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+
+        warped_frame = torch.zeros(size=(b, h + 2, w + 2, c), dtype=torch.float32).to(frame1)
+        warped_weights = torch.zeros(size=(b, h + 2, w + 2, 1), dtype=torch.float32).to(frame1)
+
+        frame1_cl = torch.moveaxis(frame1, [0, 1, 2, 3], [0, 3, 1, 2])
+        batch_indices = torch.arange(b)[:, None, None].to(frame1.device)
+        warped_frame.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_floor[:, 0]),
+                                frame1_cl * weight_nw, accumulate=True)
+        warped_frame.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]),
+                                frame1_cl * weight_sw, accumulate=True)
+        warped_frame.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]),
+                                frame1_cl * weight_ne, accumulate=True)
+        warped_frame.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]),
+                                frame1_cl * weight_se, accumulate=True)
+
+        warped_weights.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_floor[:, 0]),
+                                  weight_nw, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]),
+                                  weight_sw, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]),
+                                  weight_ne, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]),
+                                  weight_se, accumulate=True)
+
+        warped_frame_cf = torch.moveaxis(warped_frame, [0, 1, 2, 3], [0, 2, 3, 1])
+        warped_weights_cf = torch.moveaxis(warped_weights, [0, 1, 2, 3], [0, 2, 3, 1])
+        cropped_warped_frame = warped_frame_cf[:, :, 1:-1, 1:-1]
+        cropped_weights = warped_weights_cf[:, :, 1:-1, 1:-1]
+
+        mask = cropped_weights > 0
+        zero_value = -1 if is_image else 0
+        zero_tensor = torch.tensor(zero_value, dtype=frame1.dtype, device=frame1.device)
+        warped_frame2 = torch.where(mask, cropped_warped_frame / cropped_weights, zero_tensor)
+        mask2 = mask.to(frame1)
+
+        if is_image:
+            assert warped_frame2.min() >= -1.1  # Allow for rounding errors
+            assert warped_frame2.max() <= 1.1
+            warped_frame2 = torch.clamp(warped_frame2, min=-1, max=1)
+        return warped_frame2, mask2
+
+    def bilinear_interpolation(self, frame2: torch.Tensor, mask2: Optional[torch.Tensor], flow12: torch.Tensor,
+                               flow12_mask: Optional[torch.Tensor], is_image: bool = False) -> \
+            Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Bilinear interpolation
+        :param frame2: (b, c, h, w)
+        :param mask2: (b, 1, h, w): 1 for known, 0 for unknown. Optional
+        :param flow12: (b, 2, h, w)
+        :param flow12_mask: (b, 1, h, w): 1 for valid flow, 0 for invalid flow. Optional
+        :param is_image: if true, output will be clipped to (-1,1) range
+        :return: warped_frame1: (b, c, h, w)
+                 mask1: (b, 1, h, w): 1 for known and 0 for unknown
+        """
+        if self.resolution is not None:
+            assert frame2.shape[2:4] == self.resolution
+        b, c, h, w = frame2.shape
+        if mask2 is None:
+            mask2 = torch.ones(size=(b, 1, h, w)).to(frame2)
+        if flow12_mask is None:
+            flow12_mask = torch.ones(size=(b, 1, h, w)).to(flow12)
+        grid = self.create_grid(b, h, w).to(frame2)
+        trans_pos = flow12 + grid
+
+        trans_pos_offset = trans_pos + 1
+        trans_pos_floor = torch.floor(trans_pos_offset).long()
+        trans_pos_ceil = torch.ceil(trans_pos_offset).long()
+        trans_pos_offset = torch.stack([
+            torch.clamp(trans_pos_offset[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_offset[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_floor = torch.stack([
+            torch.clamp(trans_pos_floor[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_floor[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_ceil = torch.stack([
+            torch.clamp(trans_pos_ceil[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_ceil[:, 1], min=0, max=h + 1)], dim=1)
+
+        prox_weight_nw = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_sw = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_ne = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+        prox_weight_se = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+
+        weight_nw = torch.moveaxis(prox_weight_nw * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_sw = torch.moveaxis(prox_weight_sw * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_ne = torch.moveaxis(prox_weight_ne * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_se = torch.moveaxis(prox_weight_se * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+
+        frame2_offset = F.pad(frame2, [1, 1, 1, 1])
+        mask2_offset = F.pad(mask2, [1, 1, 1, 1])
+        bi = torch.arange(b)[:, None, None]
+
+        f2_nw = frame2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_floor[:, 0]]
+        f2_sw = frame2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]]
+        f2_ne = frame2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]]
+        f2_se = frame2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]]
+
+        m2_nw = mask2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_floor[:, 0]]
+        m2_sw = mask2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]]
+        m2_ne = mask2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]]
+        m2_se = mask2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]]
+
+        nr = weight_nw * f2_nw * m2_nw + weight_sw * f2_sw * m2_sw + \
+             weight_ne * f2_ne * m2_ne + weight_se * f2_se * m2_se
+        dr = weight_nw * m2_nw + weight_sw * m2_sw + weight_ne * m2_ne + weight_se * m2_se
+
+        zero_value = -1 if is_image else 0
+        zero_tensor = torch.tensor(zero_value, dtype=nr.dtype, device=nr.device)
+        warped_frame1 = torch.where(dr > 0, nr / dr, zero_tensor)
+        mask1 = (dr > 0).to(frame2)
+
+        # Convert to channel first
+        warped_frame1 = torch.moveaxis(warped_frame1, [0, 1, 2, 3], [0, 2, 3, 1])
+        mask1 = torch.moveaxis(mask1, [0, 1, 2, 3], [0, 2, 3, 1])
+
+        if is_image:
+            assert warped_frame1.min() >= -1.1  # Allow for rounding errors
+            assert warped_frame1.max() <= 1.1
+            warped_frame1 = torch.clamp(warped_frame1, min=-1, max=1)
+        return warped_frame1, mask1
+
+    @staticmethod
+    def create_grid(b, h, w):
+        x_1d = torch.arange(0, w)[None]
+        y_1d = torch.arange(0, h)[:, None]
+        x_2d = x_1d.repeat([h, 1])
+        y_2d = y_1d.repeat([1, w])
+        grid = torch.stack([x_2d, y_2d], dim=0)
+        batch_grid = grid[None].repeat([b, 1, 1, 1])
+        return batch_grid
+
+    # @staticmethod
+    # def read_image(path: Path) -> torch.Tensor:
+    #     image = skimage.io.imread(path.as_posix())
+    #     return image
+
+    # @staticmethod
+    # def read_depth(path: Path) -> torch.Tensor:
+    #     if path.suffix == '.png':
+    #         depth = skimage.io.imread(path.as_posix())
+    #     elif path.suffix == '.npy':
+    #         depth = numpy.load(path.as_posix())
+    #     elif path.suffix == '.npz':
+    #         with numpy.load(path.as_posix()) as depth_data:
+    #             depth = depth_data['depth']
+    #     elif path.suffix == '.exr':
+    #         exr_file = OpenEXR.InputFile(path.as_posix())
+    #         raw_bytes = exr_file.channel('B', Imath.PixelType(Imath.PixelType.FLOAT))
+    #         depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
+    #         height = exr_file.header()['displayWindow'].max.y + 1 - exr_file.header()['displayWindow'].min.y
+    #         width = exr_file.header()['displayWindow'].max.x + 1 - exr_file.header()['displayWindow'].min.x
+    #         depth = numpy.reshape(depth_vector, (height, width))
+    #     else:
+    #         raise RuntimeError(f'Unknown depth format: {path.suffix}')
+    #     return depth
+
+    # @staticmethod
+    # def camera_intrinsic_transform(capture_width=1920, capture_height=1080, patch_start_point: tuple = (0, 0)):
+    #     start_y, start_x = patch_start_point
+    #     camera_intrinsics = numpy.eye(4)
+    #     camera_intrinsics[0, 0] = 2100
+    #     camera_intrinsics[0, 2] = capture_width / 2.0 - start_x
+    #     camera_intrinsics[1, 1] = 2100
+    #     camera_intrinsics[1, 2] = capture_height / 2.0 - start_y
+    #     return camera_intrinsics
+
+    # @staticmethod
+    # def get_device(device: str):
+    #     """
+    #     Returns torch device object
+    #     :param device: cpu/gpu0/gpu1
+    #     :return:
+    #     """
+    #     if device == 'cpu':
+    #         device = torch.device('cpu')
+    #     elif device.startswith('gpu') and torch.cuda.is_available():
+    #         gpu_num = int(device[3:])
+    #         device = torch.device(f'cuda:{gpu_num}')
+    #     else:
+    #         device = torch.device('cpu')
+    #     return device
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/depth_anything_v2/depth_anything_wrapper.py b/finetune/modules/depth_warping/depth_anything_v2/depth_anything_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec044693eea14ea6ef924c5c4c298bb3a942105
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/depth_anything_wrapper.py
@@ -0,0 +1,12 @@
+import os
+from cameractrl.modules.depth_anything_v2.dpt import DepthAnythingV2
+
+class MVSplat_wrapper(nn.Module):
+    def __init__(
+            self,
+            model_configs,
+            ckpt_path,
+        ):
+        super().__init__()
+        
+        depth_anything = DepthAnythingV2(model_configs)
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4499a18330523aa3564b16be70e813de000c94
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2.py
@@ -0,0 +1,415 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small, 
+        "vitb": vit_base, 
+        "vitl": vit_large, 
+        "vitg": vit_giant2
+    }
+    
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/__init__.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8120f4bc83066cb3f825ce32daa3b437f88486f1
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/attention.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..815a2bf53dbec496f6a184ed7d03bcecb7124262
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/attention.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+        
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/block.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..25488f57cc0ad3c692f86b62555f6668e2a66db1
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/block.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+
+
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+
+    residual_scale_factor = b / sample_subset_size
+
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+    return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+
+
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+
+        if self.training and self.sample_drop_ratio > 0.0:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/drop_path.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..af05625984dd14682cc96a63bf0c97bab1f123b1
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/layer_scale.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/layer_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca5daa52bd81d3581adeb2198ea5b7dba2a3aea1
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/layer_scale.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/mlp.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4b315f972f9a9f54aef1e4ef4e81b52976f018
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/patch_embed.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..574abe41175568d700a389b8b96d1ba554914779
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/patch_embed.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/swiglu_ffn.py b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/swiglu_ffn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3324b266fb0a50ccf8c3a0ede2ae10ac4dfa03e
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/swiglu_ffn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+try:
+    from xformers.ops import SwiGLU
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
diff --git a/finetune/modules/depth_warping/depth_anything_v2/dpt.py b/finetune/modules/depth_warping/depth_anything_v2/dpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccbd8cf7554ace0a3c6572249dc987471bf1b278
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/dpt.py
@@ -0,0 +1,235 @@
+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as tf
+from torchvision.transforms import Compose
+
+from .dinov2 import DINOv2
+from .util.blocks import FeatureFusionBlock, _make_scratch
+from .util.transform import Resize, NormalizeImage, PrepareForNet
+
+
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+
+
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    
+    def forward(self, x):
+        return self.conv_block(x)
+
+
+class DPTHead(nn.Module):
+    def __init__(
+        self, 
+        in_channels, 
+        features=256, 
+        use_bn=False, 
+        out_channels=[256, 512, 1024, 1024], 
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        
+        self.use_clstoken = use_clstoken
+        
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        
+        self.scratch.stem_transpose = None
+        
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        
+        head_features_1 = features
+        head_features_2 = 32
+        
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.Sigmoid()
+        )
+    
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            
+            out.append(x)
+        
+        layer_1, layer_2, layer_3, layer_4 = out
+        
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])        
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        
+        return out
+
+
+class DepthAnythingV2(nn.Module):
+    def __init__(
+        self, 
+        encoder='vitl', 
+        features=256, 
+        out_channels=[256, 512, 1024, 1024], 
+        use_bn=False, 
+        use_clstoken=False,
+        max_depth=20.0
+    ):
+        super(DepthAnythingV2, self).__init__()
+        
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitb': [2, 5, 8, 11], 
+            'vitl': [4, 11, 17, 23], 
+            'vitg': [9, 19, 29, 39]
+        }
+        
+        self.max_depth = max_depth
+        
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        
+        self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+    
+    def forward(self, x):
+        patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
+        
+        features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        
+        depth = self.depth_head(features, patch_h, patch_w) * self.max_depth
+        
+        return depth.squeeze(1)
+    
+    @torch.no_grad()
+    def infer_image(self, raw_image, input_size=518):
+        image, (h, w) = self.image2tensor(raw_image, input_size)
+        
+        depth = self.forward(image)
+        
+        depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+        
+        return depth
+        # return depth.cpu().numpy()
+    
+    
+    # TODO. transform for torch.Tensor
+    # TODO. inference for torch.Tensor
+    # def image2tensor_pt(self, raw_image, input_size=518):
+    #     transform = Compose([
+    #         tf
+    #     ])
+    
+    
+    def image2tensor(self, raw_image, input_size=518):        
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        
+        h, w = raw_image.shape[:2]
+        
+        # raw_image already has RGB order, [0,255]
+        # image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+        image = raw_image / 255.0
+        
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0)
+        
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+        image = image.to(DEVICE)
+        
+        return image, (h, w)
diff --git a/finetune/modules/depth_warping/depth_anything_v2/util/blocks.py b/finetune/modules/depth_warping/depth_anything_v2/util/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..382ea183a40264056142afffc201c992a2b01d37
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/util/blocks.py
@@ -0,0 +1,148 @@
+import torch.nn as nn
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+
+    return scratch
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups=1
+
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+       
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+
+    def __init__(
+        self, 
+        features, 
+        activation, 
+        deconv=False, 
+        bn=False, 
+        expand=False, 
+        align_corners=True,
+        size=None
+    ):
+        """Init.
+        
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups=1
+
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        
+        self.skip_add = nn.quantized.FloatFunctional()
+
+        self.size=size
+
+    def forward(self, *xs, size=None):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+
+        output = self.resConfUnit2(output)
+
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        
+        output = self.out_conv(output)
+
+        return output
diff --git a/finetune/modules/depth_warping/depth_anything_v2/util/transform.py b/finetune/modules/depth_warping/depth_anything_v2/util/transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..b14aacd44ea086b01725a9ca68bb49eadcf37d73
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_anything_v2/util/transform.py
@@ -0,0 +1,158 @@
+import numpy as np
+import cv2
+
+
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+
+        return y
+
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+
+        return (new_width, new_height)
+
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+                
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        
+        return sample
+
+
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+
+        return sample
+
+
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        
+        return sample
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/depth_pro/__init__.py b/finetune/modules/depth_warping/depth_pro/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52080b686a64851c4bb62003884fbdeb55dced9a
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro package."""
+
+from .depth_pro import create_model_and_transforms  # noqa
+from .utils import load_rgb  # noqa
diff --git a/finetune/modules/depth_warping/depth_pro/cli/__init__.py b/finetune/modules/depth_warping/depth_pro/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..54ac5722c5db5e9a6846f12fea9efc00f3e385e5
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/cli/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro CLI and tools."""
+
+from .run import main as run_main  # noqa
diff --git a/finetune/modules/depth_warping/depth_pro/cli/run.py b/finetune/modules/depth_warping/depth_pro/cli/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..3545a99993810b7d602f63c057640645b215f2b2
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/cli/run.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""Sample script to run DepthPro.
+
+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""
+
+
+import argparse
+import logging
+from pathlib import Path
+
+import numpy as np
+import PIL.Image
+import torch
+from matplotlib import pyplot as plt
+from tqdm import tqdm
+
+from depth_pro import create_model_and_transforms, load_rgb
+
+LOGGER = logging.getLogger(__name__)
+
+
+def get_torch_device() -> torch.device:
+    """Get the Torch device."""
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    return device
+
+
+def run(args):
+    """Run Depth Pro on a sample image."""
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO)
+
+    # Load model.
+    model, transform = create_model_and_transforms(
+        device=get_torch_device(),
+        precision=torch.half,
+    )
+    model.eval()
+
+    image_paths = [args.image_path]
+    if args.image_path.is_dir():
+        image_paths = args.image_path.glob("**/*")
+        relative_path = args.image_path
+    else:
+        relative_path = args.image_path.parent
+
+    if not args.skip_display:
+        plt.ion()
+        fig = plt.figure()
+        ax_rgb = fig.add_subplot(121)
+        ax_disp = fig.add_subplot(122)
+
+    for image_path in tqdm(image_paths):
+        # Load image and focal length from exif info (if found.).
+        try:
+            LOGGER.info(f"Loading image {image_path} ...")
+            image, _, f_px = load_rgb(image_path)
+        except Exception as e:
+            LOGGER.error(str(e))
+            continue
+        # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
+        # otherwise the model estimates `f_px` to compute the depth metricness.
+        prediction = model.infer(transform(image), f_px=f_px)
+
+        # Extract the depth and focal length.
+        depth = prediction["depth"].detach().cpu().numpy().squeeze()
+        if f_px is not None:
+            LOGGER.debug(f"Focal length (from exif): {f_px:0.2f}")
+        elif prediction["focallength_px"] is not None:
+            focallength_px = prediction["focallength_px"].detach().cpu().item()
+            LOGGER.info(f"Estimated focal length: {focallength_px}")
+
+        inverse_depth = 1 / depth
+        # Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization.
+        max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1)
+        min_invdepth_vizu = max(1 / 250, inverse_depth.min())
+        inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / (
+            max_invdepth_vizu - min_invdepth_vizu
+        )
+
+        # Save Depth as npz file.
+        if args.output_path is not None:
+            output_file = (
+                args.output_path
+                / image_path.relative_to(relative_path).parent
+                / image_path.stem
+            )
+            LOGGER.info(f"Saving depth map to: {str(output_file)}")
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            np.savez_compressed(output_file, depth=depth)
+
+            # Save as color-mapped "turbo" jpg image.
+            cmap = plt.get_cmap("turbo")
+            color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(
+                np.uint8
+            )
+            color_map_output_file = str(output_file) + ".jpg"
+            LOGGER.info(f"Saving color-mapped depth to: : {color_map_output_file}")
+            PIL.Image.fromarray(color_depth).save(
+                color_map_output_file, format="JPEG", quality=90
+            )
+
+        # Display the image and estimated depth map.
+        if not args.skip_display:
+            ax_rgb.imshow(image)
+            ax_disp.imshow(inverse_depth_normalized, cmap="turbo")
+            fig.canvas.draw()
+            fig.canvas.flush_events()
+
+    LOGGER.info("Done predicting depth!")
+    if not args.skip_display:
+        plt.show(block=True)
+
+
+def main():
+    """Run DepthPro inference example."""
+    parser = argparse.ArgumentParser(
+        description="Inference scripts of DepthPro with PyTorch models."
+    )
+    parser.add_argument(
+        "-i", 
+        "--image-path", 
+        type=Path, 
+        default="./data/example.jpg",
+        help="Path to input image.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=Path,
+        help="Path to store output files.",
+    )
+    parser.add_argument(
+        "--skip-display",
+        action="store_true",
+        help="Skip matplotlib display.",
+    )
+    parser.add_argument(
+        "-v", 
+        "--verbose", 
+        action="store_true", 
+        help="Show verbose output."
+    )
+    
+    run(parser.parse_args())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finetune/modules/depth_warping/depth_pro/depth_pro.py b/finetune/modules/depth_warping/depth_pro/depth_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..f31b4e16178c5e29a3ffcd1a2366fe585bc9c370
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/depth_pro.py
@@ -0,0 +1,298 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
+
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Mapping, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torchvision.transforms import (
+    Compose,
+    ConvertImageDtype,
+    Lambda,
+    Normalize,
+    ToTensor,
+)
+
+from .network.decoder import MultiresConvDecoder
+from .network.encoder import DepthProEncoder
+from .network.fov import FOVNetwork
+from .network.vit_factory import VIT_CONFIG_DICT, ViTPreset, create_vit
+
+
+@dataclass
+class DepthProConfig:
+    """Configuration for DepthPro."""
+
+    patch_encoder_preset: ViTPreset
+    image_encoder_preset: ViTPreset
+    decoder_features: int
+
+    checkpoint_uri: Optional[str] = None
+    fov_encoder_preset: Optional[ViTPreset] = None
+    use_fov_head: bool = True
+
+
+DEFAULT_MONODEPTH_CONFIG_DICT = DepthProConfig(
+    patch_encoder_preset="dinov2l16_384",
+    image_encoder_preset="dinov2l16_384",
+    checkpoint_uri="./checkpoints/depth_pro.pt",
+    decoder_features=256,
+    use_fov_head=True,
+    fov_encoder_preset="dinov2l16_384",
+)
+
+
+def create_backbone_model(
+    preset: ViTPreset
+) -> Tuple[nn.Module, ViTPreset]:
+    """Create and load a backbone model given a config.
+
+    Args:
+    ----
+        preset: A backbone preset to load pre-defind configs.
+
+    Returns:
+    -------
+        A Torch module and the associated config.
+
+    """
+    if preset in VIT_CONFIG_DICT:
+        config = VIT_CONFIG_DICT[preset]
+        model = create_vit(preset=preset, use_pretrained=False)
+    else:
+        raise KeyError(f"Preset {preset} not found.")
+
+    return model, config
+
+
+def create_model_and_transforms(
+    config: DepthProConfig = DEFAULT_MONODEPTH_CONFIG_DICT,
+    device: torch.device = torch.device("cpu"),
+    precision: torch.dtype = torch.float32,
+) -> Tuple[DepthPro, Compose]:
+    """Create a DepthPro model and load weights from `config.checkpoint_uri`.
+
+    Args:
+    ----
+        config: The configuration for the DPT model architecture.
+        device: The optional Torch device to load the model onto, default runs on "cpu".
+        precision: The optional precision used for the model, default is FP32.
+
+    Returns:
+    -------
+        The Torch DepthPro model and associated Transform.
+
+    """
+    patch_encoder, patch_encoder_config = create_backbone_model(
+        preset=config.patch_encoder_preset
+    )
+    image_encoder, _ = create_backbone_model(
+        preset=config.image_encoder_preset
+    )
+
+    fov_encoder = None
+    if config.use_fov_head and config.fov_encoder_preset is not None:
+        fov_encoder, _ = create_backbone_model(preset=config.fov_encoder_preset)
+
+    dims_encoder = patch_encoder_config.encoder_feature_dims
+    hook_block_ids = patch_encoder_config.encoder_feature_layer_ids
+    encoder = DepthProEncoder(
+        dims_encoder=dims_encoder,
+        patch_encoder=patch_encoder,
+        image_encoder=image_encoder,
+        hook_block_ids=hook_block_ids,
+        decoder_features=config.decoder_features,
+    )
+    decoder = MultiresConvDecoder(
+        dims_encoder=[config.decoder_features] + list(encoder.dims_encoder),
+        dim_decoder=config.decoder_features,
+    )
+    model = DepthPro(
+        encoder=encoder,
+        decoder=decoder,
+        last_dims=(32, 1),
+        use_fov_head=config.use_fov_head,
+        fov_encoder=fov_encoder,
+    ).to(device)
+
+    if precision == torch.half:
+        model.half()
+
+    transform = Compose(
+        [
+            ToTensor(),
+            Lambda(lambda x: x.to(device)),
+            Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+            ConvertImageDtype(precision),
+        ]
+    )
+
+    if config.checkpoint_uri is not None:
+        state_dict = torch.load(config.checkpoint_uri, map_location="cpu")
+        missing_keys, unexpected_keys = model.load_state_dict(
+            state_dict=state_dict, strict=True
+        )
+
+        if len(unexpected_keys) != 0:
+            raise KeyError(
+                f"Found unexpected keys when loading monodepth: {unexpected_keys}"
+            )
+
+        # fc_norm is only for the classification head,
+        # which we would not use. We only use the encoding.
+        missing_keys = [key for key in missing_keys if "fc_norm" not in key]
+        if len(missing_keys) != 0:
+            raise KeyError(f"Keys are missing when loading monodepth: {missing_keys}")
+
+    return model, transform
+
+
+class DepthPro(nn.Module):
+    """DepthPro network."""
+
+    def __init__(
+        self,
+        encoder: DepthProEncoder,
+        decoder: MultiresConvDecoder,
+        last_dims: tuple[int, int],
+        use_fov_head: bool = True,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize DepthPro.
+
+        Args:
+        ----
+            encoder: The DepthProEncoder backbone.
+            decoder: The MultiresConvDecoder decoder.
+            last_dims: The dimension for the last convolution layers.
+            use_fov_head: Whether to use the field-of-view head.
+            fov_encoder: A separate encoder for the field of view.
+
+        """
+        super().__init__()
+
+        self.encoder = encoder
+        self.decoder = decoder
+    
+        dim_decoder = decoder.dim_decoder
+        self.head = nn.Sequential(
+            nn.Conv2d(
+                dim_decoder, dim_decoder // 2, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ConvTranspose2d(
+                in_channels=dim_decoder // 2,
+                out_channels=dim_decoder // 2,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+            ),
+            nn.Conv2d(
+                dim_decoder // 2,
+                last_dims[0],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            ),
+            nn.ReLU(True),
+            nn.Conv2d(last_dims[0], last_dims[1], kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+
+        # Set the final convolution layer's bias to be 0.
+        self.head[4].bias.data.fill_(0)
+
+        # Set the FOV estimation head.
+        if use_fov_head:
+            self.fov = FOVNetwork(num_features=dim_decoder, fov_encoder=fov_encoder)
+
+    @property
+    def img_size(self) -> int:
+        """Return the internal image size of the network."""
+        return self.encoder.img_size
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Decode by projection and fusion of multi-resolution encodings.
+
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+
+        Returns:
+        -------
+            The canonical inverse depth map [m] and the optional estimated field of view [deg].
+
+        """
+        _, _, H, W = x.shape
+        assert H == self.img_size and W == self.img_size
+
+        encodings = self.encoder(x)
+        features, features_0 = self.decoder(encodings)
+        canonical_inverse_depth = self.head(features)
+
+        fov_deg = None
+        if hasattr(self, "fov"):
+            fov_deg = self.fov.forward(x, features_0.detach())
+
+        return canonical_inverse_depth, fov_deg
+
+    @torch.no_grad()
+    def infer(
+        self,
+        x: torch.Tensor,
+        f_px: Optional[Union[float, torch.Tensor]] = None,
+        interpolation_mode="bilinear",
+    ) -> Mapping[str, torch.Tensor]:
+        """Infer depth and fov for a given image.
+
+        If the image is not at network resolution, it is resized to 1536x1536 and
+        the estimated depth is resized to the original image resolution.
+        Note: if the focal length is given, the estimated value is ignored and the provided
+        focal length is use to generate the metric depth values.
+
+        Args:
+        ----
+            x (torch.Tensor): Input image
+            f_px (torch.Tensor): Optional focal length in pixels corresponding to `x`.
+            interpolation_mode (str): Interpolation function for downsampling/upsampling. 
+
+        Returns:
+        -------
+            Tensor dictionary (torch.Tensor): depth [m], focallength [pixels].
+
+        """
+        if len(x.shape) == 3:
+            x = x.unsqueeze(0)
+        _, _, H, W = x.shape
+        resize = H != self.img_size or W != self.img_size
+
+        if resize:
+            x = nn.functional.interpolate(
+                x,
+                size=(self.img_size, self.img_size),
+                mode=interpolation_mode,
+                align_corners=False,
+            )
+
+        canonical_inverse_depth, fov_deg = self.forward(x)
+        if f_px is None:
+            f_px = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_deg.to(torch.float)))
+        
+        inverse_depth = canonical_inverse_depth * (W / f_px)
+        f_px = f_px.squeeze()
+
+        if resize:
+            inverse_depth = nn.functional.interpolate(
+                inverse_depth, size=(H, W), mode=interpolation_mode, align_corners=False
+            )
+
+        depth = 1.0 / torch.clamp(inverse_depth, min=1e-4, max=1e4)
+
+        return {
+            "depth": depth.squeeze(),
+            "focallength_px": f_px,
+        }
diff --git a/finetune/modules/depth_warping/depth_pro/eval/boundary_metrics.py b/finetune/modules/depth_warping/depth_pro/eval/boundary_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7650dbb60b990ed66b4444a1bbb7f7eaaed1390
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/eval/boundary_metrics.py
@@ -0,0 +1,332 @@
+from typing import List, Tuple
+
+import numpy as np
+
+
+def connected_component(r: np.ndarray, c: np.ndarray) -> List[List[int]]:
+    """Find connected components in the given row and column indices.
+
+    Args:
+    ----
+        r (np.ndarray): Row indices.
+        c (np.ndarray): Column indices.
+
+    Yields:
+    ------
+        List[int]: Indices of connected components.
+
+    """
+    indices = [0]
+    for i in range(1, r.size):
+        if r[i] == r[indices[-1]] and c[i] == c[indices[-1]] + 1:
+            indices.append(i)
+        else:
+            yield indices
+            indices = [i]
+    yield indices
+
+
+def nms_horizontal(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) horizontally on the given ratio matrix.
+
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+
+    """
+    mask = np.zeros_like(ratio, dtype=bool)
+    r, c = np.nonzero(ratio > threshold)
+    if len(r) == 0:
+        return mask
+    for ids in connected_component(r, c):
+        values = [ratio[r[i], c[i]] for i in ids]
+        mi = np.argmax(values)
+        mask[r[ids[mi]], c[ids[mi]]] = True
+    return mask
+
+
+def nms_vertical(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) vertically on the given ratio matrix.
+
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+
+    """
+    return np.transpose(nms_horizontal(np.transpose(ratio), threshold))
+
+
+def fgbg_depth(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels.
+
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for comparison.
+
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations.
+
+    """
+    right_is_big_enough = (d[..., :, 1:] / d[..., :, :-1]) > t
+    left_is_big_enough = (d[..., :, :-1] / d[..., :, 1:]) > t
+    bottom_is_big_enough = (d[..., 1:, :] / d[..., :-1, :]) > t
+    top_is_big_enough = (d[..., :-1, :] / d[..., 1:, :]) > t
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+
+
+def fgbg_depth_thinned(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels with Non-Maximum Suppression.
+
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for NMS.
+
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations with NMS applied.
+
+    """
+    right_is_big_enough = nms_horizontal(d[..., :, 1:] / d[..., :, :-1], t)
+    left_is_big_enough = nms_horizontal(d[..., :, :-1] / d[..., :, 1:], t)
+    bottom_is_big_enough = nms_vertical(d[..., 1:, :] / d[..., :-1, :], t)
+    top_is_big_enough = nms_vertical(d[..., :-1, :] / d[..., 1:, :], t)
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+
+
+def fgbg_binary_mask(
+    d: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels in binary masks.
+
+    Args:
+    ----
+        d (np.ndarray): Binary depth matrix.
+
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations in binary masks.
+
+    """
+    assert d.dtype == bool
+    right_is_big_enough = d[..., :, 1:] & ~d[..., :, :-1]
+    left_is_big_enough = d[..., :, :-1] & ~d[..., :, 1:]
+    bottom_is_big_enough = d[..., 1:, :] & ~d[..., :-1, :]
+    top_is_big_enough = d[..., :-1, :] & ~d[..., 1:, :]
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+
+
+def edge_recall_matting(pr: np.ndarray, gt: np.ndarray, t: float) -> float:
+    """Calculate edge recall for image matting.
+
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth binary mask.
+        t (float): Threshold for NMS.
+
+    Returns:
+    -------
+        float: Edge recall value.
+
+    """
+    assert gt.dtype == bool
+    ap, bp, cp, dp = fgbg_depth_thinned(pr, t)
+    ag, bg, cg, dg = fgbg_binary_mask(gt)
+    return 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+
+
+def boundary_f1(
+    pr: np.ndarray,
+    gt: np.ndarray,
+    t: float,
+    return_p: bool = False,
+    return_r: bool = False,
+) -> float:
+    """Calculate Boundary F1 score.
+
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth depth matrix.
+        t (float): Threshold for comparison.
+        return_p (bool, optional): If True, return precision. Defaults to False.
+        return_r (bool, optional): If True, return recall. Defaults to False.
+
+    Returns:
+    -------
+        float: Boundary F1 score, or precision, or recall depending on the flags.
+
+    """
+    ap, bp, cp, dp = fgbg_depth(pr, t)
+    ag, bg, cg, dg = fgbg_depth(gt, t)
+
+    r = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+    p = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ap), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bp), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cp), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dp), 1)
+    )
+    if r + p == 0:
+        return 0.0
+    if return_p:
+        return p
+    if return_r:
+        return r
+    return 2 * (r * p) / (r + p)
+
+
+def get_thresholds_and_weights(
+    t_min: float, t_max: float, N: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate thresholds and weights for the given range.
+
+    Args:
+    ----
+        t_min (float): Minimum threshold.
+        t_max (float): Maximum threshold.
+        N (int): Number of thresholds.
+
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray]: Array of thresholds and corresponding weights.
+
+    """
+    thresholds = np.linspace(t_min, t_max, N)
+    weights = thresholds / thresholds.sum()
+    return thresholds, weights
+
+
+def invert_depth(depth: np.ndarray, eps: float = 1e-6) -> np.ndarray:
+    """Inverts a depth map with numerical stability.
+
+    Args:
+    ----
+        depth (np.ndarray): Depth map to be inverted.
+        eps (float): Minimum value to avoid division by zero (default is 1e-6).
+
+    Returns:
+    -------
+    np.ndarray: Inverted depth map.
+
+    """
+    inverse_depth = 1.0 / depth.clip(min=eps)
+    return inverse_depth
+
+
+def SI_boundary_F1(
+    predicted_depth: np.ndarray,
+    target_depth: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+) -> float:
+    """Calculate Scale-Invariant Boundary F1 Score for depth-based ground-truth.
+
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_depth (np.ndarray): Ground truth depth matrix.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+
+    Returns:
+    -------
+        float: Scale-Invariant Boundary F1 Score.
+
+    """
+    assert predicted_depth.ndim == target_depth.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    f1_scores = np.array(
+        [
+            boundary_f1(invert_depth(predicted_depth), invert_depth(target_depth), t)
+            for t in thresholds
+        ]
+    )
+    return np.sum(f1_scores * weights)
+
+
+def SI_boundary_Recall(
+    predicted_depth: np.ndarray,
+    target_mask: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+    alpha_threshold: float = 0.1,
+) -> float:
+    """Calculate Scale-Invariant Boundary Recall Score for mask-based ground-truth.
+
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_mask (np.ndarray): Ground truth binary mask.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+        alpha_threshold (float, optional): Threshold for alpha masking. Defaults to 0.1.
+
+    Returns:
+    -------
+        float: Scale-Invariant Boundary Recall Score.
+
+    """
+    assert predicted_depth.ndim == target_mask.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    thresholded_target = target_mask > alpha_threshold
+
+    recall_scores = np.array(
+        [
+            edge_recall_matting(
+                invert_depth(predicted_depth), thresholded_target, t=float(t)
+            )
+            for t in thresholds
+        ]
+    )
+    weighted_recall = np.sum(recall_scores * weights)
+    return weighted_recall
diff --git a/finetune/modules/depth_warping/depth_pro/eval/dis5k_sample_list.txt b/finetune/modules/depth_warping/depth_pro/eval/dis5k_sample_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..81da1dcdb786da7bbec861604f50d4f039f695ef
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/eval/dis5k_sample_list.txt
@@ -0,0 +1,200 @@
+DIS5K/DIS-TE1/im/12#Graphics#4#TrafficSign#8245751856_821be14f86_o.jpg
+DIS5K/DIS-TE1/im/13#Insect#4#Butterfly#16023994688_7ff8cdccb1_o.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205538.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#8#SweetStand#4848284981_fc90f54b50_o.jpg
+DIS5K/DIS-TE1/im/17#Non-motor Vehicle#4#Cart#15012855035_d10b57014f_o.jpg
+DIS5K/DIS-TE1/im/2#Aircraft#5#Kite#13104545564_5afceec9bd_o.jpg
+DIS5K/DIS-TE1/im/20#Sports#10#Skateboarding#8472763540_bb2390e928_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#14#Sword#32473146960_dcc6b77848_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#15#Tapeline#9680492386_2d2020f282_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#4#Flag#507752845_ef852100f0_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#6#Key#11966089533_3becd78b44_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#8#Scale#31946428472_d28def471b_o.jpg
+DIS5K/DIS-TE1/im/22#Weapon#4#Rifle#8472656430_3eb908b211_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#3#Earphone#1177468301_641df8c267_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#9#MusicPlayer#2235782872_7d47847bb4_o.jpg
+DIS5K/DIS-TE2/im/11#Furniture#13#Ladder#3878434417_2ed740586e_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#1#Ant#27047700955_3b3a1271f8_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#11#Spider#5567179191_38d1f65589_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#8#Locust#5237933769_e6687c05e4_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#2#DishRack#70838854_40cf689da7_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#8#SweetStand#8467929412_fef7f4275d_o.jpg
+DIS5K/DIS-TE2/im/16#Music Instrument#2#Harp#28058219806_28e05ff24a_o.jpg
+DIS5K/DIS-TE2/im/17#Non-motor Vehicle#1#BabyCarriage#29794777180_2e1695a0cf_o.jpg
+DIS5K/DIS-TE2/im/19#Ship#3#Sailboat#22442908623_5977e3becf_o.jpg
+DIS5K/DIS-TE2/im/2#Aircraft#5#Kite#44654358051_1400e71cc4_o.jpg
+DIS5K/DIS-TE2/im/21#Tool#11#Stand#IMG_20210520_205442.jpg
+DIS5K/DIS-TE2/im/21#Tool#17#Tripod#9318977876_34615ec9a0_o.jpg
+DIS5K/DIS-TE2/im/5#Artifact#3#Handcraft#50860882577_8482143b1b_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#10#Robot#3093360210_fee54dc5c5_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#6#Microphone#47411477652_6da66cbc10_o.jpg
+DIS5K/DIS-TE3/im/14#Kitchenware#4#Kitchenware#2451122898_ef883175dd_o.jpg
+DIS5K/DIS-TE3/im/15#Machine#4#SewingMachine#9311164128_97ba1d3947_o.jpg
+DIS5K/DIS-TE3/im/16#Music Instrument#2#Harp#7670920550_59e992fd7b_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#1#BabyCarriage#8389984877_1fddf8715c_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#3#Carriage#5947122724_98e0fc3d1f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#2#Balloon#2487168092_641505883f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#4#Helicopter#8401177591_06c71c8df2_o.jpg
+DIS5K/DIS-TE3/im/20#Sports#1#Archery#12520003103_faa43ea3e0_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#11#Stand#IMG_20210709_221507.jpg
+DIS5K/DIS-TE3/im/21#Tool#2#Clip#5656649687_63d0c6696d_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#6#Key#12878459244_6387a140ea_o.jpg
+DIS5K/DIS-TE3/im/3#Aquatic#1#Lobster#109214461_f52b4b6093_o.jpg
+DIS5K/DIS-TE3/im/4#Architecture#19#Windmill#20195851863_2627117e0e_o.jpg
+DIS5K/DIS-TE3/im/5#Artifact#2#Cage#5821476369_ea23927487_o.jpg
+DIS5K/DIS-TE3/im/8#Electronics#7#MobileHolder#49732997896_7f53c290b5_o.jpg
+DIS5K/DIS-TE4/im/13#Insect#6#Centipede#15302179708_a267850881_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#11#Tricycle#5771069105_a3aef6f665_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#2#Bicycle#4245936196_fdf812dcb7_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#9#ShoppingCart#4674052920_a5b7a2b236_o.jpg
+DIS5K/DIS-TE4/im/18#Plant#1#Bonsai#3539420884_ca8973e2c0_o.jpg
+DIS5K/DIS-TE4/im/2#Aircraft#6#Parachute#33590416634_9d6f2325e7_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#1#Archery#46924476515_0be1caa684_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#8#Racket#19337607166_dd1985fb59_o.jpg
+DIS5K/DIS-TE4/im/21#Tool#6#Key#3193329588_839b0c74ce_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#2#Cage#5821886526_0573ba2d0d_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#3#Handcraft#50105138282_3c1d02c968_o.jpg
+DIS5K/DIS-TE4/im/8#Electronics#1#Antenna#4305034305_874f21a701_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#15554964549_3105e51b6f_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#41104261980_098a6c4a56_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#2#Clothes#2284764037_871b2e8ca4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#1824643784_70d0134156_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#3590020230_37b09a29b3_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#4809652879_4da8a69f3b_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#792204934_f9b28f99b4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#5#Jewelry#13909132974_c4750c5fb7_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#7#Shoe#2483391615_9199ece8d6_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#8#Watch#4343266960_f6633b029b_o.jpg
+DIS5K/DIS-TR/im/10#Frame#2#BicycleFrame#17897573_42964dd104_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#15898634812_64807069ff_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#23928546819_c184cb0b60_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#19#Shower#6189119596_77bcfe80ee_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#2#Bench#3263647075_9306e280b5_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#5#CoatHanger#12774091054_cd5ff520ef_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#6#DentalChair#13878156865_d0439dcb32_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#9#Easel#5861024714_2070cd480c_o.jpg
+DIS5K/DIS-TR/im/12#Graphics#4#TrafficSign#40621867334_f3c32ec189_o.jpg
+DIS5K/DIS-TR/im/13#Insect#1#Ant#3295038190_db5dd0d4f4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#10#Mosquito#24341339_a88a1dad4c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#27171518270_63b78069ff_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#49925050281_fa727c154e_o.jpg
+DIS5K/DIS-TR/im/13#Insect#2#Beatle#279616486_2f1e64f591_o.jpg
+DIS5K/DIS-TR/im/13#Insect#3#Bee#43892067695_82cf3e536b_o.jpg
+DIS5K/DIS-TR/im/13#Insect#6#Centipede#20874281788_3e15c90a1c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#7#Dragonfly#14106671120_1b824d77e4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#8#Locust#21637491048_676ef7c9f7_o.jpg
+DIS5K/DIS-TR/im/13#Insect#9#Mantis#1381120202_9dff6987b2_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#1#Cup#12812517473_327d6474b8_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#10#WineGlass#6402491641_389275d4d1_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#3#Hydrovalve#3129932040_8c05825004_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#2881934780_87d5218ebb_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205527.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#6#Spoon#32989113501_b69eccf0df_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#8#SweetStand#2867322189_c56d1e0b87_o.jpg
+DIS5K/DIS-TR/im/15#Machine#1#Gear#19217846720_f5f2807475_o.jpg
+DIS5K/DIS-TR/im/15#Machine#2#Machine#1620160659_9571b7a7ab_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#2#Harp#6012801603_1a6e2c16a6_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#5#Trombone#8683292118_d223c17ccb_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#6#Trumpet#8393262740_b8c216142c_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#8#Violin#1511267391_40e4949d68_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#1#BabyCarriage#6989512997_38b3dbc88b_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#14627183228_b2d68cf501_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#2932226475_1b2403e549_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#5420155648_86459905b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#2#Bicycle#IMG_20210513_134904.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#3#Carriage#3311962551_6f211b7bd6_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#4#Cart#2609732026_baf7fff3a1_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#5#Handcart#5821282211_201cefeaf2_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#7#Mower#5779003232_3bb3ae531a_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#10051622843_ace07e32b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#8075259294_f23e243849_o.jpg
+DIS5K/DIS-TR/im/18#Plant#2#Tree#44800999741_e377e16dbb_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#2631761913_3ac67d0223_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#37707911566_e908a261b6_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#3#HangGlider#2557220131_b8506920c5_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#4#Helicopter#6215659280_5dbd9b4546_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#6#Parachute#20185790493_e56fcaf8c6_o.jpg
+DIS5K/DIS-TR/im/20#Sports#1#Archery#3871269982_ae4c59a7eb_o.jpg
+DIS5K/DIS-TR/im/20#Sports#9#RockClimbing#9662433268_51299bc50e_o.jpg
+DIS5K/DIS-TR/im/21#Tool#14#Sword#26258479365_2950d7fa37_o.jpg
+DIS5K/DIS-TR/im/21#Tool#15#Tapeline#15505703447_e0fdeaa5a6_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#26678602024_9b665742de_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#5774823110_d603ce3cc8_o.jpg
+DIS5K/DIS-TR/im/21#Tool#5#Hook#6867989814_dba18d673c_o.jpg
+DIS5K/DIS-TR/im/22#Weapon#4#Rifle#4451713125_cd91719189_o.jpg
+DIS5K/DIS-TR/im/3#Aquatic#2#Seadragon#4910944581_913139b238_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#12#Scaffold#3661448960_8aff24cc4d_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#13#Sculpture#6385318715_9a88d4eba7_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#17#Well#5011603479_75cf42808a_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#2#Cage#4892828841_7f1bc05682_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#15404211628_9e9ff2ce2e_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#3200169865_7c84cfcccf_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#5859295071_c217e7c22f_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#10#SteeringWheel#17200338026_f1e2122d8e_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#3#Car#3780893425_1a7d275e09_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#5#Crane#15282506502_1b1132a7c3_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#16767791875_8e6df41752_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#3291433361_38747324c4_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#4195104238_12a754c61a_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#49645415132_61e5664ecf_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#IMG_20210521_232406.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#3298312021_92f431e3e9_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#47950134773_fbfff63f4e_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#11#VacuumCleaner#5448403677_6a29e21881_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#2#CeilingLamp#611568868_680ed5d39f_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#3#Fan#3391683115_990525a693_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#6#StreetLamp#150049122_0692266618_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#9#TransmissionTower#31433908671_7e7e277dfe_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#1#Antenna#8727884873_e0622ee5c4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#2#Camcorder#4172690390_7e5f280ace_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#3#Earphone#413984555_f290febdf5_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#5#Headset#30574225373_3717ed9fa4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#6#Microphone#538006482_4aae4f5bd6_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#9#MusicPlayer#1306012480_2ea80d2afd_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#1#GymEquipment#33071754135_8f3195cbd1_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#2305807849_be53d724ea_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#3862040422_5bbf903204_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#3#OutdoorFitnessEquipment#10814507005_3dacaa28b3_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#4#FerrisWheel#81640293_4b0ee62040_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#5#Swing#49867339188_08073f4b76_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#1#Bag#6815402415_e01c1a41e6_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#5#Jewelry#2744070193_1486582e8d_o.jpg
+DIS5K/DIS-VD/im/10#Frame#1#BasketballHoop#IMG_20210521_232650.jpg
+DIS5K/DIS-VD/im/10#Frame#5#Rack#6156611713_49ebf12b1e_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#11#Handrail#3276641240_1b84b5af85_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#13#Ladder#33423266_5391cf47e9_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#17#Table#3725111755_4fc101e7ab_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#2#Bench#35556410400_7235b58070_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#4#Chair#3301769985_e49de6739f_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#6#DentalChair#23811071619_2a95c3a688_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#9#Easel#8322807354_df6d56542e_o.jpg
+DIS5K/DIS-VD/im/13#Insect#10#Mosquito#12391674863_0cdf430d3f_o.jpg
+DIS5K/DIS-VD/im/13#Insect#7#Dragonfly#14693028899_344ea118f2_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#10#WineGlass#4450148455_8f460f541a_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#3#Hydrovalve#IMG_20210520_203410.jpg
+DIS5K/DIS-VD/im/15#Machine#3#PlowHarrow#34521712846_df4babb024_o.jpg
+DIS5K/DIS-VD/im/16#Music Instrument#5#Trombone#6222242743_e7189405cd_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#12#Wheel#25677578797_ea47e1d9e8_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#2#Bicycle#5153474856_21560b081b_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#7#Mower#16992510572_8a6ff27398_o.jpg
+DIS5K/DIS-VD/im/19#Ship#2#Canoe#40571458163_7faf8b73d9_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#1#Airplane#4270588164_66a619e834_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#4#Helicopter#86789665_650b94b2ee_o.jpg
+DIS5K/DIS-VD/im/20#Sports#14#Wakesurfing#5589577652_5061c168d2_o.jpg
+DIS5K/DIS-VD/im/21#Tool#10#Spade#37018312543_63b21b0784_o.jpg
+DIS5K/DIS-VD/im/21#Tool#14#Sword#24789047250_42df9bf422_o.jpg
+DIS5K/DIS-VD/im/21#Tool#18#Umbrella#IMG_20210513_140445.jpg
+DIS5K/DIS-VD/im/21#Tool#6#Key#43939732715_5a6e28b518_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#1#Cannon#12758066705_90b54295e7_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#4#Rifle#8019368790_fb6dc469a7_o.jpg
+DIS5K/DIS-VD/im/3#Aquatic#5#Shrimp#2582833427_7a99e7356e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#12#Scaffold#1013402687_590750354e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#13#Sculpture#17176841759_272a3ed6e3_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#14#Stair#15079108505_0d11281624_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#19#Windmill#2928111082_ceb3051c04_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#3#Crack#3551574032_17dd106d31_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#5#GasStation#4564307581_c3069bdc62_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#8#ObservationTower#2704526950_d4f0ddc807_o.jpg
+DIS5K/DIS-VD/im/5#Artifact#3#Handcraft#10873642323_1bafce3aa5_o.jpg
+DIS5K/DIS-VD/im/6#Automobile#11#Tractor#8594504006_0c2c557d85_o.jpg
+DIS5K/DIS-VD/im/8#Electronics#3#Earphone#8106454803_1178d867cc_o.jpg
\ No newline at end of file
diff --git a/finetune/modules/depth_warping/depth_pro/network/__init__.py b/finetune/modules/depth_warping/depth_pro/network/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..74882c0eacac7e9bde0e13008fab31037eae671d
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/network/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro network blocks."""
diff --git a/finetune/modules/depth_warping/depth_pro/network/decoder.py b/finetune/modules/depth_warping/depth_pro/network/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..770665fcd3e47948388d5da43487d9e75dc0f3fc
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/network/decoder.py
@@ -0,0 +1,206 @@
+"""Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+Dense Prediction Transformer Decoder architecture.
+
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+"""
+
+from __future__ import annotations
+
+from typing import Iterable
+
+import torch
+from torch import nn
+
+
+class MultiresConvDecoder(nn.Module):
+    """Decoder for multi-resolution encodings."""
+
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        dim_decoder: int,
+    ):
+        """Initialize multiresolution convolutional decoder.
+
+        Args:
+        ----
+            dims_encoder: Expected dims at each level from the encoder.
+            dim_decoder: Dim of decoder features.
+
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        self.dim_decoder = dim_decoder
+        self.dim_out = dim_decoder
+
+        num_encoders = len(self.dims_encoder)
+
+        # At the highest resolution, i.e. level 0, we apply projection w/ 1x1 convolution
+        # when the dimensions mismatch. Otherwise we do not do anything, which is
+        # the default behavior of monodepth.
+        conv0 = (
+            nn.Conv2d(self.dims_encoder[0], dim_decoder, kernel_size=1, bias=False)
+            if self.dims_encoder[0] != dim_decoder
+            else nn.Identity()
+        )
+
+        convs = [conv0]
+        for i in range(1, num_encoders):
+            convs.append(
+                nn.Conv2d(
+                    self.dims_encoder[i],
+                    dim_decoder,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            )
+
+        self.convs = nn.ModuleList(convs)
+
+        fusions = []
+        for i in range(num_encoders):
+            fusions.append(
+                FeatureFusionBlock2d(
+                    num_features=dim_decoder,
+                    deconv=(i != 0),
+                    batch_norm=False,
+                )
+            )
+        self.fusions = nn.ModuleList(fusions)
+
+    def forward(self, encodings: torch.Tensor) -> torch.Tensor:
+        """Decode the multi-resolution encodings."""
+        num_levels = len(encodings)
+        num_encoders = len(self.dims_encoder)
+
+        if num_levels != num_encoders:
+            raise ValueError(
+                f"Got encoder output levels={num_levels}, expected levels={num_encoders+1}."
+            )
+
+        # Project features of different encoder dims to the same decoder dim.
+        # Fuse features from the lowest resolution (num_levels-1)
+        # to the highest (0).
+        features = self.convs[-1](encodings[-1])
+        lowres_features = features
+        features = self.fusions[-1](features)
+        for i in range(num_levels - 2, -1, -1):
+            features_i = self.convs[i](encodings[i])
+            features = self.fusions[i](features, features_i)
+        return features, lowres_features
+
+
+class ResidualBlock(nn.Module):
+    """Generic implementation of residual blocks.
+
+    This implements a generic residual block from
+        He et al. - Identity Mappings in Deep Residual Networks (2016),
+        https://arxiv.org/abs/1603.05027
+    which can be further customized via factory functions.
+    """
+
+    def __init__(self, residual: nn.Module, shortcut: nn.Module | None = None) -> None:
+        """Initialize ResidualBlock."""
+        super().__init__()
+        self.residual = residual
+        self.shortcut = shortcut
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply residual block."""
+        delta_x = self.residual(x)
+
+        if self.shortcut is not None:
+            x = self.shortcut(x)
+
+        return x + delta_x
+
+
+class FeatureFusionBlock2d(nn.Module):
+    """Feature fusion for DPT."""
+
+    def __init__(
+        self,
+        num_features: int,
+        deconv: bool = False,
+        batch_norm: bool = False,
+    ):
+        """Initialize feature fusion block.
+
+        Args:
+        ----
+            num_features: Input and output dimensions.
+            deconv: Whether to use deconv before the final output conv.
+            batch_norm: Whether to use batch normalization in resnet blocks.
+
+        """
+        super().__init__()
+
+        self.resnet1 = self._residual_block(num_features, batch_norm)
+        self.resnet2 = self._residual_block(num_features, batch_norm)
+
+        self.use_deconv = deconv
+        if deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=num_features,
+                out_channels=num_features,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+
+        self.out_conv = nn.Conv2d(
+            num_features,
+            num_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x0: torch.Tensor, x1: torch.Tensor | None = None) -> torch.Tensor:
+        """Process and fuse input features."""
+        x = x0
+
+        if x1 is not None:
+            res = self.resnet1(x1)
+            x = self.skip_add.add(x, res)
+
+        x = self.resnet2(x)
+
+        if self.use_deconv:
+            x = self.deconv(x)
+        x = self.out_conv(x)
+
+        return x
+
+    @staticmethod
+    def _residual_block(num_features: int, batch_norm: bool):
+        """Create a residual block."""
+
+        def _create_block(dim: int, batch_norm: bool) -> list[nn.Module]:
+            layers = [
+                nn.ReLU(False),
+                nn.Conv2d(
+                    num_features,
+                    num_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not batch_norm,
+                ),
+            ]
+            if batch_norm:
+                layers.append(nn.BatchNorm2d(dim))
+            return layers
+
+        residual = nn.Sequential(
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+        )
+        return ResidualBlock(residual)
diff --git a/finetune/modules/depth_warping/depth_pro/network/encoder.py b/finetune/modules/depth_warping/depth_pro/network/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3a3da17d47bf91662463520afaf413f08676c3b
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/network/encoder.py
@@ -0,0 +1,332 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# DepthProEncoder combining patch and image encoders.
+
+from __future__ import annotations
+
+import math
+from typing import Iterable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DepthProEncoder(nn.Module):
+    """DepthPro Encoder.
+
+    An encoder aimed at creating multi-resolution encodings from Vision Transformers.
+    """
+
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        patch_encoder: nn.Module,
+        image_encoder: nn.Module,
+        hook_block_ids: Iterable[int],
+        decoder_features: int,
+    ):
+        """Initialize DepthProEncoder.
+
+        The framework
+            1. creates an image pyramid,
+            2. generates overlapping patches with a sliding window at each pyramid level,
+            3. creates batched encodings via vision transformer backbones,
+            4. produces multi-resolution encodings.
+
+        Args:
+        ----
+            img_size: Backbone image resolution.
+            dims_encoder: Dimensions of the encoder at different layers.
+            patch_encoder: Backbone used for patches.
+            image_encoder: Backbone used for global image encoder.
+            hook_block_ids: Hooks to obtain intermediate features for the patch encoder model.
+            decoder_features: Number of feature output in the decoder.
+
+        """
+        super().__init__()
+
+        self.dims_encoder = list(dims_encoder)
+        self.patch_encoder = patch_encoder
+        self.image_encoder = image_encoder
+        self.hook_block_ids = list(hook_block_ids)
+
+        patch_encoder_embed_dim = patch_encoder.embed_dim
+        image_encoder_embed_dim = image_encoder.embed_dim
+
+        self.out_size = int(
+            patch_encoder.patch_embed.img_size[0] // patch_encoder.patch_embed.patch_size[0]
+        )
+
+        def _create_project_upsample_block(
+            dim_in: int,
+            dim_out: int,
+            upsample_layers: int,
+            dim_int: Optional[int] = None,
+        ) -> nn.Module:
+            if dim_int is None:
+                dim_int = dim_out
+            # Projection.
+            blocks = [
+                nn.Conv2d(
+                    in_channels=dim_in,
+                    out_channels=dim_int,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=False,
+                )
+            ]
+
+            # Upsampling.
+            blocks += [
+                nn.ConvTranspose2d(
+                    in_channels=dim_int if i == 0 else dim_out,
+                    out_channels=dim_out,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=False,
+                )
+                for i in range(upsample_layers)
+            ]
+
+            return nn.Sequential(*blocks)
+
+        self.upsample_latent0 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim,
+            dim_int=self.dims_encoder[0],
+            dim_out=decoder_features,
+            upsample_layers=3,
+        )
+        self.upsample_latent1 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[0], upsample_layers=2
+        )
+
+        self.upsample0 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[1], upsample_layers=1
+        )
+        self.upsample1 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[2], upsample_layers=1
+        )
+        self.upsample2 = _create_project_upsample_block(
+            dim_in=patch_encoder_embed_dim, dim_out=self.dims_encoder[3], upsample_layers=1
+        )
+
+        self.upsample_lowres = nn.ConvTranspose2d(
+            in_channels=image_encoder_embed_dim,
+            out_channels=self.dims_encoder[3],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+        )
+        self.fuse_lowres = nn.Conv2d(
+            in_channels=(self.dims_encoder[3] + self.dims_encoder[3]),
+            out_channels=self.dims_encoder[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+
+        # Obtain intermediate outputs of the blocks.
+        self.patch_encoder.blocks[self.hook_block_ids[0]].register_forward_hook(
+            self._hook0
+        )
+        self.patch_encoder.blocks[self.hook_block_ids[1]].register_forward_hook(
+            self._hook1
+        )
+
+    def _hook0(self, model, input, output):
+        self.backbone_highres_hook0 = output
+
+    def _hook1(self, model, input, output):
+        self.backbone_highres_hook1 = output
+
+    @property
+    def img_size(self) -> int:
+        """Return the full image size of the SPN network."""
+        return self.patch_encoder.patch_embed.img_size[0] * 4
+
+    def _create_pyramid(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Create a 3-level image pyramid."""
+        # Original resolution: 1536 by default.
+        x0 = x
+
+        # Middle resolution: 768 by default.
+        x1 = F.interpolate(
+            x, size=None, scale_factor=0.5, mode="bilinear", align_corners=False
+        )
+
+        # Low resolution: 384 by default, corresponding to the backbone resolution.
+        x2 = F.interpolate(
+            x, size=None, scale_factor=0.25, mode="bilinear", align_corners=False
+        )
+
+        return x0, x1, x2
+
+    def split(self, x: torch.Tensor, overlap_ratio: float = 0.25) -> torch.Tensor:
+        """Split the input into small patches with sliding window."""
+        patch_size = 384
+        patch_stride = int(patch_size * (1 - overlap_ratio))
+
+        image_size = x.shape[-1]
+        steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1
+
+        x_patch_list = []
+        for j in range(steps):
+            j0 = j * patch_stride
+            j1 = j0 + patch_size
+
+            for i in range(steps):
+                i0 = i * patch_stride
+                i1 = i0 + patch_size
+                x_patch_list.append(x[..., j0:j1, i0:i1])
+
+        return torch.cat(x_patch_list, dim=0)
+
+    def merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor:
+        """Merge the patched input into a image with sliding window."""
+        steps = int(math.sqrt(x.shape[0] // batch_size))
+
+        idx = 0
+
+        output_list = []
+        for j in range(steps):
+            output_row_list = []
+            for i in range(steps):
+                output = x[batch_size * idx : batch_size * (idx + 1)]
+
+                if j != 0:
+                    output = output[..., padding:, :]
+                if i != 0:
+                    output = output[..., :, padding:]
+                if j != steps - 1:
+                    output = output[..., :-padding, :]
+                if i != steps - 1:
+                    output = output[..., :, :-padding]
+
+                output_row_list.append(output)
+                idx += 1
+
+            output_row = torch.cat(output_row_list, dim=-1)
+            output_list.append(output_row)
+        output = torch.cat(output_list, dim=-2)
+        return output
+
+    def reshape_feature(
+        self, embeddings: torch.Tensor, width, height, cls_token_offset=1
+    ):
+        """Discard class token and reshape 1D feature map to a 2D grid."""
+        b, hw, c = embeddings.shape
+
+        # Remove class token.
+        if cls_token_offset > 0:
+            embeddings = embeddings[:, cls_token_offset:, :]
+
+        # Shape: (batch, height, width, dim) -> (batch, dim, height, width)
+        embeddings = embeddings.reshape(b, height, width, c).permute(0, 3, 1, 2)
+        return embeddings
+
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
+        """Encode input at multiple resolutions.
+
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+
+        Returns:
+        -------
+            Multi resolution encoded features.
+
+        """
+        batch_size = x.shape[0]
+
+        # Step 0: create a 3-level image pyramid.
+        x0, x1, x2 = self._create_pyramid(x)
+
+        # Step 1: split to create batched overlapped mini-images at the backbone (BeiT/ViT/Dino)
+        # resolution.
+        # 5x5 @ 384x384 at the highest resolution (1536x1536).
+        x0_patches = self.split(x0, overlap_ratio=0.25)
+        # 3x3 @ 384x384 at the middle resolution (768x768).
+        x1_patches = self.split(x1, overlap_ratio=0.5)
+        # 1x1 # 384x384 at the lowest resolution (384x384).
+        x2_patches = x2
+
+        # Concatenate all the sliding window patches and form a batch of size (35=5x5+3x3+1x1).
+        x_pyramid_patches = torch.cat(
+            (x0_patches, x1_patches, x2_patches),
+            dim=0,
+        )
+
+        # Step 2: Run the backbone (BeiT) model and get the result of large batch size.
+        x_pyramid_encodings = self.patch_encoder(x_pyramid_patches)
+        x_pyramid_encodings = self.reshape_feature(
+            x_pyramid_encodings, self.out_size, self.out_size
+        )
+
+        # Step 3: merging.
+        # Merge highres latent encoding.
+        x_latent0_encodings = self.reshape_feature(
+            self.backbone_highres_hook0,
+            self.out_size,
+            self.out_size,
+        )
+        x_latent0_features = self.merge(
+            x_latent0_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+
+        x_latent1_encodings = self.reshape_feature(
+            self.backbone_highres_hook1,
+            self.out_size,
+            self.out_size,
+        )
+        x_latent1_features = self.merge(
+            x_latent1_encodings[: batch_size * 5 * 5], batch_size=batch_size, padding=3
+        )
+
+        # Split the 35 batch size from pyramid encoding back into 5x5+3x3+1x1.
+        x0_encodings, x1_encodings, x2_encodings = torch.split(
+            x_pyramid_encodings,
+            [len(x0_patches), len(x1_patches), len(x2_patches)],
+            dim=0,
+        )
+
+        # 96x96 feature maps by merging 5x5 @ 24x24 patches with overlaps.
+        x0_features = self.merge(x0_encodings, batch_size=batch_size, padding=3)
+
+        # 48x84 feature maps by merging 3x3 @ 24x24 patches with overlaps.
+        x1_features = self.merge(x1_encodings, batch_size=batch_size, padding=6)
+
+        # 24x24 feature maps.
+        x2_features = x2_encodings
+
+        # Apply the image encoder model.
+        x_global_features = self.image_encoder(x2_patches)
+        x_global_features = self.reshape_feature(
+            x_global_features, self.out_size, self.out_size
+        )
+
+        # Upsample feature maps.
+        x_latent0_features = self.upsample_latent0(x_latent0_features)
+        x_latent1_features = self.upsample_latent1(x_latent1_features)
+
+        x0_features = self.upsample0(x0_features)
+        x1_features = self.upsample1(x1_features)
+        x2_features = self.upsample2(x2_features)
+
+        x_global_features = self.upsample_lowres(x_global_features)
+        x_global_features = self.fuse_lowres(
+            torch.cat((x2_features, x_global_features), dim=1)
+        )
+
+        return [
+            x_latent0_features,
+            x_latent1_features,
+            x0_features,
+            x1_features,
+            x_global_features,
+        ]
diff --git a/finetune/modules/depth_warping/depth_pro/network/fov.py b/finetune/modules/depth_warping/depth_pro/network/fov.py
new file mode 100644
index 0000000000000000000000000000000000000000..5900286509ca9535d4d29679b88055b5b6aed938
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/network/fov.py
@@ -0,0 +1,82 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Field of View network architecture.
+
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class FOVNetwork(nn.Module):
+    """Field of View estimation network."""
+
+    def __init__(
+        self,
+        num_features: int,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize the Field of View estimation block.
+
+        Args:
+        ----
+            num_features: Number of features used.
+            fov_encoder: Optional encoder to bring additional network capacity.
+
+        """
+        super().__init__()
+
+        # Create FOV head.
+        fov_head0 = [
+            nn.Conv2d(
+                num_features, num_features // 2, kernel_size=3, stride=2, padding=1
+            ),  # 128 x 24 x 24
+            nn.ReLU(True),
+        ]
+        fov_head = [
+            nn.Conv2d(
+                num_features // 2, num_features // 4, kernel_size=3, stride=2, padding=1
+            ),  # 64 x 12 x 12
+            nn.ReLU(True),
+            nn.Conv2d(
+                num_features // 4, num_features // 8, kernel_size=3, stride=2, padding=1
+            ),  # 32 x 6 x 6
+            nn.ReLU(True),
+            nn.Conv2d(num_features // 8, 1, kernel_size=6, stride=1, padding=0),
+        ]
+        if fov_encoder is not None:
+            self.encoder = nn.Sequential(
+                fov_encoder, nn.Linear(fov_encoder.embed_dim, num_features // 2)
+            )
+            self.downsample = nn.Sequential(*fov_head0)
+        else:
+            fov_head = fov_head0 + fov_head
+        self.head = nn.Sequential(*fov_head)
+
+    def forward(self, x: torch.Tensor, lowres_feature: torch.Tensor) -> torch.Tensor:
+        """Forward the fov network.
+
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+            lowres_feature (torch.Tensor): Low resolution feature.
+
+        Returns:
+        -------
+            The field of view tensor.
+
+        """
+        if hasattr(self, "encoder"):
+            x = F.interpolate(
+                x,
+                size=None,
+                scale_factor=0.25,
+                mode="bilinear",
+                align_corners=False,
+            )
+            x = self.encoder(x)[:, 1:].permute(0, 2, 1)
+            lowres_feature = self.downsample(lowres_feature)
+            x = x.reshape_as(lowres_feature) + lowres_feature
+        else:
+            x = lowres_feature
+        return self.head(x)
diff --git a/finetune/modules/depth_warping/depth_pro/network/vit.py b/finetune/modules/depth_warping/depth_pro/network/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6c3768a1dcedccd99a58f9507f4edac3cde9da0
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/network/vit.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+
+try:
+    from timm.layers import resample_abs_pos_embed
+except ImportError as err:
+    print("ImportError: {0}".format(err))
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+
+def make_vit_b16_backbone(
+    model,
+    encoder_feature_dims,
+    encoder_feature_layer_ids,
+    vit_features,
+    start_index=1,
+    use_grad_checkpointing=False,
+) -> nn.Module:
+    """Make a ViTb16 backbone for the DPT model."""
+    if use_grad_checkpointing:
+        model.set_grad_checkpointing()
+
+    vit_model = nn.Module()
+    vit_model.hooks = encoder_feature_layer_ids
+    vit_model.model = model
+    vit_model.features = encoder_feature_dims
+    vit_model.vit_features = vit_features
+    vit_model.model.start_index = start_index
+    vit_model.model.patch_size = vit_model.model.patch_embed.patch_size
+    vit_model.model.is_vit = True
+    vit_model.model.forward = vit_model.model.forward_features
+
+    return vit_model
+
+
+def forward_features_eva_fixed(self, x):
+    """Encode features."""
+    x = self.patch_embed(x)
+    x, rot_pos_embed = self._pos_embed(x)
+    for blk in self.blocks:
+        if self.grad_checkpointing:
+            x = checkpoint(blk, x, rot_pos_embed)
+        else:
+            x = blk(x, rot_pos_embed)
+    x = self.norm(x)
+    return x
+
+
+def resize_vit(model: nn.Module, img_size) -> nn.Module:
+    """Resample the ViT module to the given size."""
+    patch_size = model.patch_embed.patch_size
+    model.patch_embed.img_size = img_size
+    grid_size = tuple([s // p for s, p in zip(img_size, patch_size)])
+    model.patch_embed.grid_size = grid_size
+
+    pos_embed = resample_abs_pos_embed(
+        model.pos_embed,
+        grid_size,  # img_size
+        num_prefix_tokens=(
+            0 if getattr(model, "no_embed_class", False) else model.num_prefix_tokens
+        ),
+    )
+    model.pos_embed = torch.nn.Parameter(pos_embed)
+
+    return model
+
+
+def resize_patch_embed(model: nn.Module, new_patch_size=(16, 16)) -> nn.Module:
+    """Resample the ViT patch size to the given one."""
+    # interpolate patch embedding
+    if hasattr(model, "patch_embed"):
+        old_patch_size = model.patch_embed.patch_size
+
+        if (
+            new_patch_size[0] != old_patch_size[0]
+            or new_patch_size[1] != old_patch_size[1]
+        ):
+            patch_embed_proj = model.patch_embed.proj.weight
+            patch_embed_proj_bias = model.patch_embed.proj.bias
+            use_bias = True if patch_embed_proj_bias is not None else False
+            _, _, h, w = patch_embed_proj.shape
+
+            new_patch_embed_proj = torch.nn.functional.interpolate(
+                patch_embed_proj,
+                size=[new_patch_size[0], new_patch_size[1]],
+                mode="bicubic",
+                align_corners=False,
+            )
+            new_patch_embed_proj = (
+                new_patch_embed_proj * (h / new_patch_size[0]) * (w / new_patch_size[1])
+            )
+
+            model.patch_embed.proj = nn.Conv2d(
+                in_channels=model.patch_embed.proj.in_channels,
+                out_channels=model.patch_embed.proj.out_channels,
+                kernel_size=new_patch_size,
+                stride=new_patch_size,
+                bias=use_bias,
+            )
+
+            if use_bias:
+                model.patch_embed.proj.bias = patch_embed_proj_bias
+
+            model.patch_embed.proj.weight = torch.nn.Parameter(new_patch_embed_proj)
+
+            model.patch_size = new_patch_size
+            model.patch_embed.patch_size = new_patch_size
+            model.patch_embed.img_size = (
+                int(
+                    model.patch_embed.img_size[0]
+                    * new_patch_size[0]
+                    / old_patch_size[0]
+                ),
+                int(
+                    model.patch_embed.img_size[1]
+                    * new_patch_size[1]
+                    / old_patch_size[1]
+                ),
+            )
+
+    return model
diff --git a/finetune/modules/depth_warping/depth_pro/network/vit_factory.py b/finetune/modules/depth_warping/depth_pro/network/vit_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cd899f650978043c2c83348670beaf597e9ca30
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/network/vit_factory.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Factory functions to build and load ViT models.
+
+
+from __future__ import annotations
+
+import logging
+import types
+from dataclasses import dataclass
+from typing import Dict, List, Literal, Optional
+
+import timm
+import torch
+import torch.nn as nn
+
+from .vit import (
+    forward_features_eva_fixed,
+    make_vit_b16_backbone,
+    resize_patch_embed,
+    resize_vit,
+)
+
+LOGGER = logging.getLogger(__name__)
+
+
+ViTPreset = Literal[
+    "dinov2l16_384",
+]
+
+
+@dataclass
+class ViTConfig:
+    """Configuration for ViT."""
+
+    in_chans: int
+    embed_dim: int
+
+    img_size: int = 384
+    patch_size: int = 16
+
+    # In case we need to rescale the backbone when loading from timm.
+    timm_preset: Optional[str] = None
+    timm_img_size: int = 384
+    timm_patch_size: int = 16
+
+    # The following 2 parameters are only used by DPT.  See dpt_factory.py.
+    encoder_feature_layer_ids: List[int] = None
+    """The layers in the Beit/ViT used to constructs encoder features for DPT."""
+    encoder_feature_dims: List[int] = None
+    """The dimension of features of encoder layers from Beit/ViT features for DPT."""
+
+
+VIT_CONFIG_DICT: Dict[ViTPreset, ViTConfig] = {
+    "dinov2l16_384": ViTConfig(
+        in_chans=3,
+        embed_dim=1024,
+        encoder_feature_layer_ids=[5, 11, 17, 23],
+        encoder_feature_dims=[256, 512, 1024, 1024],
+        img_size=384,
+        patch_size=16,
+        timm_preset="vit_large_patch14_dinov2",
+        timm_img_size=518,
+        timm_patch_size=14,
+    ),
+}
+
+
+def create_vit(
+    preset: ViTPreset,
+    use_pretrained: bool = False,
+    checkpoint_uri: str | None = None,
+    use_grad_checkpointing: bool = False,
+) -> nn.Module:
+    """Create and load a VIT backbone module.
+
+    Args:
+    ----
+        preset: The VIT preset to load the pre-defined config.
+        use_pretrained: Load pretrained weights if True, default is False.
+        checkpoint_uri: Checkpoint to load the wights from.
+        use_grad_checkpointing: Use grandient checkpointing.
+
+    Returns:
+    -------
+        A Torch ViT backbone module.
+
+    """
+    config = VIT_CONFIG_DICT[preset]
+
+    img_size = (config.img_size, config.img_size)
+    patch_size = (config.patch_size, config.patch_size)
+
+    if "eva02" in preset:
+        model = timm.create_model(config.timm_preset, pretrained=use_pretrained)
+        model.forward_features = types.MethodType(forward_features_eva_fixed, model)
+    else:
+        model = timm.create_model(
+            config.timm_preset, pretrained=use_pretrained, dynamic_img_size=True
+        )
+    model = make_vit_b16_backbone(
+        model,
+        encoder_feature_dims=config.encoder_feature_dims,
+        encoder_feature_layer_ids=config.encoder_feature_layer_ids,
+        vit_features=config.embed_dim,
+        use_grad_checkpointing=use_grad_checkpointing,
+    )
+    if config.patch_size != config.timm_patch_size:
+        model.model = resize_patch_embed(model.model, new_patch_size=patch_size)
+    if config.img_size != config.timm_img_size:
+        model.model = resize_vit(model.model, img_size=img_size)
+
+    if checkpoint_uri is not None:
+        state_dict = torch.load(checkpoint_uri, map_location="cpu")
+        missing_keys, unexpected_keys = model.load_state_dict(
+            state_dict=state_dict, strict=False
+        )
+
+        if len(unexpected_keys) != 0:
+            raise KeyError(f"Found unexpected keys when loading vit: {unexpected_keys}")
+        if len(missing_keys) != 0:
+            raise KeyError(f"Keys are missing when loading vit: {missing_keys}")
+
+    LOGGER.info(model)
+    return model.model
diff --git a/finetune/modules/depth_warping/depth_pro/utils.py b/finetune/modules/depth_warping/depth_pro/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a401def2e1d6a2dd96b204e962569e9da5e0ef1
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_pro/utils.py
@@ -0,0 +1,112 @@
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+
+import numpy as np
+import pillow_heif
+from PIL import ExifTags, Image, TiffTags
+from pillow_heif import register_heif_opener
+
+register_heif_opener()
+LOGGER = logging.getLogger(__name__)
+
+
+def extract_exif(img_pil: Image) -> Dict[str, Any]:
+    """Return exif information as a dictionary.
+
+    Args:
+    ----
+        img_pil: A Pillow image.
+
+    Returns:
+    -------
+        A dictionary with extracted EXIF information.
+
+    """
+    # Get full exif description from get_ifd(0x8769):
+    # cf https://pillow.readthedocs.io/en/stable/releasenotes/8.2.0.html#image-getexif-exif-and-gps-ifd
+    img_exif = img_pil.getexif().get_ifd(0x8769)
+    exif_dict = {ExifTags.TAGS[k]: v for k, v in img_exif.items() if k in ExifTags.TAGS}
+
+    tiff_tags = img_pil.getexif()
+    tiff_dict = {
+        TiffTags.TAGS_V2[k].name: v
+        for k, v in tiff_tags.items()
+        if k in TiffTags.TAGS_V2
+    }
+    return {**exif_dict, **tiff_dict}
+
+
+def fpx_from_f35(width: float, height: float, f_mm: float = 50) -> float:
+    """Convert a focal length given in mm (35mm film equivalent) to pixels."""
+    return f_mm * np.sqrt(width**2.0 + height**2.0) / np.sqrt(36**2 + 24**2)
+
+
+def load_rgb(
+    path: Union[Path, str], auto_rotate: bool = True, remove_alpha: bool = True
+) -> Tuple[np.ndarray, List[bytes], float]:
+    """Load an RGB image.
+
+    Args:
+    ----
+        path: The url to the image to load.
+        auto_rotate: Rotate the image based on the EXIF data, default is True.
+        remove_alpha: Remove the alpha channel, default is True.
+
+    Returns:
+    -------
+        img: The image loaded as a numpy array.
+        icc_profile: The color profile of the image.
+        f_px: The optional focal length in pixels, extracting from the exif data.
+
+    """
+    LOGGER.debug(f"Loading image {path} ...")
+
+    path = Path(path)
+    if path.suffix.lower() in [".heic"]:
+        heif_file = pillow_heif.open_heif(path, convert_hdr_to_8bit=True)
+        img_pil = heif_file.to_pillow()
+    else:
+        img_pil = Image.open(path)
+
+    img_exif = extract_exif(img_pil)
+    icc_profile = img_pil.info.get("icc_profile", None)
+
+    # Rotate the image.
+    if auto_rotate:
+        exif_orientation = img_exif.get("Orientation", 1)
+        if exif_orientation == 3:
+            img_pil = img_pil.transpose(Image.ROTATE_180)
+        elif exif_orientation == 6:
+            img_pil = img_pil.transpose(Image.ROTATE_270)
+        elif exif_orientation == 8:
+            img_pil = img_pil.transpose(Image.ROTATE_90)
+        elif exif_orientation != 1:
+            LOGGER.warning(f"Ignoring image orientation {exif_orientation}.")
+
+    img = np.array(img_pil)
+    # Convert to RGB if single channel.
+    if img.ndim < 3 or img.shape[2] == 1:
+        img = np.dstack((img, img, img))
+
+    if remove_alpha:
+        img = img[:, :, :3]
+
+    LOGGER.debug(f"\tHxW: {img.shape[0]}x{img.shape[1]}")
+
+    # Extract the focal length from exif data.
+    f_35mm = img_exif.get(
+        "FocalLengthIn35mmFilm",
+        img_exif.get(
+            "FocalLenIn35mmFilm", img_exif.get("FocalLengthIn35mmFormat", None)
+        ),
+    )
+    if f_35mm is not None and f_35mm > 0:
+        LOGGER.debug(f"\tfocal length @ 35mm film: {f_35mm}mm")
+        f_px = fpx_from_f35(img.shape[1], img.shape[0], f_35mm)
+    else:
+        f_px = None
+
+    return img, icc_profile, f_px
diff --git a/finetune/modules/depth_warping/depth_warping.py b/finetune/modules/depth_warping/depth_warping.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac2d565907a1e94aef0d5392052eaf502c646e6
--- /dev/null
+++ b/finetune/modules/depth_warping/depth_warping.py
@@ -0,0 +1,106 @@
+from transformers import pipeline
+from PIL import Image
+import requests
+import torchvision
+import os
+from .camera.WarperPytorch import Warper
+import numpy as np
+from einops import rearrange, repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .depth_anything_v2.dpt import DepthAnythingV2
+
+import pdb
+
+def to_pil_image(x):
+    # x: c h w, [-1, 1]
+    x_np = ((x+1)/2*255).permute(1,2,0).detach().cpu().numpy().astype(np.uint8)
+    x_pil = Image.fromarray(x_np)
+    
+    return x_pil
+
+def to_npy(x):
+    return ((x+1)/2*255).permute(1,2,0).detach().cpu().numpy()
+
+def unnormalize_intrinsic(x, size):
+    h, w = size
+    x_ = x.detach().clone()
+    x_[:,0:1] = x[:,0:1].detach().clone() * w
+    x_[:,1:2] = x[:,1:2].detach().clone() * h
+    return x_
+
+class DepthWarping_wrapper(nn.Module):
+    def __init__(self,
+                 model_config,
+                 ckpt_path,):
+        super().__init__()
+        
+        # self.depth_model = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")
+        self.depth_model = DepthAnythingV2(**model_config)
+        self.depth_model.load_state_dict(torch.load(ckpt_path, map_location='cpu'))
+        self.depth_model = self.depth_model.eval()
+        self.warper = Warper()
+    
+    def get_input(self, batch):
+        # pdb.set_trace()
+        
+        b, v = batch["target"]["intrinsics"].shape[:2]
+        h, w = batch["context"]["image"].shape[-2:]
+        
+        
+        image = (batch["context"]["image"]) * 2 - 1
+        image_ctxt = repeat(image, "b c h w -> (b v) c h w", v=v)
+        c2w_ctxt = repeat(batch["context"]["extrinsics"], "b t h w -> (b v t) h w", v=v) # No need to apply inverse as it is an eye matrix.
+        # c2w_trgt = rearrange(torch.inverse(batch["target"]["extrinsics"]), "b t h w -> (b t) h w")
+        c2w_trgt = rearrange(batch["target"]["extrinsics"], "b t h w -> (b t) h w")
+        intrinsics_ctxt = unnormalize_intrinsic(repeat(batch["context"]["intrinsics"], "b t h w -> (b v t) h w", v=v), size=(h,w))
+        intrinsics_trgt = unnormalize_intrinsic(rearrange(batch["target"]["intrinsics"], "b t h w -> (b t) h w"), size=(h,w))
+        
+        # image = image.squeeze(1)
+        # depth_ctxt = torch.stack([torch.tensor(self.depth_model.infer_image(to_npy(x))) for x in image], dim=0).to(image.device).unsqueeze(1) # B 1 H W
+        depth_ctxt = torch.stack([self.depth_model.infer_image(to_npy(x)) for x in image], dim=0).to(image.device).unsqueeze(1) # B 1 H W
+
+        # depth_ctxt = torch.nn.functional.interpolate(
+        #     depth_ctxt,
+        #     size=(h,w),
+        #     mode="bicubic",
+        #     align_corners=False,
+        # )
+        
+        return image_ctxt, c2w_ctxt, c2w_trgt, intrinsics_ctxt, intrinsics_trgt, depth_ctxt, batch['variable_intrinsic']
+    
+    def forward(self, batch):
+        image_ctxt, c2w_ctxt, c2w_trgt, intrinsics_ctxt, intrinsics_trgt, depth_ctxt, variable_intrinsic = self.get_input(batch)
+        
+        with torch.cuda.amp.autocast(enabled=False):
+        
+            b, v = batch["target"]["intrinsics"].shape[:2]
+            # h, w = image_ctxt.shape[-2:]
+            
+            warped_trgt, mask_trgt, warped_depth_trgt, flow_f = self.warper.forward_warp(
+                frame1=image_ctxt, 
+                mask1=None, 
+                depth1=repeat(depth_ctxt, "b c h w -> (b t) c h w", t=v), 
+                transformation1=c2w_ctxt, 
+                transformation2=c2w_trgt, 
+                intrinsic1=intrinsics_ctxt, 
+                intrinsic2=intrinsics_trgt if variable_intrinsic else None)
+            
+            warped_src, mask_src, warped_depth_src, flow_b = self.warper.forward_warp(
+                frame1=warped_trgt, 
+                mask1=None, 
+                depth1=warped_depth_trgt, 
+                transformation1=c2w_trgt, 
+                transformation2=c2w_ctxt, 
+                intrinsic1=intrinsics_trgt, 
+                intrinsic2=None)
+        
+        # if use_backward_flow:
+        #     mask = mask_trgt
+        # else:
+        #     mask = mask_src
+
+        return flow_f, flow_b, warped_trgt, depth_ctxt, warped_depth_trgt 
+
+        
diff --git a/finetune/modules/utils.py b/finetune/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd74370bdf2841a168d817f59b7bfc17fb63bade
--- /dev/null
+++ b/finetune/modules/utils.py
@@ -0,0 +1,505 @@
+import importlib
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+import pdb
+
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[:4]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[6:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+
+def get_relative_pose(cam_params, zero_first_frame_scale):
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    source_cam_c2w = abs_c2ws[0]
+    if zero_first_frame_scale:
+        cam_to_origin = 0
+    else:
+        cam_to_origin = np.linalg.norm(source_cam_c2w[:3, 3])
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+
+def get_K(intrinsics, size):
+    def normalize_intrinsic(x, size):
+        h, w = size
+        x[:,:,0:1] = x[:,:,0:1] / w
+        x[:,:,1:2] = x[:,:,1:2] / h
+        return x
+    
+    b, _, t, _ = intrinsics.shape
+    K = torch.zeros((b, t, 9), dtype=intrinsics.dtype, device=intrinsics.device)
+    fx, fy, cx, cy = intrinsics.squeeze(1).chunk(4, dim=-1)
+    
+    K[:,:,0:1] = fx
+    K[:,:,2:3] = cx
+    K[:,:,4:5] = fy
+    K[:,:,5:6] = cy
+    K[:,:,8:9] = 1.0
+    
+    K = rearrange(K, "b t (h w) -> b t h w", h=3, w=3)
+    K = normalize_intrinsic(K, size)
+    
+    return K
+
+def get_camera_flow_generator_input(condition_image, camparams, device, speed=1.0):
+    """
+    Args
+    - condition_image: [c h w], scale~[0,255]
+    - camparam: [b, 18] (fx, fy, cx, cy, 0, 0, 3x4 Rt matrix), W2C.
+    - intrinsic: [b, 1, t, 4] (fx, fy, cx, cy)
+    - c2w: [b, 1, t, 4, 4]
+    """
+
+    condition_image = condition_image.unsqueeze(0)/255. # bchw, scale~[0,1]
+    sample_size = condition_image.shape[2:]
+    
+    cam_params = [[float(x) for x in camparam] for camparam in camparams]
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+
+    intrinsic = np.asarray([[cam_param.fx * sample_size[1],
+                            cam_param.fy * sample_size[0],
+                            cam_param.cx * sample_size[1],
+                            cam_param.cy * sample_size[0]]
+                            for cam_param in cam_params], dtype=np.float32)
+    
+    intrinsic = torch.as_tensor(intrinsic).unsqueeze(0).unsqueeze(0) # [1, 1, f, 4]
+
+    c2w = get_relative_pose(cam_params, zero_first_frame_scale=True)
+    c2w[:, :3, -1] = c2w[:, :3, -1] * speed
+    c2w = torch.as_tensor(c2w)
+
+    c2w = c2w.unsqueeze(0)
+    b = condition_image.shape[0]
+    t = c2w.shape[1]
+    K = get_K(intrinsic, size=condition_image.shape[2:]) # [b t 3 3]
+    c2w_dummy = repeat(torch.eye(4, dtype=c2w.dtype, device=device), "h w -> b 1 h w", b=c2w.shape[0])
+
+    t = 1
+    assert t == 1, "We use single image setting in 3D estimation networks! Now, you use more than one image for the context view."
+    
+    batch = dict()
+    batch['context'] = {
+            'image': condition_image,
+            'intrinsics': K[:,:1], 
+            'extrinsics': c2w_dummy,
+            'near': torch.ones((b, t), device=device),
+            'far': torch.ones((b, t), device=device) * 100,
+            'index': torch.arange(t).to(device)
+    }
+
+    b, t = c2w.shape[:2]
+
+    batch['target'] = {
+            'intrinsics': K,
+            'extrinsics': c2w,
+            'near': torch.ones((b, t), device=device),
+            'far': torch.ones((b, t), device=device) * 100,
+            'index': repeat(torch.arange(t).to(device), "t -> b t", b=b)
+    }
+
+    batch['scene'] = 'random'
+    batch['variable_intrinsic'] = None
+    return batch
+
+def to_zero_to_one(x):
+    return (x+1)/2
+
+
+
+def instantiate_from_config(config, **additional_kwargs):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+
+    additional_kwargs.update(config.get("kwargs", dict()))
+    return get_obj_from_str(config["target"])(**additional_kwargs)
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def warp_image(image, flow, use_forward_flow=True):
+    """
+    Args
+        image: context image (src view image)
+        flow: forward (src -> trgt) or backward optical flow (trgt -> src)
+    """
+    assert image.ndim==4 and flow.ndim==4
+    
+    h, w = flow.shape[2:]
+    if use_forward_flow:
+        flow = -flow
+
+    # Create a mesh grid
+    meshgrid = torch.meshgrid(torch.arange(w), torch.arange(h), indexing='xy')
+    grid = torch.stack(meshgrid, dim=2).float().to(image.device)  # Shape: (h, w, 2)
+
+    # Apply flow to the grid
+    flow_map = repeat(grid, "h w c -> b h w c", b=flow.shape[0]) + flow.permute(0, 2, 3, 1)  # Permute to match grid shape (h, w, 2)
+
+    # Normalize the flow map to [-1, 1] range for grid_sample
+    flow_map[..., 0] = 2.0 * flow_map[..., 0] / max(w - 1, 1) - 1.0
+    flow_map[..., 1] = 2.0 * flow_map[..., 1] / max(h - 1, 1) - 1.0
+
+    # Warp image using grid_sample
+    warped_image = F.grid_sample(image, flow_map, mode='bilinear', align_corners=True)
+
+    # Create the unobserved mask
+    # observed_mask = (flow_map[..., 0] >= -1.0) & (flow_map[..., 0] <= 1.0) & (flow_map[..., 1] >= -1.0) & (flow_map[..., 1] <= 1.0)
+
+    return warped_image
+
+def forward_bilinear_splatting(image, flow, mask=None):
+    """
+    Forward warping (splatting) with bilinear interpolation for an entire batch at once.
+
+    Args:
+      image: (B, 3, H, W)  # 소스 이미지
+      flow:  (B, 2, H, W)  # forward flow (dx, dy)
+      mask:  (B, 1, H, W)  # 1: valid, 0: invalid
+
+    Returns:
+      warped: (B, 3, H, W) # forward warp 결과
+    """
+    
+    device = image.device
+    B, C_i, H, W = image.shape
+    
+    if mask is None:
+        mask = torch.ones(B, 1, H, W).to(device, flow.dtype)
+    
+    assert C_i == 3, f"image의 채널 수는 3이어야 합니다. (현재: {C_i})"
+    assert flow.shape == (B, 2, H, W), "flow는 (B,2,H,W) 형태여야 합니다." # (BF)CHW, C=2
+    assert mask.shape == (B, 1, H, W), "mask는 (B,1,H,W) 형태여야 합니다." # (BF)CHW, C=1
+
+    # (B,3,H,W) -> (B,H,W,3)
+    image_bhwc = image.permute(0, 2, 3, 1).contiguous()  # (B,H,W,3)
+    # (B,2,H,W) -> (B,H,W,2)
+    flow_bhwt = flow.permute(0, 2, 3, 1).contiguous()    # (B,H,W,2)
+    # (B,1,H,W) -> (B,H,W)
+    mask_bhw   = mask.view(B, H, W)                     # (B,H,W)
+
+    # 나중에 scatter_add로 누적하기 위해 1D로 펼침
+    # 소스 이미지 픽셀 값 (B*H*W, 3)
+    image_flat = image_bhwc.view(-1, C_i)
+    # 플로우 (B*H*W, 2)
+    flow_flat  = flow_bhwt.view(-1, 2)
+    # 마스크 (B*H*W,)
+    mask_flat  = mask_bhw.view(-1)
+
+    # 각 픽셀이 속한 (batch b, y, x) 좌표를 1D로 만들기
+    b_grid = torch.arange(B, device=device).view(B,1,1).expand(-1,H,W)  # (B,H,W)
+    y_grid = torch.arange(H, device=device).view(1,H,1).expand(B,-1,W)  
+    x_grid = torch.arange(W, device=device).view(1,1,W).expand(B,H,-1)
+
+    b_idx = b_grid.flatten()  # (B*H*W)
+    y_idx = y_grid.flatten()
+    x_idx = x_grid.flatten()
+
+    # flow 적용 (x+dx, y+dy)
+    dx = flow_flat[:, 0]
+    dy = flow_flat[:, 1]
+    tx = x_idx + dx  # float
+    ty = y_idx + dy  # float
+
+    # bilinear 보간을 위해 floor/ceil
+    tx0 = tx.floor().long()
+    tx1 = tx0 + 1
+    ty0 = ty.floor().long()
+    ty1 = ty0 + 1
+
+    alpha = tx - tx.floor()  # (B*H*W)
+    beta  = ty - ty.floor()
+
+    # 유효 범위 & mask
+    valid = ((mask_flat == 1) &
+             (tx0 >= 0) & (tx1 < W) &
+             (ty0 >= 0) & (ty1 < H))
+    valid_idx = valid.nonzero(as_tuple=True)  # (N,)
+
+    # 필요한 부분만 인덱싱
+    v_b = b_idx[valid_idx]     # (N,)
+    v_x0 = tx0[valid_idx]
+    v_x1 = tx1[valid_idx]
+    v_y0 = ty0[valid_idx]
+    v_y1 = ty1[valid_idx]
+    v_alpha = alpha[valid_idx]
+    v_beta  = beta[valid_idx]
+    v_src   = image_flat[valid_idx]  # (N,3)
+
+    # bilinear 가중치
+    w00 = (1 - v_alpha) * (1 - v_beta)
+    w01 = v_alpha       * (1 - v_beta)
+    w10 = (1 - v_alpha) * v_beta
+    w11 = v_alpha       * v_beta
+
+    # 최종 결과 (B,H,W,3)와 가중치맵 (B,H,W)
+    warped_bhwc = torch.zeros_like(image_bhwc)  # (B,H,W,3)
+    weight_map  = torch.zeros((B, H, W), dtype=image.dtype, device=device)
+
+    # 다시 (B*H*W)로 펼침
+    warped_flat = warped_bhwc.view(-1, C_i)  # (B*H*W,3)
+    weight_flat = weight_map.view(-1)        # (B*H*W,)
+
+    # (b, y, x)를 (B,H,W) 1D 인덱스로 변환
+    # offset_b = b*(H*W), 그 후 y*W + x
+    def flatten_index(b, y, x):
+        return b*(H*W) + (y * W) + x
+
+    i00 = flatten_index(v_b, v_y0, v_x0)
+    i01 = flatten_index(v_b, v_y0, v_x1)
+    i10 = flatten_index(v_b, v_y1, v_x0)
+    i11 = flatten_index(v_b, v_y1, v_x1)
+
+    # scatter_add로 누적
+    warped_flat.index_add_(0, i00, w00.unsqueeze(-1) * v_src)
+    warped_flat.index_add_(0, i01, w01.unsqueeze(-1) * v_src)
+    warped_flat.index_add_(0, i10, w10.unsqueeze(-1) * v_src)
+    warped_flat.index_add_(0, i11, w11.unsqueeze(-1) * v_src)
+
+    weight_flat.index_add_(0, i00, w00)
+    weight_flat.index_add_(0, i01, w01)
+    weight_flat.index_add_(0, i10, w10)
+    weight_flat.index_add_(0, i11, w11)
+
+    # 누적된 값을 weight로 나누어 최종 색상 확정
+    w_valid = (weight_flat > 0)
+    warped_flat[w_valid] /= weight_flat[w_valid].unsqueeze(-1)
+
+    # (B,H,W,3)로 복원 후, (B,3,H,W)로 permute
+    warped_bhwc = warped_flat.view(B, H, W, C_i)
+    warped = warped_bhwc.permute(0, 3, 1, 2).contiguous()  # (B,3,H,W)
+    
+    return warped
+
+    
+def run_filtering(flow_f, flow_b, cycle_th=3.):
+    """
+    Args:
+        flow_f: b 2 h w
+        flow_b: b 2 h w
+        cycle_th: distance threshold for inconsistency (e.g., 3.0 pixel)
+    Returns:
+        valid_mask: binary mask (0: Not consistent or 1: consistent), float, [b 1 h w]
+    """
+    assert flow_f.ndim == 4 and flow_b.ndim == 4
+    
+    device = flow_f.device
+    h, w = flow_f.shape[-2:]
+    num_imgs = flow_f.shape[0]
+    
+    flow_f = flow_f
+    flow_b = flow_b
+    
+    grid = repeat(gen_grid(h, w, device=device).permute(2, 0, 1)[None], "b c h w -> (b v) c h w", v=num_imgs)
+    
+    coord2 = flow_f + grid
+    coord2_normed = normalize_coords(coord2.permute(0, 2, 3, 1), h, w)
+    flow_21_sampled = F.grid_sample(flow_b, coord2_normed, align_corners=True)
+    map_i = flow_f + flow_21_sampled
+    fb_discrepancy = torch.norm(map_i.squeeze(), dim=1)
+    valid_mask = fb_discrepancy < cycle_th
+    
+    return valid_mask.unsqueeze(1).float()
+
+
+def gen_grid(h, w, device, normalize=False, homogeneous=False):
+    if normalize:
+        lin_y = torch.linspace(-1., 1., steps=h, device=device)
+        lin_x = torch.linspace(-1., 1., steps=w, device=device)
+    else:
+        lin_y = torch.arange(0, h, device=device)
+        lin_x = torch.arange(0, w, device=device)
+    grid_y, grid_x = torch.meshgrid((lin_y, lin_x))
+    grid = torch.stack((grid_x, grid_y), -1)
+    if homogeneous:
+        grid = torch.cat([grid, torch.ones_like(grid[..., :1])], dim=-1)
+    return grid  # [h, w, 2 or 3]
+
+
+def normalize_coords(coords, h, w, no_shift=False):
+    assert coords.shape[-1] == 2
+    if no_shift:
+        return coords / torch.tensor([w-1., h-1.], device=coords.device) * 2
+    else:
+        return coords / torch.tensor([w-1., h-1.], device=coords.device) * 2 - 1.
+    
+#--------------------------------------------------------------------------------------------------------------
+# Codes borrowed from https://github.com/ChristophReich1996/Optical-Flow-Visualization-PyTorch
+
+from typing import Optional, Union
+
+import torch
+from math import pi as PI
+
+
+def get_color_wheel(device: torch.device) -> torch.Tensor:
+    """
+    Generates the color wheel.
+    :param device: (torch.device) Device to be used
+    :return: (torch.Tensor) Color wheel tensor of the shape [55, 3]
+    """
+    # Set constants
+    RY: int = 15
+    YG: int = 6
+    GC: int = 4
+    CB: int = 11
+    BM: int = 13
+    MR: int = 6
+    # Init color wheel
+    color_wheel: torch.Tensor = torch.zeros((RY + YG + GC + CB + BM + MR, 3), dtype=torch.float32)
+    # Init counter
+    counter: int = 0
+    # RY
+    color_wheel[0:RY, 0] = 255
+    color_wheel[0:RY, 1] = torch.floor(255 * torch.arange(0, RY) / RY)
+    counter: int = counter + RY
+    # YG
+    color_wheel[counter:counter + YG, 0] = 255 - torch.floor(255 * torch.arange(0, YG) / YG)
+    color_wheel[counter:counter + YG, 1] = 255
+    counter: int = counter + YG
+    # GC
+    color_wheel[counter:counter + GC, 1] = 255
+    color_wheel[counter:counter + GC, 2] = torch.floor(255 * torch.arange(0, GC) / GC)
+    counter: int = counter + GC
+    # CB
+    color_wheel[counter:counter + CB, 1] = 255 - torch.floor(255 * torch.arange(CB) / CB)
+    color_wheel[counter:counter + CB, 2] = 255
+    counter: int = counter + CB
+    # BM
+    color_wheel[counter:counter + BM, 2] = 255
+    color_wheel[counter:counter + BM, 0] = torch.floor(255 * torch.arange(0, BM) / BM)
+    counter: int = counter + BM
+    # MR
+    color_wheel[counter:counter + MR, 2] = 255 - torch.floor(255 * torch.arange(MR) / MR)
+    color_wheel[counter:counter + MR, 0] = 255
+    # To device
+    color_wheel: torch.Tensor = color_wheel.to(device)
+    return color_wheel
+
+
+def _flow_hw_to_color(flow_vertical: torch.Tensor, flow_horizontal: torch.Tensor,
+                      color_wheel: torch.Tensor, device: torch.device) -> torch.Tensor:
+    """
+    Private function applies the flow color wheel to flow components (vertical and horizontal).
+    :param flow_vertical: (torch.Tensor) Vertical flow of the shape [height, width]
+    :param flow_horizontal: (torch.Tensor) Horizontal flow of the shape [height, width]
+    :param color_wheel: (torch.Tensor) Color wheel tensor of the shape [55, 3]
+    :param: device: (torch.device) Device to be used
+    :return: (torch.Tensor) Visualized flow of the shape [3, height, width]
+    """
+    # Get shapes
+    _, height, width = flow_vertical.shape
+    # Init flow image
+    flow_image: torch.Tensor = torch.zeros(3, height, width, dtype=torch.float32, device=device)
+    # Get number of colors
+    number_of_colors: int = color_wheel.shape[0]
+    # Compute norm, angle and factors
+    flow_norm: torch.Tensor = (flow_vertical ** 2 + flow_horizontal ** 2).sqrt()
+    angle: torch.Tensor = torch.atan2(- flow_vertical, - flow_horizontal) / PI
+    fk: torch.Tensor = (angle + 1.) / 2. * (number_of_colors - 1.)
+    k0: torch.Tensor = torch.floor(fk).long()
+    k1: torch.Tensor = k0 + 1
+    k1[k1 == number_of_colors] = 0
+    f: torch.Tensor = fk - k0
+    # Iterate over color components
+    for index in range(color_wheel.shape[1]):
+        # Get component of all colors
+        tmp: torch.Tensor = color_wheel[:, index]
+        # Get colors
+        color_0: torch.Tensor = tmp[k0] / 255.
+        color_1: torch.Tensor = tmp[k1] / 255.
+        # Compute color
+        color: torch.Tensor = (1. - f) * color_0 + f * color_1
+        # Get color index
+        color_index: torch.Tensor = flow_norm <= 1
+        # Set color saturation
+        color[color_index] = 1 - flow_norm[color_index] * (1. - color[color_index])
+        color[~color_index] = color[~color_index] * 0.75
+        # Set color in image
+        flow_image[index] = torch.floor(255 * color)
+    return flow_image
+
+
+def flow_to_color(flow: torch.Tensor, clip_flow: Optional[Union[float, torch.Tensor]] = None, 
+                  normalize_over_video: bool = False) -> torch.Tensor:
+    """
+    Function converts a given optical flow map into the classical color schema.
+    :param flow: (torch.Tensor) Optical flow tensor of the shape [batch size (optional), 2, height, width].
+    :param clip_flow: (Optional[Union[float, torch.Tensor]]) Max value of flow values for clipping (default None).
+    :param normalize_over_video: (bool) If true scale is normalized over the whole video (batch).
+    :return: (torch.Tensor) Flow visualization (float tensor) with the shape [batch size (if used), 3, height, width].
+    """
+    # Check parameter types
+    assert torch.is_tensor(flow), "Given flow map must be a torch.Tensor, {} given".format(type(flow))
+    assert torch.is_tensor(clip_flow) or isinstance(clip_flow, float) or clip_flow is None, \
+        "Given clip_flow parameter must be a float, a torch.Tensor, or None, {} given".format(type(clip_flow))
+    # Check shapes
+    assert flow.ndimension() in [3, 4], \
+        "Given flow must be a 3D or 4D tensor, given tensor shape {}.".format(flow.shape)
+    if torch.is_tensor(clip_flow):
+        assert clip_flow.ndimension() == 0, \
+            "Given clip_flow tensor must be a scalar, given tensor shape {}.".format(clip_flow.shape)
+    # Manage batch dimension
+    batch_dimension: bool = True
+    if flow.ndimension() == 3:
+        flow = flow[None]
+        batch_dimension: bool = False
+    # Save shape
+    batch_size, _, height, width = flow.shape
+    # Check flow dimension
+    assert flow.shape[1] == 2, "Flow dimension must have the shape 2 but tensor with {} given".format(flow.shape[1])
+    # Save device
+    device: torch.device = flow.device
+    # Clip flow if utilized
+    if clip_flow is not None:
+        flow = flow.clip(max=clip_flow)
+    # Get horizontal and vertical flow
+    flow_vertical: torch.Tensor = flow[:, 0:1]
+    flow_horizontal: torch.Tensor = flow[:, 1:2]
+    # Get max norm of flow
+    flow_max_norm: torch.Tensor = (flow_vertical ** 2 + flow_horizontal ** 2).sqrt().view(batch_size, -1).max(dim=-1)[0]
+    flow_max_norm: torch.Tensor = flow_max_norm.view(batch_size, 1, 1, 1)
+    if normalize_over_video:
+        flow_max_norm: Tensor = flow_max_norm.max(dim=0, keepdim=True)[0]
+    # Normalize flow
+    flow_vertical: torch.Tensor = flow_vertical / (flow_max_norm + 1e-05)
+    flow_horizontal: torch.Tensor = flow_horizontal / (flow_max_norm + 1e-05)
+    # Get color wheel
+    color_wheel: torch.Tensor = get_color_wheel(device=device)
+    # Init flow image
+    flow_image = torch.zeros(batch_size, 3, height, width, device=device)
+    # Iterate over batch dimension
+    for index in range(batch_size):
+        flow_image[index] = _flow_hw_to_color(flow_vertical=flow_vertical[index],
+                                              flow_horizontal=flow_horizontal[index], color_wheel=color_wheel,
+                                              device=device)
+    return flow_image if batch_dimension else flow_image[0]
\ No newline at end of file
diff --git a/finetune/pipeline/flovd_FVSM_cogvideox_controlnet_pipeline.py b/finetune/pipeline/flovd_FVSM_cogvideox_controlnet_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa074d338ab10068479df48cd0cd1470f04c543e
--- /dev/null
+++ b/finetune/pipeline/flovd_FVSM_cogvideox_controlnet_pipeline.py
@@ -0,0 +1,932 @@
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import numpy as np
+import PIL
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange, repeat
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers.video_processor import VideoProcessor
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from diffusers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, CogVideoXImageToVideoPipeline
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import CogVideoXPipelineOutput, CogVideoXLoraLoaderMixin
+from diffusers.utils import is_torch_xla_available
+
+from finetune.modules.cogvideox_controlnet import CogVideoXControlnet
+
+import pdb
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+
+def resize_for_crop(image, crop_h, crop_w):
+    img_h, img_w = image.shape[-2:]
+    if img_h >= crop_h and img_w >= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    elif img_h <= crop_h and img_w <= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    else:
+        coef = crop_h / img_h if crop_h > img_h else crop_w / img_w 
+    out_h, out_w = int(img_h * coef), int(img_w * coef)
+    resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
+    return resized_image
+
+
+def prepare_frames(input_images, video_size, do_resize=True, do_crop=True):
+    input_images = np.stack([np.array(x) for x in input_images])
+    images_tensor = torch.from_numpy(input_images).permute(0, 3, 1, 2) / 127.5 - 1
+    if do_resize:
+        images_tensor = [resize_for_crop(x, crop_h=video_size[0], crop_w=video_size[1]) for x in images_tensor]
+    if do_crop:
+        images_tensor = [transforms.functional.center_crop(x, video_size) for x in images_tensor]
+    if isinstance(images_tensor, list):
+        images_tensor = torch.stack(images_tensor)
+    return images_tensor.unsqueeze(0) 
+
+
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+    
+
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+
+
+class FloVDCogVideoXControlnetImageToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    r"""
+    Pipeline for image-to-video generation using CogVideoX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        controlnet: CogVideoXControlnet,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            controlnet=controlnet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+        self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        # For CogVideoX1.5, the latent should add 1 for padding (Not use)
+        if self.transformer.config.patch_size_t is not None:
+            shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
+
+        image = image.unsqueeze(2)  # [B, C, F, H, W]
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
+            ]
+        else:
+            image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
+
+        image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+
+        if not self.vae.config.invert_scale_latents:
+            image_latents = self.vae_scaling_factor_image * image_latents
+        else:
+            # This is awkward but required because the CogVideoX team forgot to multiply the
+            # scaling factor during training :)
+            image_latents = 1 / self.vae_scaling_factor_image * image_latents
+
+        padding_shape = (
+            batch_size,
+            num_frames - 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+        # Select the first frame along the second dimension
+        if self.transformer.config.patch_size_t is not None:
+            first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
+            image_latents = torch.cat([first_frame, image_latents], dim=1)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents, image_latents
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+
+        frames = self.vae.decode(latents).sample
+        return frames
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        image,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        latents=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+
+        if p_t is None:
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,
+                grid_size=(grid_height, grid_width),
+                temporal_size=num_frames,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+            )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+    
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+    
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        flow_latent: Optional[torch.FloatTensor] = None,
+        valid_mask: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+        controlnet_weights: Optional[Union[float, list, np.ndarray, torch.FloatTensor]] = 1.0,
+        controlnet_guidance_start: float = 0.0,
+        controlnet_guidance_end: float = 1.0, # default: 0.4 in argument and AC3D.
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # flow_latent : [B, F, C, H, W] (e.g., [B, 13, 16, 60, 90])
+
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._current_timestep = None
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds.to(negative_prompt_embeds.device)], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+
+        latent_channels = self.transformer.config.in_channels // 2
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Create ofs embeds if required
+        ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
+                
+                latent_flow_input = torch.cat([flow_latent.to(image_latents.device)] * 2).contiguous() if do_classifier_free_guidance else flow_latent.to(image_latents.device)
+                
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                
+                current_sampling_percent = i / len(timesteps)
+                
+                # valid mask
+                if valid_mask is not None:
+                    if do_classifier_free_guidance:
+                        valid_mask_input = torch.cat([valid_mask] * 2)
+                    else:
+                        valid_mask_input = valid_mask
+                else:
+                    valid_mask_input = None
+
+                # Controlnet feedforward
+                controlnet_states = []
+                if (controlnet_guidance_start <= current_sampling_percent < controlnet_guidance_end):
+                    # extract controlnet hidden state
+                    controlnet_states = self.controlnet(
+                        hidden_states=latent_model_input[:, :, :16, :, :], # only for video latent
+                        encoder_hidden_states=prompt_embeds,
+                        image_rotary_emb=image_rotary_emb,
+                        controlnet_hidden_states=latent_flow_input,
+                        timestep=timestep,
+                        controlnet_valid_mask=valid_mask_input,
+                        return_dict=False,
+                    )[0]
+                    if isinstance(controlnet_states, (tuple, list)):
+                        controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states] # each state shape: torch.Size([2, 17550, 384])
+                    else:
+                        controlnet_states = controlnet_states.to(dtype=self.transformer.dtype)
+                
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    # ofs=ofs_emb,
+                    image_rotary_emb=image_rotary_emb,
+                    # attention_kwargs=attention_kwargs,
+                    controlnet_states=controlnet_states,
+                    controlnet_weights=controlnet_weights,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if not output_type == "latent":
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CogVideoXPipelineOutput(frames=video)
\ No newline at end of file
diff --git a/finetune/pipeline/flovd_OMSM_cogvideox_pipeline.py b/finetune/pipeline/flovd_OMSM_cogvideox_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..9200f952e23abd70df9f0c9df990eb6f998162e6
--- /dev/null
+++ b/finetune/pipeline/flovd_OMSM_cogvideox_pipeline.py
@@ -0,0 +1,336 @@
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing_extensions import override
+
+import PIL
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+
+from diffusers.pipelines.cogvideo.pipeline_output import CogVideoXPipelineOutput
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.cogvideo.pipeline_cogvideox_image2video import retrieve_timesteps
+from diffusers.utils import is_torch_xla_available
+
+import pdb
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+class FloVDOMSMCogVideoXImageToVideoPipeline(CogVideoXImageToVideoPipeline):
+    @override
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The height in pixels of the generated image. This is set to 480 by default for the best results.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_height * self.vae_scale_factor_spatial):
+                The width in pixels of the generated image. This is set to 720 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 8. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_output.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            latents=latents,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._current_timestep = None
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds.to(negative_prompt_embeds.device)], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+
+        latent_channels = self.transformer.config.in_channels // 2
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Create ofs embeds if required
+        ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                # latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
+
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                latent_model_input = torch.cat([latent_model_input, latent_image_input], dim=2)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    ofs=ofs_emb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if not output_type == "latent":
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CogVideoXPipelineOutput(frames=video)
\ No newline at end of file
diff --git a/finetune/schemas/__init__.py b/finetune/schemas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f547bf29d5596b842b09c448cdd2f5abaae074
--- /dev/null
+++ b/finetune/schemas/__init__.py
@@ -0,0 +1,6 @@
+from .args import Args
+from .components import Components
+from .state import State
+
+
+__all__ = ["Args", "State", "Components"]
diff --git a/finetune/schemas/args.py b/finetune/schemas/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e79deefc3fec9c03f26e3ba3ca26c9ddd5cda59
--- /dev/null
+++ b/finetune/schemas/args.py
@@ -0,0 +1,291 @@
+import argparse
+import datetime
+import logging
+from pathlib import Path
+from typing import Any, List, Literal, Tuple
+
+from pydantic import BaseModel, ValidationInfo, field_validator
+
+
+class Args(BaseModel):
+    ########## Model ##########
+    model_path: Path
+    model_name: str
+    model_type: Literal["i2v", "t2v", "i2vFlow"] # i2vFlow for FloVD
+    training_type: Literal["lora", "sft", "controlnet"] = "lora"
+    additional_save_blocks: List[str] | None = None
+    depth_ckpt_path: str
+
+    ########## Output ##########
+    output_dir: Path = Path("train_results/{:%Y-%m-%d-%H-%M-%S}".format(datetime.datetime.now()))
+    report_to: Literal["tensorboard", "wandb", "all"] | None = None
+    tracker_name: str = "finetrainer-cogvideo"
+    run_name: str = "CogVideoX"
+
+    ########## Data ###########
+    data_root: Path
+    caption_column: Path
+    image_column: Path | None = None
+    video_column: Path
+
+    ########## Training #########
+    resume_from_checkpoint: Path | None = None
+
+    seed: int | None = None
+    train_epochs: int
+    train_steps: int | None = None
+    checkpointing_steps: int = 200
+    checkpointing_limit: int = 10
+
+    batch_size: int
+    gradient_accumulation_steps: int = 1
+
+    train_resolution: Tuple[int, int, int]  # shape: (frames, height, width)
+
+    #### deprecated args: video_resolution_buckets
+    # if use bucket for training, should not be None
+    # Note1: At least one frame rate in the bucket must be less than or equal to the frame rate of any video in the dataset
+    # Note2:  For cogvideox, cogvideox1.5
+    #   The frame rate set in the bucket must be an integer multiple of 8 (spatial_compression_rate[4] * path_t[2] = 8)
+    #   The height and width set in the bucket must be an integer multiple of 8 (temporal_compression_rate[8])
+    # video_resolution_buckets: List[Tuple[int, int, int]] | None = None
+
+    mixed_precision: Literal["no", "fp16", "bf16"]
+
+    learning_rate: float = 2e-5
+    optimizer: str = "adamw"
+    beta1: float = 0.9
+    beta2: float = 0.95
+    beta3: float = 0.98
+    epsilon: float = 1e-8
+    weight_decay: float = 1e-4
+    max_grad_norm: float = 1.0
+
+    lr_scheduler: str = "constant_with_warmup"
+    lr_warmup_steps: int = 100
+    lr_num_cycles: int = 1
+    lr_power: float = 1.0
+
+    num_workers: int = 8
+    pin_memory: bool = True
+
+    gradient_checkpointing: bool = True
+    enable_slicing: bool = True
+    enable_tiling: bool = True
+    nccl_timeout: int = 1800
+
+    ########## Lora ##########
+    rank: int = 128
+    lora_alpha: int = 64
+    target_modules: List[str] = ["to_q", "to_k", "to_v", "to_out.0"]
+
+    ########## Validation ##########
+    do_validation: bool = False
+    validation_steps: int | None  # if set, should be a multiple of checkpointing_steps
+    validation_dir: Path | None  # if set do_validation, should not be None
+    validation_prompts: str | None  # if set do_validation, should not be None
+    validation_images: str | None  # if set do_validation and model_type == i2v, should not be None
+    validation_videos: str | None  # if set do_validation and model_type == v2v, should not be None
+    gen_fps: int = 15
+    max_scene: int = 8
+    
+    ########## Controlnet ##########
+    controlnet_transformer_num_layers: int = 8
+    controlnet_input_channels: int = 16
+    controlnet_weights: float = 1.0
+    controlnet_guidance_start: float = 0.0
+    controlnet_guidance_end: float = 1.0
+    controlnet_out_proj_dim_factor: int = 64
+    controlnet_out_proj_zero_init: bool = True
+    enable_time_sampling: bool = True
+    time_sampling_type: str = 'truncated_normal'
+    time_sampling_mean: float = 0.95
+    time_sampling_std: float = 0.1
+    use_valid_mask: bool = False
+    notextinflow: bool = False
+
+    #### deprecated args: gen_video_resolution
+    # 1. If set do_validation, should not be None
+    # 2. Suggest selecting the bucket from `video_resolution_buckets` that is closest to the resolution you have chosen for fine-tuning
+    #        or the resolution recommended by the model
+    # 3. Note:  For cogvideox, cogvideox1.5
+    #        The frame rate set in the bucket must be an integer multiple of 8 (spatial_compression_rate[4] * path_t[2] = 8)
+    #        The height and width set in the bucket must be an integer multiple of 8 (temporal_compression_rate[8])
+    # gen_video_resolution: Tuple[int, int, int] | None  # shape: (frames, height, width)
+
+    @field_validator("image_column")
+    def validate_image_column(cls, v: str | None, info: ValidationInfo) -> str | None:
+        values = info.data
+        if values.get("model_type") == "i2v" and not v:
+            logging.warning(
+                "No `image_column` specified for i2v model. Will automatically extract first frames from videos as conditioning images."
+            )
+        return v
+
+    @field_validator("validation_dir", "validation_prompts")
+    def validate_validation_required_fields(cls, v: Any, info: ValidationInfo) -> Any:
+        values = info.data
+        if values.get("do_validation") and not v:
+            field_name = info.field_name
+            raise ValueError(f"{field_name} must be specified when do_validation is True")
+        return v
+
+    @field_validator("validation_images")
+    def validate_validation_images(cls, v: str | None, info: ValidationInfo) -> str | None:
+        values = info.data
+        if values.get("do_validation") and values.get("model_type") == "i2v" and not v:
+            raise ValueError("validation_images must be specified when do_validation is True and model_type is i2v")
+        return v
+
+    @field_validator("validation_videos")
+    def validate_validation_videos(cls, v: str | None, info: ValidationInfo) -> str | None:
+        values = info.data
+        if values.get("do_validation") and values.get("model_type") == "v2v" and not v:
+            raise ValueError("validation_videos must be specified when do_validation is True and model_type is v2v")
+        return v
+
+    @field_validator("validation_steps")
+    def validate_validation_steps(cls, v: int | None, info: ValidationInfo) -> int | None:
+        values = info.data
+        if values.get("do_validation"):
+            if v is None:
+                raise ValueError("validation_steps must be specified when do_validation is True")
+            if values.get("checkpointing_steps") and v % values["checkpointing_steps"] != 0:
+                raise ValueError("validation_steps must be a multiple of checkpointing_steps")
+        return v
+
+    @field_validator("train_resolution")
+    def validate_train_resolution(cls, v: Tuple[int, int, int], info: ValidationInfo) -> str:
+        try:
+            frames, height, width = v
+
+            # Check if (frames - 1) is multiple of 8
+            if (frames - 1) % 8 != 0:
+                raise ValueError("Number of frames - 1 must be a multiple of 8")
+
+            # Check resolution for cogvideox-5b models
+            model_name = info.data.get("model_name", "")
+            if model_name in ["cogvideox-5b-i2v", "cogvideox-5b-t2v"]:
+                if (height, width) != (480, 720):
+                    raise ValueError("For cogvideox-5b models, height must be 480 and width must be 720")
+
+            return v
+
+        except ValueError as e:
+            if (
+                str(e) == "not enough values to unpack (expected 3, got 0)"
+                or str(e) == "invalid literal for int() with base 10"
+            ):
+                raise ValueError("train_resolution must be in format 'frames x height x width'")
+            raise e
+
+    @field_validator("mixed_precision")
+    def validate_mixed_precision(cls, v: str, info: ValidationInfo) -> str:
+        if v == "fp16" and "cogvideox-2b" not in str(info.data.get("model_path", "")).lower():
+            logging.warning(
+                "All CogVideoX models except cogvideox-2b were trained with bfloat16. "
+                "Using fp16 precision may lead to training instability."
+            )
+        return v
+
+    @classmethod
+    def parse_args(cls):
+        """Parse command line arguments and return Args instance"""
+        parser = argparse.ArgumentParser()
+        # Required arguments
+        parser.add_argument("--model_path", type=str, required=True)
+        parser.add_argument("--model_name", type=str, required=True)
+        parser.add_argument("--model_type", type=str, required=True)
+        parser.add_argument("--depth_ckpt_path", type=str, required=False, default="./ckpt/others/depth_anything_v2_metric_hypersim_vitb.pth", help="Path to the checkpoint of the depth estimation networks")
+        parser.add_argument("--training_type", type=str, required=True)
+        parser.add_argument("--additional_save_blocks", type=str, required=False, default=None)
+        parser.add_argument("--output_dir", type=str, required=True)
+        parser.add_argument("--data_root", type=str, required=True)
+        parser.add_argument("--caption_column", type=str, required=True)
+        parser.add_argument("--video_column", type=str, required=True)
+        parser.add_argument("--train_resolution", type=str, required=True)
+        parser.add_argument("--report_to", type=str, required=True)
+        parser.add_argument("--run_name", type=str, required=False, default='CogVideoX')
+
+        # Training hyperparameters
+        parser.add_argument("--seed", type=int, default=42)
+        parser.add_argument("--train_epochs", type=int, default=10)
+        parser.add_argument("--train_steps", type=int, default=None)
+        parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+        parser.add_argument("--batch_size", type=int, default=1)
+        parser.add_argument("--learning_rate", type=float, default=2e-5)
+        parser.add_argument("--optimizer", type=str, default="adamw")
+        parser.add_argument("--beta1", type=float, default=0.9)
+        parser.add_argument("--beta2", type=float, default=0.95)
+        parser.add_argument("--beta3", type=float, default=0.98)
+        parser.add_argument("--epsilon", type=float, default=1e-8)
+        parser.add_argument("--weight_decay", type=float, default=1e-4)
+        parser.add_argument("--max_grad_norm", type=float, default=1.0)
+
+        # Learning rate scheduler
+        parser.add_argument("--lr_scheduler", type=str, default="constant_with_warmup")
+        parser.add_argument("--lr_warmup_steps", type=int, default=100)
+        parser.add_argument("--lr_num_cycles", type=int, default=1)
+        parser.add_argument("--lr_power", type=float, default=1.0)
+
+        # Data loading
+        parser.add_argument("--num_workers", type=int, default=8)
+        parser.add_argument("--pin_memory", type=bool, default=True)
+        parser.add_argument("--image_column", type=str, default=None)
+
+        # Model configuration
+        parser.add_argument("--mixed_precision", type=str, default="no")
+        parser.add_argument("--gradient_checkpointing", type=bool, default=True)
+        parser.add_argument("--enable_slicing", type=bool, default=True)
+        parser.add_argument("--enable_tiling", type=bool, default=True)
+        parser.add_argument("--nccl_timeout", type=int, default=1800)
+
+        # LoRA parameters
+        parser.add_argument("--rank", type=int, default=128)
+        parser.add_argument("--lora_alpha", type=int, default=64)
+        parser.add_argument("--target_modules", type=str, nargs="+", default=["to_q", "to_k", "to_v", "to_out.0"])
+
+        # Checkpointing
+        parser.add_argument("--checkpointing_steps", type=int, default=200)
+        parser.add_argument("--checkpointing_limit", type=int, default=10)
+        parser.add_argument("--resume_from_checkpoint", type=str, default=None)
+
+        # Validation
+        parser.add_argument("--do_validation", type=lambda x: x.lower() == 'true', default=False)
+        parser.add_argument("--validation_steps", type=int, default=None)
+        parser.add_argument("--validation_dir", type=str, default=None)
+        parser.add_argument("--validation_prompts", type=str, default=None)
+        parser.add_argument("--validation_images", type=str, default=None)
+        parser.add_argument("--validation_videos", type=str, default=None)
+        parser.add_argument("--gen_fps", type=int, default=15)
+        parser.add_argument("--max_scene", type=int, default=8)
+        
+        # Controlnet
+        parser.add_argument("--controlnet_transformer_num_layers", type=int, default=8)
+        parser.add_argument("--controlnet_input_channels", type=int, default=16)
+        parser.add_argument("--controlnet_weights", type=float, default=1.0)
+        parser.add_argument("--controlnet_guidance_start", type=float, default=0.0)
+        parser.add_argument("--controlnet_guidance_end", type=float, default=1.0)
+        parser.add_argument("--controlnet_out_proj_dim_factor", type=int, default=64)
+        parser.add_argument("--controlnet_out_proj_zero_init", type=bool, default=True)
+        parser.add_argument("--enable_time_sampling", type=bool, default=True)
+        # parser.add_argument("--enable_time_sampling", type=lambda x: x.lower() == 'true', default=False)
+        parser.add_argument("--time_sampling_type", type=str, default='truncated_normal')
+        parser.add_argument("--time_sampling_mean", type=float, default=0.95)
+        parser.add_argument("--time_sampling_std", type=float, default=0.1)
+        parser.add_argument("--use_valid_mask", type=bool, default=False)
+        parser.add_argument("--notextinflow", type=bool, default=False)
+
+        args = parser.parse_args()
+
+        # Convert video_resolution_buckets string to list of tuples
+        frames, height, width = args.train_resolution.split("x")
+        args.train_resolution = (int(frames), int(height), int(width))
+        
+        if args.additional_save_blocks is not None:
+            args.additional_save_blocks = args.additional_save_blocks.split(',')
+        if not args.training_type == 'lora':
+            # Use additional trainable blocks only for 'lora' setting
+            assert args.additional_save_blocks is None
+            
+        return cls(**vars(args))
diff --git a/finetune/schemas/components.py b/finetune/schemas/components.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae06f787171c9a925bd03cccea6ff3d0bca8ffa
--- /dev/null
+++ b/finetune/schemas/components.py
@@ -0,0 +1,29 @@
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class Components(BaseModel):
+    # pipeline cls
+    pipeline_cls: Any = None
+
+    # Tokenizers
+    tokenizer: Any = None
+    tokenizer_2: Any = None
+    tokenizer_3: Any = None
+
+    # Text encoders
+    text_encoder: Any = None
+    text_encoder_2: Any = None
+    text_encoder_3: Any = None
+
+    # Autoencoder
+    vae: Any = None
+
+    # Denoiser
+    transformer: Any = None
+    controlnet: Any = None
+    unet: Any = None
+
+    # Scheduler
+    scheduler: Any = None
diff --git a/finetune/schemas/state.py b/finetune/schemas/state.py
new file mode 100644
index 0000000000000000000000000000000000000000..68d26a7de6ebc65e6d9e28cd4261135f037b11fa
--- /dev/null
+++ b/finetune/schemas/state.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+from typing import Any, Dict, List
+
+import torch
+from pydantic import BaseModel
+
+
+class State(BaseModel):
+    model_config = {"arbitrary_types_allowed": True}
+
+    train_frames: int
+    train_height: int
+    train_width: int
+
+    transformer_config: Dict[str, Any] = None
+
+    weight_dtype: torch.dtype = torch.float32  # dtype for mixed precision training
+    num_trainable_parameters: int = 0
+    overwrote_max_train_steps: bool = False
+    num_update_steps_per_epoch: int = 0
+    total_batch_size_count: int = 0
+
+    generator: torch.Generator | None = None
+
+    validation_prompts: List[str] = []
+    validation_images: List[Path | None] = []
+    validation_videos: List[Path | None] = []
+
+    # WJ: Added..
+    validation_prompt_embeddings: List[Path | None] = []
+    validation_video_latents: List[Path | None] = []
+    validation_flow_latents: List[Path | None] = []
+    validation_valid_masks: List[Path | None] = []
+    validation_valid_masks_interp: List[Path | None] = []
+
+    using_deepspeed: bool = False
diff --git a/finetune/scripts/extract_images.py b/finetune/scripts/extract_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..42ce8e21136d543964946d7ca4e9579b74017975
--- /dev/null
+++ b/finetune/scripts/extract_images.py
@@ -0,0 +1,57 @@
+import argparse
+import os
+from pathlib import Path
+
+import cv2
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--datadir", type=str, required=True, help="Root directory containing videos.txt and video subdirectory"
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+# Create data/images directory if it doesn't exist
+data_dir = Path(args.datadir)
+image_dir = data_dir / "images"
+image_dir.mkdir(exist_ok=True)
+
+# Read videos.txt
+videos_file = data_dir / "videos.txt"
+with open(videos_file, "r") as f:
+    video_paths = [line.strip() for line in f.readlines() if line.strip()]
+
+# Process each video file and collect image paths
+image_paths = []
+for video_rel_path in video_paths:
+    video_path = data_dir / video_rel_path
+
+    # Open video
+    cap = cv2.VideoCapture(str(video_path))
+
+    # Read first frame
+    ret, frame = cap.read()
+    if not ret:
+        print(f"Failed to read video: {video_path}")
+        continue
+
+    # Save frame as PNG with same name as video
+    image_name = f"images/{video_path.stem}.png"
+    image_path = data_dir / image_name
+    cv2.imwrite(str(image_path), frame)
+
+    # Release video capture
+    cap.release()
+
+    print(f"Extracted first frame from {video_path} to {image_path}")
+    image_paths.append(image_name)
+
+# Write images.txt
+images_file = data_dir / "images.txt"
+with open(images_file, "w") as f:
+    for path in image_paths:
+        f.write(f"{path}\n")
diff --git a/finetune/train.py b/finetune/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5109ac505218fa7bc83d7748340ef00db9574811
--- /dev/null
+++ b/finetune/train.py
@@ -0,0 +1,20 @@
+import sys
+from pathlib import Path
+
+
+sys.path.append(str(Path(__file__).parent.parent))
+
+from finetune.models.utils import get_model_cls
+from finetune.schemas import Args
+
+import pdb
+
+def main():
+    args = Args.parse_args()
+    trainer_cls = get_model_cls(args.model_name, args.training_type)
+    trainer = trainer_cls(args)
+    trainer.fit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/finetune/trainer.py b/finetune/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1edecc3cdcf1838a8513a22f5e8f4d76bce7902
--- /dev/null
+++ b/finetune/trainer.py
@@ -0,0 +1,946 @@
+import os
+import hashlib
+import json
+import logging
+import math
+import datetime
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import diffusers
+import torch
+import transformers
+import wandb
+from accelerate.accelerator import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import (
+    DistributedDataParallelKwargs,
+    InitProcessGroupKwargs,
+    ProjectConfiguration,
+    gather_object,
+    set_seed,
+    broadcast_object_list,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.utils.export_utils import export_to_video
+from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from safetensors.torch import save_file, load_file
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+from finetune.datasets import I2VDatasetWithResize, T2VDatasetWithResize, I2VFlowDataset
+from finetune.datasets.utils import (
+    load_images,
+    load_prompts,
+    load_videos,
+    preprocess_image_with_resize,
+    preprocess_video_with_resize,
+)
+from finetune.schemas import Args, Components, State
+from finetune.utils import (
+    cast_training_params,
+    free_memory,
+    get_intermediate_ckpt_path,
+    get_latest_ckpt_path_to_resume_from,
+    get_memory_statistics,
+    get_optimizer,
+    string_to_filename,
+    unload_model,
+    unwrap_model,
+)
+
+from tqdm import tqdm
+import pdb
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+_DTYPE_MAP = {
+    "fp32": torch.float32,
+    "fp16": torch.float16,  # FP16 is Only Support for CogVideoX-2B
+    "bf16": torch.bfloat16,
+}
+
+
+class Trainer:
+    # If set, should be a list of components to unload (refer to `Components``)
+    UNLOAD_LIST: List[str] = None
+
+    def __init__(self, args: Args) -> None:
+        self.args = args
+        self.state = State(
+            weight_dtype=self.__get_training_dtype(),
+            train_frames=self.args.train_resolution[0],
+            train_height=self.args.train_resolution[1],
+            train_width=self.args.train_resolution[2],
+        )
+
+        self.components: Components = self.load_components()
+        self.accelerator: Accelerator = None
+        self.dataset: Dataset = None
+        self.data_loader: DataLoader = None
+
+        self.optimizer = None
+        self.lr_scheduler = None
+
+        self._init_distributed()
+        self._init_logging()
+
+        self.state.using_deepspeed = self.accelerator.state.deepspeed_plugin is not None
+
+    
+    def _init_distributed(self):
+        project_dir  = Path(self.args.output_dir)
+        logging_dir  = project_dir / "tmp_logs"         
+        project_config = ProjectConfiguration(project_dir=self.args.output_dir, logging_dir=logging_dir)
+        
+        ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+        init_process_group_kwargs = InitProcessGroupKwargs(
+            backend="nccl", timeout=timedelta(seconds=self.args.nccl_timeout)
+        )
+        mixed_precision = "no" if torch.backends.mps.is_available() else self.args.mixed_precision
+        report_to = None if self.args.report_to.lower() == "none" else self.args.report_to
+        
+        accelerator = Accelerator(
+            project_config=project_config,
+            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
+            mixed_precision=mixed_precision,
+            log_with=report_to,
+            kwargs_handlers=[ddp_kwargs, init_process_group_kwargs],
+        )
+        
+        run_id = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") if accelerator.is_main_process else ""
+        [run_id] = broadcast_object_list([run_id])
+
+        final_out_dir  = project_dir / f"{self.args.run_name}-{run_id}"
+        final_log_dir  = final_out_dir / "logs"
+
+        if accelerator.is_main_process:
+            final_log_dir.mkdir(parents=True, exist_ok=True)
+        accelerator.wait_for_everyone()  
+
+        self.args.output_dir                            = final_out_dir
+        accelerator.project_configuration.project_dir   = final_out_dir
+        accelerator.project_configuration.logging_dir   = final_log_dir
+
+        accelerator.init_trackers(
+            project_name=self.args.model_name,
+            config=vars(self.args),
+            init_kwargs={
+                "wandb": {
+                    "dir": final_log_dir, 
+                    "name": self.args.run_name,
+                }
+            }
+        )
+        
+        # Disable AMP for MPS.
+        if torch.backends.mps.is_available():
+            accelerator.native_amp = False
+
+        self.accelerator = accelerator
+
+        if self.args.seed is not None:
+            set_seed(self.args.seed)
+
+    def _init_logging(self) -> None:
+        logging.basicConfig(
+            format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+            datefmt="%m/%d/%Y %H:%M:%S",
+            level=LOG_LEVEL,
+        )
+        if self.accelerator.is_local_main_process:
+            transformers.utils.logging.set_verbosity_warning()
+            diffusers.utils.logging.set_verbosity_info()
+        else:
+            transformers.utils.logging.set_verbosity_error()
+            diffusers.utils.logging.set_verbosity_error()
+
+        logger.info("Initialized Trainer")
+        logger.info(f"Accelerator state: \n{self.accelerator.state}", main_process_only=False)
+
+
+    def check_setting(self) -> None:
+        # Check for unload_list
+        if self.UNLOAD_LIST is None:
+            logger.warning(
+                "\033[91mNo unload_list specified for this Trainer. All components will be loaded to GPU during training.\033[0m"
+            )
+        else:
+            for name in self.UNLOAD_LIST:
+                if name not in self.components.model_fields:
+                    raise ValueError(f"Invalid component name in unload_list: {name}")
+
+    def prepare_models(self) -> None:
+        logger.info("Initializing models")
+
+        if self.components.vae is not None:
+            if self.args.enable_slicing:
+                self.components.vae.enable_slicing()
+            if self.args.enable_tiling:
+                self.components.vae.enable_tiling()
+
+        self.state.transformer_config = self.components.transformer.config
+
+    def prepare_dataset(self) -> None:
+        logger.info("Initializing dataset and dataloader")
+
+        if self.args.model_type == "i2v":
+            self.dataset = I2VDatasetWithResize(
+                **(self.args.model_dump()),
+                device=self.accelerator.device,
+                max_num_frames=self.state.train_frames,
+                height=self.state.train_height,
+                width=self.state.train_width,
+                trainer=self,
+            )
+        elif self.args.model_type == "t2v":
+            self.dataset = T2VDatasetWithResize(
+                **(self.args.model_dump()),
+                device=self.accelerator.device,
+                max_num_frames=self.state.train_frames,
+                height=self.state.train_height,
+                width=self.state.train_width,
+                trainer=self,
+            )
+        elif self.args.model_type == "i2vFlow":
+            self.dataset = I2VFlowDataset(
+                **(self.args.model_dump()),
+                device=self.accelerator.device,
+                max_num_frames=self.state.train_frames,
+                height=self.state.train_height,
+                width=self.state.train_width,
+                trainer=self,
+            )
+        else:
+            raise ValueError(f"Invalid model type: {self.args.model_type}")
+
+        # Prepare VAE and text encoder for encoding
+        if self.args.training_type == "controlnet":
+            self.components.transformer.requires_grad_(False)    
+        self.components.vae.requires_grad_(False)
+        self.components.text_encoder.requires_grad_(False)
+        self.components.vae = self.components.vae.to(self.accelerator.device, dtype=self.state.weight_dtype)
+        self.components.text_encoder = self.components.text_encoder.to(
+            self.accelerator.device, dtype=self.state.weight_dtype
+        )
+
+        if not self.args.model_type == "i2vFlow":
+            # Precompute latent for video and prompt embedding
+            logger.info("Precomputing latent for video and prompt embedding ...")
+            tmp_data_loader = torch.utils.data.DataLoader(
+                self.dataset,
+                collate_fn=self.collate_fn,
+                batch_size=1,
+                num_workers=0,
+                pin_memory=self.args.pin_memory,
+            )
+            tmp_data_loader = self.accelerator.prepare_data_loader(tmp_data_loader)
+            for _ in tqdm(tmp_data_loader, desc="prepare dataloader"):
+                ...
+            self.accelerator.wait_for_everyone()
+            logger.info("Precomputing latent for video and prompt embedding ... Done")
+
+        unload_model(self.components.vae)
+        unload_model(self.components.text_encoder)
+        free_memory()
+
+        self.data_loader = torch.utils.data.DataLoader(
+            self.dataset,
+            collate_fn=self.collate_fn,
+            batch_size=self.args.batch_size,
+            num_workers=self.args.num_workers,
+            pin_memory=self.args.pin_memory,
+            shuffle=True,
+        )
+    
+    def set_additional_trainable_parameters(self, block_names):
+        
+        # # Set requires_grad as True for trainable parameters of selected blocks!
+        # for block_name in block_names:
+        #     if hasattr(self.components.transformer, block_name):
+        #         block = getattr(self.components.transformer, block_name)
+        #         for param in block.parameters():
+        #             param.requires_grad_(True)
+        #     else:
+        #         raise ValueError(f"Model has no attribute '{block_name}'")
+        
+        # raise NotImplementedError
+        self.components.transformer.patch_embed.proj.requires_grad_(True)
+
+    def prepare_trainable_parameters(self):
+        logger.info("Initializing trainable parameters")
+
+        # For mixed precision training we cast all non-trainable weights to half-precision
+        # as these weights are only used for inference, keeping weights in full precision is not required.
+        weight_dtype = self.state.weight_dtype
+
+        if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+            # due to pytorch#99272, MPS does not yet support bfloat16.
+            raise ValueError(
+                "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+            )
+
+        # For LoRA, we freeze all the parameters
+        # For SFT, we train all the parameters in transformer model
+        for attr_name, component in vars(self.components).items():
+            if hasattr(component, "requires_grad_"):
+                if self.args.training_type == "sft" and attr_name == "transformer":
+                    component.requires_grad_(True)
+                elif self.args.training_type == "controlnet" and attr_name == "controlnet":
+                    component.requires_grad_(True)
+                    if self.args.notextinflow:
+                        component.patch_embed.text_proj.requires_grad_(False)
+                else:
+                    component.requires_grad_(False)
+
+        if self.args.training_type == "lora":
+            transformer_lora_config = LoraConfig(
+                r=self.args.rank,
+                lora_alpha=self.args.lora_alpha,
+                init_lora_weights=True,
+                target_modules=self.args.target_modules,
+            )
+            self.components.transformer.add_adapter(transformer_lora_config)
+            self.__prepare_saving_loading_hooks(transformer_lora_config, block_names=self.args.additional_save_blocks)
+            
+            # Add trainable blocks
+            self.set_additional_trainable_parameters(block_names=self.args.additional_save_blocks)
+
+        # Load components needed for training to GPU (except transformer), and cast them to the specified data type
+        # ignore_list = ["transformer"] + self.UNLOAD_LIST # ??
+        ignore_list = self.UNLOAD_LIST
+        self.__move_components_to_device(dtype=weight_dtype, ignore_list=ignore_list)
+
+        if self.args.gradient_checkpointing:
+            self.components.transformer.enable_gradient_checkpointing()
+            if self.args.training_type == "controlnet":
+                self.components.controlnet.enable_gradient_checkpointing()
+
+    def prepare_optimizer(self) -> None:
+        logger.info("Initializing optimizer and lr scheduler")
+
+        # Make sure the trainable params are in float32
+        if self.args.training_type == "sft" or self.args.training_type == "lora":
+            cast_training_params([self.components.transformer], dtype=torch.float32)
+            # For LoRA, we only want to train the LoRA weights
+            # For SFT, we want to train all the parameters
+            trainable_parameters = list(filter(lambda p: p.requires_grad, self.components.transformer.parameters()))
+            trainable_parameters_name = [p[0] for p in filter(lambda p: p[1].requires_grad, self.components.transformer.named_parameters())]
+        elif self.args.training_type == "controlnet":
+            cast_training_params([self.components.controlnet], dtype=torch.float32)
+            trainable_parameters = list(filter(lambda p: p.requires_grad, self.components.controlnet.parameters()))
+            trainable_parameters_name = [p[0] for p in filter(lambda p: p[1].requires_grad, self.components.controlnet.named_parameters())]
+        else:
+            raise NotImplementedError("Choose training_type among 'sft', 'lora', 'controlnet'")
+        
+        
+        # import pdb
+        # pdb.set_trace()
+        print("-"*200)
+        print(f"Training type: {self.args.training_type}")
+        print(f"Trainable parameters: {trainable_parameters_name}")
+        print("-"*200)
+        
+        
+        trainable_parameters_with_lr = {
+            "params": trainable_parameters,
+            "lr": self.args.learning_rate,
+        }
+        params_to_optimize = [trainable_parameters_with_lr]
+        self.state.num_trainable_parameters = sum(p.numel() for p in trainable_parameters)
+
+        use_deepspeed_opt = (
+            self.accelerator.state.deepspeed_plugin is not None
+            and "optimizer" in self.accelerator.state.deepspeed_plugin.deepspeed_config
+        )
+        optimizer = get_optimizer(
+            params_to_optimize=params_to_optimize,
+            optimizer_name=self.args.optimizer,
+            learning_rate=self.args.learning_rate,
+            beta1=self.args.beta1,
+            beta2=self.args.beta2,
+            beta3=self.args.beta3,
+            epsilon=self.args.epsilon,
+            weight_decay=self.args.weight_decay,
+            use_deepspeed=use_deepspeed_opt,
+        )
+
+        num_update_steps_per_epoch = math.ceil(len(self.data_loader) / self.args.gradient_accumulation_steps)
+        if self.args.train_steps is None:
+            self.args.train_steps = self.args.train_epochs * num_update_steps_per_epoch
+            self.state.overwrote_max_train_steps = True
+
+        use_deepspeed_lr_scheduler = (
+            self.accelerator.state.deepspeed_plugin is not None
+            and "scheduler" in self.accelerator.state.deepspeed_plugin.deepspeed_config
+        )
+        total_training_steps = self.args.train_steps * self.accelerator.num_processes
+        num_warmup_steps = self.args.lr_warmup_steps * self.accelerator.num_processes
+
+        if use_deepspeed_lr_scheduler:
+            from accelerate.utils import DummyScheduler
+
+            lr_scheduler = DummyScheduler(
+                name=self.args.lr_scheduler,
+                optimizer=optimizer,
+                total_num_steps=total_training_steps,
+                num_warmup_steps=num_warmup_steps,
+            )
+        else:
+            lr_scheduler = get_scheduler(
+                name=self.args.lr_scheduler,
+                optimizer=optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=total_training_steps,
+                num_cycles=self.args.lr_num_cycles,
+                power=self.args.lr_power,
+            )
+
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+
+    def prepare_for_training(self) -> None:
+        if self.args.training_type == "sft" or self.args.training_type == "lora":
+            self.components.transformer, self.optimizer, self.data_loader, self.lr_scheduler = self.accelerator.prepare(
+                self.components.transformer, self.optimizer, self.data_loader, self.lr_scheduler
+            )
+        elif self.args.training_type == "controlnet":
+            self.components.controlnet, self.optimizer, self.data_loader, self.lr_scheduler = self.accelerator.prepare(
+                self.components.controlnet, self.optimizer, self.data_loader, self.lr_scheduler
+            )
+            # self.components.transformer.to(self.accelerator.device, dtype=self.state.weight_dtype)
+
+        # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+        num_update_steps_per_epoch = math.ceil(len(self.data_loader) / self.args.gradient_accumulation_steps)
+        if self.state.overwrote_max_train_steps:
+            self.args.train_steps = self.args.train_epochs * num_update_steps_per_epoch
+        # Afterwards we recalculate our number of training epochs
+        self.args.train_epochs = math.ceil(self.args.train_steps / num_update_steps_per_epoch)
+        self.state.num_update_steps_per_epoch = num_update_steps_per_epoch
+
+    def prepare_for_validation(self):
+        validation_prompts = load_prompts(self.args.validation_dir / self.args.validation_prompts)
+
+        if self.args.validation_images is not None:
+            validation_images = load_images(self.args.validation_dir / self.args.validation_images)
+        else:
+            validation_images = [None] * len(validation_prompts)
+
+        if self.args.validation_videos is not None:
+            validation_videos = load_videos(self.args.validation_dir / self.args.validation_videos)
+        else:
+            validation_videos = [None] * len(validation_prompts)
+
+        self.state.validation_prompts = validation_prompts
+        self.state.validation_images = validation_images
+        self.state.validation_videos = validation_videos
+
+        self.validate(0)
+
+    def prepare_trackers(self) -> None:
+        logger.info("Initializing trackers")
+
+        tracker_name = self.args.tracker_name or "finetrainers-experiment"
+        self.accelerator.init_trackers(tracker_name, config=self.args.model_dump())
+
+    def load_state_single_gpu(self, resume_from_checkpoint_path) -> None:
+        state_dict_path = resume_from_checkpoint_path / "pytorch_model" / "mp_rank_00_model_states.pt"
+        state_dict = torch.load(state_dict_path)['module']
+        if self.args.training_type == "controlnet":
+            controlnet_ = unwrap_model(self.accelerator, self.components.controlnet)
+            controlnet_.load_state_dict(state_dict)
+
+    def train(self) -> None:
+        # try:
+        logger.info("Starting training")
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory before training start: {json.dumps(memory_statistics, indent=4)}")
+
+        self.state.total_batch_size_count = (
+            self.args.batch_size * self.accelerator.num_processes * self.args.gradient_accumulation_steps
+        )
+        info = {
+            "trainable parameters": self.state.num_trainable_parameters,
+            "total samples": len(self.dataset),
+            "train epochs": self.args.train_epochs,
+            "train steps": self.args.train_steps,
+            "batches per device": self.args.batch_size,
+            "total batches observed per epoch": len(self.data_loader),
+            "train batch size total count": self.state.total_batch_size_count,
+            "gradient accumulation steps": self.args.gradient_accumulation_steps,
+        }
+        logger.info(f"Training configuration: {json.dumps(info, indent=4)}")
+
+        global_step = 0
+        first_epoch = 0
+        initial_global_step = 0
+
+        # Potentially load in the weights and states from a previous save
+        (
+            resume_from_checkpoint_path,
+            initial_global_step,
+            global_step,
+            first_epoch,
+        ) = get_latest_ckpt_path_to_resume_from(
+            resume_from_checkpoint=self.args.resume_from_checkpoint,
+            num_update_steps_per_epoch=self.state.num_update_steps_per_epoch,
+        )
+        
+        # print(f"Before out_proj weight sum: {self.components.controlnet.out_projectors[0].weight.sum()}")  
+        if resume_from_checkpoint_path is not None:
+            self.accelerator.load_state(resume_from_checkpoint_path)
+            # try:
+            #     self.accelerator.load_state(resume_from_checkpoint_path)
+            # except:
+            #     print("[Error] deepspeed.runtime.zero.utils.ZeRORuntimeException. We sidestep this issue for the case using single gpu.")
+            #     self.load_state_single_gpu(resume_from_checkpoint_path)      
+        
+        # print(f"After out_proj weight sum: {self.components.controlnet.out_projectors[0].weight.sum()}")  
+        
+        
+        progress_bar = tqdm(
+            range(0, self.args.train_steps),
+            initial=initial_global_step,
+            desc="Training steps",
+            disable=not self.accelerator.is_local_main_process,
+        )
+
+        accelerator = self.accelerator
+        generator = torch.Generator(device=accelerator.device)
+        if self.args.seed is not None:
+            generator = generator.manual_seed(self.args.seed)
+        self.state.generator = generator
+        
+        last_validated_step = -1
+        if global_step != 0:
+            last_validated_step = global_step
+        
+        free_memory()
+        for epoch in range(first_epoch, self.args.train_epochs):
+            logger.debug(f"Starting epoch ({epoch + 1}/{self.args.train_epochs})")
+            
+            if self.args.training_type == "sft" or self.args.training_type == "lora":
+                self.components.transformer.train()
+                models_to_accumulate = [self.components.transformer]
+            elif self.args.training_type == "controlnet":
+                self.components.controlnet.train()
+                models_to_accumulate = [self.components.controlnet]
+
+            for step, batch in enumerate(self.data_loader):
+                logger.debug(f"Starting step {step + 1}")
+                logs = {}
+
+                with accelerator.accumulate(models_to_accumulate):
+                    # These weighting schemes use a uniform timestep sampling and instead post-weight the loss
+                    loss = self.compute_loss(batch)
+                    accelerator.backward(loss)
+
+                    if accelerator.sync_gradients:
+                        if accelerator.distributed_type == DistributedType.DEEPSPEED:
+                            if self.args.training_type == "sft" or self.args.training_type == "lora":
+                                grad_norm = self.components.transformer.get_global_grad_norm()
+                            elif self.args.training_type == "controlnet":
+                                grad_norm = self.components.controlnet.get_global_grad_norm()
+                            # In some cases the grad norm may not return a float
+                            if torch.is_tensor(grad_norm):
+                                grad_norm = grad_norm.item()
+                        else:
+                            if self.args.training_type == "sft" or self.args.training_type == "lora":
+                                param_to_clip = self.components.transformer.parameters()
+                            elif self.args.training_type == "controlnet":
+                                param_to_clip = self.components.controlnet.parameters()
+                            grad_norm = accelerator.clip_grad_norm_(
+                                param_to_clip, self.args.max_grad_norm
+                            )
+                            if torch.is_tensor(grad_norm):
+                                grad_norm = grad_norm.item()
+
+                        logs["grad_norm"] = grad_norm
+
+                    self.optimizer.step()
+                    self.lr_scheduler.step()
+                    self.optimizer.zero_grad()
+
+                # Checks if the accelerator has performed an optimization step behind the scenes
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    global_step += 1
+                    self.__maybe_save_checkpoint(global_step)
+                
+
+                logs["loss"] = loss.detach().item()
+                logs["lr"] = self.lr_scheduler.get_last_lr()[0]
+                progress_bar.set_postfix(logs)
+
+                # Maybe run validation
+                should_run_validation = (
+                    self.args.do_validation and 
+                    global_step % self.args.validation_steps == 0 and 
+                    global_step != 0 and 
+                    global_step != last_validated_step  # prevent duplicate validation
+                )
+                
+                if should_run_validation:
+                    del loss
+                    free_memory()
+                    self.validate(global_step)
+                    should_run_validation = False
+                    last_validated_step = global_step
+
+                accelerator.log(logs, step=global_step)
+
+                if global_step >= self.args.train_steps:
+                    break
+
+            memory_statistics = get_memory_statistics()
+            logger.info(f"Memory after epoch {epoch + 1}: {json.dumps(memory_statistics, indent=4)}")
+
+        accelerator.wait_for_everyone()
+        self.__maybe_save_checkpoint(global_step, must_save=True)
+        if self.args.do_validation:
+            free_memory()
+            self.validate(global_step)
+
+        del self.components
+        free_memory()
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory after training end: {json.dumps(memory_statistics, indent=4)}")
+
+        accelerator.end_training()
+        # except Exception as e:
+        #     logger.info(f"Error message: {e}")
+
+    def validate(self, step: int) -> None:
+        logger.info("Starting validation")
+
+        accelerator = self.accelerator
+        num_validation_samples = len(self.state.validation_prompts)
+
+        if num_validation_samples == 0:
+            logger.warning("No validation samples found. Skipping validation.")
+            return
+
+        self.components.transformer.eval()
+        torch.set_grad_enabled(False)
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory before validation start: {json.dumps(memory_statistics, indent=4)}")
+
+        #####  Initialize pipeline  #####
+        pipe = self.initialize_pipeline()
+
+        if self.state.using_deepspeed:
+            # Can't using model_cpu_offload in deepspeed,
+            # so we need to move all components in pipe to device
+            # pipe.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["transformer"])
+        else:
+            # if not using deepspeed, use model_cpu_offload to further reduce memory usage
+            # Or use pipe.enable_sequential_cpu_offload() to further reduce memory usage
+            pipe.enable_model_cpu_offload(device=self.accelerator.device)
+
+            # Convert all model weights to training dtype
+            # Note, this will change LoRA weights in self.components.transformer to training dtype, rather than keep them in fp32
+            pipe = pipe.to(dtype=self.state.weight_dtype)
+
+        #################################
+
+        all_processes_artifacts = []
+        for i in range(num_validation_samples):
+            if self.state.using_deepspeed and self.accelerator.deepspeed_plugin.zero_stage != 3:
+                # Skip current validation on all processes but one
+                if i % accelerator.num_processes != accelerator.process_index:
+                    continue
+
+            prompt = self.state.validation_prompts[i]
+            image = self.state.validation_images[i]
+            video = self.state.validation_videos[i]
+
+            if image is not None:
+                image = preprocess_image_with_resize(image, self.state.train_height, self.state.train_width)
+                # Convert image tensor (C, H, W) to PIL images
+                image = image.to(torch.uint8)
+                image = image.permute(1, 2, 0).cpu().numpy()
+                image = Image.fromarray(image)
+
+            if video is not None:
+                video = preprocess_video_with_resize(
+                    video, self.state.train_frames, self.state.train_height, self.state.train_width
+                )
+                # Convert video tensor (F, C, H, W) to list of PIL images
+                video = video.round().clamp(0, 255).to(torch.uint8)
+                video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+
+            logger.debug(
+                f"Validating sample {i + 1}/{num_validation_samples} on process {accelerator.process_index}. Prompt: {prompt}",
+                main_process_only=False,
+            )
+            validation_artifacts = self.validation_step({"prompt": prompt, "image": image, "video": video}, pipe)
+
+            if (
+                self.state.using_deepspeed
+                and self.accelerator.deepspeed_plugin.zero_stage == 3
+                and not accelerator.is_main_process
+            ):
+                continue
+
+            prompt_filename = string_to_filename(prompt)[:25]
+            # Calculate hash of reversed prompt as a unique identifier
+            reversed_prompt = prompt[::-1]
+            hash_suffix = hashlib.md5(reversed_prompt.encode()).hexdigest()[:5]
+
+            artifacts = {
+                "image": {"type": "image", "value": image},
+                "video": {"type": "video", "value": video},
+            }
+            for i, (artifact_type, artifact_value) in enumerate(validation_artifacts):
+                artifacts.update({f"artifact_{i}": {"type": artifact_type, "value": artifact_value}})
+            logger.debug(
+                f"Validation artifacts on process {accelerator.process_index}: {list(artifacts.keys())}",
+                main_process_only=False,
+            )
+
+            for key, value in list(artifacts.items()):
+                artifact_type = value["type"]
+                artifact_value = value["value"]
+                if artifact_type not in ["image", "video"] or artifact_value is None:
+                    continue
+
+                extension = "png" if artifact_type == "image" else "mp4"
+                filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}.{extension}"
+                validation_path = self.args.output_dir / "validation_res"
+                validation_path.mkdir(parents=True, exist_ok=True)
+                filename = str(validation_path / filename)
+
+                if artifact_type == "image":
+                    logger.debug(f"Saving image to {filename}")
+                    artifact_value.save(filename)
+                    artifact_value = wandb.Image(filename)
+                elif artifact_type == "video":
+                    logger.debug(f"Saving video to {filename}")
+                    export_to_video(artifact_value, filename, fps=self.args.gen_fps)
+                    artifact_value = wandb.Video(filename, caption=prompt)
+
+                all_processes_artifacts.append(artifact_value)
+
+        all_artifacts = gather_object(all_processes_artifacts)
+
+        if accelerator.is_main_process:
+            tracker_key = "validation"
+            for tracker in accelerator.trackers:
+                if tracker.name == "wandb":
+                    image_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Image)]
+                    video_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Video)]
+                    tracker.log(
+                        {
+                            tracker_key: {"images": image_artifacts, "videos": video_artifacts},
+                        },
+                        step=step,
+                    )
+
+        ##########  Clean up  ##########
+        if self.state.using_deepspeed:
+            del pipe
+            # Unload models except those needed for training
+            self.__move_components_to_cpu(unload_list=self.UNLOAD_LIST)
+        else:
+            pipe.remove_all_hooks()
+            del pipe
+            # Load models except those not needed for training
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=self.UNLOAD_LIST)
+            self.components.transformer.to(self.accelerator.device, dtype=self.state.weight_dtype)
+
+            # Change trainable weights back to fp32 to keep with dtype after prepare the model
+            cast_training_params([self.components.transformer], dtype=torch.float32)
+
+        free_memory()
+        accelerator.wait_for_everyone()
+        ################################
+
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
+        torch.cuda.reset_peak_memory_stats(accelerator.device)
+
+        torch.set_grad_enabled(True)
+        self.components.transformer.train()
+
+    def fit(self):
+        self.check_setting()
+        self.prepare_models()
+        self.prepare_dataset()
+        self.prepare_trainable_parameters()
+        self.prepare_optimizer()
+        self.prepare_for_training()
+        self.prepare_trackers() # prepare for the first validation before training.
+        if self.args.do_validation:
+            self.prepare_for_validation()
+        self.train()
+
+    def collate_fn(self, examples: List[Dict[str, Any]]):
+        raise NotImplementedError
+
+    def load_components(self) -> Components:
+        raise NotImplementedError
+
+    def initialize_pipeline(self) -> DiffusionPipeline:
+        raise NotImplementedError
+
+    def encode_video(self, video: torch.Tensor) -> torch.Tensor:
+        # shape of input video: [B, C, F, H, W], where B = 1
+        # shape of output video: [B, C', F', H', W'], where B = 1
+        raise NotImplementedError
+
+    def encode_text(self, text: str) -> torch.Tensor:
+        # shape of output text: [batch size, sequence length, embedding dimension]
+        raise NotImplementedError
+
+    def compute_loss(self, batch) -> torch.Tensor:
+        raise NotImplementedError
+
+    def validation_step(self) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        raise NotImplementedError
+
+    def __get_training_dtype(self) -> torch.dtype:
+        if self.args.mixed_precision == "no":
+            return _DTYPE_MAP["fp32"]
+        elif self.args.mixed_precision == "fp16":
+            return _DTYPE_MAP["fp16"]
+        elif self.args.mixed_precision == "bf16":
+            return _DTYPE_MAP["bf16"]
+        else:
+            raise ValueError(f"Invalid mixed precision: {self.args.mixed_precision}")
+
+    def __move_components_to_device(self, dtype, ignore_list: List[str] = []):
+        ignore_list = set(ignore_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name not in ignore_list:
+                    setattr(self.components, name, component.to(self.accelerator.device, dtype=dtype))
+
+    def __move_components_to_cpu(self, unload_list: List[str] = []):
+        unload_list = set(unload_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name in unload_list:
+                    setattr(self.components, name, component.to("cpu"))
+
+    def __prepare_saving_loading_hooks(self, transformer_lora_config, block_names=[]):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if self.accelerator.is_main_process:
+                transformer_lora_layers_to_save = None
+
+                for model in models:
+                    if isinstance(
+                        unwrap_model(self.accelerator, model),
+                        type(unwrap_model(self.accelerator, self.components.transformer)),
+                    ):
+                        model = unwrap_model(self.accelerator, model)
+                        # 1) Set Lora weight
+                        transformer_lora_layers_to_save = get_peft_model_state_dict(model)
+                        
+                        # 2) Set Other weight designated by block_names
+                        if len(block_names) != 0:
+                            tensor_dict = {}
+                            for block_name in block_names:
+                                if hasattr(model, block_name):
+                                    block = getattr(model, block_name)
+                                    for k, v in block.state_dict().items():
+                                        tensor_dict[f"{block_name}.{k}"] = v
+                                else:
+                                    raise ValueError(f"Model has no attribute '{block_name}'")
+                    else:
+                        raise ValueError(f"Unexpected save model: {model.__class__}")
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    if weights:
+                        weights.pop()
+
+                # 1) Save Lora weight
+                self.components.pipeline_cls.save_lora_weights(
+                    output_dir,
+                    transformer_lora_layers=transformer_lora_layers_to_save,
+                )
+                # 2) Save Other weight
+                if len(block_names) != 0:
+                    save_path = os.path.join(output_dir, "selected_blocks.safetensors")
+                    save_file(tensor_dict, save_path)
+
+        def load_model_hook(models, input_dir):
+            if not self.accelerator.distributed_type == DistributedType.DEEPSPEED:
+                while len(models) > 0:
+                    model = models.pop()
+                    if isinstance(
+                        unwrap_model(self.accelerator, model),
+                        type(unwrap_model(self.accelerator, self.components.transformer)),
+                    ):
+                        transformer_ = unwrap_model(self.accelerator, model)
+                    else:
+                        raise ValueError(f"Unexpected save model: {unwrap_model(self.accelerator, model).__class__}")
+            else:
+                transformer_ = unwrap_model(self.accelerator, self.components.transformer).__class__.from_pretrained(
+                    self.args.model_path, subfolder="transformer"
+                )
+                transformer_.add_adapter(transformer_lora_config)
+
+            # 1) Load Lora weight
+            lora_state_dict = self.components.pipeline_cls.lora_state_dict(input_dir)
+            transformer_state_dict = {
+                f'{k.replace("transformer.", "")}': v
+                for k, v in lora_state_dict.items()
+                if k.startswith("transformer.")
+            }
+            incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+            
+            # 2) Load Other weight
+            load_path = os.path.join(input_dir, "selected_blocks.safetensors")
+            if os.path.exists(load_path):
+                tensor_dict = load_file(load_path)
+                
+                block_state_dicts = {}
+                for k, v in tensor_dict.items():
+                    block_name, param_name = k.split(".", 1)
+                    if block_name not in block_state_dicts:
+                        block_state_dicts[block_name] = {}
+                    block_state_dicts[block_name][param_name] = v
+                
+                for block_name, state_dict in block_state_dicts.items():
+                    if hasattr(transformer_, block_name):
+                        getattr(transformer_, block_name).load_state_dict(state_dict)
+                    else:
+                        raise ValueError(f"Transformer has no attribute '{block_name}'")
+            
+            # 3) Set optimizer state for desired device/dtype
+            for state in self.optimizer.state.values():
+                for k, v in state.items():
+                    if isinstance(v, torch.Tensor):
+                        state[k] = v.to(device=self.accelerator.device, dtype=torch.float32)
+
+        self.accelerator.register_save_state_pre_hook(save_model_hook)
+        self.accelerator.register_load_state_pre_hook(load_model_hook)
+
+    def __maybe_save_checkpoint(self, global_step: int, must_save: bool = False):
+        if self.accelerator.distributed_type == DistributedType.DEEPSPEED or self.accelerator.is_main_process:
+            if must_save or global_step % self.args.checkpointing_steps == 0:
+                # for training
+                save_path = get_intermediate_ckpt_path(
+                    checkpointing_limit=self.args.checkpointing_limit,
+                    step=global_step,
+                    output_dir=self.args.output_dir,
+                )
+                self.accelerator.save_state(save_path, safe_serialization=True)
diff --git a/finetune/training_scripts/train_FVSM_controlnet.sh b/finetune/training_scripts/train_FVSM_controlnet.sh
new file mode 100644
index 0000000000000000000000000000000000000000..682614d695e828461f5a4b296caf9311f57dbb4c
--- /dev/null
+++ b/finetune/training_scripts/train_FVSM_controlnet.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+
+# Prevent tokenizer parallelism issues
+export TOKENIZERS_PARALLELISM=false
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+# Model Configuration
+MODEL_ARGS=(
+    --model_path "THUDM/CogVideoX-5b-I2V"
+    --model_name "cogvideox-flovd"  # ["cogvideox-i2v" or "cogvideox-flovd"]
+    --model_type "i2vFlow" # ["t2v", "i2v", "i2vFlow"]
+    --training_type "controlnet"
+    # --additional_save_blocks "patch_embed" # additional blocks to update when using lora. e.g., "patch_embed,text_proj"
+)
+
+# Output Configuration
+OUTPUT_ARGS=(
+    --output_dir "absolute/path/to/output"
+    --report_to "wandb"
+    --run_name "FloVD_CogVideoX_controlnet"
+)
+
+# Data Configuration
+DATA_ARGS=(
+    --data_root "absolute/path/to/whole_data"
+    --caption_column "prompt.txt" # Do not need
+    --video_column "videos.txt" # Do not need
+    # --image_column "images.txt"  # comment this line will use first frame of video as image conditioning
+    --train_resolution "49x480x720"  # (frames x height x width), frames should be 8N+1
+)
+
+# Training Configuration
+TRAIN_ARGS=(
+    --train_epochs 10 # number of training epochs
+    --seed 42 # random seed
+    --batch_size 1
+    --gradient_accumulation_steps 2
+    --mixed_precision "bf16"  # ["no", "fp16"] # Only CogVideoX-2B supports fp16 training
+    --learning_rate 1e-5
+)
+
+# System Configuration
+SYSTEM_ARGS=(
+    --num_workers 8
+    --pin_memory True
+    --nccl_timeout 1800
+)
+
+# Checkpointing Configuration
+CHECKPOINT_ARGS=(
+    --checkpointing_steps 2000 # save checkpoint every x steps
+    --checkpointing_limit 2 # maximum number of checkpoints to keep, after which the oldest one is deleted
+    # --resume_from_checkpoint /path/to/ckpt # if you want to resume from a checkpoint, otherwise, comment this line
+)
+
+# Validation Configuration
+VALIDATION_ARGS=(
+    --do_validation true  # ["true", "false"]
+    --validation_dir "absolute/path/to/whole_data"
+    --validation_steps 2000  # should be multiple of checkpointing_steps
+    --validation_prompts "prompts.txt" # Do not need
+    --validation_images "images.txt" # Do not need
+    --gen_fps 16
+    --max_scene 4
+)
+
+# Controlnet Configuration
+CONTROLNET_ARGS=(
+    --controlnet_transformer_num_layers 6
+    --controlnet_input_channels 16
+    --controlnet_weights 1.0
+    --controlnet_guidance_start 0.0
+    --controlnet_guidance_end 0.4
+    --controlnet_out_proj_dim_factor 64
+    --enable_time_sampling false
+    --time_sampling_type "truncated_normal"
+    --time_sampling_mean 0.95
+    --time_sampling_std 0.1
+    --notextinflow true
+)
+
+
+# Combine all arguments and launch training
+accelerate launch --config_file accelerate_config.yaml train.py \
+    "${MODEL_ARGS[@]}" \
+    "${OUTPUT_ARGS[@]}" \
+    "${DATA_ARGS[@]}" \
+    "${TRAIN_ARGS[@]}" \
+    "${SYSTEM_ARGS[@]}" \
+    "${CHECKPOINT_ARGS[@]}" \
+    "${VALIDATION_ARGS[@]}" \
+    "${CONTROLNET_ARGS[@]}"
diff --git a/finetune/training_scripts/train_OMSM.sh b/finetune/training_scripts/train_OMSM.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0097ac010f07295aad55f6e696b4cb763d79e610
--- /dev/null
+++ b/finetune/training_scripts/train_OMSM.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# Prevent tokenizer parallelism issues
+export TOKENIZERS_PARALLELISM=false
+
+# Model Configuration
+MODEL_ARGS=(
+    --model_path "THUDM/CogVideoX-5b-I2V"
+    --model_name "cogvideox-flovd-omsm"  # ["cogvideox-i2v"]
+    --model_type "i2vFlow" # ["t2v", "i2v", "i2vFlow"]
+    --training_type "lora"
+    --additional_save_blocks "patch_embed" # additional blocks to update when using lora. e.g., "patch_embed,text_proj"
+)
+
+# Output Configuration
+OUTPUT_ARGS=(
+    --output_dir "absolute/path/to/output"
+    --report_to "wandb"
+    --run_name "CogVideoX_OMSM"
+)
+
+# Data Configuration
+DATA_ARGS=(
+    --data_root "absolute/path/to/whole_data"
+    --caption_column "prompt.txt" # Do not need
+    --video_column "videos.txt" # Do not need
+    # --image_column "images.txt"  # comment this line will use first frame of video as image conditioning
+    --train_resolution "49x480x720"  # (frames x height x width), frames should be 8N+1
+)
+
+# Lora Configuration
+LORA_ARGS=(
+    --target_modules "to_q" "to_k" "to_v" "to_out.0" "norm1.linear" "norm2.linear" "ff.net.2"
+)
+
+# Training Configuration
+TRAIN_ARGS=(
+    --train_epochs 10 # number of training epochs
+    --seed 42 # random seed
+    --batch_size 1
+    --gradient_accumulation_steps 2
+    --mixed_precision "bf16"  # ["no", "fp16"] # Only CogVideoX-2B supports fp16 training
+)
+
+# System Configuration
+SYSTEM_ARGS=(
+    --num_workers 8
+    --pin_memory True
+    --nccl_timeout 1800
+)
+
+# Checkpointing Configuration
+CHECKPOINT_ARGS=(
+    --checkpointing_steps 2000 # save checkpoint every x steps
+    --checkpointing_limit 2 # maximum number of checkpoints to keep, after which the oldest one is deleted
+    # --resume_from_checkpoint "/path/to/ckpt"  # if you want to resume from a checkpoint, otherwise, comment this line
+)
+
+# Validation Configuration
+VALIDATION_ARGS=(
+    --do_validation true  # ["true", "false"]
+    --validation_dir "absolute/path/to/whole_data"
+    --validation_steps 2000  # should be multiple of checkpointing_steps
+    --validation_prompts "prompts.txt"
+    --validation_images "images.txt"
+    --gen_fps 16
+    --max_scene 4
+)
+
+# Combine all arguments and launch training
+accelerate launch train.py \
+    "${MODEL_ARGS[@]}" \
+    "${OUTPUT_ARGS[@]}" \
+    "${DATA_ARGS[@]}" \
+    "${TRAIN_ARGS[@]}" \
+    "${SYSTEM_ARGS[@]}" \
+    "${CHECKPOINT_ARGS[@]}" \
+    "${VALIDATION_ARGS[@]}" \
+    "${LORA_ARGS[@]}" \
diff --git a/finetune/training_scripts/train_OMSM_Curated.sh b/finetune/training_scripts/train_OMSM_Curated.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c2b73995f3d86d2cb1e7098c3551d95c8fdc1410
--- /dev/null
+++ b/finetune/training_scripts/train_OMSM_Curated.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+
+# Prevent tokenizer parallelism issues
+export TOKENIZERS_PARALLELISM=false
+
+# Model Configuration
+MODEL_ARGS=(
+    --model_path "THUDM/CogVideoX-5b-I2V"
+    --model_name "cogvideox-flovd-omsm"  # ["cogvideox-i2v"]
+    --model_type "i2vFlow" # ["t2v", "i2v", "i2vFlow"]
+    --training_type "lora"
+    --additional_save_blocks "patch_embed" # additional blocks to update when using lora. e.g., "patch_embed,text_proj"
+)
+
+# Output Configuration
+OUTPUT_ARGS=(
+    --output_dir "absolute/path/to/output"
+    --report_to "wandb"
+    --run_name "CogVideoX_OMSM_Curated"
+)
+
+# Data Configuration
+DATA_ARGS=(
+    --data_root "absolute/path/to/curated_data"
+    --caption_column "prompt.txt" # Do not need
+    --video_column "videos.txt" # Do not need
+    # --image_column "images.txt"  # comment this line will use first frame of video as image conditioning
+    --train_resolution "49x480x720"  # (frames x height x width), frames should be 8N+1
+)
+
+# Lora Configuration
+LORA_ARGS=(
+    --target_modules "to_q" "to_k" "to_v" "to_out.0" "norm1.linear" "norm2.linear" "ff.net.2"
+)
+
+# Training Configuration
+TRAIN_ARGS=(
+    --train_epochs 15 # number of training epochs
+    --seed 42 # random seed
+    --batch_size 1
+    --gradient_accumulation_steps 2
+    --mixed_precision "bf16"  # ["no", "fp16"] # Only CogVideoX-2B supports fp16 training
+)
+
+# System Configuration
+SYSTEM_ARGS=(
+    --num_workers 8
+    --pin_memory True
+    --nccl_timeout 1800
+)
+
+# Checkpointing Configuration
+CHECKPOINT_ARGS=(
+    --checkpointing_steps 2000 # save checkpoint every x steps
+    --checkpointing_limit 2 # maximum number of checkpoints to keep, after which the oldest one is deleted
+    --resume_from_checkpoint "absolute/path/to/OMSM_Pretrained"  # if you want to resume from a checkpoint, otherwise, comment this line
+)
+
+# Validation Configuration
+VALIDATION_ARGS=(
+    --do_validation true  # ["true", "false"]
+    --validation_dir "absolute/path/to/curated_data"
+    --validation_steps 2000  # should be multiple of checkpointing_steps
+    --validation_prompts "prompts.txt" # Do not need
+    --validation_images "images.txt" # Do not need
+    --gen_fps 16
+    --max_scene 4
+)
+
+# Combine all arguments and launch training
+accelerate launch train.py \
+    "${MODEL_ARGS[@]}" \
+    "${OUTPUT_ARGS[@]}" \
+    "${DATA_ARGS[@]}" \
+    "${TRAIN_ARGS[@]}" \
+    "${SYSTEM_ARGS[@]}" \
+    "${CHECKPOINT_ARGS[@]}" \
+    "${VALIDATION_ARGS[@]}" \
+    "${LORA_ARGS[@]}" \
diff --git a/finetune/utils/__init__.py b/finetune/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff4912bfa79426ac231e1450cd95990a0ae6fa1
--- /dev/null
+++ b/finetune/utils/__init__.py
@@ -0,0 +1,5 @@
+from .checkpointing import *
+from .file_utils import *
+from .memory_utils import *
+from .optimizer_utils import *
+from .torch_utils import *
diff --git a/finetune/utils/checkpointing.py b/finetune/utils/checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..775038c03cdf0d0d24c2a7b9c2fdc05c21c1ca5e
--- /dev/null
+++ b/finetune/utils/checkpointing.py
@@ -0,0 +1,55 @@
+import os
+from pathlib import Path
+from typing import Tuple
+
+from accelerate.logging import get_logger
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+from ..utils.file_utils import delete_files, find_files
+
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+def get_latest_ckpt_path_to_resume_from(
+    resume_from_checkpoint: str | None, num_update_steps_per_epoch: int
+) -> Tuple[str | None, int, int, int]:
+    if resume_from_checkpoint is None:
+        initial_global_step = 0
+        global_step = 0
+        first_epoch = 0
+        resume_from_checkpoint_path = None
+    else:
+        resume_from_checkpoint_path = Path(resume_from_checkpoint)
+        if not resume_from_checkpoint_path.exists():
+            logger.info(f"Checkpoint '{resume_from_checkpoint}' does not exist. Starting a new training run.")
+            initial_global_step = 0
+            global_step = 0
+            first_epoch = 0
+            resume_from_checkpoint_path = None
+        else:
+            logger.info(f"Resuming from checkpoint {resume_from_checkpoint}")
+            global_step = int(resume_from_checkpoint_path.name.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    return resume_from_checkpoint_path, initial_global_step, global_step, first_epoch
+
+
+def get_intermediate_ckpt_path(checkpointing_limit: int, step: int, output_dir: str) -> str:
+    # before saving state, check if this save would set us over the `checkpointing_limit`
+    if checkpointing_limit is not None:
+        checkpoints = find_files(output_dir, prefix="checkpoint")
+
+        # before we save the new checkpoint, we need to have at_most `checkpoints_total_limit - 1` checkpoints
+        if len(checkpoints) >= checkpointing_limit:
+            num_to_remove = len(checkpoints) - checkpointing_limit + 1
+            checkpoints_to_remove = checkpoints[0:num_to_remove]
+            delete_files(checkpoints_to_remove)
+
+    logger.info(f"Checkpointing at step {step}")
+    save_path = os.path.join(output_dir, f"checkpoint-{step}")
+    logger.info(f"Saving state to {save_path}")
+    return save_path
diff --git a/finetune/utils/file_utils.py b/finetune/utils/file_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b11051e127e94357f068166b76542819d030fa
--- /dev/null
+++ b/finetune/utils/file_utils.py
@@ -0,0 +1,48 @@
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import Any, Dict, List, Union
+
+from accelerate.logging import get_logger
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+def find_files(dir: Union[str, Path], prefix: str = "checkpoint") -> List[str]:
+    if not isinstance(dir, Path):
+        dir = Path(dir)
+    if not dir.exists():
+        return []
+    checkpoints = os.listdir(dir.as_posix())
+    checkpoints = [c for c in checkpoints if c.startswith(prefix)]
+    checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+    checkpoints = [dir / c for c in checkpoints]
+    return checkpoints
+
+
+def delete_files(dirs: Union[str, List[str], Path, List[Path]]) -> None:
+    if not isinstance(dirs, list):
+        dirs = [dirs]
+    dirs = [Path(d) if isinstance(d, str) else d for d in dirs]
+    logger.info(f"Deleting files: {dirs}")
+    for dir in dirs:
+        if not dir.exists():
+            continue
+        shutil.rmtree(dir, ignore_errors=True)
+
+
+def string_to_filename(s: str) -> str:
+    return (
+        s.replace(" ", "-")
+        .replace("/", "-")
+        .replace(":", "-")
+        .replace(".", "-")
+        .replace(",", "-")
+        .replace(";", "-")
+        .replace("!", "-")
+        .replace("?", "-")
+    )
diff --git a/finetune/utils/memory_utils.py b/finetune/utils/memory_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c88d70c430b9a2b3b23ea42eef69c75ed17f233
--- /dev/null
+++ b/finetune/utils/memory_utils.py
@@ -0,0 +1,64 @@
+import gc
+from typing import Any, Dict, Union
+
+import torch
+from accelerate.logging import get_logger
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+def get_memory_statistics(precision: int = 3) -> Dict[str, Any]:
+    memory_allocated = None
+    memory_reserved = None
+    max_memory_allocated = None
+    max_memory_reserved = None
+
+    if torch.cuda.is_available():
+        device = torch.cuda.current_device()
+        memory_allocated = torch.cuda.memory_allocated(device)
+        memory_reserved = torch.cuda.memory_reserved(device)
+        max_memory_allocated = torch.cuda.max_memory_allocated(device)
+        max_memory_reserved = torch.cuda.max_memory_reserved(device)
+
+    elif torch.mps.is_available():
+        memory_allocated = torch.mps.current_allocated_memory()
+
+    else:
+        logger.warning("No CUDA, MPS, or ROCm device found. Memory statistics are not available.")
+
+    return {
+        "memory_allocated": round(bytes_to_gigabytes(memory_allocated), ndigits=precision),
+        "memory_reserved": round(bytes_to_gigabytes(memory_reserved), ndigits=precision),
+        "max_memory_allocated": round(bytes_to_gigabytes(max_memory_allocated), ndigits=precision),
+        "max_memory_reserved": round(bytes_to_gigabytes(max_memory_reserved), ndigits=precision),
+    }
+
+
+def bytes_to_gigabytes(x: int) -> float:
+    if x is not None:
+        return x / 1024**3
+
+
+def free_memory() -> None:
+    if torch.cuda.is_available():
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+    # TODO(aryan): handle non-cuda devices
+
+
+def unload_model(model):
+    model.to("cpu")
+
+
+def make_contiguous(x: Union[torch.Tensor, Dict[str, torch.Tensor]]) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+    if isinstance(x, torch.Tensor):
+        return x.contiguous()
+    elif isinstance(x, dict):
+        return {k: make_contiguous(v) for k, v in x.items()}
+    else:
+        return x
diff --git a/finetune/utils/optimizer_utils.py b/finetune/utils/optimizer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d24aa3f6443f3199d7f6f3b7b002266137673c8d
--- /dev/null
+++ b/finetune/utils/optimizer_utils.py
@@ -0,0 +1,180 @@
+import inspect
+
+import torch
+from accelerate.logging import get_logger
+
+from finetune.constants import LOG_LEVEL, LOG_NAME
+
+
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+
+
+def get_optimizer(
+    params_to_optimize,
+    optimizer_name: str = "adam",
+    learning_rate: float = 1e-3,
+    beta1: float = 0.9,
+    beta2: float = 0.95,
+    beta3: float = 0.98,
+    epsilon: float = 1e-8,
+    weight_decay: float = 1e-4,
+    prodigy_decouple: bool = False,
+    prodigy_use_bias_correction: bool = False,
+    prodigy_safeguard_warmup: bool = False,
+    use_8bit: bool = False,
+    use_4bit: bool = False,
+    use_torchao: bool = False,
+    use_deepspeed: bool = False,
+    use_cpu_offload_optimizer: bool = False,
+    offload_gradients: bool = False,
+) -> torch.optim.Optimizer:
+    optimizer_name = optimizer_name.lower()
+
+    # Use DeepSpeed optimzer
+    if use_deepspeed:
+        from accelerate.utils import DummyOptim
+
+        return DummyOptim(
+            params_to_optimize,
+            lr=learning_rate,
+            betas=(beta1, beta2),
+            eps=epsilon,
+            weight_decay=weight_decay,
+        )
+
+    if use_8bit and use_4bit:
+        raise ValueError("Cannot set both `use_8bit` and `use_4bit` to True.")
+
+    if (use_torchao and (use_8bit or use_4bit)) or use_cpu_offload_optimizer:
+        try:
+            import torchao
+
+            torchao.__version__
+        except ImportError:
+            raise ImportError(
+                "To use optimizers from torchao, please install the torchao library: `USE_CPP=0 pip install torchao`."
+            )
+
+    if not use_torchao and use_4bit:
+        raise ValueError("4-bit Optimizers are only supported with torchao.")
+
+    # Optimizer creation
+    supported_optimizers = ["adam", "adamw", "prodigy", "came"]
+    if optimizer_name not in supported_optimizers:
+        logger.warning(
+            f"Unsupported choice of optimizer: {optimizer_name}. Supported optimizers include {supported_optimizers}. Defaulting to `AdamW`."
+        )
+        optimizer_name = "adamw"
+
+    if (use_8bit or use_4bit) and optimizer_name not in ["adam", "adamw"]:
+        raise ValueError("`use_8bit` and `use_4bit` can only be used with the Adam and AdamW optimizers.")
+
+    if use_8bit:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+    if optimizer_name == "adamw":
+        if use_torchao:
+            from torchao.prototype.low_bit_optim import AdamW4bit, AdamW8bit
+
+            optimizer_class = AdamW8bit if use_8bit else AdamW4bit if use_4bit else torch.optim.AdamW
+        else:
+            optimizer_class = bnb.optim.AdamW8bit if use_8bit else torch.optim.AdamW
+
+        init_kwargs = {
+            "betas": (beta1, beta2),
+            "eps": epsilon,
+            "weight_decay": weight_decay,
+        }
+
+    elif optimizer_name == "adam":
+        if use_torchao:
+            from torchao.prototype.low_bit_optim import Adam4bit, Adam8bit
+
+            optimizer_class = Adam8bit if use_8bit else Adam4bit if use_4bit else torch.optim.Adam
+        else:
+            optimizer_class = bnb.optim.Adam8bit if use_8bit else torch.optim.Adam
+
+        init_kwargs = {
+            "betas": (beta1, beta2),
+            "eps": epsilon,
+            "weight_decay": weight_decay,
+        }
+
+    elif optimizer_name == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        init_kwargs = {
+            "lr": learning_rate,
+            "betas": (beta1, beta2),
+            "beta3": beta3,
+            "eps": epsilon,
+            "weight_decay": weight_decay,
+            "decouple": prodigy_decouple,
+            "use_bias_correction": prodigy_use_bias_correction,
+            "safeguard_warmup": prodigy_safeguard_warmup,
+        }
+
+    elif optimizer_name == "came":
+        try:
+            import came_pytorch
+        except ImportError:
+            raise ImportError("To use CAME, please install the came-pytorch library: `pip install came-pytorch`")
+
+        optimizer_class = came_pytorch.CAME
+
+        init_kwargs = {
+            "lr": learning_rate,
+            "eps": (1e-30, 1e-16),
+            "betas": (beta1, beta2, beta3),
+            "weight_decay": weight_decay,
+        }
+
+    if use_cpu_offload_optimizer:
+        from torchao.prototype.low_bit_optim import CPUOffloadOptimizer
+
+        if "fused" in inspect.signature(optimizer_class.__init__).parameters:
+            init_kwargs.update({"fused": True})
+
+        optimizer = CPUOffloadOptimizer(
+            params_to_optimize, optimizer_class=optimizer_class, offload_gradients=offload_gradients, **init_kwargs
+        )
+    else:
+        optimizer = optimizer_class(params_to_optimize, **init_kwargs)
+
+    return optimizer
+
+
+def gradient_norm(parameters):
+    norm = 0
+    for param in parameters:
+        if param.grad is None:
+            continue
+        local_norm = param.grad.detach().data.norm(2)
+        norm += local_norm.item() ** 2
+    norm = norm**0.5
+    return norm
+
+
+def max_gradient(parameters):
+    max_grad_value = float("-inf")
+    for param in parameters:
+        if param.grad is None:
+            continue
+        local_max_grad = param.grad.detach().data.abs().max()
+        max_grad_value = max(max_grad_value, local_max_grad.item())
+    return max_grad_value
diff --git a/finetune/utils/torch_utils.py b/finetune/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..867a8bf093fab3864fbbc3741056665a133b24ee
--- /dev/null
+++ b/finetune/utils/torch_utils.py
@@ -0,0 +1,52 @@
+from typing import Dict, List, Optional, Union
+
+import torch
+from accelerate import Accelerator
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+def unwrap_model(accelerator: Accelerator, model):
+    model = accelerator.unwrap_model(model)
+    model = model._orig_mod if is_compiled_module(model) else model
+    return model
+
+
+def align_device_and_dtype(
+    x: Union[torch.Tensor, Dict[str, torch.Tensor]],
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+    if isinstance(x, torch.Tensor):
+        if device is not None:
+            x = x.to(device)
+        if dtype is not None:
+            x = x.to(dtype)
+    elif isinstance(x, dict):
+        if device is not None:
+            x = {k: align_device_and_dtype(v, device, dtype) for k, v in x.items()}
+        if dtype is not None:
+            x = {k: align_device_and_dtype(v, device, dtype) for k, v in x.items()}
+    return x
+
+
+def expand_tensor_to_dims(tensor, ndim):
+    while len(tensor.shape) < ndim:
+        tensor = tensor.unsqueeze(-1)
+    return tensor
+
+
+def cast_training_params(model: Union[torch.nn.Module, List[torch.nn.Module]], dtype=torch.float32):
+    """
+    Casts the training parameters of the model to the specified data type.
+
+    Args:
+        model: The PyTorch model whose parameters will be cast.
+        dtype: The data type to which the model parameters will be cast.
+    """
+    if not isinstance(model, list):
+        model = [model]
+    for m in model:
+        for param in m.parameters():
+            # only upcast trainable parameters into fp32
+            if param.requires_grad:
+                param.data = param.to(dtype)
diff --git a/inference/cli_demo.py b/inference/cli_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..37dfcfc75e120635aef0034407ef35667c927368
--- /dev/null
+++ b/inference/cli_demo.py
@@ -0,0 +1,237 @@
+"""
+This script demonstrates how to generate a video using the CogVideoX model with the Hugging Face `diffusers` pipeline.
+The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v),
+and video-to-video (v2v), depending on the input data and different weight.
+
+- text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V
+
+Running the Script:
+To run the script, use the following command with appropriate arguments:
+
+```bash
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
+```
+
+You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory
+
+Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
+
+"""
+
+import argparse
+import logging
+from typing import Literal, Optional
+
+import torch
+
+from diffusers import (
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXVideoToVideoPipeline,
+)
+from diffusers.utils import export_to_video, load_image, load_video
+
+
+logging.basicConfig(level=logging.INFO)
+
+# Recommended resolution for each model (width, height)
+RESOLUTION_MAP = {
+    # cogvideox1.5-*
+    "cogvideox1.5-5b-i2v": (768, 1360),
+    "cogvideox1.5-5b": (768, 1360),
+    # cogvideox-*
+    "cogvideox-5b-i2v": (480, 720),
+    "cogvideox-5b": (480, 720),
+    "cogvideox-2b": (480, 720),
+}
+
+
+def generate_video(
+    prompt: str,
+    model_path: str,
+    lora_path: str = None,
+    lora_rank: int = 128,
+    num_frames: int = 81,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    output_path: str = "./output.mp4",
+    image_or_video_path: str = "",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 6.0,
+    num_videos_per_prompt: int = 1,
+    dtype: torch.dtype = torch.bfloat16,
+    generate_type: str = Literal["t2v", "i2v", "v2v"],  # i2v: image to video, v2v: video to video
+    seed: int = 42,
+    fps: int = 16,
+):
+    """
+    Generates a video based on the given prompt and saves it to the specified path.
+
+    Parameters:
+    - prompt (str): The description of the video to be generated.
+    - model_path (str): The path of the pre-trained model to be used.
+    - lora_path (str): The path of the LoRA weights to be used.
+    - lora_rank (int): The rank of the LoRA weights.
+    - output_path (str): The path where the generated video will be saved.
+    - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
+    - num_frames (int): Number of frames to generate. CogVideoX1.0 generates 49 frames for 6 seconds at 8 fps, while CogVideoX1.5 produces either 81 or 161 frames, corresponding to 5 seconds or 10 seconds at 16 fps.
+    - width (int): The width of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - height (int): The height of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
+    - num_videos_per_prompt (int): Number of videos to generate per prompt.
+    - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
+    - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
+    - seed (int): The seed for reproducibility.
+    - fps (int): The frames per second for the generated video.
+    """
+
+    # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
+    # add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload()
+    # function to use Multi GPUs.
+
+    image = None
+    video = None
+
+    model_name = model_path.split("/")[-1].lower()
+    desired_resolution = RESOLUTION_MAP[model_name]
+    if width is None or height is None:
+        height, width = desired_resolution
+        logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m")
+    elif (height, width) != desired_resolution:
+        if generate_type == "i2v":
+            # For i2v models, use user-defined width and height
+            logging.warning(
+                f"\033[1;31mThe width({width}) and height({height}) are not recommended for {model_name}. The best resolution is {desired_resolution}.\033[0m"
+            )
+        else:
+            # Otherwise, use the recommended width and height
+            logging.warning(
+                f"\033[1;31m{model_name} is not supported for custom resolution. Setting back to default resolution {desired_resolution}.\033[0m"
+            )
+            height, width = desired_resolution
+
+    if generate_type == "i2v":
+        pipe = CogVideoXImageToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)
+        image = load_image(image=image_or_video_path)
+    elif generate_type == "t2v":
+        pipe = CogVideoXPipeline.from_pretrained(model_path, torch_dtype=dtype)
+    else:
+        pipe = CogVideoXVideoToVideoPipeline.from_pretrained(model_path, torch_dtype=dtype)
+        video = load_video(image_or_video_path)
+
+    # If you're using with lora, add this code
+    if lora_path:
+        pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
+        pipe.fuse_lora(components=["transformer"], lora_scale=1 / lora_rank)
+
+    # 2. Set Scheduler.
+    # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`.
+    # We recommend using `CogVideoXDDIMScheduler` for CogVideoX-2B.
+    # using `CogVideoXDPMScheduler` for CogVideoX-5B / CogVideoX-5B-I2V.
+
+    # pipe.scheduler = CogVideoXDDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+
+    # 3. Enable CPU offload for the model.
+    # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
+    # and enable to("cuda")
+    # pipe.to("cuda")
+
+    # pipe.enable_model_cpu_offload()
+    pipe.enable_sequential_cpu_offload()
+    pipe.vae.enable_slicing()
+    pipe.vae.enable_tiling()
+
+    # 4. Generate the video frames based on the prompt.
+    # `num_frames` is the Number of frames to generate.
+    if generate_type == "i2v":
+        video_generate = pipe(
+            height=height,
+            width=width,
+            prompt=prompt,
+            image=image,
+            # The path of the image, the resolution of video will be the same as the image for CogVideoX1.5-5B-I2V, otherwise it will be 720 * 480
+            num_videos_per_prompt=num_videos_per_prompt,  # Number of videos to generate per prompt
+            num_inference_steps=num_inference_steps,  # Number of inference steps
+            num_frames=num_frames,  # Number of frames to generate
+            use_dynamic_cfg=True,  # This id used for DPM scheduler, for DDIM scheduler, it should be False
+            guidance_scale=guidance_scale,
+            generator=torch.Generator().manual_seed(seed),  # Set the seed for reproducibility
+        ).frames[0]
+    elif generate_type == "t2v":
+        video_generate = pipe(
+            height=height,
+            width=width,
+            prompt=prompt,
+            num_videos_per_prompt=num_videos_per_prompt,
+            num_inference_steps=num_inference_steps,
+            num_frames=num_frames,
+            use_dynamic_cfg=True,
+            guidance_scale=guidance_scale,
+            generator=torch.Generator().manual_seed(seed),
+        ).frames[0]
+    else:
+        video_generate = pipe(
+            height=height,
+            width=width,
+            prompt=prompt,
+            video=video,  # The path of the video to be used as the background of the video
+            num_videos_per_prompt=num_videos_per_prompt,
+            num_inference_steps=num_inference_steps,
+            num_frames=num_frames,
+            use_dynamic_cfg=True,
+            guidance_scale=guidance_scale,
+            generator=torch.Generator().manual_seed(seed),  # Set the seed for reproducibility
+        ).frames[0]
+    export_to_video(video_generate, output_path, fps=fps)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
+    parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
+    parser.add_argument(
+        "--image_or_video_path",
+        type=str,
+        default=None,
+        help="The path of the image to be used as the background of the video",
+    )
+    parser.add_argument(
+        "--model_path", type=str, default="THUDM/CogVideoX1.5-5B", help="Path of the pre-trained model use"
+    )
+    parser.add_argument("--lora_path", type=str, default=None, help="The path of the LoRA weights to be used")
+    parser.add_argument("--lora_rank", type=int, default=128, help="The rank of the LoRA weights")
+    parser.add_argument("--output_path", type=str, default="./output.mp4", help="The path save generated video")
+    parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
+    parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of steps for the inference process")
+    parser.add_argument("--width", type=int, default=None, help="The width of the generated video")
+    parser.add_argument("--height", type=int, default=None, help="The height of the generated video")
+    parser.add_argument("--fps", type=int, default=16, help="The frames per second for the generated video")
+    parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
+    parser.add_argument("--generate_type", type=str, default="t2v", help="The type of video generation")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation")
+    parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
+
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    generate_video(
+        prompt=args.prompt,
+        model_path=args.model_path,
+        lora_path=args.lora_path,
+        lora_rank=args.lora_rank,
+        output_path=args.output_path,
+        num_frames=args.num_frames,
+        width=args.width,
+        height=args.height,
+        image_or_video_path=args.image_or_video_path,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        dtype=dtype,
+        generate_type=args.generate_type,
+        seed=args.seed,
+        fps=args.fps,
+    )
diff --git a/inference/flovd_ddp_demo.py b/inference/flovd_ddp_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e4fa8ce55e5b8b2234dd4c1b0f9dd266696ca70
--- /dev/null
+++ b/inference/flovd_ddp_demo.py
@@ -0,0 +1,817 @@
+"""
+This script demonstrates how to generate a video using the CogVideoX model with the Hugging Face `diffusers` pipeline.
+The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v),
+and video-to-video (v2v), depending on the input data and different weight.
+
+- text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V
+
+Running the Script:
+To run the script, use the following command with appropriate arguments:
+
+```bash
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
+```
+
+You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory
+
+Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
+
+"""
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import argparse
+import logging
+import os
+import sys
+from typing import Literal, Optional
+from pathlib import Path
+import json
+from datetime import timedelta
+import random
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+from einops import rearrange, repeat
+import math
+import numpy as np
+from PIL import Image
+
+import torch
+
+from diffusers import (
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXVideoToVideoPipeline,
+    AutoencoderKLCogVideoX,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils import export_to_video, load_image, load_video
+from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
+
+sys.path.append(os.path.abspath(os.path.join(sys.path[0], "../")))
+from finetune.pipeline.flovd_FVSM_cogvideox_controlnet_pipeline import FloVDCogVideoXControlnetImageToVideoPipeline
+from finetune.pipeline.flovd_OMSM_cogvideox_pipeline import FloVDOMSMCogVideoXImageToVideoPipeline
+from finetune.schemas import Components, Args
+from finetune.modules.cogvideox_controlnet import CogVideoXControlnet
+from finetune.modules.cogvideox_custom_model import CustomCogVideoXTransformer3DModel
+from transformers import AutoTokenizer, T5EncoderModel
+
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting, flow_to_color
+from finetune.modules.depth_warping.depth_warping import unnormalize_intrinsic
+
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    preprocess_video_with_resize,
+)
+
+
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+
+import pdb
+sys.path.append(os.path.abspath(os.path.join(sys.path[-1], 'finetune'))) # for camera flow generator
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+logging.basicConfig(level=logging.INFO)
+
+# Recommended resolution for each model (width, height)
+RESOLUTION_MAP = {
+    # cogvideox1.5-*
+    "cogvideox1.5-5b-i2v": (768, 1360),
+    "cogvideox1.5-5b": (768, 1360),
+    # cogvideox-*
+    "cogvideox-5b-i2v": (480, 720),
+    "cogvideox-5b": (480, 720),
+    "cogvideox-2b": (480, 720),
+}
+
+
+
+
+def init_dist(launcher="slurm", backend='nccl', port=29500, **kwargs):
+    """Initializes distributed environment."""
+    if launcher == 'pytorch':
+        rank = int(os.environ['RANK'])
+        num_gpus = torch.cuda.device_count()
+        local_rank = rank % num_gpus
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(backend=backend, timeout=timedelta(minutes=30), **kwargs)
+
+    elif launcher == 'slurm':
+        proc_id = int(os.environ['SLURM_PROCID'])
+        ntasks = int(os.environ['SLURM_NTASKS'])
+        node_list = os.environ['SLURM_NODELIST']
+        num_gpus = torch.cuda.device_count()
+        local_rank = proc_id % num_gpus
+        torch.cuda.set_device(local_rank)
+        addr = subprocess.getoutput(
+            f'scontrol show hostname {node_list} | head -n1')
+        os.environ['MASTER_ADDR'] = addr
+        os.environ['WORLD_SIZE'] = str(ntasks)
+        os.environ['RANK'] = str(proc_id)
+        port = os.environ.get('PORT', port)
+        os.environ['MASTER_PORT'] = str(port)
+        dist.init_process_group(backend=backend, timeout=timedelta(minutes=30))
+
+    else:
+        raise NotImplementedError(f'Not implemented launcher type: `{launcher}`!')
+    # https://github.com/pytorch/pytorch/issues/98763
+    # torch.cuda.set_device(local_rank)
+
+    return local_rank
+
+
+def load_cogvideox_flovd_FVSM_controlnet_pipeline(controlnet_path, backbone_path, device, dtype):
+    controlnet_sd = torch.load(controlnet_path, map_location='cpu')['module']
+    
+    tokenizer = AutoTokenizer.from_pretrained(backbone_path, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(backbone_path, subfolder="text_encoder")
+    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(backbone_path, subfolder="transformer")
+    vae = AutoencoderKLCogVideoX.from_pretrained(backbone_path, subfolder="vae")
+    scheduler = CogVideoXDPMScheduler.from_pretrained(backbone_path, subfolder="scheduler")
+    
+    additional_kwargs = {
+        'num_layers': 6,
+        'out_proj_dim_factor': 64,
+        'out_proj_dim_zero_init': True,
+        'notextinflow': True,
+    }
+    controlnet = CogVideoXControlnet.from_pretrained(backbone_path, subfolder="transformer", **additional_kwargs)
+    controlnet.eval()
+    
+    missing, unexpected = controlnet.load_state_dict(controlnet_sd)
+    
+    if len(missing) != 0 or len(unexpected) != 0:
+        print(f"Missing keys : {missing}")
+        print(f"Unexpected keys : {unexpected}")
+        
+    pipe = FloVDCogVideoXControlnetImageToVideoPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            controlnet=controlnet,
+            scheduler=scheduler,
+    )
+    
+    # pipe.enable_model_cpu_offload(device=device)
+    pipe = pipe.to(device, dtype)
+    
+    return pipe
+
+def load_cogvideox_flovd_OMSM_lora_pipeline(omsm_path, backbone_path, transformer_lora_config, device, dtype):
+    tokenizer = AutoTokenizer.from_pretrained(backbone_path, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(backbone_path, subfolder="text_encoder")
+    transformer = CogVideoXTransformer3DModel.from_pretrained(backbone_path, subfolder="transformer")
+    vae = AutoencoderKLCogVideoX.from_pretrained(backbone_path, subfolder="vae")
+    scheduler = CogVideoXDPMScheduler.from_pretrained(backbone_path, subfolder="scheduler")
+
+    # 1) Load Lora weight
+    transformer.add_adapter(transformer_lora_config)
+
+    lora_state_dict = FloVDOMSMCogVideoXImageToVideoPipeline.lora_state_dict(omsm_path)
+    transformer_state_dict = {
+        f'{k.replace("transformer.", "")}': v
+        for k, v in lora_state_dict.items()
+        if k.startswith("transformer.")
+    }
+    incompatible_keys = set_peft_model_state_dict(transformer, transformer_state_dict, adapter_name="default")
+    if incompatible_keys is not None:
+        # check only for unexpected keys
+        unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+        if unexpected_keys:
+            logger.warning(
+                f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                f" {unexpected_keys}. "
+            )
+
+    # 2) Load Other weight
+    load_path = os.path.join(omsm_path, "selected_blocks.safetensors")
+    if os.path.exists(load_path):
+        tensor_dict = load_file(load_path)
+        
+        block_state_dicts = {}
+        for k, v in tensor_dict.items():
+            block_name, param_name = k.split(".", 1)
+            if block_name not in block_state_dicts:
+                block_state_dicts[block_name] = {}
+            block_state_dicts[block_name][param_name] = v
+        
+        for block_name, state_dict in block_state_dicts.items():
+            if hasattr(transformer, block_name):
+                getattr(transformer, block_name).load_state_dict(state_dict)
+            else:
+                raise ValueError(f"Transformer has no attribute '{block_name}'")
+    
+    
+    pipe = FloVDOMSMCogVideoXImageToVideoPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+    )
+    
+    # pipe.load_lora_weights(omsm_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
+    # pipe.fuse_lora(components=["transformer"], lora_scale=1.0)
+    
+    # pipe.enable_model_cpu_offload(device=device)
+    pipe = pipe.to(device, dtype)
+    
+    return pipe
+
+
+class I2VFlowDataset_Inference(Dataset):
+    def __init__(
+        self, 
+        max_num_frames: int, 
+        height: int, 
+        width: int, 
+        data_root: str,
+        max_num_videos: int = None,
+    ) -> None:
+        
+        self.train_resolution = (int(max_num_frames), int(height), int(width))
+        
+        data_root = Path(data_root)
+        metadata_path = data_root / "metadata_reformat.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl in the root path"
+        
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+        
+        metadata = random.sample(metadata, max_num_videos)
+
+        self.prompts = [x["prompt"] for x in metadata]
+        self.prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.videos = [data_root / "video_latent" / "x".join(str(x) for x in self.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        self.flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.masks = [data_root / "valid_mask" / (x["hash_code"] + '.bin') for x in metadata]
+        
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+        
+        self.length = len(self.videos)
+
+        print(f"Dataset size: {self.length}")
+        
+    def __len__(self) -> int:
+        return self.length
+    
+    def load_data_pair(self, index):
+        prompt_embedding_path = self.prompt_embeddings[index]
+        encoded_video_path = self.videos[index]
+        encoded_flow_path = self.flows[index]
+        
+        prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"] 
+        encoded_video = load_file(encoded_video_path)["encoded_video"] # CFHW
+        encoded_flow = load_file(encoded_flow_path)["encoded_flow_f"] # CFHW
+    
+        return prompt_embedding, encoded_video, encoded_flow
+
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        while True:
+            try:
+                prompt_embedding, encoded_video, encoded_flow = self.load_data_pair(index)
+                break
+            except Exception as e:
+                print(f"Error loading {self.prompt_embeddings[index]}: {str(e)}")
+                index = random.randint(0, self.length - 1)
+            
+        image_path = self.images[index]
+        prompt = self.prompts[index]
+        
+        _, image = self.preprocess(None, image_path)
+        image = self.image_transform(image)
+        
+        
+        # shape of encoded_video: [C, F, H, W]
+        # shape and scale of image: [C, H, W], [-1,1]
+        return {
+            "image": image,
+            "prompt": prompt,
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "encoded_flow": encoded_flow,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+    
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if video_path is not None:
+            video = preprocess_video_with_resize(video_path, self.max_num_frames, self.height, self.width)
+        else:
+            video = None
+        if image_path is not None:
+            image = preprocess_image_with_resize(image_path, self.height, self.width)
+        else:
+            image = None
+        return video, image
+    
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)
+
+def initialize_flow_generator(target):
+    depth_estimator_kwargs = {
+        "target": target,
+        "kwargs": {
+            "ckpt_path": '/workspace/workspace/checkpoints/depth_anything/depth_anything_v2_metric_hypersim_vitb.pth',
+            "model_config": {
+                "max_depth": 20,
+                "encoder": 'vitb',
+                "features": 128,
+                "out_channels": [96, 192, 384, 768],
+            }
+
+        }
+    }
+
+    return CameraFlowGenerator(depth_estimator_kwargs)
+
+def integrate_flow(camera_flow, object_flow, depth_ctxt, camera_flow_generator, camera_flow_generator_input):
+    # camera_flow: (BF)CHW
+    # object_flow: (BF)CHW
+    # depth_ctxt: B1HW
+    
+    B, F = camera_flow_generator_input["target"]["intrinsics"].shape[:2]
+    H, W = object_flow.shape[-2:]
+    
+    c2w_ctxt = repeat(camera_flow_generator_input["context"]["extrinsics"], "b t h w -> (b v t) h w", v=F) # No need to apply inverse as it is an eye matrix.
+    c2w_trgt = rearrange(torch.inverse(camera_flow_generator_input["target"]["extrinsics"]), "b t h w -> (b t) h w")
+    intrinsics_ctxt = unnormalize_intrinsic(repeat(camera_flow_generator_input["context"]["intrinsics"], "b t h w -> (b v t) h w", v=F), size=(H, W))
+        
+    with torch.cuda.amp.autocast(enabled=False):
+        warped_object_flow = camera_flow_generator.depth_warping_module.warper.forward_warp_displacement(
+            depth1=repeat(depth_ctxt, "b c h w -> (b f) c h w", f=F),
+            flow1=object_flow, 
+            transformation1=c2w_ctxt, 
+            transformation2=c2w_trgt, 
+            intrinsic1=intrinsics_ctxt, 
+            intrinsic2=None,
+        )
+    
+    integrated_flow = camera_flow + warped_object_flow
+    
+    return integrated_flow
+    
+def save_flow(flow, filename, fps=16):
+    # flow: (BF)CHW, arbitrary scale
+    flow_RGB = flow_to_color(flow) # BF,C,H,W (B=1)
+
+    frame_list = []
+    for frame in flow_RGB:
+        frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+        frame_list.append(Image.fromarray(frame))
+    
+    export_to_video(frame_list, filename, fps=fps)
+
+def save_flow_warped_video(image, flow, filename, fps=16):
+    # image: CHW, 0~255 scale
+    # flow: (BF)CHW, arbitrary scale
+    warped_video = forward_bilinear_splatting(repeat(image, 'c h w -> f c h w', f=flow.size(0)), flow.to(torch.float))
+    
+    frame_list = []
+    for frame in warped_video:
+        frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+        frame_list.append(Image.fromarray(frame))
+    
+    export_to_video(frame_list, filename, fps=fps)
+
+def generate_video(
+    # prompt: str,
+    launcher: str,
+    port: int,
+    data_root: str,
+    fvsm_path: str,
+    omsm_path: str,
+    num_frames: int = 81,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    output_path: str = "./output.mp4",
+    image_path: str = "",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 6.0,
+    num_videos_per_prompt: int = 1,
+    dtype: torch.dtype = torch.bfloat16,
+    seed: int = 42,
+    fps: int = 16,
+    controlnet_guidance_end: float = 0.4,
+    max_num_videos: int = None,
+    use_dynamic_cfg: bool = False,
+    pose_type: str = "manual",
+    speed: float = 0.5,
+    use_flow_integration: bool = False,
+):
+    """
+    Generates a video based on the given prompt and saves it to the specified path.
+
+    Parameters:
+    - prompt (str): The description of the video to be generated.
+    - lora_path (str): The path of the LoRA weights to be used.
+    - lora_rank (int): The rank of the LoRA weights.
+    - output_path (str): The path where the generated video will be saved.
+    - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
+    - num_frames (int): Number of frames to generate. CogVideoX1.0 generates 49 frames for 6 seconds at 8 fps, while CogVideoX1.5 produces either 81 or 161 frames, corresponding to 5 seconds or 10 seconds at 16 fps.
+    - width (int): The width of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - height (int): The height of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
+    - num_videos_per_prompt (int): Number of videos to generate per prompt.
+    - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
+    - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
+    - seed (int): The seed for reproducibility.
+    - fps (int): The frames per second for the generated video.
+    """
+    
+    # Distributed
+    local_rank = init_dist(launcher=launcher, port=port)
+    global_rank = dist.get_rank()
+    num_processes = dist.get_world_size()
+    is_main_process = global_rank == 0
+    
+    torch.manual_seed(seed)
+    random.seed(seed)
+    
+    if is_main_process:
+        os.makedirs(os.path.join(output_path, 'generated_videos'), exist_ok=True)
+        os.makedirs(os.path.join(output_path, 'generated_flow_videos'), exist_ok=True)
+        os.makedirs(os.path.join(output_path, 'flow_warped_videos'), exist_ok=True)
+
+    # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
+    # add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload()
+    # function to use Multi GPUs.
+
+    image = None
+    video = None
+
+    model_name = "cogvideox-5b-i2v".lower()
+    desired_resolution = RESOLUTION_MAP[model_name]
+    if width is None or height is None:
+        height, width = desired_resolution
+        logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m")
+    elif (height, width) != desired_resolution:
+        if generate_type == "i2v":
+            # For i2v models, use user-defined width and height
+            logging.warning(
+                f"\033[1;31mThe width({width}) and height({height}) are not recommended for {model_name}. The best resolution is {desired_resolution}.\033[0m"
+            )
+
+    """
+        # Prepare Dataset Class..
+    """
+    # image = load_image(image=image_or_video_path)
+    
+    # prompt
+    # first image
+    # camera parameters
+    dataset = I2VFlowDataset_Inference(
+        max_num_frames=num_frames,
+        height=height,
+        width=width,
+        data_root=data_root,
+        max_num_videos=max_num_videos,
+    )
+    
+    
+    distributed_sampler = DistributedSampler(
+        dataset,
+        num_replicas=num_processes,
+        rank=global_rank,
+        shuffle=False,
+        seed=seed,
+    )
+    
+    # DataLoaders creation:
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        sampler=distributed_sampler,
+        num_workers=4,
+        pin_memory=True,
+        drop_last=False,
+    )
+    
+
+    """
+        # Prepare Pipeline
+    """
+    transformer_lora_config = LoraConfig(
+        r=128,
+        lora_alpha=64,
+        init_lora_weights=True,
+        target_modules=["to_q", "to_k", "to_v", "to_out.0", "norm1.linear", "norm2.linear", "ff.net.2"],
+    )
+    
+    print(f'Constructing pipeline')
+    pipe_omsm = load_cogvideox_flovd_OMSM_lora_pipeline(omsm_path, backbone_path="THUDM/CogVideoX-5b-I2V", transformer_lora_config=transformer_lora_config, device=local_rank, dtype=dtype)   
+    pipe_fvsm = load_cogvideox_flovd_FVSM_controlnet_pipeline(fvsm_path, backbone_path="THUDM/CogVideoX-5b-I2V", device=local_rank, dtype=dtype)   
+    print(f'Done loading pipeline')
+    
+    assert pose_type in ['re10k', 'manual'], "Choose other pose_type between ['re10k', 'manual']"
+    if pose_type == 're10k':
+        root_path = "./manual_poses_re10k"
+    else:
+        root_path = "./manual_poses"
+        
+    CameraSampler = SampleManualCam(pose_type=pose_type, root_path=root_path)
+    camera_flow_generator_target = 'finetune.modules.depth_warping.depth_warping.DepthWarping_wrapper'
+    camera_flow_generator = initialize_flow_generator(camera_flow_generator_target).to(local_rank)
+    #--------------------------------------------------------------------------------------------------------
+
+
+    # 2. Set Scheduler.
+    # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`.
+    # We recommend using `CogVideoXDDIMScheduler` for CogVideoX-2B.
+    # using `CogVideoXDPMScheduler` for CogVideoX-5B / CogVideoX-5B-I2V.
+
+    # pipe.scheduler = CogVideoXDDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+    pipe_fvsm.scheduler = CogVideoXDPMScheduler.from_config(pipe_fvsm.scheduler.config, timestep_spacing="trailing")
+    pipe_omsm.scheduler = CogVideoXDPMScheduler.from_config(pipe_omsm.scheduler.config, timestep_spacing="trailing")
+
+    # 3. Enable CPU offload for the model.
+    # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
+    # and enable to("cuda")
+    # pipe.to("cuda")
+
+    # pipe_fvsm.enable_model_cpu_offload()
+    # pipe_omsm.enable_model_cpu_offload()
+    # pipe_fvsm.enable_sequential_cpu_offload()
+    # pipe_omsm.enable_sequential_cpu_offload()
+    
+    pipe_fvsm.vae.enable_slicing()
+    pipe_fvsm.vae.enable_tiling()
+    pipe_omsm.vae.enable_slicing()
+    pipe_omsm.vae.enable_tiling()
+    
+    dataloader.sampler.set_epoch(1)
+    dist.barrier()
+    
+    output_video_path = os.path.join(output_path, 'generated_videos')
+    output_flow_path = os.path.join(output_path, 'generated_flow_videos')
+    output_warped_video_path = os.path.join(output_path, 'flow_warped_videos')
+    
+    data_iter = iter(dataloader)
+    for step in tqdm(range(0, len(dataloader))):
+        batch = next(data_iter)
+        
+        prompt = batch["prompt"][0]
+        image = batch["image"].to(local_rank)
+        prompt_embedding = batch["prompt_embedding"].to(local_rank)
+        prompt_short = prompt[:20].strip()
+        
+        # if step < 10:
+        #     step += 1
+        #     continue
+        
+        # Get Camera flow
+        camparam, cam_name = CameraSampler.sample() # W2C
+        image_torch_255 = ((image.detach().clone()+1)/2. * 255.).squeeze(0)
+        camera_flow_generator_input = get_camera_flow_generator_input(image_torch_255, camparam, device=local_rank, speed=speed)
+        image_torch = ((image_torch_255.unsqueeze(0) / 255.) * 2. - 1.).to(local_rank)
+        
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=True, dtype=dtype):
+                
+                # camera_flow, log_dict = camera_flow_generator(image_torch, camera_flow_generator_input)
+                # camera_flow = camera_flow.to(local_rank, dtype)
+                
+                # camera_flow_latent = rearrange(encode_flow(camera_flow, pipe_omsm.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(local_rank, dtype)
+                
+                flow_latent = pipe_omsm(
+                    num_frames=num_frames,
+                    height=height,
+                    width=width,
+                    prompt=None,
+                    prompt_embeds=prompt_embedding,
+                    image=image,
+                    generator=torch.Generator().manual_seed(seed),
+                    num_inference_steps=num_inference_steps,
+                    use_dynamic_cfg=use_dynamic_cfg,
+                    output_type='latent'
+                ).frames[0]
+                object_flow = decode_flow(flow_latent.detach().clone().unsqueeze(0).to(local_rank), pipe_omsm.vae, flow_scale_factor=[60, 36]) # BF,C,H,W
+                
+                if use_flow_integration:
+                    # Integrate camera (from 3D warping) and object (from OMSM) flow maps
+                    # Using segmentation model will be implemented later..
+                    
+                    camera_flow, log_dict = camera_flow_generator(image_torch, camera_flow_generator_input)
+                    camera_flow = camera_flow.to(local_rank, dtype)
+
+                    integrated_flow = integrate_flow(camera_flow, object_flow, log_dict['depth_ctxt'], camera_flow_generator, camera_flow_generator_input)
+                    integrated_flow_latent = rearrange(encode_flow(integrated_flow, pipe_omsm.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(local_rank, dtype)
+                else:
+                    integrated_flow_latent = rearrange(flow_latent, '(b f) c h w -> b f c h w', b=image.size(0))
+                
+                # 4. Generate the video frames based on the prompt.
+                # `num_frames` is the Number of frames to generate.
+                video_generate = pipe_fvsm(
+                    num_frames=num_frames,
+                    height=height,
+                    width=width,
+                    prompt=None,
+                    prompt_embeds=prompt_embedding,
+                    image=image,
+                    flow_latent=integrated_flow_latent,
+                    valid_mask=None,
+                    generator=torch.Generator().manual_seed(seed),
+                    num_inference_steps=num_inference_steps,
+                    controlnet_guidance_start = 0.0,
+                    controlnet_guidance_end = controlnet_guidance_end,
+                    use_dynamic_cfg=use_dynamic_cfg,
+                ).frames[0]
+
+        # Save logs
+        # 1) Synthesized flow (object_flow)
+        save_path = os.path.join(output_flow_path, f"{prompt_short}_DCFG-{use_dynamic_cfg}_ContGuide-{controlnet_guidance_end}_{cam_name}.mp4")
+        save_flow(object_flow, filename=save_path, fps=fps)
+        
+        # 2) Flow-Warped Video
+        save_path = os.path.join(output_warped_video_path, f"{prompt_short}_DCFG-{use_dynamic_cfg}_ContGuide-{controlnet_guidance_end}_{cam_name}.mp4")
+        save_flow_warped_video(image_torch_255, object_flow, filename=save_path, fps=fps)
+
+        # 3) Flow-Cond. Synthesized Video
+        save_path = os.path.join(output_video_path, f"{prompt_short}_DCFG-{use_dynamic_cfg}_ContGuide-{controlnet_guidance_end}_{cam_name}.mp4")
+        export_to_video(video_generate, save_path, fps=fps)
+        
+        dist.barrier()
+        
+        step += 1
+
+
+#--------------------------------------------------------------------------------------------------
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+
+    return encode_video(flow_norm_extended, vae)
+
+
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    
+    flow = vae.decode(flow_latent).sample # BCFHW
+
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+
+    return flow_norm
+
+
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    
+    return flow_orig
+
+#--------------------------------------------------------------------------------------------------
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
+    # parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
+    parser.add_argument("--image_path", type=str, default=None, help="The path of the image to be used as the background of the video",)
+    parser.add_argument("--data_root", type=str, required=True, help="The path of the dataset root",)
+    parser.add_argument("--fvsm_path", type=str, required=True, help="Path of the pre-trained model use")
+    parser.add_argument("--omsm_path", type=str, required=True, help="Path of the pre-trained model use")
+    parser.add_argument("--output_path", type=str, default="./output.mp4", help="The path save generated video")
+    parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
+    parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
+    parser.add_argument("--num_frames", type=int, default=49, help="Number of steps for the inference process")
+    parser.add_argument("--width", type=int, default=None, help="The width of the generated video")
+    parser.add_argument("--height", type=int, default=None, help="The height of the generated video")
+    parser.add_argument("--fps", type=int, default=16, help="The frames per second for the generated video")
+    parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation")
+    parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
+    parser.add_argument("--controlnet_guidance_end", type=float, default=0.4, help="Controlnet guidance end during sampling")
+    parser.add_argument("--max_num_videos", type=int, default=None, help="# of videos for inference")
+    parser.add_argument("--use_dynamic_cfg", action='store_true')
+    parser.add_argument("--pose_type", type=str, default='manual', help="pose type in the inference time")
+    parser.add_argument("--speed", type=float, default=0.5, help="pose type in the inference time")
+    parser.add_argument("--use_flow_integration", action='store_true')
+    
+    
+    # DDP args
+    parser.add_argument("--launcher", type=str, choices=["pytorch", "slurm"], default="pytorch")
+    parser.add_argument("--world_size", default=1, type=int,
+                        help="number of the distributed processes.")
+    parser.add_argument('--local-rank', type=int, default=-1,
+                        help='Replica rank on the current node. This field is required '
+                             'by `torch.distributed.launch`.')
+    parser.add_argument("--global_seed", default=42, type=int,
+                        help="seed")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
+
+
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    
+    
+    generate_video(
+        # prompt=args.prompt,
+        launcher=args.launcher,
+        port=args.port,
+        data_root=args.data_root,
+        fvsm_path=args.fvsm_path,
+        omsm_path=args.omsm_path,
+        output_path=args.output_path,
+        num_frames=args.num_frames,
+        width=args.width,
+        height=args.height,
+        image_path=args.image_path,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        dtype=dtype,
+        seed=args.seed,
+        fps=args.fps,
+        controlnet_guidance_end=args.controlnet_guidance_end,
+        max_num_videos=args.max_num_videos,
+        use_dynamic_cfg=args.use_dynamic_cfg,
+        pose_type=args.pose_type,
+        speed=args.speed,
+        use_flow_integration=args.use_flow_integration,
+    )
diff --git a/inference/flovd_demo.py b/inference/flovd_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ffc955e8459599a1296cefa9110668318c9c77
--- /dev/null
+++ b/inference/flovd_demo.py
@@ -0,0 +1,594 @@
+"""
+This script demonstrates how to generate a video using the CogVideoX model with the Hugging Face `diffusers` pipeline.
+The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v),
+and video-to-video (v2v), depending on the input data and different weight.
+
+- text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V
+
+Running the Script:
+To run the script, use the following command with appropriate arguments:
+
+```bash
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
+```
+
+You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory
+
+Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
+
+"""
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import argparse
+import logging
+import os
+import sys
+from typing import Literal, Optional
+from pathlib import Path
+import json
+from datetime import timedelta
+import random
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+from einops import rearrange, repeat
+import math
+import numpy as np
+from PIL import Image
+
+import torch
+
+from diffusers import (
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXVideoToVideoPipeline,
+    AutoencoderKLCogVideoX,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils import export_to_video, load_image, load_video
+from peft import LoraConfig, get_peft_model_state_dict, set_peft_model_state_dict
+
+sys.path.append(os.path.abspath(os.path.join(sys.path[0], "../")))
+from finetune.pipeline.flovd_FVSM_cogvideox_controlnet_pipeline import FloVDCogVideoXControlnetImageToVideoPipeline
+from finetune.pipeline.flovd_OMSM_cogvideox_pipeline import FloVDOMSMCogVideoXImageToVideoPipeline
+from finetune.schemas import Components, Args
+from finetune.modules.cogvideox_controlnet import CogVideoXControlnet
+from finetune.modules.cogvideox_custom_model import CustomCogVideoXTransformer3DModel
+from transformers import AutoTokenizer, T5EncoderModel
+
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting, flow_to_color
+from finetune.modules.depth_warping.depth_warping import unnormalize_intrinsic
+
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    preprocess_video_with_resize,
+)
+
+
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+
+import pdb
+sys.path.append(os.path.abspath(os.path.join(sys.path[-1], 'finetune'))) # for camera flow generator
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+logging.basicConfig(level=logging.INFO)
+
+# Recommended resolution for each model (width, height)
+RESOLUTION_MAP = {
+    # cogvideox1.5-*
+    "cogvideox1.5-5b-i2v": (768, 1360),
+    "cogvideox1.5-5b": (768, 1360),
+    # cogvideox-*
+    "cogvideox-5b-i2v": (480, 720),
+    "cogvideox-5b": (480, 720),
+    "cogvideox-2b": (480, 720),
+}
+
+
+
+
+def load_cogvideox_flovd_FVSM_controlnet_pipeline(controlnet_path, backbone_path, device, dtype):
+    controlnet_sd = torch.load(controlnet_path, map_location='cpu')['module']
+    
+    tokenizer = AutoTokenizer.from_pretrained(backbone_path, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(backbone_path, subfolder="text_encoder")
+    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(backbone_path, subfolder="transformer")
+    vae = AutoencoderKLCogVideoX.from_pretrained(backbone_path, subfolder="vae")
+    scheduler = CogVideoXDPMScheduler.from_pretrained(backbone_path, subfolder="scheduler")
+    
+    additional_kwargs = {
+        'num_layers': 6,
+        'out_proj_dim_factor': 64,
+        'out_proj_dim_zero_init': True,
+        'notextinflow': True,
+    }
+    controlnet = CogVideoXControlnet.from_pretrained(backbone_path, subfolder="transformer", **additional_kwargs)
+    controlnet.eval()
+    
+    missing, unexpected = controlnet.load_state_dict(controlnet_sd)
+    
+    if len(missing) != 0 or len(unexpected) != 0:
+        print(f"Missing keys : {missing}")
+        print(f"Unexpected keys : {unexpected}")
+        
+    pipe = FloVDCogVideoXControlnetImageToVideoPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            controlnet=controlnet,
+            scheduler=scheduler,
+    )
+    
+    # pipe.enable_model_cpu_offload(device=device)
+    pipe = pipe.to(device, dtype)
+    
+    return pipe
+
+def load_cogvideox_flovd_OMSM_lora_pipeline(omsm_path, backbone_path, transformer_lora_config, device, dtype):
+    tokenizer = AutoTokenizer.from_pretrained(backbone_path, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(backbone_path, subfolder="text_encoder")
+    transformer = CogVideoXTransformer3DModel.from_pretrained(backbone_path, subfolder="transformer")
+    vae = AutoencoderKLCogVideoX.from_pretrained(backbone_path, subfolder="vae")
+    scheduler = CogVideoXDPMScheduler.from_pretrained(backbone_path, subfolder="scheduler")
+
+    # 1) Load Lora weight
+    transformer.add_adapter(transformer_lora_config)
+
+    lora_state_dict = FloVDOMSMCogVideoXImageToVideoPipeline.lora_state_dict(omsm_path)
+    transformer_state_dict = {
+        f'{k.replace("transformer.", "")}': v
+        for k, v in lora_state_dict.items()
+        if k.startswith("transformer.")
+    }
+    incompatible_keys = set_peft_model_state_dict(transformer, transformer_state_dict, adapter_name="default")
+    if incompatible_keys is not None:
+        # check only for unexpected keys
+        unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+        if unexpected_keys:
+            logger.warning(
+                f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                f" {unexpected_keys}. "
+            )
+
+    # 2) Load Other weight
+    load_path = os.path.join(omsm_path, "selected_blocks.safetensors")
+    if os.path.exists(load_path):
+        tensor_dict = load_file(load_path)
+        
+        block_state_dicts = {}
+        for k, v in tensor_dict.items():
+            block_name, param_name = k.split(".", 1)
+            if block_name not in block_state_dicts:
+                block_state_dicts[block_name] = {}
+            block_state_dicts[block_name][param_name] = v
+        
+        for block_name, state_dict in block_state_dicts.items():
+            if hasattr(transformer, block_name):
+                getattr(transformer, block_name).load_state_dict(state_dict)
+            else:
+                raise ValueError(f"Transformer has no attribute '{block_name}'")
+    
+    
+    pipe = FloVDOMSMCogVideoXImageToVideoPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+    )
+    
+    # pipe.load_lora_weights(omsm_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
+    # pipe.fuse_lora(components=["transformer"], lora_scale=1.0)
+    
+    # pipe.enable_model_cpu_offload(device=device)
+    pipe = pipe.to(device, dtype)
+    
+    return pipe
+
+
+def initialize_flow_generator(target, ckpt_path):
+    depth_estimator_kwargs = {
+        "target": target,
+        "kwargs": {
+            "ckpt_path": ckpt_path, # './ckpt/others/depth_anything_v2_metric_hypersim_vitb.pth',
+            "model_config": {
+                "max_depth": 20,
+                "encoder": 'vitb',
+                "features": 128,
+                "out_channels": [96, 192, 384, 768],
+            }
+
+        }
+    }
+
+    return CameraFlowGenerator(depth_estimator_kwargs)
+
+def integrate_flow(camera_flow, object_flow, depth_ctxt, camera_flow_generator, camera_flow_generator_input):
+    # camera_flow: (BF)CHW
+    # object_flow: (BF)CHW
+    # depth_ctxt: B1HW
+    
+    B, F = camera_flow_generator_input["target"]["intrinsics"].shape[:2]
+    H, W = object_flow.shape[-2:]
+    
+    c2w_ctxt = repeat(camera_flow_generator_input["context"]["extrinsics"], "b t h w -> (b v t) h w", v=F) # No need to apply inverse as it is an eye matrix.
+    c2w_trgt = rearrange(torch.inverse(camera_flow_generator_input["target"]["extrinsics"]), "b t h w -> (b t) h w")
+    intrinsics_ctxt = unnormalize_intrinsic(repeat(camera_flow_generator_input["context"]["intrinsics"], "b t h w -> (b v t) h w", v=F), size=(H, W))
+        
+    with torch.cuda.amp.autocast(enabled=False):
+        warped_object_flow = camera_flow_generator.depth_warping_module.warper.forward_warp_displacement(
+            depth1=repeat(depth_ctxt, "b c h w -> (b f) c h w", f=F),
+            flow1=object_flow, 
+            transformation1=c2w_ctxt, 
+            transformation2=c2w_trgt, 
+            intrinsic1=intrinsics_ctxt, 
+            intrinsic2=None,
+        )
+    
+    integrated_flow = camera_flow + warped_object_flow
+    
+    return integrated_flow
+    
+def save_flow(flow, filename, fps=16):
+    # flow: (BF)CHW, arbitrary scale
+    flow_RGB = flow_to_color(flow) # BF,C,H,W (B=1)
+
+    frame_list = []
+    for frame in flow_RGB:
+        frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+        frame_list.append(Image.fromarray(frame))
+    
+    export_to_video(frame_list, filename, fps=fps)
+
+def save_flow_warped_video(image, flow, filename, fps=16):
+    # image: CHW, 0~255 scale
+    # flow: (BF)CHW, arbitrary scale
+    warped_video = forward_bilinear_splatting(repeat(image, 'c h w -> f c h w', f=flow.size(0)), flow.to(torch.float))
+    
+    frame_list = []
+    for frame in warped_video:
+        frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+        frame_list.append(Image.fromarray(frame))
+    
+    export_to_video(frame_list, filename, fps=fps)
+
+def generate_video(
+    prompt: str,
+    fvsm_path: str,
+    omsm_path: str,
+    num_frames: int = 81,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    output_path: str = "./output.mp4",
+    image_path: str = "",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 6.0,
+    num_videos_per_prompt: int = 1,
+    dtype: torch.dtype = torch.bfloat16,
+    seed: int = 42,
+    fps: int = 16,
+    controlnet_guidance_end: float = 0.4,
+    use_dynamic_cfg: bool = False,
+    pose_type: str = "manual",
+    speed: float = 0.5,
+    use_flow_integration: bool = False,
+    cam_pose_name: str = None,
+    depth_ckpt_path: str = "./ckpt/others/depth_anything_v2_metric_hypersim_vitb.pth",
+):
+    """
+    Generates a video based on the given prompt and saves it to the specified path.
+
+    Parameters:
+    - prompt (str): The description of the video to be generated.
+    - lora_path (str): The path of the LoRA weights to be used.
+    - lora_rank (int): The rank of the LoRA weights.
+    - output_path (str): The path where the generated video will be saved.
+    - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
+    - num_frames (int): Number of frames to generate. CogVideoX1.0 generates 49 frames for 6 seconds at 8 fps, while CogVideoX1.5 produces either 81 or 161 frames, corresponding to 5 seconds or 10 seconds at 16 fps.
+    - width (int): The width of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - height (int): The height of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
+    - num_videos_per_prompt (int): Number of videos to generate per prompt.
+    - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
+    - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
+    - seed (int): The seed for reproducibility.
+    - fps (int): The frames per second for the generated video.
+    """
+    
+    local_rank = 'cuda'
+    
+    torch.manual_seed(seed)
+    random.seed(seed)
+    
+    os.makedirs(os.path.join(output_path, 'generated_videos'), exist_ok=True)
+
+    # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
+    # add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload()
+    # function to use Multi GPUs.
+
+    image = None
+    video = None
+
+    model_name = "cogvideox-5b-i2v".lower()
+    desired_resolution = RESOLUTION_MAP[model_name]
+    if width is None or height is None:
+        height, width = desired_resolution
+        logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m")
+    elif (height, width) != desired_resolution:
+        if generate_type == "i2v":
+            # For i2v models, use user-defined width and height
+            logging.warning(
+                f"\033[1;31mThe width({width}) and height({height}) are not recommended for {model_name}. The best resolution is {desired_resolution}.\033[0m"
+            )
+
+    
+
+    """
+        # Prepare Pipeline
+    """
+    transformer_lora_config = LoraConfig(
+        r=128,
+        lora_alpha=64,
+        init_lora_weights=True,
+        target_modules=["to_q", "to_k", "to_v", "to_out.0", "norm1.linear", "norm2.linear", "ff.net.2"],
+    )
+    
+    print(f'Constructing pipeline')
+    pipe_omsm = load_cogvideox_flovd_OMSM_lora_pipeline(omsm_path, backbone_path="THUDM/CogVideoX-5b-I2V", transformer_lora_config=transformer_lora_config, device=local_rank, dtype=dtype)   
+    pipe_fvsm = load_cogvideox_flovd_FVSM_controlnet_pipeline(fvsm_path, backbone_path="THUDM/CogVideoX-5b-I2V", device=local_rank, dtype=dtype)   
+    print(f'Done loading pipeline')
+    
+    """
+        # Prepare inputs
+    """
+    image = load_image(image=image_path)
+    
+    assert pose_type in ['re10k', 'manual'], "Choose other pose_type between ['re10k', 'manual']"
+    if pose_type == 're10k':
+        root_path = "./assets/re10k_poses"
+    else:
+        root_path = "./assets/manual_poses"
+        
+    CameraSampler = SampleManualCam(pose_type=pose_type, root_path=root_path)
+    camera_flow_generator_target = 'finetune.modules.depth_warping.depth_warping.DepthWarping_wrapper'
+    camera_flow_generator = initialize_flow_generator(camera_flow_generator_target, ckpt_path=depth_ckpt_path).to(local_rank)
+    #--------------------------------------------------------------------------------------------------------
+
+
+    # 2. Set Scheduler.
+    # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`.
+    # We recommend using `CogVideoXDDIMScheduler` for CogVideoX-2B.
+    # using `CogVideoXDPMScheduler` for CogVideoX-5B / CogVideoX-5B-I2V.
+
+    pipe_fvsm.scheduler = CogVideoXDPMScheduler.from_config(pipe_fvsm.scheduler.config, timestep_spacing="trailing")
+    pipe_omsm.scheduler = CogVideoXDPMScheduler.from_config(pipe_omsm.scheduler.config, timestep_spacing="trailing")
+
+    # 3. Enable CPU offload for the model.
+    # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
+    # and enable to("cuda")
+    # pipe.to("cuda")
+
+    # pipe_fvsm.enable_model_cpu_offload()
+    # pipe_omsm.enable_model_cpu_offload()
+    # pipe_fvsm.enable_sequential_cpu_offload()
+    # pipe_omsm.enable_sequential_cpu_offload()
+    
+    pipe_fvsm.vae.enable_slicing()
+    pipe_fvsm.vae.enable_tiling()
+    pipe_omsm.vae.enable_slicing()
+    pipe_omsm.vae.enable_tiling()
+    
+    
+    output_video_path = os.path.join(output_path, 'generated_videos')
+    
+    """
+        # Inference time
+    """
+    image = rearrange((torch.tensor(np.array(image)).to(torch.float) / 255. * 2. - 1.).unsqueeze(0), 'b h w c -> b c h w')
+    image = image.to(local_rank)
+    prompt_short = prompt[:30].strip().replace(" ", "_")
+    
+    # Get Camera flow
+    camparam, cam_name = CameraSampler.sample(name=cam_pose_name) # W2C
+    image_torch_255 = ((image.detach().clone()+1)/2. * 255.).squeeze(0)
+    camera_flow_generator_input = get_camera_flow_generator_input(image_torch_255, camparam, device=local_rank, speed=speed)
+    image_torch = ((image_torch_255.unsqueeze(0) / 255.) * 2. - 1.).to(local_rank)
+    
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(enabled=True, dtype=dtype):
+            
+            flow_latent = pipe_omsm(
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                prompt=prompt,
+                image=image,
+                generator=torch.Generator().manual_seed(seed),
+                num_inference_steps=num_inference_steps,
+                use_dynamic_cfg=use_dynamic_cfg,
+                output_type='latent'
+            ).frames[0]
+            object_flow = decode_flow(flow_latent.detach().clone().unsqueeze(0).to(local_rank), pipe_omsm.vae, flow_scale_factor=[60, 36]) # BF,C,H,W
+            
+            if use_flow_integration:
+                # Integrate camera (from 3D warping) and object (from OMSM) flow maps
+                # Using segmentation model will be implemented later..
+                
+                camera_flow, log_dict = camera_flow_generator(image_torch, camera_flow_generator_input)
+                camera_flow = camera_flow.to(local_rank, dtype)
+
+                integrated_flow = integrate_flow(camera_flow, object_flow, log_dict['depth_ctxt'], camera_flow_generator, camera_flow_generator_input)
+                integrated_flow_latent = rearrange(encode_flow(integrated_flow, pipe_omsm.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(local_rank, dtype)
+            else:
+                integrated_flow_latent = rearrange(flow_latent, '(b f) c h w -> b f c h w', b=image.size(0))
+            
+            # 4. Generate the video frames based on the prompt.
+            # `num_frames` is the Number of frames to generate.
+            video_generate = pipe_fvsm(
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                prompt=prompt,
+                image=image,
+                flow_latent=integrated_flow_latent,
+                valid_mask=None,
+                generator=torch.Generator().manual_seed(seed),
+                num_inference_steps=num_inference_steps,
+                controlnet_guidance_start = 0.0,
+                controlnet_guidance_end = controlnet_guidance_end,
+                use_dynamic_cfg=use_dynamic_cfg,
+            ).frames[0]
+        
+    # Save Result
+    save_path = os.path.join(output_video_path, f"{prompt_short}_{cam_name}.mp4")
+    export_to_video(video_generate, save_path, fps=fps)
+    
+
+
+#--------------------------------------------------------------------------------------------------
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+
+    return encode_video(flow_norm_extended, vae)
+
+
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    
+    flow = vae.decode(flow_latent).sample # BCFHW
+
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+
+    return flow_norm
+
+
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    
+    return flow_orig
+
+#--------------------------------------------------------------------------------------------------
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
+    parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
+    parser.add_argument("--image_path", type=str, default=None, help="The path of the image to be used as the background of the video",)
+    parser.add_argument("--fvsm_path", type=str, required=True, help="Path of the pre-trained model use")
+    parser.add_argument("--omsm_path", type=str, required=True, help="Path of the pre-trained model use")
+    parser.add_argument("--output_path", type=str, default="./output.mp4", help="The path save generated video")
+    parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
+    parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
+    parser.add_argument("--num_frames", type=int, default=49, help="Number of steps for the inference process")
+    parser.add_argument("--width", type=int, default=None, help="The width of the generated video")
+    parser.add_argument("--height", type=int, default=None, help="The height of the generated video")
+    parser.add_argument("--fps", type=int, default=16, help="The frames per second for the generated video")
+    parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation")
+    parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
+    parser.add_argument("--controlnet_guidance_end", type=float, default=0.4, help="Controlnet guidance end during sampling")
+    parser.add_argument("--use_dynamic_cfg", action='store_true')
+    parser.add_argument("--pose_type", type=str, default='manual', help="pose type in the inference time")
+    parser.add_argument("--speed", type=float, default=0.5, help="pose type in the inference time")
+    parser.add_argument("--use_flow_integration", action='store_true')
+    parser.add_argument("--cam_pose_name", type=str, required=False, default=None, help="Camera trajectory name")
+    parser.add_argument("--depth_ckpt_path", type=str, required=False, default="./ckpt/others/depth_anything_v2_metric_hypersim_vitb.pth", help="Camera trajectory name")
+
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    
+    
+    generate_video(
+        prompt=args.prompt,
+        fvsm_path=args.fvsm_path,
+        omsm_path=args.omsm_path,
+        output_path=args.output_path,
+        num_frames=args.num_frames,
+        width=args.width,
+        height=args.height,
+        image_path=args.image_path,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        dtype=dtype,
+        seed=args.seed,
+        fps=args.fps,
+        controlnet_guidance_end=args.controlnet_guidance_end,
+        use_dynamic_cfg=args.use_dynamic_cfg,
+        pose_type=args.pose_type,
+        speed=args.speed,
+        use_flow_integration=args.use_flow_integration,
+        cam_pose_name=args.cam_pose_name,
+        depth_ckpt_path=args.depth_ckpt_path
+    )
diff --git a/inference/flovd_fvsm_ddp_demo.py b/inference/flovd_fvsm_ddp_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..12153d9cee6255834e93c295e0d9a42d78b7d51b
--- /dev/null
+++ b/inference/flovd_fvsm_ddp_demo.py
@@ -0,0 +1,640 @@
+"""
+This script demonstrates how to generate a video using the CogVideoX model with the Hugging Face `diffusers` pipeline.
+The script supports different types of video generation, including text-to-video (t2v), image-to-video (i2v),
+and video-to-video (v2v), depending on the input data and different weight.
+
+- text-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- video-to-video: THUDM/CogVideoX-5b, THUDM/CogVideoX-2b or THUDM/CogVideoX1.5-5b
+- image-to-video: THUDM/CogVideoX-5b-I2V or THUDM/CogVideoX1.5-5b-I2V
+
+Running the Script:
+To run the script, use the following command with appropriate arguments:
+
+```bash
+$ python cli_demo.py --prompt "A girl riding a bike." --model_path THUDM/CogVideoX1.5-5b --generate_type "t2v"
+```
+
+You can change `pipe.enable_sequential_cpu_offload()` to `pipe.enable_model_cpu_offload()` to speed up inference, but this will use more GPU memory
+
+Additional options are available to specify the model path, guidance scale, number of inference steps, video generation type, and output paths.
+
+"""
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import argparse
+import logging
+import os
+import sys
+from typing import Literal, Optional
+from pathlib import Path
+import json
+from datetime import timedelta
+import random
+from safetensors.torch import load_file, save_file
+from tqdm import tqdm
+from einops import rearrange, repeat
+import math
+
+import torch
+
+from diffusers import (
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXVideoToVideoPipeline,
+    AutoencoderKLCogVideoX
+)
+from diffusers.utils import export_to_video, load_image, load_video
+
+sys.path.append(os.path.abspath(os.path.join(sys.path[0], "../")))
+from finetune.pipeline.flovd_FVSM_cogvideox_controlnet_pipeline import FloVDCogVideoXControlnetImageToVideoPipeline
+from finetune.schemas import Components, Args
+from finetune.modules.cogvideox_controlnet import CogVideoXControlnet
+from finetune.modules.cogvideox_custom_model import CustomCogVideoXTransformer3DModel
+from transformers import AutoTokenizer, T5EncoderModel
+
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting
+
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    preprocess_video_with_resize,
+)
+
+
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+
+import pdb
+sys.path.append(os.path.abspath(os.path.join(sys.path[-1], 'finetune'))) # for camera flow generator
+
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+logging.basicConfig(level=logging.INFO)
+
+# Recommended resolution for each model (width, height)
+RESOLUTION_MAP = {
+    # cogvideox1.5-*
+    "cogvideox1.5-5b-i2v": (768, 1360),
+    "cogvideox1.5-5b": (768, 1360),
+    # cogvideox-*
+    "cogvideox-5b-i2v": (480, 720),
+    "cogvideox-5b": (480, 720),
+    "cogvideox-2b": (480, 720),
+}
+
+
+
+
+def init_dist(launcher="slurm", backend='nccl', port=29500, **kwargs):
+    """Initializes distributed environment."""
+    if launcher == 'pytorch':
+        rank = int(os.environ['RANK'])
+        num_gpus = torch.cuda.device_count()
+        local_rank = rank % num_gpus
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(backend=backend, timeout=timedelta(minutes=30), **kwargs)
+
+    elif launcher == 'slurm':
+        proc_id = int(os.environ['SLURM_PROCID'])
+        ntasks = int(os.environ['SLURM_NTASKS'])
+        node_list = os.environ['SLURM_NODELIST']
+        num_gpus = torch.cuda.device_count()
+        local_rank = proc_id % num_gpus
+        torch.cuda.set_device(local_rank)
+        addr = subprocess.getoutput(
+            f'scontrol show hostname {node_list} | head -n1')
+        os.environ['MASTER_ADDR'] = addr
+        os.environ['WORLD_SIZE'] = str(ntasks)
+        os.environ['RANK'] = str(proc_id)
+        port = os.environ.get('PORT', port)
+        os.environ['MASTER_PORT'] = str(port)
+        dist.init_process_group(backend=backend, timeout=timedelta(minutes=30))
+
+    else:
+        raise NotImplementedError(f'Not implemented launcher type: `{launcher}`!')
+    # https://github.com/pytorch/pytorch/issues/98763
+    # torch.cuda.set_device(local_rank)
+
+    return local_rank
+
+
+def load_cogvideox_flovd_FVSM_controlnet_pipeline(controlnet_path, backbone_path, device, dtype):
+    controlnet_sd = torch.load(controlnet_path)['module']
+    
+    tokenizer = AutoTokenizer.from_pretrained(backbone_path, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(backbone_path, subfolder="text_encoder")
+    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(backbone_path, subfolder="transformer")
+    vae = AutoencoderKLCogVideoX.from_pretrained(backbone_path, subfolder="vae")
+    scheduler = CogVideoXDPMScheduler.from_pretrained(backbone_path, subfolder="scheduler")
+    
+    additional_kwargs = {
+        'num_layers': 6,
+        'out_proj_dim_factor': 64,
+        'out_proj_dim_zero_init': True,
+        'notextinflow': True,
+    }
+    controlnet = CogVideoXControlnet.from_pretrained(backbone_path, subfolder="transformer", **additional_kwargs)
+    controlnet.eval()
+    
+    missing, unexpected = controlnet.load_state_dict(controlnet_sd)
+    
+    if len(missing) != 0 or len(unexpected) != 0:
+        print(f"Missing keys : {missing}")
+        print(f"Unexpected keys : {unexpected}")
+        
+    pipe = FloVDCogVideoXControlnetImageToVideoPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            transformer=transformer,
+            controlnet=controlnet,
+            scheduler=scheduler,
+    )
+    
+    # pipe.enable_model_cpu_offload(device=device)
+    pipe = pipe.to(device, dtype)
+    
+    return pipe
+
+class I2VFlowDataset_Inference(Dataset):
+    def __init__(
+        self, 
+        max_num_frames: int, 
+        height: int, 
+        width: int, 
+        data_root: str,
+        max_num_videos: int = None,
+    ) -> None:
+        
+        self.train_resolution = (int(max_num_frames), int(height), int(width))
+        
+        data_root = Path(data_root)
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl in the root path"
+        
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+        
+        metadata = random.sample(metadata, max_num_videos)
+
+        self.prompts = [x["prompt"] for x in metadata]
+        self.prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.videos = [data_root / "video_latent" / "x".join(str(x) for x in self.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        self.flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.masks = [data_root / "valid_mask" / (x["hash_code"] + '.bin') for x in metadata]
+        
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+        
+        self.length = len(self.videos)
+
+        print(f"Dataset size: {self.length}")
+        
+    def __len__(self) -> int:
+        return self.length
+    
+    def load_data_pair(self, index):
+        prompt_embedding_path = self.prompt_embeddings[index]
+        encoded_video_path = self.videos[index]
+        encoded_flow_path = self.flows[index]
+        
+        prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"] 
+        encoded_video = load_file(encoded_video_path)["encoded_video"] # CFHW
+        encoded_flow = load_file(encoded_flow_path)["encoded_flow_f"] # CFHW
+    
+        return prompt_embedding, encoded_video, encoded_flow
+
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        while True:
+            try:
+                prompt_embedding, encoded_video, encoded_flow = self.load_data_pair(index)
+                break
+            except Exception as e:
+                print(f"Error loading {self.prompt_embeddings[index]}: {str(e)}")
+                index = random.randint(0, self.length - 1)
+            
+        image_path = self.images[index]
+        prompt = self.prompts[index]
+        
+        _, image = self.preprocess(None, image_path)
+        image = self.image_transform(image)
+        
+        
+        # shape of encoded_video: [C, F, H, W]
+        # shape and scale of image: [C, H, W], [-1,1]
+        return {
+            "image": image,
+            "prompt": prompt,
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "encoded_flow": encoded_flow,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+    
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if video_path is not None:
+            video = preprocess_video_with_resize(video_path, self.max_num_frames, self.height, self.width)
+        else:
+            video = None
+        if image_path is not None:
+            image = preprocess_image_with_resize(image_path, self.height, self.width)
+        else:
+            image = None
+        return video, image
+    
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)
+
+def initialize_flow_generator(target):
+    depth_estimator_kwargs = {
+        "target": target,
+        "kwargs": {
+            "ckpt_path": '/workspace/workspace/checkpoints/depth_anything/depth_anything_v2_metric_hypersim_vitb.pth',
+            "model_config": {
+                "max_depth": 20,
+                "encoder": 'vitb',
+                "features": 128,
+                "out_channels": [96, 192, 384, 768],
+            }
+
+        }
+    }
+
+    return CameraFlowGenerator(depth_estimator_kwargs)
+
+def generate_video(
+    # prompt: str,
+    launcher: str,
+    port: int,
+    data_root: str,
+    model_path: str,
+    num_frames: int = 81,
+    width: Optional[int] = None,
+    height: Optional[int] = None,
+    output_path: str = "./output.mp4",
+    image_path: str = "",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 6.0,
+    num_videos_per_prompt: int = 1,
+    dtype: torch.dtype = torch.bfloat16,
+    seed: int = 42,
+    fps: int = 16,
+    controlnet_guidance_end: float = 0.4,
+    max_num_videos: int = None,
+    use_dynamic_cfg: bool = False,
+    pose_type: str = "manual",
+    speed: float = 0.5,
+):
+    """
+    Generates a video based on the given prompt and saves it to the specified path.
+
+    Parameters:
+    - prompt (str): The description of the video to be generated.
+    - model_path (str): The path of the pre-trained model to be used.
+    - lora_path (str): The path of the LoRA weights to be used.
+    - lora_rank (int): The rank of the LoRA weights.
+    - output_path (str): The path where the generated video will be saved.
+    - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
+    - num_frames (int): Number of frames to generate. CogVideoX1.0 generates 49 frames for 6 seconds at 8 fps, while CogVideoX1.5 produces either 81 or 161 frames, corresponding to 5 seconds or 10 seconds at 16 fps.
+    - width (int): The width of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - height (int): The height of the generated video, applicable only for CogVideoX1.5-5B-I2V
+    - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
+    - num_videos_per_prompt (int): Number of videos to generate per prompt.
+    - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
+    - generate_type (str): The type of video generation (e.g., 't2v', 'i2v', 'v2v').·
+    - seed (int): The seed for reproducibility.
+    - fps (int): The frames per second for the generated video.
+    """
+    
+    # Distributed
+    local_rank = init_dist(launcher=launcher, port=port)
+    global_rank = dist.get_rank()
+    num_processes = dist.get_world_size()
+    is_main_process = global_rank == 0
+    
+    torch.manual_seed(seed)
+    random.seed(seed)
+    
+    if is_main_process:
+        os.makedirs(os.path.join(output_path, 'generated_videos'), exist_ok=True)
+
+    # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
+    # add device_map="balanced" in the from_pretrained function and remove the enable_model_cpu_offload()
+    # function to use Multi GPUs.
+
+    image = None
+    video = None
+
+    model_name = "cogvideox-5b-i2v".lower()
+    desired_resolution = RESOLUTION_MAP[model_name]
+    if width is None or height is None:
+        height, width = desired_resolution
+        logging.info(f"\033[1mUsing default resolution {desired_resolution} for {model_name}\033[0m")
+    elif (height, width) != desired_resolution:
+        if generate_type == "i2v":
+            # For i2v models, use user-defined width and height
+            logging.warning(
+                f"\033[1;31mThe width({width}) and height({height}) are not recommended for {model_name}. The best resolution is {desired_resolution}.\033[0m"
+            )
+
+    """
+        # Prepare Dataset Class..
+    """
+    # image = load_image(image=image_or_video_path)
+    
+    # prompt
+    # first image
+    # camera parameters
+    dataset = I2VFlowDataset_Inference(
+        max_num_frames=num_frames,
+        height=height,
+        width=width,
+        data_root=data_root,
+        max_num_videos=max_num_videos,
+    )
+    
+    
+    distributed_sampler = DistributedSampler(
+        dataset,
+        num_replicas=num_processes,
+        rank=global_rank,
+        shuffle=False,
+        seed=seed,
+    )
+    
+    # DataLoaders creation:
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        sampler=distributed_sampler,
+        num_workers=4,
+        pin_memory=True,
+        drop_last=False,
+    )
+    
+
+    """
+        # Prepare Pipeline
+    """
+    print(f'Constructing pipeline')
+    pipe = load_cogvideox_flovd_FVSM_controlnet_pipeline(model_path, backbone_path="THUDM/CogVideoX-5b-I2V", device=local_rank, dtype=dtype)   
+    
+    assert pose_type in ['re10k', 'manual'], "Choose other pose_type between ['re10k', 'manual']"
+    if pose_type == 're10k':
+        root_path = "./manual_poses_re10k"
+    else:
+        root_path = "./manual_poses"
+        
+    CameraSampler = SampleManualCam(pose_type=pose_type, root_path=root_path)
+    camera_flow_generator_target = 'finetune.modules.depth_warping.depth_warping.DepthWarping_wrapper'
+    camera_flow_generator = initialize_flow_generator(camera_flow_generator_target).to(local_rank)
+    #--------------------------------------------------------------------------------------------------------
+
+
+    # 2. Set Scheduler.
+    # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`.
+    # We recommend using `CogVideoXDDIMScheduler` for CogVideoX-2B.
+    # using `CogVideoXDPMScheduler` for CogVideoX-5B / CogVideoX-5B-I2V.
+
+    # pipe.scheduler = CogVideoXDDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+
+    # 3. Enable CPU offload for the model.
+    # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
+    # and enable to("cuda")
+    # pipe.to("cuda")
+
+    # pipe.enable_model_cpu_offload()
+    # pipe.enable_sequential_cpu_offload()
+    pipe.vae.enable_slicing()
+    pipe.vae.enable_tiling()
+    
+    dataloader.sampler.set_epoch(1)
+    dist.barrier()
+    
+    output_path = os.path.join(output_path, 'generated_videos')
+    
+    data_iter = iter(dataloader)
+    for step in tqdm(range(0, len(dataloader))):
+        batch = next(data_iter)
+        
+        prompt = batch["prompt"][0]
+        image = batch["image"].to(local_rank)
+        prompt_embedding = batch["prompt_embedding"].to(local_rank)
+        prompt_short = prompt[:20].strip()
+        
+        # if step < 10:
+        #     step += 1
+        #     continue
+        
+        # Get Camera flow
+        camparam, cam_name = CameraSampler.sample()
+        image_torch = ((image.detach().clone()+1)/2. * 255.).squeeze(0)
+        camera_flow_generator_input = get_camera_flow_generator_input(image_torch, camparam, device=local_rank, speed=speed)
+        image_torch = ((image_torch.unsqueeze(0) / 255.) * 2. - 1.).to(local_rank)
+        
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=True, dtype=dtype):
+                camera_flow, log_dict = camera_flow_generator(image_torch, camera_flow_generator_input)
+                camera_flow = camera_flow.to(local_rank, dtype)
+                
+                camera_flow_latent = rearrange(encode_flow(camera_flow, pipe.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(local_rank, dtype)
+                
+                # 4. Generate the video frames based on the prompt.
+                # `num_frames` is the Number of frames to generate.
+                video_generate = pipe(
+                    num_frames=num_frames,
+                    height=height,
+                    width=width,
+                    prompt=None,
+                    prompt_embeds=prompt_embedding,
+                    image=image,
+                    flow_latent=camera_flow_latent,
+                    valid_mask=None,
+                    generator=torch.Generator().manual_seed(seed),
+                    num_inference_steps=50,
+                    controlnet_guidance_start = 0.0,
+                    controlnet_guidance_end = controlnet_guidance_end,
+                    use_dynamic_cfg=use_dynamic_cfg,
+                ).frames[0]
+
+                
+        save_path = os.path.join(output_path, f"{prompt_short}_DCFG-{use_dynamic_cfg}_ContGuide-{controlnet_guidance_end}_{cam_name}.mp4")
+        export_to_video(video_generate, save_path, fps=fps)
+        
+        dist.barrier()
+        
+        step += 1
+
+
+#--------------------------------------------------------------------------------------------------
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+
+    return encode_video(flow_norm_extended, vae)
+
+
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    
+    flow = vae.decode(flow_latent).sample # BCFHW
+
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+
+    return flow_norm
+
+
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    
+    return flow_orig
+
+#--------------------------------------------------------------------------------------------------
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
+    # parser.add_argument("--prompt", type=str, required=True, help="The description of the video to be generated")
+    parser.add_argument("--image_path", type=str, default=None, help="The path of the image to be used as the background of the video",)
+    parser.add_argument("--data_root", type=str, required=True, help="The path of the dataset root",)
+    parser.add_argument("--model_path", type=str, default="THUDM/CogVideoX1.5-5B", help="Path of the pre-trained model use")
+    parser.add_argument("--output_path", type=str, default="./output.mp4", help="The path save generated video")
+    parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
+    parser.add_argument("--num_inference_steps", type=int, default=50, help="Inference steps")
+    parser.add_argument("--num_frames", type=int, default=49, help="Number of steps for the inference process")
+    parser.add_argument("--width", type=int, default=None, help="The width of the generated video")
+    parser.add_argument("--height", type=int, default=None, help="The height of the generated video")
+    parser.add_argument("--fps", type=int, default=16, help="The frames per second for the generated video")
+    parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="The data type for computation")
+    parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
+    parser.add_argument("--controlnet_guidance_end", type=float, default=0.4, help="Controlnet guidance end during sampling")
+    parser.add_argument("--max_num_videos", type=int, default=None, help="# of videos for inference")
+    parser.add_argument("--use_dynamic_cfg", action='store_true')
+    parser.add_argument("--pose_type", type=str, default='manual', help="pose type in the inference time")
+    parser.add_argument("--speed", type=float, default=0.5, help="pose type in the inference time")
+    
+    
+    # DDP args
+    parser.add_argument("--launcher", type=str, choices=["pytorch", "slurm"], default="pytorch")
+    parser.add_argument("--world_size", default=1, type=int,
+                        help="number of the distributed processes.")
+    parser.add_argument('--local-rank', type=int, default=-1,
+                        help='Replica rank on the current node. This field is required '
+                             'by `torch.distributed.launch`.')
+    parser.add_argument("--global_seed", default=42, type=int,
+                        help="seed")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
+
+
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    
+    
+    generate_video(
+        # prompt=args.prompt,
+        launcher=args.launcher,
+        port=args.port,
+        data_root=args.data_root,
+        model_path=args.model_path,
+        output_path=args.output_path,
+        num_frames=args.num_frames,
+        width=args.width,
+        height=args.height,
+        image_path=args.image_path,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        dtype=dtype,
+        seed=args.seed,
+        fps=args.fps,
+        controlnet_guidance_end=args.controlnet_guidance_end,
+        max_num_videos=args.max_num_videos,
+        use_dynamic_cfg=args.use_dynamic_cfg,
+        pose_type=args.pose_type,
+        speed=args.speed,
+    )
diff --git a/inference/inference_scripts/flovd_demo.sh b/inference/inference_scripts/flovd_demo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0b222613410540c1e63d04ef013a54ed1c6376d4
--- /dev/null
+++ b/inference/inference_scripts/flovd_demo.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+FVSM_PATH="path/to/FVSM/blahblah.pt" # e.g., ./ckpt/FVSM/FloVD_FVSM_Controlnet.pt
+OMSM_PATH="path/to/OMSM/directory" # e.g., ./ckpt/OMMSM/
+DEPTH_CKPT_PATH="path/to/depth_estimator.pth" # e.g., './ckpt/others/depth_anything_v2_metric_hypersim_vitb.pth'
+
+POSE_TYPE="re10k" # Choose between ['re10k', 'manual']
+OUTPUT_PATH="./results/"
+CONTROLNET_GUIDANCE_END=0.4
+SPEED=1.0
+
+PROMPT="A chef in a white coat and glasses preparing a meal in a professional kitchen. He is seen standing at a counter, surrounded by various ingredients and cookware, including bowls, plates, and utensils. The chef is focused on his task, carefully selecting and arranging the ingredients on the counter. The kitchen is well-equipped with stainless steel appliances, such as a refrigerator, oven, and microwave, and features a ventilation system to ensure a clean and efficient environment. The chef's meticulous preparation process is captured as he carefully selects and organizes the ingredients, indicating a high level of culinary skill and attention to detail."
+IMAGE_PATH="./assets/example_image/input1.png"
+CAM_POSE_NAME="1593596b99e2dde9.txt"
+
+python inference/flovd_demo.py \
+    --prompt "$PROMPT" \
+    --image_path $IMAGE_PATH \
+    --fvsm_path $FVSM_PATH \
+    --omsm_path $OMSM_PATH \
+    --output_path $OUTPUT_PATH \
+    --controlnet_guidance_end $CONTROLNET_GUIDANCE_END \
+    --use_flow_integration \
+    --pose_type $POSE_TYPE \
+    --speed $SPEED \
+    --cam_pose_name $CAM_POSE_NAME \
+    --depth_ckpt_path $DEPTH_CKPT_PATH \
+
+
+PROMPT="A stunning and untouched coastal landscape. It begins with a view of a rugged rock formation emerging from the sea, surrounded by churning waves. As the video progresses, the scene shifts to a solitary rock formation standing in the midst of the sea, with the waves crashing against the shore. The natural beauty of the coastline is highlighted by the absence of any human activity. The video then presents a panoramic view of the coastline, revealing the dynamic interplay between the sea and the land. The scene is characterized by the presence of a solitary rock formation, which stands out against the backdrop of the sea and the overcast sky. The video concludes with a view of the coastline, emphasizing the natural beauty and the absence of any human activity."
+IMAGE_PATH="./assets/example_image/input2.png"
+CAM_POSE_NAME="6b6d20c6a46b9fe9.txt"
+
+python inference/flovd_demo.py \
+    --prompt "$PROMPT" \
+    --image_path $IMAGE_PATH \
+    --fvsm_path $FVSM_PATH \
+    --omsm_path $OMSM_PATH \
+    --output_path $OUTPUT_PATH \
+    --controlnet_guidance_end $CONTROLNET_GUIDANCE_END \
+    --use_flow_integration \
+    --pose_type $POSE_TYPE \
+    --speed $SPEED \
+    --cam_pose_name $CAM_POSE_NAME \
+    --depth_ckpt_path $DEPTH_CKPT_PATH \
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..09bc849926ce8a8868cf7853a4f58e9229176bfa
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,27 @@
+[tool.ruff]
+line-length = 119
+
+[tool.ruff.lint]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741", "F402", "F823"]
+select = ["C", "E", "F", "I", "W"]
+
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..585dfabc88b1aadcfc40d8a33f154f2caa2917f5
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,21 @@
+diffusers>=0.32.1
+accelerate>=1.1.1
+transformers>=4.46.2
+numpy==1.26.0
+torch==2.6.0
+torchvision>=0.20.0
+sentencepiece>=0.2.0
+SwissArmyTransformer>=0.4.12
+gradio>=5.5.0
+imageio>=2.35.1
+imageio-ffmpeg>=0.5.1
+openai>=1.54.0
+moviepy>=2.0.0
+scikit-video>=1.1.11
+pydantic>=2.10.3
+xformers==0.0.29.post3
+wandb
+peft
+opencv-python
+decord
+pyav
\ No newline at end of file
diff --git a/results/generated_videos/A_chef_in_a_white_coat_and_gla_1593596b99e2dde9.txt.mp4 b/results/generated_videos/A_chef_in_a_white_coat_and_gla_1593596b99e2dde9.txt.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..92e3a96175fe28b9a0b596860b36d9f0685dc9c0
--- /dev/null
+++ b/results/generated_videos/A_chef_in_a_white_coat_and_gla_1593596b99e2dde9.txt.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f1e2d59f05e0fceab6dadfb6e74c8991c07799731b96f17f0a00e0da07014f2
+size 363832
diff --git a/results/generated_videos/A_stunning_and_untouched_coast_6b6d20c6a46b9fe9.txt.mp4 b/results/generated_videos/A_stunning_and_untouched_coast_6b6d20c6a46b9fe9.txt.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..d8869f66546b5681bfe0be53a56ff8c4d8a3b582
--- /dev/null
+++ b/results/generated_videos/A_stunning_and_untouched_coast_6b6d20c6a46b9fe9.txt.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:163d6f436ec896de6be1ae026242195708b97c99a4b3e2e4de41cde1abfb2dff
+size 812892
diff --git a/tools/cam_visualization.sh b/tools/cam_visualization.sh
new file mode 100644
index 0000000000000000000000000000000000000000..52484f72d320c8a6260904b9523272830e9017cf
--- /dev/null
+++ b/tools/cam_visualization.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+#---------------------------------------------------------------------------
+# RE10K
+
+TARGET=8135673a5a3e3d17
+
+python tools/visualize_trajectory.py \
+    --pose_file_path ./assets/re10k_poses/${TARGET}.txt \
+    --relative_c2w \
+    --base_xval 0.05 \
+    --zval 0.15 \
+    --x_min -1.0 \
+    --x_max  1.0 \
+    --y_min -1.0 \
+    --y_max 1.0 \
+    --z_min -1.0 \
+    --z_max 1.0 \
+
+
+#---------------------------------------------------------------------------
+# Manual poses
+
+TARGET=camera_U
+
+python tools/visualize_trajectory.py \
+    --pose_file_path ./assets/manual_poses/${TARGET}.txt \
+    --relative_c2w \
+    --base_xval 0.05 \
+    --zval 0.15 \
+    --x_min -1.0 \
+    --x_max  1.0 \
+    --y_min -1.0 \
+    --y_max 1.0 \
+    --z_min -1.0 \
+    --z_max 1.0 \
\ No newline at end of file
diff --git a/tools/caption/README.md b/tools/caption/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e81691a653cac98af1f0e41356649e8ec74f97d0
--- /dev/null
+++ b/tools/caption/README.md
@@ -0,0 +1,67 @@
+# Video Caption
+
+Typically, most video data does not come with corresponding descriptive text, so it is necessary to convert the video
+data into textual descriptions to provide the essential training data for text-to-video models.
+
+## Update and News
+- 🔥🔥 **News**: ```2024/9/19```: The caption model used in the CogVideoX training process to convert video data into text
+  descriptions, [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption), is now open-source. Feel
+  free to download and use it.
+
+
+## Video Caption via CogVLM2-Caption
+
+🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 
+
+CogVLM2-Caption is a video captioning model used to generate training data for the CogVideoX model.
+
+### Install
+```shell
+pip install -r requirements.txt
+```
+
+### Usage
+
+```shell
+python video_caption.py
+```
+
+Example:
+<div align="center">
+    <img width="600px" height="auto" src="./assests/CogVLM2-Caption-example.png">
+</div>
+
+## Video Caption via CogVLM2-Video
+
+[Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) ｜ [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
+
+CogVLM2-Video is a versatile video understanding model equipped with timestamp-based question answering capabilities.
+Users can input prompts such as `Please describe this video in detail.` to the model to obtain a detailed video caption:
+<div align="center">
+    <a href="https://cogvlm2-video.github.io/"><img width="600px" height="auto" src="./assests/cogvlm2-video-example.png"></a>
+</div>
+
+Users can use the provided [code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) to load the model or configure a RESTful API to generate video captions.
+
+## Citation
+
+🌟 If you find our work helpful, please leave us a star and cite our paper.
+
+CogVLM2-Caption:
+```
+@article{yang2024cogvideox,
+  title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
+  author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
+  journal={arXiv preprint arXiv:2408.06072},
+  year={2024}
+}
+```
+CogVLM2-Video:
+```
+@article{hong2024cogvlm2,
+  title={CogVLM2: Visual Language Models for Image and Video Understanding},
+  author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
+  journal={arXiv preprint arXiv:2408.16500},
+  year={2024}
+}
+```
\ No newline at end of file
diff --git a/tools/caption/README_ja.md b/tools/caption/README_ja.md
new file mode 100644
index 0000000000000000000000000000000000000000..25c6cce6181367200f3fa7c4616ab9c9a0f9b147
--- /dev/null
+++ b/tools/caption/README_ja.md
@@ -0,0 +1,65 @@
+# ビデオキャプション
+
+通常、ほとんどのビデオデータには対応する説明文が付いていないため、ビデオデータをテキストの説明に変換して、テキストからビデオへのモデルに必要なトレーニングデータを提供する必要があります。
+
+## 更新とニュース
+- 🔥🔥 **ニュース**: ```2024/9/19```：CogVideoX
+  のトレーニングプロセスで、ビデオデータをテキストに変換するためのキャプションモデル [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
+  がオープンソース化されました。ぜひダウンロードしてご利用ください。
+## CogVLM2-Captionによるビデオキャプション
+
+🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 
+
+CogVLM2-Captionは、CogVideoXモデルのトレーニングデータを生成するために使用されるビデオキャプションモデルです。
+
+### インストール
+```shell
+pip install -r requirements.txt
+```
+
+### 使用方法
+```shell
+python video_caption.py
+```
+
+例:
+<div align="center">
+    <img width="600px" height="auto" src="./assests/CogVLM2-Caption-example.png">
+</div>
+
+
+
+## CogVLM2-Video を使用したビデオキャプション
+
+[Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) ｜ [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
+
+
+CogVLM2-Video は、タイムスタンプベースの質問応答機能を備えた多機能なビデオ理解モデルです。ユーザーは `このビデオを詳細に説明してください。` などのプロンプトをモデルに入力して、詳細なビデオキャプションを取得できます：
+<div align="center">
+    <a href="https://cogvlm2-video.github.io/"><img width="600px" height="auto" src="./assests/cogvlm2-video-example.png"></a>
+</div>
+
+ユーザーは提供された[コード](https://github.com/THUDM/CogVLM2/tree/main/video_demo)を使用してモデルをロードするか、RESTful API を構成してビデオキャプションを生成できます。
+
+## Citation
+
+🌟 If you find our work helpful, please leave us a star and cite our paper.
+
+CogVLM2-Caption:
+```
+@article{yang2024cogvideox,
+  title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
+  author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
+  journal={arXiv preprint arXiv:2408.06072},
+  year={2024}
+}
+```
+CogVLM2-Video:
+```
+@article{hong2024cogvlm2,
+  title={CogVLM2: Visual Language Models for Image and Video Understanding},
+  author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
+  journal={arXiv preprint arXiv:2408.16500},
+  year={2024}
+}
+```
diff --git a/tools/caption/README_zh.md b/tools/caption/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..f6da7a61c8621b1028a337056813bb5abcec9874
--- /dev/null
+++ b/tools/caption/README_zh.md
@@ -0,0 +1,67 @@
+# 视频Caption
+
+通常，大多数视频数据不带有相应的描述性文本，因此需要将视频数据转换为文本描述，以提供必要的训练数据用于文本到视频模型。
+
+## 项目更新
+- 🔥🔥 **News**: ```2024/9/19```: CogVideoX 训练过程中用于将视频数据转换为文本描述的 Caption
+  模型 [CogVLM2-Caption](https://huggingface.co/THUDM/cogvlm2-llama3-caption)
+  已经开源。欢迎前往下载并使用。
+
+## 通过 CogVLM2-Caption 模型生成视频Caption
+
+🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-llama3-caption) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-llama3-caption/) 
+
+CogVLM2-Caption是用于生成CogVideoX模型训练数据的视频caption模型。
+
+### 安装依赖
+```shell
+pip install -r requirements.txt
+```
+
+### 运行caption模型
+
+```shell
+python video_caption.py
+```
+
+示例：
+<div align="center">
+    <img width="600px" height="auto" src="./assests/CogVLM2-Caption-example.png">
+</div>
+
+## 通过 CogVLM2-Video 模型生成视频Caption
+
+[Code](https://github.com/THUDM/CogVLM2/tree/main/video_demo) | 🤗 [Hugging Face](https://huggingface.co/THUDM/cogvlm2-video-llama3-chat) | 🤖 [ModelScope](https://modelscope.cn/models/ZhipuAI/cogvlm2-video-llama3-chat) | 📑 [Blog](https://cogvlm2-video.github.io/) ｜ [💬 Online Demo](http://cogvlm2-online.cogviewai.cn:7868/)
+
+CogVLM2-Video 是一个多功能的视频理解模型，具备基于时间戳的问题回答能力。用户可以输入诸如 `Describe this video in detail.` 的提示语给模型，以获得详细的视频Caption：
+
+
+<div align="center">
+    <a href="https://cogvlm2-video.github.io/"><img width="600px" height="auto" src="./assests/cogvlm2-video-example.png"></a>
+</div>
+
+用户可以使用提供的[代码](https://github.com/THUDM/CogVLM2/tree/main/video_demo)加载模型或配置 RESTful API 来生成视频Caption。
+
+
+## Citation
+
+🌟 If you find our work helpful, please leave us a star and cite our paper.
+
+CogVLM2-Caption:
+```
+@article{yang2024cogvideox,
+  title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
+  author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
+  journal={arXiv preprint arXiv:2408.06072},
+  year={2024}
+}
+```
+CogVLM2-Video:
+```
+@article{hong2024cogvlm2,
+  title={CogVLM2: Visual Language Models for Image and Video Understanding},
+  author={Hong, Wenyi and Wang, Weihan and Ding, Ming and Yu, Wenmeng and Lv, Qingsong and Wang, Yan and Cheng, Yean and Huang, Shiyu and Ji, Junhui and Xue, Zhao and others},
+  journal={arXiv preprint arXiv:2408.16500},
+  year={2024}
+}
+```
\ No newline at end of file
diff --git a/tools/caption/assests/CogVLM2-Caption-example.png b/tools/caption/assests/CogVLM2-Caption-example.png
new file mode 100644
index 0000000000000000000000000000000000000000..4842a1919c0377cc267471b45ead659b7361eafd
--- /dev/null
+++ b/tools/caption/assests/CogVLM2-Caption-example.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dbac04d2eb724c28edb44f8a784f0058f0231980deabae4c0f063c9d60d77c3
+size 1186449
diff --git a/tools/caption/assests/cogvlm2-video-example.png b/tools/caption/assests/cogvlm2-video-example.png
new file mode 100644
index 0000000000000000000000000000000000000000..f0c2e6bae875945ca6db64d64edea2eefa4914c9
--- /dev/null
+++ b/tools/caption/assests/cogvlm2-video-example.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93ffadabb7b0b32fbdce9c3bfdff68e2b1fe9af2277708828e58757ea81a568b
+size 1419122
diff --git a/tools/caption/requirements.txt b/tools/caption/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ce2e17e68c9bbd675e8a4efb6f23c99d09d115cc
--- /dev/null
+++ b/tools/caption/requirements.txt
@@ -0,0 +1,23 @@
+decord>=0.6.0
+#根据https://download.pytorch.org/whl/torch/，python版本为[3.8,3.11]
+torch==2.1.0
+torchvision== 0.16.0
+pytorchvideo==0.1.5
+xformers
+transformers==4.42.4
+#git+https://github.com/huggingface/transformers.git
+huggingface-hub>=0.23.0
+pillow
+chainlit>=1.0
+pydantic>=2.7.1
+timm>=0.9.16
+openai>=1.30.1
+loguru>=0.7.2
+pydantic>=2.7.1
+einops
+sse-starlette>=2.1.0
+flask
+gunicorn
+gevent
+requests
+gradio
\ No newline at end of file
diff --git a/tools/caption/video_caption.py b/tools/caption/video_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..1110fca90f60dc52de2eacce40eea730581638f9
--- /dev/null
+++ b/tools/caption/video_caption.py
@@ -0,0 +1,108 @@
+import io
+
+import argparse
+import numpy as np
+import torch
+from decord import cpu, VideoReader, bridge
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL_PATH = "THUDM/cogvlm2-llama3-caption"
+
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+TORCH_TYPE = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[
+    0] >= 8 else torch.float16
+
+parser = argparse.ArgumentParser(description="CogVLM2-Video CLI Demo")
+parser.add_argument('--quant', type=int, choices=[4, 8], help='Enable 4-bit or 8-bit precision loading', default=0)
+args = parser.parse_args([])
+
+
+def load_video(video_data, strategy='chat'):
+    bridge.set_bridge('torch')
+    mp4_stream = video_data
+    num_frames = 24
+    decord_vr = VideoReader(io.BytesIO(mp4_stream), ctx=cpu(0))
+
+    frame_id_list = None
+    total_frames = len(decord_vr)
+    if strategy == 'base':
+        clip_end_sec = 60
+        clip_start_sec = 0
+        start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
+        end_frame = min(total_frames,
+                        int(clip_end_sec * decord_vr.get_avg_fps())) if clip_end_sec is not None else total_frames
+        frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
+    elif strategy == 'chat':
+        timestamps = decord_vr.get_frame_timestamp(np.arange(total_frames))
+        timestamps = [i[0] for i in timestamps]
+        max_second = round(max(timestamps)) + 1
+        frame_id_list = []
+        for second in range(max_second):
+            closest_num = min(timestamps, key=lambda x: abs(x - second))
+            index = timestamps.index(closest_num)
+            frame_id_list.append(index)
+            if len(frame_id_list) >= num_frames:
+                break
+
+    video_data = decord_vr.get_batch(frame_id_list)
+    video_data = video_data.permute(3, 0, 1, 2)
+    return video_data
+
+
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATH,
+    trust_remote_code=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=TORCH_TYPE,
+    trust_remote_code=True
+).eval().to(DEVICE)
+
+
+def predict(prompt, video_data, temperature):
+    strategy = 'chat'
+
+    video = load_video(video_data, strategy=strategy)
+
+    history = []
+    query = prompt
+    inputs = model.build_conversation_input_ids(
+        tokenizer=tokenizer,
+        query=query,
+        images=[video],
+        history=history,
+        template_version=strategy
+    )
+    inputs = {
+        'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
+        'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
+        'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
+        'images': [[inputs['images'][0].to('cuda').to(TORCH_TYPE)]],
+    }
+    gen_kwargs = {
+        "max_new_tokens": 2048,
+        "pad_token_id": 128002,
+        "top_k": 1,
+        "do_sample": False,
+        "top_p": 0.1,
+        "temperature": temperature,
+    }
+    with torch.no_grad():
+        outputs = model.generate(**inputs, **gen_kwargs)
+        outputs = outputs[:, inputs['input_ids'].shape[1]:]
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response
+
+
+def test():
+    prompt = "Please describe this video in detail."
+    temperature = 0.1
+    video_data = open('test.mp4', 'rb').read()
+    response = predict(prompt, video_data, temperature)
+    print(response)
+
+
+if __name__ == '__main__':
+    test()
diff --git a/tools/convert_weight_deepspeed2hf.py b/tools/convert_weight_deepspeed2hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c5ed88c8bed2044cf981cdfe6d2aacdb112e3e8
--- /dev/null
+++ b/tools/convert_weight_deepspeed2hf.py
@@ -0,0 +1,848 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+def convert_zero_checkpoint_to_bf16_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=True,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    将 ZeRO 2 或 ZeRO 3 格式的 DeepSpeed 检查点转换为 BF16，并输出到指定目录下，命名规则为:
+      - 如果只有一个分片:
+          diffusion_pytorch_model.safetensors
+      - 如果分片多于一个:
+          diffusion_pytorch_model-00001-of-0000X.safetensors
+          diffusion_pytorch_model-00002-of-0000X.safetensors
+          ...
+          diffusion_pytorch_model.safetensors.index.json
+    """
+
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            raise ImportError("You need `pip install safetensors` to use safetensors.")
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            raise ImportError("You need `pip install huggingface_hub` to use the sharding feature.")
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(
+        checkpoint_dir,
+        tag=tag,
+        exclude_frozen_parameters=exclude_frozen_parameters,
+        lazy_mode=True
+    )
+
+    state_dict = to_torch_tensor(state_dict, return_empty_tensor=False)
+
+    for key, tensor in state_dict.items():
+        state_dict[key] = tensor.to(torch.bfloat16)
+
+    if safe_serialization:
+        filename_pattern = "diffusion_pytorch_model{suffix}.safetensors"
+    else:
+        filename_pattern = "diffusion_pytorch_model{suffix}.bin"
+
+    empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+    state_dict_split = split_torch_state_dict_into_shards(
+        empty_state_dict,
+        filename_pattern=filename_pattern,
+        max_shard_size=max_shard_size
+    )
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    filename_to_tensors = list(state_dict_split.filename_to_tensors.items())
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {t_name: state_dict[t_name] for t_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+
+        # Save
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        for t_name in shard_state_dict.keys():
+            del state_dict[t_name]
+        del shard_state_dict
+        gc.collect()
+
+
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        index_path = os.path.join(output_dir, "diffusion_pytorch_model.safetensors.index.json")
+        with open(index_path, "w", encoding="utf-8") as f:
+            f.write(json.dumps(index, indent=2, sort_keys=True) + "\n")
+    else:
+        only_filename = list(state_dict_split.filename_to_tensors.keys())[0]
+        old_path = os.path.join(output_dir, only_filename)
+        new_path = os.path.join(output_dir, "diffusion_pytorch_model.safetensors" if safe_serialization
+                                else "diffusion_pytorch_model.bin")
+        if old_path != new_path:
+            os.rename(old_path, new_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_bf16_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/tools/convert_weight_sat2hf.py b/tools/convert_weight_sat2hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b70af1ab2f66bbe7bf14a79b40187507b1516685
--- /dev/null
+++ b/tools/convert_weight_sat2hf.py
@@ -0,0 +1,358 @@
+"""
+
+The script demonstrates how to convert the weights of the CogVideoX model from SAT to Hugging Face format.
+This script supports the conversion of the following models:
+- CogVideoX-2B
+- CogVideoX-5B, CogVideoX-5B-I2V
+- CogVideoX1.1-5B, CogVideoX1.1-5B-I2V
+
+Original Script:
+https://github.com/huggingface/diffusers/blob/main/scripts/convert_cogvideox_to_diffusers.py
+
+"""
+import argparse
+from typing import Any, Dict
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDDIMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXPipeline,
+    CogVideoXTransformer3DModel,
+)
+
+
+def reassign_query_key_value_inplace(key: str, state_dict: Dict[str, Any]):
+    to_q_key = key.replace("query_key_value", "to_q")
+    to_k_key = key.replace("query_key_value", "to_k")
+    to_v_key = key.replace("query_key_value", "to_v")
+    to_q, to_k, to_v = torch.chunk(state_dict[key], chunks=3, dim=0)
+    state_dict[to_q_key] = to_q
+    state_dict[to_k_key] = to_k
+    state_dict[to_v_key] = to_v
+    state_dict.pop(key)
+
+
+def reassign_query_key_layernorm_inplace(key: str, state_dict: Dict[str, Any]):
+    layer_id, weight_or_bias = key.split(".")[-2:]
+
+    if "query" in key:
+        new_key = f"transformer_blocks.{layer_id}.attn1.norm_q.{weight_or_bias}"
+    elif "key" in key:
+        new_key = f"transformer_blocks.{layer_id}.attn1.norm_k.{weight_or_bias}"
+
+    state_dict[new_key] = state_dict.pop(key)
+
+
+def reassign_adaln_norm_inplace(key: str, state_dict: Dict[str, Any]):
+    layer_id, _, weight_or_bias = key.split(".")[-3:]
+
+    weights_or_biases = state_dict[key].chunk(12, dim=0)
+    norm1_weights_or_biases = torch.cat(weights_or_biases[0:3] + weights_or_biases[6:9])
+    norm2_weights_or_biases = torch.cat(weights_or_biases[3:6] + weights_or_biases[9:12])
+
+    norm1_key = f"transformer_blocks.{layer_id}.norm1.linear.{weight_or_bias}"
+    state_dict[norm1_key] = norm1_weights_or_biases
+
+    norm2_key = f"transformer_blocks.{layer_id}.norm2.linear.{weight_or_bias}"
+    state_dict[norm2_key] = norm2_weights_or_biases
+
+    state_dict.pop(key)
+
+
+def remove_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    state_dict.pop(key)
+
+
+def replace_up_keys_inplace(key: str, state_dict: Dict[str, Any]):
+    key_split = key.split(".")
+    layer_index = int(key_split[2])
+    replace_layer_index = 4 - 1 - layer_index
+
+    key_split[1] = "up_blocks"
+    key_split[2] = str(replace_layer_index)
+    new_key = ".".join(key_split)
+
+    state_dict[new_key] = state_dict.pop(key)
+
+
+TRANSFORMER_KEYS_RENAME_DICT = {
+    "transformer.final_layernorm": "norm_final",
+    "transformer": "transformer_blocks",
+    "attention": "attn1",
+    "mlp": "ff.net",
+    "dense_h_to_4h": "0.proj",
+    "dense_4h_to_h": "2",
+    ".layers": "",
+    "dense": "to_out.0",
+    "input_layernorm": "norm1.norm",
+    "post_attn1_layernorm": "norm2.norm",
+    "time_embed.0": "time_embedding.linear_1",
+    "time_embed.2": "time_embedding.linear_2",
+    "ofs_embed.0": "ofs_embedding.linear_1",
+    "ofs_embed.2": "ofs_embedding.linear_2",
+    "mixins.patch_embed": "patch_embed",
+    "mixins.final_layer.norm_final": "norm_out.norm",
+    "mixins.final_layer.linear": "proj_out",
+    "mixins.final_layer.adaLN_modulation.1": "norm_out.linear",
+    "mixins.pos_embed.pos_embedding": "patch_embed.pos_embedding",  # Specific to CogVideoX-5b-I2V
+}
+
+TRANSFORMER_SPECIAL_KEYS_REMAP = {
+    "query_key_value": reassign_query_key_value_inplace,
+    "query_layernorm_list": reassign_query_key_layernorm_inplace,
+    "key_layernorm_list": reassign_query_key_layernorm_inplace,
+    "adaln_layer.adaLN_modulations": reassign_adaln_norm_inplace,
+    "embed_tokens": remove_keys_inplace,
+    "freqs_sin": remove_keys_inplace,
+    "freqs_cos": remove_keys_inplace,
+    "position_embedding": remove_keys_inplace,
+}
+
+VAE_KEYS_RENAME_DICT = {
+    "block.": "resnets.",
+    "down.": "down_blocks.",
+    "downsample": "downsamplers.0",
+    "upsample": "upsamplers.0",
+    "nin_shortcut": "conv_shortcut",
+    "encoder.mid.block_1": "encoder.mid_block.resnets.0",
+    "encoder.mid.block_2": "encoder.mid_block.resnets.1",
+    "decoder.mid.block_1": "decoder.mid_block.resnets.0",
+    "decoder.mid.block_2": "decoder.mid_block.resnets.1",
+}
+
+VAE_SPECIAL_KEYS_REMAP = {
+    "loss": remove_keys_inplace,
+    "up.": replace_up_keys_inplace,
+}
+
+TOKENIZER_MAX_LENGTH = 226
+
+
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+
+
+def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
+    state_dict[new_key] = state_dict.pop(old_key)
+
+
+def convert_transformer(
+    ckpt_path: str,
+    num_layers: int,
+    num_attention_heads: int,
+    use_rotary_positional_embeddings: bool,
+    i2v: bool,
+    dtype: torch.dtype,
+    init_kwargs: Dict[str, Any],
+):
+    PREFIX_KEY = "model.diffusion_model."
+
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+    transformer = CogVideoXTransformer3DModel(
+        in_channels=32 if i2v else 16,
+        num_layers=num_layers,
+        num_attention_heads=num_attention_heads,
+        use_rotary_positional_embeddings=use_rotary_positional_embeddings,
+        ofs_embed_dim=512 if (i2v and init_kwargs["patch_size_t"] is not None) else None,  # CogVideoX1.5-5B-I2V
+        use_learned_positional_embeddings=i2v and init_kwargs["patch_size_t"] is None,  # CogVideoX-5B-I2V
+        **init_kwargs,
+    ).to(dtype=dtype)
+
+    for key in list(original_state_dict.keys()):
+        new_key = key[len(PREFIX_KEY) :]
+        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_inplace(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    transformer.load_state_dict(original_state_dict, strict=True)
+    return transformer
+
+
+def convert_vae(ckpt_path: str, scaling_factor: float, version: str, dtype: torch.dtype):
+    init_kwargs = {"scaling_factor": scaling_factor}
+    if version == "1.5":
+        init_kwargs.update({"invert_scale_latents": True})
+
+    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+    vae = AutoencoderKLCogVideoX(**init_kwargs).to(dtype=dtype)
+
+    for key in list(original_state_dict.keys()):
+        new_key = key[:]
+        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
+            new_key = new_key.replace(replace_key, rename_key)
+        update_state_dict_inplace(original_state_dict, key, new_key)
+
+    for key in list(original_state_dict.keys()):
+        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
+            if special_key not in key:
+                continue
+            handler_fn_inplace(key, original_state_dict)
+
+    vae.load_state_dict(original_state_dict, strict=True)
+    return vae
+
+
+def get_transformer_init_kwargs(version: str):
+    if version == "1.0":
+        vae_scale_factor_spatial = 8
+        init_kwargs = {
+            "patch_size": 2,
+            "patch_size_t": None,
+            "patch_bias": True,
+            "sample_height": 480 // vae_scale_factor_spatial,
+            "sample_width": 720 // vae_scale_factor_spatial,
+            "sample_frames": 49,
+        }
+
+    elif version == "1.5":
+        vae_scale_factor_spatial = 8
+        init_kwargs = {
+            "patch_size": 2,
+            "patch_size_t": 2,
+            "patch_bias": False,
+            "sample_height": 768 // vae_scale_factor_spatial,
+            "sample_width": 1360 // vae_scale_factor_spatial,
+            "sample_frames": 81,
+        }
+    else:
+        raise ValueError("Unsupported version of CogVideoX.")
+
+    return init_kwargs
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
+    )
+    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
+    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
+    parser.add_argument("--fp16", action="store_true", default=False, help="Whether to save the model weights in fp16")
+    parser.add_argument("--bf16", action="store_true", default=False, help="Whether to save the model weights in bf16")
+    parser.add_argument(
+        "--push_to_hub", action="store_true", default=False, help="Whether to push to HF Hub after saving"
+    )
+    parser.add_argument(
+        "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
+    )
+    parser.add_argument(
+        "--typecast_text_encoder",
+        action="store_true",
+        default=False,
+        help="Whether or not to apply fp16/bf16 precision to text_encoder",
+    )
+    # For CogVideoX-2B, num_layers is 30. For 5B, it is 42
+    parser.add_argument("--num_layers", type=int, default=30, help="Number of transformer blocks")
+    # For CogVideoX-2B, num_attention_heads is 30. For 5B, it is 48
+    parser.add_argument("--num_attention_heads", type=int, default=30, help="Number of attention heads")
+    # For CogVideoX-2B, use_rotary_positional_embeddings is False. For 5B, it is True
+    parser.add_argument(
+        "--use_rotary_positional_embeddings", action="store_true", default=False, help="Whether to use RoPE or not"
+    )
+    # For CogVideoX-2B, scaling_factor is 1.15258426. For 5B, it is 0.7
+    parser.add_argument("--scaling_factor", type=float, default=1.15258426, help="Scaling factor in the VAE")
+    # For CogVideoX-2B, snr_shift_scale is 3.0. For 5B, it is 1.0
+    parser.add_argument("--snr_shift_scale", type=float, default=3.0, help="Scaling factor in the VAE")
+    parser.add_argument(
+        "--i2v",
+        action="store_true",
+        default=False,
+        help="Whether the model to be converted is the Image-to-Video version of CogVideoX.",
+    )
+    parser.add_argument(
+        "--version",
+        choices=["1.0", "1.5"],
+        default="1.0",
+        help="Which version of CogVideoX to use for initializing default modeling parameters.",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    transformer = None
+    vae = None
+
+    if args.fp16 and args.bf16:
+        raise ValueError("You cannot pass both --fp16 and --bf16 at the same time.")
+
+    dtype = torch.float16 if args.fp16 else torch.bfloat16 if args.bf16 else torch.float32
+
+    if args.transformer_ckpt_path is not None:
+        init_kwargs = get_transformer_init_kwargs(args.version)
+        transformer = convert_transformer(
+            args.transformer_ckpt_path,
+            args.num_layers,
+            args.num_attention_heads,
+            args.use_rotary_positional_embeddings,
+            args.i2v,
+            dtype,
+            init_kwargs,
+        )
+    if args.vae_ckpt_path is not None:
+        # Keep VAE in float32 for better quality
+        vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, args.version, torch.float32)
+
+    text_encoder_id = "google/t5-v1_1-xxl"
+    tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
+    text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
+
+    if args.typecast_text_encoder:
+        text_encoder = text_encoder.to(dtype=dtype)
+
+    # Apparently, the conversion does not work anymore without this :shrug:
+    for param in text_encoder.parameters():
+        param.data = param.data.contiguous()
+
+    scheduler = CogVideoXDDIMScheduler.from_config(
+        {
+            "snr_shift_scale": args.snr_shift_scale,
+            "beta_end": 0.012,
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "clip_sample": False,
+            "num_train_timesteps": 1000,
+            "prediction_type": "v_prediction",
+            "rescale_betas_zero_snr": True,
+            "set_alpha_to_one": True,
+            "timestep_spacing": "trailing",
+        }
+    )
+    if args.i2v:
+        pipeline_cls = CogVideoXImageToVideoPipeline
+    else:
+        pipeline_cls = CogVideoXPipeline
+
+    pipe = pipeline_cls(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        vae=vae,
+        transformer=transformer,
+        scheduler=scheduler,
+    )
+
+    # We don't use variant here because the model must be run in fp16 (2B) or bf16 (5B). It would be weird
+    # for users to specify variant when the default is not fp32 and they want to run with the correct default (which
+    # is either fp16/bf16 here).
+
+    # This is necessary This is necessary for users with insufficient memory,
+    # such as those using Colab and notebooks, as it can save some memory used for model loading.
+    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", push_to_hub=args.push_to_hub)
diff --git a/tools/export_sat_lora_weight.py b/tools/export_sat_lora_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..9340d8f2996d55f67a72ddc8c92541465d612c9b
--- /dev/null
+++ b/tools/export_sat_lora_weight.py
@@ -0,0 +1,83 @@
+from typing import Any, Dict
+import torch 
+import argparse 
+from diffusers.loaders.lora_base import LoraBaseMixin
+from diffusers.models.modeling_utils import load_state_dict
+
+
+def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
+    state_dict = saved_dict
+    if "model" in saved_dict.keys():
+        state_dict = state_dict["model"]
+    if "module" in saved_dict.keys():
+        state_dict = state_dict["module"]
+    if "state_dict" in saved_dict.keys():
+        state_dict = state_dict["state_dict"]
+    return state_dict
+
+LORA_KEYS_RENAME = {
+
+    'attention.query_key_value.matrix_A.0': 'attn1.to_q.lora_A.weight',
+    'attention.query_key_value.matrix_A.1': 'attn1.to_k.lora_A.weight',
+    'attention.query_key_value.matrix_A.2': 'attn1.to_v.lora_A.weight',
+    'attention.query_key_value.matrix_B.0': 'attn1.to_q.lora_B.weight',
+    'attention.query_key_value.matrix_B.1': 'attn1.to_k.lora_B.weight',
+    'attention.query_key_value.matrix_B.2': 'attn1.to_v.lora_B.weight',
+    'attention.dense.matrix_A.0': 'attn1.to_out.0.lora_A.weight',
+    'attention.dense.matrix_B.0': 'attn1.to_out.0.lora_B.weight'
+}
+
+
+
+PREFIX_KEY = "model.diffusion_model."
+SAT_UNIT_KEY = "layers"
+LORA_PREFIX_KEY = "transformer_blocks"
+
+
+
+def export_lora_weight(ckpt_path,lora_save_directory):
+
+    merge_original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", mmap=True))
+
+
+    lora_state_dict = {}
+    for key in list(merge_original_state_dict.keys()):
+        new_key = key[len(PREFIX_KEY) :]
+        for special_key, lora_keys in LORA_KEYS_RENAME.items():
+            if new_key.endswith(special_key):
+                new_key = new_key.replace(special_key, lora_keys)
+                new_key = new_key.replace(SAT_UNIT_KEY, LORA_PREFIX_KEY)
+
+                lora_state_dict[new_key] = merge_original_state_dict[key]
+
+
+
+    # final length should be 240 
+    if len(lora_state_dict) != 240:
+        raise ValueError("lora_state_dict length is not 240")
+
+    lora_state_dict.keys()
+
+    LoraBaseMixin.write_lora_layers(
+        state_dict=lora_state_dict,
+        save_directory=lora_save_directory,
+        is_main_process=True,
+        weight_name=None,
+        save_function=None,
+        safe_serialization=True
+    )
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--sat_pt_path", type=str, required=True, help="Path to original sat transformer checkpoint"
+    )
+    parser.add_argument("--lora_save_directory", type=str, required=True, help="Path where converted lora should be saved") 
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    export_lora_weight(args.sat_pt_path, args.lora_save_directory)
diff --git a/tools/generate_camparam.py b/tools/generate_camparam.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cff477593c2c3d521a262d3c96436563d2b1aad
--- /dev/null
+++ b/tools/generate_camparam.py
@@ -0,0 +1,68 @@
+import os
+import argparse
+import numpy as np
+from einops import rearrange, repeat
+import pdb
+
+# Generate W2C extrinsic parameters
+
+def translation_matrix(direction, length, num_frame):
+    assert len(direction)==3, "direction should be [1, 0, 0] or [0, 1, 0] or ..."
+
+    K = np.array([0.474812, 0.844111, 0.500000, 0.500000, abs(0.000000), abs(0.000000)])
+    R = np.array([[1.0, abs(0.0), abs(0.0)],
+                  [abs(0.0), 1.0, abs(0.0)],
+                  [abs(0.0), abs(0.0), 1.0]])
+    
+    T = (repeat(np.array(direction), 'n -> n f', f=num_frame) * np.linspace(abs(0.), length, num_frame)).transpose(1,0)
+    extrinsic = np.concatenate([repeat(R, 'h w -> f h w', f=num_frame), T[:,:,None]], axis=-1)
+    camparam = np.concatenate([repeat(K, 'n -> f n', f=num_frame), rearrange(extrinsic, 'f h w -> f (h w)')], axis=-1)
+
+    return camparam
+
+def main(args):
+    os.makedirs(args.output_path, exist_ok=True)
+
+
+    length = 1.5
+
+    # right
+    direction = [1., abs(0.), abs(0.)]
+    camparam_right = translation_matrix(direction, length, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, 'camera_R.txt'), camparam_right, fmt='%1.6f')
+
+    # left
+    direction = [-1., abs(0.), abs(0.)]
+    camparam_left = translation_matrix(direction, length, args.num_frame)
+    np.savetxt(os.path.join(args.output_path, 'camera_L.txt'), camparam_left, fmt='%1.6f')
+
+    # up
+    direction = [abs(0.), -1.0, abs(0.)]
+    camparam_up = translation_matrix(direction, length, args.num_frame)
+    np.savetxt(os.path.join(args.output_path, 'camera_U.txt'), camparam_up, fmt='%1.6f')
+
+    # down
+    direction = [abs(0.), 1.0, abs(0.)]
+    camparam_down = translation_matrix(direction, length, args.num_frame)
+    np.savetxt(os.path.join(args.output_path, 'camera_D.txt'), camparam_down, fmt='%1.6f')
+
+    # in
+    direction = [abs(0.), abs(0.), 1.0]
+    camparam_in = translation_matrix(direction, length, args.num_frame)
+    np.savetxt(os.path.join(args.output_path, 'camera_I.txt'), camparam_in, fmt='%1.6f')
+
+    # out
+    direction = [abs(0.), abs(0.), -1.0]
+    camparam_out = translation_matrix(direction, length, args.num_frame)
+    np.savetxt(os.path.join(args.output_path, 'camera_O.txt'), camparam_out, fmt='%1.6f')
+    
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--num_frame", type=int, default=49)
+
+    args = parser.parse_args()
+    main(args)
+    
\ No newline at end of file
diff --git a/tools/generate_camparam_pan_tilt.py b/tools/generate_camparam_pan_tilt.py
new file mode 100644
index 0000000000000000000000000000000000000000..cccd9d39e7518e704d082e5b0f605d22a5b08cbb
--- /dev/null
+++ b/tools/generate_camparam_pan_tilt.py
@@ -0,0 +1,118 @@
+import os
+import argparse
+import numpy as np
+from einops import rearrange, repeat
+import pdb
+
+
+def generate_rotation_extrinsics(direction: str, angle: float, num_frame: int):
+    """
+    Generate extrinsic camera matrices with rotation only (no translation),
+    allowing both positive and negative directions.
+
+    Args:
+        direction (str): '+x', '-x', '+y', '-y', '+z', or '-z'
+        angle (float): total rotation angle in degrees
+        num_frame (int): number of frames to interpolate the rotation
+
+    Returns:
+        List[np.ndarray]: List of 3x4 extrinsic matrices (rotation | zero translation)
+    """
+    assert direction[0] in ('+', '-'), "direction must start with '+' or '-'"
+    assert direction[1] in ('x', 'y', 'z'), "direction must be along x, y, or z"
+
+    axis = direction[1]
+    sign = 1 if direction[0] == '+' else -1
+    angle_rad = np.deg2rad(angle) * sign
+    step = angle_rad / (num_frame - 1)
+
+    extrinsics = []
+    for i in range(num_frame):
+        theta = step * i
+        print(theta)
+        if axis == 'x':
+            R = np.array([
+                [1, 0, 0],
+                [0, np.cos(theta), -np.sin(theta)],
+                [0, np.sin(theta), np.cos(theta)],
+            ])
+        elif axis == 'y':
+            R = np.array([
+                [np.cos(theta), 0, np.sin(theta)],
+                [0, 1, 0],
+                [-np.sin(theta), 0, np.cos(theta)],
+            ])
+        elif axis == 'z':
+            R = np.array([
+                [np.cos(theta), -np.sin(theta), 0],
+                [np.sin(theta), np.cos(theta), 0],
+                [0, 0, 1],
+            ])
+
+        Rt = np.hstack([R, np.zeros((3, 1))])  # 3x4 matrix
+        extrinsics.append(Rt)
+
+    extrinsics = np.stack(extrinsics)
+    
+    K = np.array([0.474812, 0.844111, 0.500000, 0.500000, abs(0.000000), abs(0.000000)])
+    camparam = np.concatenate([repeat(K, 'n -> f n', f=num_frame), rearrange(extrinsics, 'f h w -> f (h w)')], axis=-1)
+
+    return camparam
+
+def main(args):
+    os.makedirs(args.output_path, exist_ok=True)
+
+    angle = 90
+
+    # tilt up
+    direction = ('+', 'x')
+    camparam_tilt_up = generate_rotation_extrinsics(direction, angle, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, f'Tilt_Up_{angle:01f}.txt'), camparam_tilt_up, fmt='%1.6f')
+    
+    # tilt down
+    direction = ('-', 'x')
+    camparam_tilt_down = generate_rotation_extrinsics(direction, angle, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, f'Tilt_Down_{angle:01f}.txt'), camparam_tilt_down, fmt='%1.6f')
+
+    # pan right
+    direction = ('+', 'y')
+    camparam_pan_right = generate_rotation_extrinsics(direction, angle, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, f'Pan_Right_{angle:01f}.txt'), camparam_pan_right, fmt='%1.6f')
+    
+    # pan left
+    direction = ('-', 'y')
+    camparam_pan_left = generate_rotation_extrinsics(direction, angle, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, f'Pan_Left_{angle:01f}.txt'), camparam_pan_left, fmt='%1.6f')
+    
+    # Spin clockwise
+    direction = ('+', 'z')
+    camparam_spin_clockwise = generate_rotation_extrinsics(direction, angle, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, f'Spin_Clockwise_{angle:01f}.txt'), camparam_spin_clockwise, fmt='%1.6f')
+    
+    # Spin anticlockwise
+    direction = ('-', 'z')
+    camparam_spin_anticlockwise = generate_rotation_extrinsics(direction, angle, args.num_frame).astype(np.float32)
+    np.savetxt(os.path.join(args.output_path, f'Spin_AntiClockwise_{angle:01f}.txt'), camparam_spin_anticlockwise, fmt='%1.6f')
+
+
+
+
+
+
+    # right
+    # direction = [1., abs(0.), abs(0.)]
+    # camparam_right = translation_matrix(direction, length, args.num_frame).astype(np.float32)
+    # np.savetxt(os.path.join(args.output_path, 'camera_R.txt'), camparam_right, fmt='%1.6f')
+
+    
+    
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--num_frame", type=int, default=49)
+
+    args = parser.parse_args()
+    main(args)
+    
\ No newline at end of file
diff --git a/tools/llm_flux_cogvideox/generate.sh b/tools/llm_flux_cogvideox/generate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c455273d90d4468e2d1bf86053855345f3ee6411
--- /dev/null
+++ b/tools/llm_flux_cogvideox/generate.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+NUM_VIDEOS=10
+INFERENCE_STEPS=50
+GUIDANCE_SCALE=7.0
+OUTPUT_DIR_PREFIX="outputs/gpu_"
+LOG_DIR_PREFIX="logs/gpu_"
+
+VIDEO_MODEL_PATH="/share/official_pretrains/hf_home/CogVideoX-5b-I2V"
+LLM_MODEL_PATH="/share/home/zyx/Models/Meta-Llama-3.1-8B-Instruct"
+IMAGE_MODEL_PATH = "share/home/zyx/Models/FLUX.1-dev"
+
+#VIDEO_MODEL_PATH="THUDM/CogVideoX-5B-I2V"
+#LLM_MODEL_PATH="THUDM/glm-4-9b-chat"
+#IMAGE_MODEL_PATH = "black-forest-labs/FLUX.1-dev"
+
+CUDA_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
+
+IFS=',' read -r -a GPU_ARRAY <<< "$CUDA_DEVICES"
+
+for i in "${!GPU_ARRAY[@]}"
+do
+    GPU=${GPU_ARRAY[$i]}
+    echo "Starting task on GPU $GPU..."
+    CUDA_VISIBLE_DEVICES=$GPU nohup python3 llm_flux_cogvideox.py \
+    --caption_generator_model_id $LLM_MODEL_PATH \
+    --image_generator_model_id $IMAGE_MODEL_PATH \
+    --model_path $VIDEO_MODEL_PATH \
+    --num_videos $NUM_VIDEOS \
+    --image_generator_num_inference_steps $INFERENCE_STEPS \
+    --guidance_scale $GUIDANCE_SCALE \
+    --use_dynamic_cfg \
+    --output_dir ${OUTPUT_DIR_PREFIX}${GPU} \
+    > ${LOG_DIR_PREFIX}${GPU}.log 2>&1 &
+done
\ No newline at end of file
diff --git a/tools/llm_flux_cogvideox/gradio_page.py b/tools/llm_flux_cogvideox/gradio_page.py
new file mode 100644
index 0000000000000000000000000000000000000000..588c46931fbfef5d23662d4fb925e28f8ac62956
--- /dev/null
+++ b/tools/llm_flux_cogvideox/gradio_page.py
@@ -0,0 +1,194 @@
+import os
+import gradio as gr
+import gc
+import random
+import torch
+import numpy as np
+from PIL import Image
+import transformers
+from diffusers import CogVideoXImageToVideoPipeline, CogVideoXDPMScheduler, DiffusionPipeline
+from diffusers.utils import export_to_video
+from transformers import AutoTokenizer
+from datetime import datetime, timedelta
+import threading
+import time
+from moviepy import VideoFileClip
+
+torch.set_float32_matmul_precision("high")
+
+# Set default values
+caption_generator_model_id = "/share/home/zyx/Models/Meta-Llama-3.1-8B-Instruct"
+image_generator_model_id = "/share/home/zyx/Models/FLUX.1-dev"
+video_generator_model_id = "/share/official_pretrains/hf_home/CogVideoX-5b-I2V"
+seed = 1337
+
+os.makedirs("./output", exist_ok=True)
+os.makedirs("./gradio_tmp", exist_ok=True)
+
+tokenizer = AutoTokenizer.from_pretrained(caption_generator_model_id, trust_remote_code=True)
+caption_generator = transformers.pipeline(
+    "text-generation",
+    model=caption_generator_model_id,
+    device_map="balanced",
+    model_kwargs={
+        "local_files_only": True,
+        "torch_dtype": torch.bfloat16,
+    },
+    trust_remote_code=True,
+    tokenizer=tokenizer
+)
+
+image_generator = DiffusionPipeline.from_pretrained(
+    image_generator_model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="balanced"
+)
+# image_generator.to("cuda")
+
+video_generator = CogVideoXImageToVideoPipeline.from_pretrained(
+    video_generator_model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="balanced"
+)
+
+video_generator.vae.enable_slicing()
+video_generator.vae.enable_tiling()
+
+video_generator.scheduler = CogVideoXDPMScheduler.from_config(
+    video_generator.scheduler.config, timestep_spacing="trailing"
+)
+
+# Define prompts
+SYSTEM_PROMPT = """
+You are part of a team of people that create videos using generative models. You use a video-generation model that can generate a video about anything you describe.
+
+For example, if you respond with "A beautiful morning in the woods with the sun peaking through the trees", the video generation model will create a video of exactly as described. Your task is to summarize the descriptions of videos provided by users and create detailed prompts to feed into the generative model.
+
+There are a few rules to follow:
+- You will only ever output a single video description per request.
+- If the user mentions to summarize the prompt in [X] words, make sure not to exceed the limit.
+
+Your responses should just be the video generation prompt. Here are examples:
+- "A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting."
+- "A street artist, clad in a worn-out denim jacket and a colorful bandana, stands before a vast concrete wall in the heart of the city, holding a can of spray paint, spray-painting a colorful bird on a mottled wall."
+""".strip()
+
+USER_PROMPT = """
+Could you generate a prompt for a video generation model? Please limit the prompt to [{0}] words.
+""".strip()
+
+
+def generate_caption(prompt):
+    num_words = random.choice([25, 50, 75, 100])
+    user_prompt = USER_PROMPT.format(num_words)
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": prompt + "\n" + user_prompt},
+    ]
+
+    response = caption_generator(
+        messages,
+        max_new_tokens=226,
+        return_full_text=False
+    )
+    caption = response[0]["generated_text"]
+    if caption.startswith("\"") and caption.endswith("\""):
+        caption = caption[1:-1]
+    return caption
+
+
+def generate_image(caption, progress=gr.Progress(track_tqdm=True)):
+    image = image_generator(
+        prompt=caption,
+        height=480,
+        width=720,
+        num_inference_steps=30,
+        guidance_scale=3.5,
+    ).images[0]
+    return image, image  # One for output One for State
+
+
+def generate_video(
+        caption,
+        image,
+        progress=gr.Progress(track_tqdm=True)
+):
+    generator = torch.Generator().manual_seed(seed)
+    video_frames = video_generator(
+        image=image,
+        prompt=caption,
+        height=480,
+        width=720,
+        num_frames=49,
+        num_inference_steps=50,
+        guidance_scale=6,
+        use_dynamic_cfg=True,
+        generator=generator,
+    ).frames[0]
+    video_path = save_video(video_frames)
+    gif_path = convert_to_gif(video_path)
+    return video_path, gif_path
+
+
+def save_video(tensor):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    video_path = f"./output/{timestamp}.mp4"
+    os.makedirs(os.path.dirname(video_path), exist_ok=True)
+    export_to_video(tensor, video_path, fps=8)
+    return video_path
+
+
+def convert_to_gif(video_path):
+    clip = VideoFileClip(video_path)
+    clip = clip.with_fps(8)
+    clip = clip.resized(height=240)
+    gif_path = video_path.replace(".mp4", ".gif")
+    clip.write_gif(gif_path, fps=8)
+    return gif_path
+
+
+def delete_old_files():
+    while True:
+        now = datetime.now()
+        cutoff = now - timedelta(minutes=10)
+        directories = ["./output", "./gradio_tmp"]
+
+        for directory in directories:
+            for filename in os.listdir(directory):
+                file_path = os.path.join(directory, filename)
+                if os.path.isfile(file_path):
+                    file_mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
+                    if file_mtime < cutoff:
+                        os.remove(file_path)
+        time.sleep(600)
+
+
+threading.Thread(target=delete_old_files, daemon=True).start()
+
+with gr.Blocks() as demo:
+    gr.Markdown("""
+           <div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 20px;">
+               LLM + FLUX + CogVideoX-I2V Space 🤗
+            </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here", lines=5)
+            generate_caption_button = gr.Button("Generate Caption")
+            caption = gr.Textbox(label="Caption", placeholder="Caption will appear here", lines=5)
+            generate_image_button = gr.Button("Generate Image")
+            image_output = gr.Image(label="Generated Image")
+            state_image = gr.State()
+            generate_caption_button.click(fn=generate_caption, inputs=prompt, outputs=caption)
+            generate_image_button.click(fn=generate_image, inputs=caption, outputs=[image_output, state_image])
+        with gr.Column():
+            video_output = gr.Video(label="Generated Video", width=720, height=480)
+            download_video_button = gr.File(label="📥 Download Video", visible=False)
+            download_gif_button = gr.File(label="📥 Download GIF", visible=False)
+            generate_video_button = gr.Button("Generate Video from Image")
+            generate_video_button.click(fn=generate_video, inputs=[caption, state_image],
+                                        outputs=[video_output, download_gif_button])
+
+if __name__ == "__main__":
+    demo.launch()
diff --git a/tools/llm_flux_cogvideox/llm_flux_cogvideox.py b/tools/llm_flux_cogvideox/llm_flux_cogvideox.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e97888fb4bb6a45acdbf3873b450ea2ddbb33ed
--- /dev/null
+++ b/tools/llm_flux_cogvideox/llm_flux_cogvideox.py
@@ -0,0 +1,257 @@
+"""
+The original experimental code for this project can be found at:
+
+https://gist.github.com/a-r-r-o-w/d070cce059ab4ceab3a9f289ff83c69c
+
+By using this code, description prompts will be generated through a local large language model, and images will be
+generated using the black-forest-labs/FLUX.1-dev model, followed by video generation via CogVideoX.
+The entire process utilizes open-source solutions, without the need for any API keys.
+
+You can use the generate.sh file in the same folder to automate running this code
+for batch generation of videos and images.
+
+bash generate.sh
+
+"""
+
+import argparse
+import gc
+import json
+import os
+import pathlib
+import random
+from typing import Any, Dict
+
+from transformers import AutoTokenizer
+
+os.environ["TORCH_LOGS"] = "+dynamo,recompiles,graph_breaks"
+os.environ["TORCHDYNAMO_VERBOSE"] = "1"
+
+import numpy as np
+import torch
+import transformers
+from diffusers import CogVideoXImageToVideoPipeline, CogVideoXDPMScheduler, DiffusionPipeline
+from diffusers.utils.logging import get_logger
+from diffusers.utils import export_to_video
+
+torch.set_float32_matmul_precision("high")
+
+logger = get_logger(__name__)
+
+SYSTEM_PROMPT = """
+You are part of a team of people that create videos using generative models. You use a video-generation model that can generate a video about anything you describe.
+
+For example, if you respond with "A beautiful morning in the woods with the sun peaking through the trees", the video generation model will create a video of exactly as described. You task is to summarize the descriptions of videos provided to by users, and create details prompts to feed into the generative model.
+
+There are a few rules to follow:
+- You will only ever output a single video description per request.
+- If the user mentions to summarize the prompt in [X] words, make sure to not exceed the limit.
+
+You responses should just be the video generation prompt. Here are examples:
+- “A lone figure stands on a city rooftop at night, gazing up at the full moon. The moon glows brightly, casting a gentle light over the quiet cityscape. Below, the windows of countless homes shine with warm lights, creating a contrast between the bustling life below and the peaceful solitude above. The scene captures the essence of the Mid-Autumn Festival, where despite the distance, the figure feels connected to loved ones through the shared beauty of the moonlit sky.”
+- "A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting."
+- "A street artist, clad in a worn-out denim jacket and a colorful banana, stands before a vast concrete wall in the heart, holding a can of spray paint, spray-painting a colorful bird on a mottled wall"
+""".strip()
+
+USER_PROMPT = """
+Could you generate a prompt for a video generation model? 
+Please limit the prompt to [{0}] words.
+""".strip()
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num_videos",
+        type=int,
+        default=5,
+        help="Number of unique videos you would like to generate."
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="THUDM/CogVideoX-5B",
+        help="The path of Image2Video CogVideoX-5B",
+    )
+    parser.add_argument(
+        "--caption_generator_model_id",
+        type=str,
+        default="THUDM/glm-4-9b-chat",
+        help="Caption generation model. default GLM-4-9B",
+    )
+    parser.add_argument(
+        "--caption_generator_cache_dir",
+        type=str,
+        default=None,
+        help="Cache directory for caption generation model."
+    )
+    parser.add_argument(
+        "--image_generator_model_id",
+        type=str,
+        default="black-forest-labs/FLUX.1-dev",
+        help="Image generation model."
+    )
+    parser.add_argument(
+        "--image_generator_cache_dir",
+        type=str,
+        default=None,
+        help="Cache directory for image generation model."
+    )
+    parser.add_argument(
+        "--image_generator_num_inference_steps",
+        type=int,
+        default=50,
+        help="Caption generation model."
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7,
+        help="Guidance scale to be use for generation."
+    )
+    parser.add_argument(
+        "--use_dynamic_cfg",
+        action="store_true",
+        help="Whether or not to use cosine dynamic guidance for generation [Recommended].",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="outputs/",
+        help="Location where generated images and videos should be stored.",
+    )
+    parser.add_argument(
+        "--compile",
+        action="store_true",
+        help="Whether or not to compile the transformer of image and video generators."
+    )
+    parser.add_argument(
+        "--enable_vae_tiling",
+        action="store_true",
+        help="Whether or not to use VAE tiling when encoding/decoding."
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Seed for reproducibility."
+    )
+    return parser.parse_args()
+
+
+def reset_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.reset_accumulated_memory_stats()
+
+
+@torch.no_grad()
+def main(args: Dict[str, Any]) -> None:
+    output_dir = pathlib.Path(args.output_dir)
+    os.makedirs(output_dir.as_posix(), exist_ok=True)
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    reset_memory()
+    tokenizer = AutoTokenizer.from_pretrained(args.caption_generator_model_id, trust_remote_code=True)
+    caption_generator = transformers.pipeline(
+        "text-generation",
+        model=args.caption_generator_model_id,
+        device_map="auto",
+        model_kwargs={
+            "local_files_only": True,
+            "cache_dir": args.caption_generator_cache_dir,
+            "torch_dtype": torch.bfloat16,
+        },
+        trust_remote_code=True,
+        tokenizer=tokenizer
+    )
+
+    captions = []
+    for i in range(args.num_videos):
+        num_words = random.choice([50, 75, 100])
+        user_prompt = USER_PROMPT.format(num_words)
+
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_prompt},
+        ]
+
+        outputs = caption_generator(messages, max_new_tokens=226)
+        caption = outputs[0]["generated_text"][-1]["content"]
+        if caption.startswith("\"") and caption.endswith("\""):
+            caption = caption[1:-1]
+        captions.append(caption)
+        logger.info(f"Generated caption: {caption}")
+
+    with open(output_dir / "captions.json", "w") as file:
+        json.dump(captions, file)
+
+    del caption_generator
+    reset_memory()
+
+    image_generator = DiffusionPipeline.from_pretrained(
+        args.image_generator_model_id,
+        cache_dir=args.image_generator_cache_dir,
+        torch_dtype=torch.bfloat16
+    )
+    image_generator.to("cuda")
+
+    if args.compile:
+        image_generator.transformer = torch.compile(image_generator.transformer, mode="max-autotune", fullgraph=True)
+
+    if args.enable_vae_tiling:
+        image_generator.vae.enable_tiling()
+
+    images = []
+    for index, caption in enumerate(captions):
+        image = image_generator(
+            prompt=caption,
+            height=480,
+            width=720,
+            num_inference_steps=args.image_generator_num_inference_steps,
+            guidance_scale=3.5,
+        ).images[0]
+        filename = caption[:25].replace(".", "_").replace("'", "_").replace('"', "_").replace(",", "_")
+        image.save(output_dir / f"{index}_{filename}.png")
+        images.append(image)
+
+    del image_generator
+    reset_memory()
+
+    video_generator = CogVideoXImageToVideoPipeline.from_pretrained(
+        args.model_path, torch_dtype=torch.bfloat16).to("cuda")
+    video_generator.scheduler = CogVideoXDPMScheduler.from_config(
+        video_generator.scheduler.config,
+        timestep_spacing="trailing")
+
+    if args.compile:
+        video_generator.transformer = torch.compile(video_generator.transformer, mode="max-autotune", fullgraph=True)
+
+    if args.enable_vae_tiling:
+        video_generator.vae.enable_tiling()
+
+    generator = torch.Generator().manual_seed(args.seed)
+    for index, (caption, image) in enumerate(zip(captions, images)):
+        video = video_generator(
+            image=image,
+            prompt=caption,
+            height=480,
+            width=720,
+            num_frames=49,
+            num_inference_steps=50,
+            guidance_scale=args.guidance_scale,
+            use_dynamic_cfg=args.use_dynamic_cfg,
+            generator=generator,
+        ).frames[0]
+        filename = caption[:25].replace(".", "_").replace("'", "_").replace('"', "_").replace(",", "_")
+        export_to_video(video, output_dir / f"{index}_{filename}.mp4", fps=8)
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args)
diff --git a/tools/load_cogvideox_lora.py b/tools/load_cogvideox_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b129755f1a261e55ff6e9a118179cecbbbd4f52
--- /dev/null
+++ b/tools/load_cogvideox_lora.py
@@ -0,0 +1,125 @@
+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math 
+import random 
+import time
+from diffusers.utils import export_to_video
+from diffusers.image_processor import VaeImageProcessor
+from datetime import datetime, timedelta
+from diffusers import CogVideoXPipeline, CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+import os
+import torch
+import argparse
+
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--lora_weights_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to lora weights.",
+    )
+    parser.add_argument(
+        "--lora_r",
+        type=int,
+        default=128,
+        help="""LoRA weights have a rank parameter, with the default for 2B trans set at 128 and 5B trans set at 256. 
+        This part is used to calculate the value for lora_scale, which is by default divided by the alpha value, 
+        used for stable learning and to prevent underflow. In the SAT training framework,
+        alpha is set to 1 by default. The higher the rank, the better the expressive capability,
+        but it requires more memory and training time. Increasing this number blindly isn't always better.
+        The formula for lora_scale is: lora_r / alpha.
+        """,
+    )
+    parser.add_argument(
+        "--lora_alpha",
+        type=int,
+        default=1,
+        help="""LoRA weights have a rank parameter, with the default for 2B trans set at 128 and 5B trans set at 256. 
+        This part is used to calculate the value for lora_scale, which is by default divided by the alpha value, 
+        used for stable learning and to prevent underflow. In the SAT training framework,
+        alpha is set to 1 by default. The higher the rank, the better the expressive capability,
+        but it requires more memory and training time. Increasing this number blindly isn't always better.
+        The formula for lora_scale is: lora_r / alpha.
+        """,
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="prompt",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    pipe = CogVideoXPipeline.from_pretrained(args.pretrained_model_name_or_path, torch_dtype=torch.bfloat16).to(device)
+    pipe.load_lora_weights(args.lora_weights_path,  weight_name="pytorch_lora_weights.safetensors", adapter_name="cogvideox-lora")
+    # pipe.fuse_lora(lora_scale=args.lora_alpha/args.lora_r, ['transformer'])
+    lora_scaling=args.lora_alpha/args.lora_r
+    pipe.set_adapters(["cogvideox-lora"], [lora_scaling])
+
+
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+ 
+    latents = pipe(
+        prompt=args.prompt,
+        num_videos_per_prompt=1,
+        num_inference_steps=50,
+        num_frames=49,
+        use_dynamic_cfg=True,
+        output_type="pt",
+        guidance_scale=3.0,
+        generator=torch.Generator(device="cpu").manual_seed(42),
+    ).frames
+    batch_size = latents.shape[0]
+    batch_video_frames = []
+    for batch_idx in range(batch_size):
+        pt_image = latents[batch_idx]
+        pt_image = torch.stack([pt_image[i] for i in range(pt_image.shape[0])])
+
+        image_np = VaeImageProcessor.pt_to_numpy(pt_image)
+        image_pil = VaeImageProcessor.numpy_to_pil(image_np)
+        batch_video_frames.append(image_pil)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    video_path = f"{args.output_dir}/{timestamp}.mp4"
+    os.makedirs(os.path.dirname(video_path), exist_ok=True)
+    tensor = batch_video_frames[0]
+    fps=math.ceil((len(batch_video_frames[0]) - 1) / 6)
+
+    export_to_video(tensor, video_path, fps=fps)
\ No newline at end of file
diff --git a/tools/parallel_inference/parallel_inference_xdit.py b/tools/parallel_inference/parallel_inference_xdit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0b802981c967337210c34716ba1465f923a056c
--- /dev/null
+++ b/tools/parallel_inference/parallel_inference_xdit.py
@@ -0,0 +1,109 @@
+"""
+This is a parallel inference script for CogVideo. The original script
+can be found from the xDiT project at
+
+https://github.com/xdit-project/xDiT/blob/main/examples/cogvideox_example.py
+
+By using this code, the inference process is parallelized on multiple GPUs,
+and thus speeded up.
+
+Usage:
+1. pip install xfuser
+2. mkdir results
+3. run the following command to generate video
+torchrun --nproc_per_node=4 parallel_inference_xdit.py \
+    --model <cogvideox-model-path> --ulysses_degree 1 --ring_degree 2 \
+    --use_cfg_parallel --height 480 --width 720 --num_frames 9 \
+    --prompt 'A small dog.'
+
+You can also use the run.sh file in the same folder to automate running this
+code for batch generation of videos, by running:
+
+sh ./run.sh
+
+"""
+
+import time
+import torch
+import torch.distributed
+from diffusers import AutoencoderKLTemporalDecoder
+from xfuser import xFuserCogVideoXPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    get_data_parallel_rank,
+    get_data_parallel_world_size,
+    get_runtime_state,
+    is_dp_last_group,
+)
+from diffusers.utils import export_to_video
+
+
+def main():
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+
+    # Check if ulysses_degree is valid
+    num_heads = 30
+    if engine_args.ulysses_degree > 0 and num_heads % engine_args.ulysses_degree != 0:
+        raise ValueError(
+            f"ulysses_degree ({engine_args.ulysses_degree}) must be a divisor of the number of heads ({num_heads})"
+        )
+
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+
+    pipe = xFuserCogVideoXPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        engine_config=engine_config,
+        torch_dtype=torch.bfloat16,
+    )
+    if args.enable_sequential_cpu_offload:
+        pipe.enable_model_cpu_offload(gpu_id=local_rank)
+    else:
+        device = torch.device(f"cuda:{local_rank}")
+        pipe = pipe.to(device)
+
+    # Always enable tiling and slicing to avoid VAE OOM while batch size > 1
+    pipe.vae.enable_slicing()
+    pipe.vae.enable_tiling()
+
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        prompt=input_config.prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        generator=torch.Generator().manual_seed(input_config.seed),
+        guidance_scale=6,
+        use_dynamic_cfg=True,
+    ).frames[0]
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"tp{engine_args.tensor_parallel_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    if is_dp_last_group():
+        world_size = get_data_parallel_world_size()
+        resolution = f"{input_config.width}x{input_config.height}"
+        output_filename = f"results/cogvideox_{parallel_info}_{resolution}.mp4"
+        export_to_video(output, output_filename, fps=8)
+        print(f"output saved to {output_filename}")
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
+    get_runtime_state().destory_distributed_env()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/parallel_inference/run.sh b/tools/parallel_inference/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7f9d5a8136a3873fa4118fc15954cf379a521b3f
--- /dev/null
+++ b/tools/parallel_inference/run.sh
@@ -0,0 +1,51 @@
+set -x
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+# Select the model type
+# The model is downloaded to a specified location on disk, 
+# or you can simply use the model's ID on Hugging Face, 
+# which will then be downloaded to the default cache path on Hugging Face.
+
+export MODEL_TYPE="CogVideoX"
+# Configuration for different model types
+# script, model_id, inference_step
+declare -A MODEL_CONFIGS=(
+    ["CogVideoX"]="parallel_inference_xdit.py /cfs/dit/CogVideoX-2b 20"
+)
+
+if [[ -v MODEL_CONFIGS[$MODEL_TYPE] ]]; then
+    IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
+    export SCRIPT MODEL_ID INFERENCE_STEP
+else
+    echo "Invalid MODEL_TYPE: $MODEL_TYPE"
+    exit 1
+fi
+
+mkdir -p ./results
+
+# task args
+if [ "$MODEL_TYPE" = "CogVideoX" ]; then
+  TASK_ARGS="--height 480 --width 720 --num_frames 9"
+fi
+
+# CogVideoX asserts sp_degree == ulysses_degree*ring_degree <= 2. Also, do not set the pipefusion degree.
+if [ "$MODEL_TYPE" = "CogVideoX" ]; then
+N_GPUS=4
+PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1"
+CFG_ARGS="--use_cfg_parallel"
+fi
+
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A small dog." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG
diff --git a/tools/replicate/cog.yaml b/tools/replicate/cog.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..19d71ad37e824ed017aa112f23d3cf10ebc59b02
--- /dev/null
+++ b/tools/replicate/cog.yaml
@@ -0,0 +1,37 @@
+# Configuration for Cog ⚙️
+# Reference: https://cog.run/yaml
+
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+
+  # a list of ubuntu apt packages to install
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+
+  # python version in the form '3.11' or '3.11.4'
+  python_version: "3.11"
+
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - diffusers>=0.30.3
+    - accelerate>=0.34.2
+    - transformers>=4.44.2
+    - numpy==1.26.0
+    - torch>=2.4.0
+    - torchvision>=0.19.0
+    - sentencepiece>=0.2.0
+    - SwissArmyTransformer>=0.4.12
+    - imageio>=2.35.1
+    - imageio-ffmpeg>=0.5.1
+    - openai>=1.45.0
+    - moviepy>=2.0.0
+    - pillow==9.5.0
+    - pydantic==1.10.7
+  run:
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
+
+# predict.py defines how predictions are run on your model
+predict: "predict_t2v.py:Predictor"
+# predict: "predict_i2v.py:Predictor"
diff --git a/tools/replicate/predict_i2v.py b/tools/replicate/predict_i2v.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e457961d3b1be28dc53e331bd4b41021098a4ce
--- /dev/null
+++ b/tools/replicate/predict_i2v.py
@@ -0,0 +1,89 @@
+# Prediction interface for Cog ⚙️
+# https://cog.run/python
+
+import os
+import subprocess
+import time
+import torch
+from diffusers import CogVideoXImageToVideoPipeline
+from diffusers.utils import export_to_video, load_image
+from cog import BasePredictor, Input, Path
+
+
+MODEL_CACHE = "model_cache_i2v"
+MODEL_URL = (
+    f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
+)
+os.environ["HF_DATASETS_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+os.environ["HF_HOME"] = MODEL_CACHE
+os.environ["TORCH_HOME"] = MODEL_CACHE
+os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
+os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
+os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
+
+
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+
+        if not os.path.exists(MODEL_CACHE):
+            download_weights(MODEL_URL, MODEL_CACHE)
+
+        # model_id: THUDM/CogVideoX-5b-I2V
+        self.pipe = CogVideoXImageToVideoPipeline.from_pretrained(
+            MODEL_CACHE, torch_dtype=torch.bfloat16
+        ).to("cuda")
+
+        self.pipe.enable_model_cpu_offload()
+        self.pipe.vae.enable_tiling()
+
+    def predict(
+        self,
+        prompt: str = Input(
+            description="Input prompt", default="Starry sky slowly rotating."
+        ),
+        image: Path = Input(description="Input image"),
+        num_inference_steps: int = Input(
+            description="Number of denoising steps", ge=1, le=500, default=50
+        ),
+        guidance_scale: float = Input(
+            description="Scale for classifier-free guidance", ge=1, le=20, default=6
+        ),
+        num_frames: int = Input(
+            description="Number of frames for the output video", default=49
+        ),
+        seed: int = Input(
+            description="Random seed. Leave blank to randomize the seed", default=None
+        ),
+    ) -> Path:
+        """Run a single prediction on the model"""
+
+        if seed is None:
+            seed = int.from_bytes(os.urandom(2), "big")
+        print(f"Using seed: {seed}")
+
+        img = load_image(image=str(image))
+
+        video = self.pipe(
+            prompt=prompt,
+            image=img,
+            num_videos_per_prompt=1,
+            num_inference_steps=num_inference_steps,
+            num_frames=num_frames,
+            guidance_scale=guidance_scale,
+            generator=torch.Generator(device="cuda").manual_seed(seed),
+        ).frames[0]
+
+        out_path = "/tmp/out.mp4"
+
+        export_to_video(video, out_path, fps=8)
+        return Path(out_path)
diff --git a/tools/replicate/predict_t2v.py b/tools/replicate/predict_t2v.py
new file mode 100644
index 0000000000000000000000000000000000000000..cadeee25641e922da5c9be32c2006caed0e2845d
--- /dev/null
+++ b/tools/replicate/predict_t2v.py
@@ -0,0 +1,87 @@
+# Prediction interface for Cog ⚙️
+# https://cog.run/python
+
+import os
+import subprocess
+import time
+import torch
+from diffusers import CogVideoXPipeline
+from diffusers.utils import export_to_video
+from cog import BasePredictor, Input, Path
+
+
+MODEL_CACHE = "model_cache"
+MODEL_URL = (
+    f"https://weights.replicate.delivery/default/THUDM/CogVideo/{MODEL_CACHE}.tar"
+)
+os.environ["HF_DATASETS_OFFLINE"] = "1"
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+os.environ["HF_HOME"] = MODEL_CACHE
+os.environ["TORCH_HOME"] = MODEL_CACHE
+os.environ["HF_DATASETS_CACHE"] = MODEL_CACHE
+os.environ["TRANSFORMERS_CACHE"] = MODEL_CACHE
+os.environ["HUGGINGFACE_HUB_CACHE"] = MODEL_CACHE
+
+
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+
+        if not os.path.exists(MODEL_CACHE):
+            download_weights(MODEL_URL, MODEL_CACHE)
+
+        # model_id: THUDM/CogVideoX-5b
+        self.pipe = CogVideoXPipeline.from_pretrained(
+            MODEL_CACHE,
+            torch_dtype=torch.bfloat16,
+        ).to("cuda")
+
+        self.pipe.enable_model_cpu_offload()
+        self.pipe.vae.enable_tiling()
+
+    def predict(
+        self,
+        prompt: str = Input(
+            description="Input prompt",
+            default="A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance.",
+        ),
+        num_inference_steps: int = Input(
+            description="Number of denoising steps", ge=1, le=500, default=50
+        ),
+        guidance_scale: float = Input(
+            description="Scale for classifier-free guidance", ge=1, le=20, default=6
+        ),
+        num_frames: int = Input(
+            description="Number of frames for the output video", default=49
+        ),
+        seed: int = Input(
+            description="Random seed. Leave blank to randomize the seed", default=None
+        ),
+    ) -> Path:
+        """Run a single prediction on the model"""
+
+        if seed is None:
+            seed = int.from_bytes(os.urandom(2), "big")
+        print(f"Using seed: {seed}")
+
+        video = self.pipe(
+            prompt=prompt,
+            num_videos_per_prompt=1,
+            num_inference_steps=num_inference_steps,
+            num_frames=num_frames,
+            guidance_scale=guidance_scale,
+            generator=torch.Generator(device="cuda").manual_seed(seed),
+        ).frames[0]
+
+        out_path = "/tmp/out.mp4"
+
+        export_to_video(video, out_path, fps=8)
+        return Path(out_path)
diff --git a/tools/venhancer/README.md b/tools/venhancer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cc6f45c632b1310908e305e398e7fcb9c36ac29e
--- /dev/null
+++ b/tools/venhancer/README.md
@@ -0,0 +1,98 @@
+# Enhance CogVideoX Generated Videos with VEnhancer
+
+This tutorial will guide you through using the VEnhancer tool to enhance videos generated by CogVideoX, including
+achieving higher frame rates and higher resolutions.
+
+## Model Introduction
+
+VEnhancer implements spatial super-resolution, temporal super-resolution (frame interpolation), and video refinement in
+a unified framework. It can flexibly adapt to different upsampling factors (e.g., 1x~8x) for spatial or temporal
+super-resolution. Additionally, it provides flexible control to modify the refinement strength, enabling it to handle
+diverse video artifacts.
+
+VEnhancer follows the design of ControlNet, copying the architecture and weights of the multi-frame encoder and middle
+block from a pre-trained video diffusion model to build a trainable conditional network. This video ControlNet accepts
+low-resolution keyframes and noisy full-frame latents as inputs. In addition to the time step t and prompt, our proposed
+video-aware conditioning also includes noise augmentation level σ and downscaling factor s as additional network
+conditioning inputs.
+
+## Hardware Requirements
+
++ Operating System: Linux (requires xformers dependency)
++ Hardware: NVIDIA GPU with at least 60GB of VRAM per card. Machines such as H100, A100 are recommended.
+
+## Quick Start
+
+1. Clone the repository and install dependencies as per the official instructions:
+
+```shell
+git clone https://github.com/Vchitect/VEnhancer.git
+cd VEnhancer
+## Torch and other dependencies can use those from CogVideoX. If you need to create a new environment, use the following commands:
+pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+
+## Install required dependencies
+pip install -r requirements.txt
+```
+
+Where:
+
+- `input_path` is the path to the input video
+- `prompt` is the description of the video content. The prompt used by this tool should be shorter, not exceeding 77
+  words. You may need to simplify the prompt used for generating the CogVideoX video.
+- `target_fps` is the target frame rate for the video. Typically, 16 fps is already smooth, with 24 fps as the default
+  value.
+- `up_scale` is recommend to be set to 2,3,4. The target resolution is limited to be around 2k and below.
+- `noise_aug` value depends on the input video quality. Lower quality needs higher noise levels, which corresponds to
+  stronger refinement. 250~300 is for very low-quality videos. good videos: <= 200.
+- `steps`  if you want fewer steps, please change solver_mode to "normal" first, then decline the number of steps. "
+  fast" solver_mode has fixed steps (15).
+  The code will automatically download the required models from Hugging Face during execution.
+
+Typical runtime logs are as follows:
+
+```shell
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+  @torch.library.impl_abstract("xformers_flash::flash_fwd")
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+  @torch.library.impl_abstract("xformers_flash::flash_bwd")
+2024-08-20 13:25:17,553 - video_to_video - INFO - checkpoint_path: ./ckpts/venhancer_paper.pt
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  checkpoint = torch.load(checkpoint_path, map_location=map_location)
+2024-08-20 13:25:37,486 - video_to_video - INFO - Build encoder with FrozenOpenCLIPEmbedder
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:35: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  load_dict = torch.load(cfg.model_path, map_location='cpu')
+2024-08-20 13:25:55,391 - video_to_video - INFO - Load model path ./ckpts/venhancer_paper.pt, with local status <All keys matched successfully>
+2024-08-20 13:25:55,392 - video_to_video - INFO - Build diffusion with GaussianDiffusion
+2024-08-20 13:26:16,092 - video_to_video - INFO - input video path: inputs/000000.mp4
+2024-08-20 13:26:16,093 - video_to_video - INFO - text: Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere
+2024-08-20 13:26:16,156 - video_to_video - INFO - input frames length: 49
+2024-08-20 13:26:16,156 - video_to_video - INFO - input fps: 8.0
+2024-08-20 13:26:16,156 - video_to_video - INFO - target_fps: 24.0
+2024-08-20 13:26:16,311 - video_to_video - INFO - input resolution: (480, 720)
+2024-08-20 13:26:16,312 - video_to_video - INFO - target resolution: (1320, 1982)
+2024-08-20 13:26:16,312 - video_to_video - INFO - noise augmentation: 250
+2024-08-20 13:26:16,312 - video_to_video - INFO - scale s is set to: 8
+2024-08-20 13:26:16,399 - video_to_video - INFO - video_data shape: torch.Size([145, 3, 1320, 1982])
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:108: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with amp.autocast(enabled=True):
+2024-08-20 13:27:19,605 - video_to_video - INFO - step: 0
+2024-08-20 13:30:12,020 - video_to_video - INFO - step: 1
+2024-08-20 13:33:04,956 - video_to_video - INFO - step: 2
+2024-08-20 13:35:58,691 - video_to_video - INFO - step: 3
+2024-08-20 13:38:51,254 - video_to_video - INFO - step: 4
+2024-08-20 13:41:44,150 - video_to_video - INFO - step: 5
+2024-08-20 13:44:37,017 - video_to_video - INFO - step: 6
+2024-08-20 13:47:30,037 - video_to_video - INFO - step: 7
+2024-08-20 13:50:22,838 - video_to_video - INFO - step: 8
+2024-08-20 13:53:15,844 - video_to_video - INFO - step: 9
+2024-08-20 13:56:08,657 - video_to_video - INFO - step: 10
+2024-08-20 13:59:01,648 - video_to_video - INFO - step: 11
+2024-08-20 14:01:54,541 - video_to_video - INFO - step: 12
+2024-08-20 14:04:47,488 - video_to_video - INFO - step: 13
+2024-08-20 14:10:13,637 - video_to_video - INFO - sampling, finished.
+
+```
+
+Running on a single A100 GPU, enhancing each 6-second CogVideoX generated video with default settings will consume 60GB
+of VRAM and take 40-50 minutes.
\ No newline at end of file
diff --git a/tools/venhancer/README_ja.md b/tools/venhancer/README_ja.md
new file mode 100644
index 0000000000000000000000000000000000000000..70f2d74d04e25139bc9972abab51b82d783c4225
--- /dev/null
+++ b/tools/venhancer/README_ja.md
@@ -0,0 +1,92 @@
+
+# VEnhancer で CogVideoX によって生成されたビデオを強化する
+
+このチュートリアルでは、VEnhancer ツールを使用して、CogVideoX で生成されたビデオを強化し、より高いフレームレートと高い解像度を実現する方法を説明します。
+
+## モデルの紹介
+
+VEnhancer は、空間超解像、時間超解像（フレーム補間）、およびビデオのリファインメントを統一されたフレームワークで実現します。空間または時間の超解像のために、さまざまなアップサンプリング係数（例：1x〜8x）に柔軟に対応できます。さらに、多様なビデオアーティファクトを処理するために、リファインメント強度を変更する柔軟な制御を提供します。
+
+VEnhancer は ControlNet の設計に従い、事前訓練されたビデオ拡散モデルのマルチフレームエンコーダーとミドルブロックのアーキテクチャとウェイトをコピーして、トレーニング可能な条件ネットワークを構築します。このビデオ ControlNet は、低解像度のキーフレームとノイズを含む完全なフレームを入力として受け取ります。さらに、タイムステップ t とプロンプトに加えて、提案されたビデオ対応条件により、ノイズ増幅レベル σ およびダウンスケーリングファクター s が追加のネットワーク条件として使用されます。
+
+## ハードウェア要件
+
++ オペレーティングシステム: Linux (xformers 依存関係が必要)
++ ハードウェア: 単一カードあたり少なくとも 60GB の VRAM を持つ NVIDIA GPU。H100、A100 などのマシンを推奨します。
+
+## クイックスタート
+
+1. 公式の指示に従ってリポジトリをクローンし、依存関係をインストールします。
+
+```shell
+git clone https://github.com/Vchitect/VEnhancer.git
+cd VEnhancer
+## Torch などの依存関係は CogVideoX の依存関係を使用できます。新しい環境を作成する必要がある場合は、以下のコマンドを使用してください。
+pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+
+## 必須の依存関係をインストールします。
+pip install -r requirements.txt
+```
+
+2. コードを実行します。
+
+```shell
+python enhance_a_video.py --up_scale 4 --target_fps 24 --noise_aug 250 --solver_mode 'fast' --steps 15 --input_path inputs/000000.mp4 --prompt 'Wide-angle aerial shot at dawn, soft morning light casting long shadows, an elderly man walking his dog through a quiet, foggy park, trees and benches in the background, peaceful and serene atmosphere' --save_dir 'results/'
+```
+
+次の設定を行います：
+
+- `input_path` 是输入视频的路径
+- `prompt` 是视频内容的描述。此工具使用的提示词应更短，不超过77个字。您可能需要简化用于生成CogVideoX视频的提示词。
+- `target_fps` 是视频的目标帧率。通常，16 fps已经很流畅，默认值为24 fps。
+- `up_scale` 推荐设置为2、3或4。目标分辨率限制在2k左右及以下。
+- `noise_aug` 的值取决于输入视频的质量。质量较低的视频需要更高的噪声级别，这对应于更强的优化。250~300适用于非常低质量的视频。对于高质量视频，设置为≤200。
+- `steps` 如果想减少步数，请先将solver_mode改为“normal”，然后减少步数。“fast”模式的步数是固定的（15步）。
+  代码在执行过程中会自动从Hugging Face下载所需的模型。
+
+コードの実行中に、必要なモデルは Hugging Face から自動的にダウンロードされます。
+
+```shell
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+  @torch.library.impl_abstract("xformers_flash::flash_fwd")
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+  @torch.library.impl_abstract("xformers_flash::flash_bwd")
+2024-08-20 13:25:17,553 - video_to_video - INFO - checkpoint_path: ./ckpts/venhancer_paper.pt
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  checkpoint = torch.load(checkpoint_path, map_location=map_location)
+2024-08-20 13:25:37,486 - video_to_video - INFO - Build encoder with FrozenOpenCLIPEmbedder
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:35: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  load_dict = torch.load(cfg.model_path, map_location='cpu')
+2024-08-20 13:25:55,391 - video_to_video - INFO - Load model path ./ckpts/venhancer_paper.pt, with local status <All keys matched successfully>
+2024-08-20 13:25:55,392 - video_to_video - INFO - Build diffusion with GaussianDiffusion
+2024-08-20 13:26:16,092 - video_to_video - INFO - input video path: inputs/000000.mp4
+2024-08-20 13:26:16,093 - video_to_video - INFO - text: Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere
+2024-08-20 13:26:16,156 - video_to_video - INFO - input frames length: 49
+2024-08-20 13:26:16,156 - video_to_video - INFO - input fps: 8.0
+2024-08-20 13:26:16,156 - video_to_video - INFO - target_fps: 24.0
+2024-08-20 13:26:16,311 - video_to_video - INFO - input resolution: (480, 720)
+2024-08-20 13:26:16,312 - video_to_video - INFO - target resolution: (1320, 1982)
+2024-08-20 13:26:16,312 - video_to_video - INFO - noise augmentation: 250
+2024-08-20 13:26:16,312 - video_to_video - INFO - scale s is set to: 8
+2024-08-20 13:26:16,399 - video_to_video - INFO - video_data shape: torch.Size([145, 3, 1320, 1982])
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:108: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with amp.autocast(enabled=True):
+2024-08-20 13:27:19,605 - video_to_video - INFO - step: 0
+2024-08-20 13:30:12,020 - video_to_video - INFO - step: 1
+2024-08-20 13:33:04,956 - video_to_video - INFO - step: 2
+2024-08-20 13:35:58,691 - video_to_video - INFO - step: 3
+2024-08-20 13:38:51,254 - video_to_video - INFO - step: 4
+2024-08-20 13:41:44,150 - video_to_video - INFO - step: 5
+2024-08-20 13:44:37,017 - video_to_video - INFO - step: 6
+2024-08-20 13:47:30,037 - video_to_video - INFO - step: 7
+2024-08-20 13:50:22,838 - video_to_video - INFO - step: 8
+2024-08-20 13:53:15,844 - video_to_video - INFO - step: 9
+2024-08-20 13:56:08,657 - video_to_video - INFO - step: 10
+2024-08-20 13:59:01,648 - video_to_video - INFO - step: 11
+2024-08-20 14:01:54,541 - video_to_video - INFO - step: 12
+2024-08-20 14:04:47,488 - video_to_video - INFO - step: 13
+2024-08-20 14:10:13,637 - video_to_video - INFO - sampling, finished.
+
+```
+
+A100 GPU を単一で使用している場合、CogVideoX によって生成された 6 秒間のビデオを強化するには、デフォルト設定で 60GB の VRAM を消費し、40〜50 分かかります。
diff --git a/tools/venhancer/README_zh.md b/tools/venhancer/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..a481cd179347c3ca446f8f0f572e6eaa10ecbf09
--- /dev/null
+++ b/tools/venhancer/README_zh.md
@@ -0,0 +1,101 @@
+# 使用 VEnhancer 对 CogVdieoX 生成视频进行增强
+
+本教程将要使用 VEnhancer 工具 对 CogVdieoX 生成视频进行增强, 包括更高的帧率和更高的分辨率
+
+## 模型介绍
+
+VEnhancer 在一个统一的框架中实现了空间超分辨率、时间超分辨率（帧插值）和视频优化。它可以灵活地适应不同的上采样因子（例如，1x~
+8x）用于空间或时间超分辨率。此外，它提供了灵活的控制，以修改优化强度，从而处理多样化的视频伪影。
+
+VEnhancer 遵循 ControlNet 的设计，复制了预训练的视频扩散模型的多帧编码器和中间块的架构和权重，构建了一个可训练的条件网络。这个视频
+ControlNet 接受低分辨率关键帧和包含噪声的完整帧作为输入。此外，除了时间步 t 和提示词外，我们提出的视频感知条件还将噪声增强的噪声级别
+σ 和降尺度因子 s 作为附加的网络条件输入。
+
+## 硬件需求
+
++ 操作系统: Linux (需要依赖xformers)
++ 硬件: NVIDIA GPU 并至少保证单卡显存超过60G，推荐使用 H100，A100等机器。
+
+## 快速上手
+
+1. 按照官方指引克隆仓库并安装依赖
+
+```shell
+git clone https://github.com/Vchitect/VEnhancer.git
+cd VEnhancer
+## torch等依赖可以使用CogVideoX的依赖，如果你需要创建一个新的环境，可以使用以下命令
+pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+
+## 安装必须的依赖
+pip install -r requirements.txt
+```
+
+2. 运行代码
+
+```shell
+python enhance_a_video.py \
+--up_scale 4 --target_fps 24 --noise_aug 250 \
+--solver_mode 'fast' --steps 15 \
+--input_path inputs/000000.mp4 \
+--prompt 'Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere' \
+--save_dir 'results/' 
+```
+
+其中:
+
+- `input_path` 是输入视频的路径
+- `prompt` 是视频内容的描述。此工具使用的提示词应更短，不超过77个字。您可能需要简化用于生成CogVideoX视频的提示词。
+- `target_fps` 是视频的目标帧率。通常，16 fps已经很流畅，默认值为24 fps。
+- `up_scale` 推荐设置为2、3或4。目标分辨率限制在2k左右及以下。
+- `noise_aug` 的值取决于输入视频的质量。质量较低的视频需要更高的噪声级别，这对应于更强的优化。250~300适用于非常低质量的视频。对于高质量视频，设置为≤200。
+- `steps` 如果想减少步数，请先将solver_mode改为“normal”，然后减少步数。“fast”模式的步数是固定的（15步）。
+  代码在执行过程中会自动从Hugging Face下载所需的模型。
+
+代码运行过程中，会自动从Huggingface拉取需要的模型
+
+运行日志通常如下:
+
+```shell
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+  @torch.library.impl_abstract("xformers_flash::flash_fwd")
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+  @torch.library.impl_abstract("xformers_flash::flash_bwd")
+2024-08-20 13:25:17,553 - video_to_video - INFO - checkpoint_path: ./ckpts/venhancer_paper.pt
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  checkpoint = torch.load(checkpoint_path, map_location=map_location)
+2024-08-20 13:25:37,486 - video_to_video - INFO - Build encoder with FrozenOpenCLIPEmbedder
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:35: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+  load_dict = torch.load(cfg.model_path, map_location='cpu')
+2024-08-20 13:25:55,391 - video_to_video - INFO - Load model path ./ckpts/venhancer_paper.pt, with local status <All keys matched successfully>
+2024-08-20 13:25:55,392 - video_to_video - INFO - Build diffusion with GaussianDiffusion
+2024-08-20 13:26:16,092 - video_to_video - INFO - input video path: inputs/000000.mp4
+2024-08-20 13:26:16,093 - video_to_video - INFO - text: Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere
+2024-08-20 13:26:16,156 - video_to_video - INFO - input frames length: 49
+2024-08-20 13:26:16,156 - video_to_video - INFO - input fps: 8.0
+2024-08-20 13:26:16,156 - video_to_video - INFO - target_fps: 24.0
+2024-08-20 13:26:16,311 - video_to_video - INFO - input resolution: (480, 720)
+2024-08-20 13:26:16,312 - video_to_video - INFO - target resolution: (1320, 1982)
+2024-08-20 13:26:16,312 - video_to_video - INFO - noise augmentation: 250
+2024-08-20 13:26:16,312 - video_to_video - INFO - scale s is set to: 8
+2024-08-20 13:26:16,399 - video_to_video - INFO - video_data shape: torch.Size([145, 3, 1320, 1982])
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:108: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+  with amp.autocast(enabled=True):
+2024-08-20 13:27:19,605 - video_to_video - INFO - step: 0
+2024-08-20 13:30:12,020 - video_to_video - INFO - step: 1
+2024-08-20 13:33:04,956 - video_to_video - INFO - step: 2
+2024-08-20 13:35:58,691 - video_to_video - INFO - step: 3
+2024-08-20 13:38:51,254 - video_to_video - INFO - step: 4
+2024-08-20 13:41:44,150 - video_to_video - INFO - step: 5
+2024-08-20 13:44:37,017 - video_to_video - INFO - step: 6
+2024-08-20 13:47:30,037 - video_to_video - INFO - step: 7
+2024-08-20 13:50:22,838 - video_to_video - INFO - step: 8
+2024-08-20 13:53:15,844 - video_to_video - INFO - step: 9
+2024-08-20 13:56:08,657 - video_to_video - INFO - step: 10
+2024-08-20 13:59:01,648 - video_to_video - INFO - step: 11
+2024-08-20 14:01:54,541 - video_to_video - INFO - step: 12
+2024-08-20 14:04:47,488 - video_to_video - INFO - step: 13
+2024-08-20 14:10:13,637 - video_to_video - INFO - sampling, finished.
+
+```
+
+使用A100单卡运行，对于每个CogVideoX生产的6秒视频，按照默认配置，会消耗60G显存，并用时40-50分钟。
\ No newline at end of file
diff --git a/tools/visualize_trajectory.py b/tools/visualize_trajectory.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a718c422c774ddfc718c4c168fc4ad8278520bf
--- /dev/null
+++ b/tools/visualize_trajectory.py
@@ -0,0 +1,152 @@
+import os
+import argparse
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+import pdb
+
+class CameraPoseVisualizer:
+    def __init__(self, xlim, ylim, zlim):
+        self.fig = plt.figure(figsize=(18, 7))
+        self.ax = self.fig.add_subplot(projection='3d')
+        self.plotly_data = None  # plotly data traces
+        self.ax.set_aspect("auto")
+        self.ax.set_xlim(xlim)
+        self.ax.set_ylim(ylim)
+        self.ax.set_zlim(zlim)
+        self.ax.set_xlabel('x')
+        self.ax.set_ylabel('y')
+        self.ax.set_zlabel('z')
+        
+        # self.ax.view_init(elev=30, azim=-90)
+        print('initialize camera pose visualizer')
+
+    def extrinsic2pyramid(self, extrinsic, color_map='red', hw_ratio=9/16, base_xval=1, zval=3):
+        vertex_std = np.array([[0, 0, 0, 1],
+                               [base_xval, -base_xval * hw_ratio, zval, 1],
+                               [base_xval, base_xval * hw_ratio, zval, 1],
+                               [-base_xval, base_xval * hw_ratio, zval, 1],
+                               [-base_xval, -base_xval * hw_ratio, zval, 1]])
+        vertex_transformed = vertex_std @ extrinsic.T
+        meshes = [[vertex_transformed[0, :-1], vertex_transformed[1][:-1], vertex_transformed[2, :-1]],
+                            [vertex_transformed[0, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1]],
+                            [vertex_transformed[0, :-1], vertex_transformed[3, :-1], vertex_transformed[4, :-1]],
+                            [vertex_transformed[0, :-1], vertex_transformed[4, :-1], vertex_transformed[1, :-1]],
+                            [vertex_transformed[1, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1], vertex_transformed[4, :-1]]]
+
+        color = color_map if isinstance(color_map, str) else plt.cm.rainbow(color_map)
+
+        self.ax.add_collection3d(
+            Poly3DCollection(meshes, facecolors=color, linewidths=0.3, edgecolors=color, alpha=0.35))
+
+    def colorbar(self, max_frame_length):
+        cmap = mpl.cm.rainbow
+        norm = mpl.colors.Normalize(vmin=0, vmax=max_frame_length)
+        self.fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), ax=self.ax, orientation='vertical', label='Frame Indexes')
+
+    def show(self, save_path=None):
+        plt.title('Camera Trajectory')
+        plt.show()
+        if save_path is not None:
+            plt.savefig(save_path)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--pose_file_path', required=True)
+    parser.add_argument('--hw_ratio', default=9/16, type=float)
+    parser.add_argument('--base_xval', type=float, default=1.0)
+    parser.add_argument('--zval', type=float, default=2.0)
+    parser.add_argument('--use_exact_fx', action='store_true')
+    parser.add_argument('--relative_c2w', action='store_true')
+    parser.add_argument('--x_min', type=float, default=-2)
+    parser.add_argument('--x_max', type=float, default=2)
+    parser.add_argument('--y_min', type=float, default=-2)
+    parser.add_argument('--y_max', type=float, default=2)
+    parser.add_argument('--z_min', type=float, default=-2)
+    parser.add_argument('--z_max', type=float, default=2)
+    parser.add_argument('--save_path', type=str, default='./assets/cam_trajectory/')
+    return parser.parse_args()
+
+
+# def get_c2w(w2cs, transform_matrix, relative_c2w):
+#     if relative_c2w:
+#         target_cam_c2w = np.array([
+#             [1, 0, 0, 0],
+#             [0, 1, 0, 0],
+#             [0, 0, 1, 0],
+#             [0, 0, 0, 1]
+#         ])
+#         pdb.set_trace()
+#         abs2rel = target_cam_c2w @ w2cs[0]
+#         ret_poses = [target_cam_c2w, ] + [abs2rel @ np.linalg.inv(w2c) for w2c in w2cs[1:]]
+#         pdb.set_trace()
+#         camera_positions = np.asarray([c2w[:3, 3] for c2w in ret_poses])        # [n_frame, 3]
+#         position_distances = [camera_positions[i] - camera_positions[i - 1] for i in range(1, len(camera_positions))]
+#         xyz_max = np.max(camera_positions, axis=0)
+#         xyz_min = np.min(camera_positions, axis=0)
+#         xyz_ranges = xyz_max - xyz_min           # [3, ]
+#         max_range = np.max(xyz_ranges)
+#         expected_xyz_ranges = 1
+#         pdb.set_trace()
+#         scale_ratio = expected_xyz_ranges / max_range
+#         scaled_position_distances = [dis * scale_ratio for dis in position_distances]      # [n_frame - 1]
+#         scaled_camera_positions = [camera_positions[0], ]
+#         scaled_camera_positions.extend([camera_positions[0] + np.sum(np.asarray(scaled_position_distances[:i]), axis=0)
+#                                         for i in range(1, len(camera_positions))])
+#         pdb.set_trace()
+#         ret_poses = [np.concatenate((np.concatenate((ori_pose[:3, :3], cam_position[:, None]), axis=1), np.asarray([0, 0, 0, 1])[None]), axis=0)
+#                      for ori_pose, cam_position in zip(ret_poses, scaled_camera_positions)]
+#         pdb.set_trace()
+#     else:
+#         ret_poses = [np.linalg.inv(w2c) for w2c in w2cs]
+#     ret_poses = [transform_matrix @ x for x in ret_poses]
+#     return np.array(ret_poses, dtype=np.float32)
+
+
+def get_c2w(w2cs, transform_matrix, relative_c2w):
+    if relative_c2w:
+        target_cam_c2w = np.array([
+            [1, 0, 0, 0],
+            [0, 1, 0, 0],
+            [0, 0, 1, 0],
+            [0, 0, 0, 1]
+        ])
+        abs2rel = target_cam_c2w @ w2cs[0]
+        ret_poses = [target_cam_c2w, ] + [abs2rel @ np.linalg.inv(w2c) for w2c in w2cs[1:]]
+        # camera_positions = np.asarray([c2w[:3, 3] for c2w in ret_poses])        # [n_frame, 3]        
+        # ret_poses = [np.concatenate((np.concatenate((ori_pose[:3, :3], cam_position[:, None]), axis=1), np.asarray([0, 0, 0, 1])[None]), axis=0)
+        #              for ori_pose, cam_position in zip(ret_poses, camera_positions)]
+    else:
+        ret_poses = [np.linalg.inv(w2c) for w2c in w2cs]
+    ret_poses = [transform_matrix @ x for x in ret_poses]
+    return np.array(ret_poses, dtype=np.float32)
+
+
+if __name__ == '__main__':
+    args = get_args()
+    with open(args.pose_file_path, 'r') as f:
+        poses = f.readlines()
+    w2cs = [np.asarray([float(p) for p in pose.strip().split(' ')[6:]]).reshape(3, 4) for pose in poses]
+    fxs = [float(pose.strip().split(' ')[1]) for pose in poses[1:]]
+    num_frames = len(w2cs)
+    transform_matrix = np.asarray([[1, 0, 0, 0], [0, 0, 1, 0], [0, -1, 0, 0], [0, 0, 0, 1]]).reshape(4, 4) # For World Coordinates!
+    # transform_matrix = np.asarray([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]).reshape(4, 4) # For Camera Coordinates!
+    last_row = np.zeros((1, 4))
+    last_row[0, -1] = 1.0
+    w2cs = [np.concatenate((w2c, last_row), axis=0) for w2c in w2cs]
+    c2ws = get_c2w(w2cs, transform_matrix, args.relative_c2w)
+
+    visualizer = CameraPoseVisualizer([args.x_min, args.x_max], [args.y_min, args.y_max], [args.z_min, args.z_max])
+    for frame_idx, c2w in enumerate(c2ws):
+        visualizer.extrinsic2pyramid(c2w, frame_idx / num_frames, hw_ratio=args.hw_ratio, base_xval=args.base_xval,
+                                     zval=(fxs[frame_idx] if args.use_exact_fx else args.zval))
+
+    visualizer.colorbar(num_frames)
+    pose_file_name = args.pose_file_path.split('/')[-1].split('.')[0]
+    
+    os.makedirs(args.save_path, exist_ok=True)
+    save_path = os.path.join(args.save_path, pose_file_name+'.png')
+    visualizer.show(save_path)
\ No newline at end of file