EthanZyh commited on Jan 20

Commit

01a383f

1 Parent(s): 3edb341

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +39 -0
.flake8 +25 -0
.github/workflows/lint.yml +35 -0
.gitignore +243 -0
.pre-commit-config.yaml +53 -0
ATTRIBUTIONS.md +1437 -0
CONTRIBUTING.md +59 -0
Dockerfile +43 -0
INSTALL.md +20 -0
LICENSE +201 -0
README.md +78 -0
RELEASE.md +7 -0
assets/cosmos-logo.png +0 -0
checkpoints/README.md +3 -0
cosmos1/models/POST_TRAINING.md +23 -0
cosmos1/models/autoregressive/README.md +427 -0
cosmos1/models/autoregressive/__init__.py +14 -0
cosmos1/models/autoregressive/assets/nemo/finetuned_result.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl +10 -0
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl +10 -0
cosmos1/models/autoregressive/assets/v1p0/input.jpg +0 -0
cosmos1/models/autoregressive/assets/v1p0/input.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_12b.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_13b.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_12b.mp4 +0 -0
cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_13b.mp4 +0 -0
cosmos1/models/autoregressive/configs/__init__.py +14 -0
cosmos1/models/autoregressive/configs/base/__init__.py +14 -0
cosmos1/models/autoregressive/configs/base/model.py +118 -0
cosmos1/models/autoregressive/configs/base/model_config.py +421 -0
cosmos1/models/autoregressive/configs/base/tokenizer.py +137 -0
cosmos1/models/autoregressive/configs/inference/inference_config.py +102 -0
cosmos1/models/autoregressive/diffusion_decoder/__init__.py +14 -0
cosmos1/models/autoregressive/diffusion_decoder/config/base/conditioner.py +61 -0
cosmos1/models/autoregressive/diffusion_decoder/config/config_latent_diffusion_decoder.py +61 -0
cosmos1/models/autoregressive/diffusion_decoder/config/inference/cosmos_diffusiondecoder_7b.py +85 -0
cosmos1/models/autoregressive/diffusion_decoder/config/registry.py +118 -0
cosmos1/models/autoregressive/diffusion_decoder/inference.py +120 -0
cosmos1/models/autoregressive/diffusion_decoder/model.py +231 -0
cosmos1/models/autoregressive/diffusion_decoder/network.py +163 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Please keep below sorted alphabetically
+__pycache__
+.cache
+.coverage
+.coverage.*
+.DS_Store
+.env
+.git
+.gitignore
+.pytest_cache
+.Python
+.tox
+.venv
+*.cover
+*.log
+*.pyc
+*.pyd
+*.pyo
+coverage.xml
+env
+nosetests.xml
+pip-delete-this-directory.txt
+pip-log.txt

.flake8 ADDED Viewed

	@@ -0,0 +1,25 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+[flake8]
+enable-extensions = G
+select = B,C,E,F,G,P,SIM1,T4,W,B9
+max-line-length = 120
+# C408 ignored because we like the dict keyword argument syntax
+# E501 is not flexible enough, we're using B950 instead
+ignore =
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,E226,E265
+exclude =
+    third_party

.github/workflows/lint.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# lint.yml : A workflow to trigger lint tests on GitHub
+name: 'Lint'
+on:
+  pull_request:
+  workflow_dispatch:
+jobs:
+  lint:
+    name: 'Linting'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@v4
+      - name: 'Setup Python'
+        uses: actions/setup-python@v5
+        with:
+          python-version: 'pypy3.10'
+      - name: 'Lint'
+        run: |
+          sudo apt-get update
+          bash ./cosmos1/scripts/format.sh

.gitignore ADDED Viewed

	@@ -0,0 +1,243 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Misc
+outputs/
+checkpoints/*
+!checkpoints/README.md
+# Data types
+*.jit
+*.pt
+*.hdr
+*.webp
+*.pgm
+*.tiff
+*.tif
+*.tar
+*.tar.gz
+*.gz
+*.pkl
+*.pt
+*.bin
+# Other uncheckable file types
+*.zip
+*.exe
+*.dll
+*.swp
+*.vscode
+*.ipynb
+*.DS_Store
+*.pyc
+*Thumbs.db
+*.patch
+# Credential information that should never be checked in
+credentials
+*.secret
+# ------------------------ BELOW IS AUTO-GENERATED FOR PYTHON REPOS ------------------------
+# Byte-compiled / optimized / DLL files
+**/__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+results/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.config
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Third party
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# ruff
+.ruff_cache
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+CLIP
+.devcontainer/devcontainer.json
+# Coverage
+.coverage
+coverage.xml
+# JUnit Reports
+report.xml
+# CI-CD
+temp/
+envs.txt
+manifest.json
+# locks and t5 temp files
+*.locks*
+*.no_exist*
+*models--t5*
+# OneLogger
+wandb/
+onelogger.err
+onelogger.log

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3.10
+repos:
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+    - id: flake8
+      args: [--max-line-length=120]
+      exclude: ^dist/|^third_party/
+  - repo: https://github.com/psf/black
+    rev: 23.12.1
+    hooks:
+      - id: black
+        args: [--line-length=120]
+        exclude: ^dist/|^third_party/
+  - repo: https://github.com/timothycrosley/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: [--line-length=120]
+  - repo: https://github.com/MarcoGorelli/absolufy-imports
+    rev: v0.3.1
+    hooks:
+    -   id: absolufy-imports
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: trailing-whitespace
+        exclude: ^tests/.*/fixtures/.*
+        args: [--markdown-linebreak-ext=md]
+      - id: end-of-file-fixer
+        exclude: ^tests/.*/fixtures/.*
+      - id: check-added-large-files
+        args: ['--maxkb=2000']

ATTRIBUTIONS.md ADDED Viewed

	@@ -0,0 +1,1437 @@

+# Open Source License Attribution
+   Cosmos uses Open Source components. You can find the details of these open-source projects along with license information below, sorted alphabetically.
+   We are grateful to the developers for their contributions to open source and acknowledge these below.
+## Better-Profanity - [MIT License](https://github.com/snguyenthanh/better_profanity/blob/master/LICENSE)
+   ```
+   Copyright (c) 2018 The Python Packaging Authority
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   ```
+## FFmpeg - [FFMPEG License](https://github.com/FFmpeg/FFmpeg/blob/master/LICENSE.md)
+   ```
+   # License
+   Most files in FFmpeg are under the GNU Lesser General Public License version 2.1
+   or later (LGPL v2.1+). Read the file `COPYING.LGPLv2.1` for details. Some other
+   files have MIT/X11/BSD-style licenses. In combination the LGPL v2.1+ applies to
+   FFmpeg.
+   Some optional parts of FFmpeg are licensed under the GNU General Public License
+   version 2 or later (GPL v2+). See the file `COPYING.GPLv2` for details. None of
+   these parts are used by default, you have to explicitly pass `--enable-gpl` to
+   configure to activate them. In this case, FFmpeg's license changes to GPL v2+.
+   Specifically, the GPL parts of FFmpeg are:
+   - libpostproc
+   - optional x86 optimization in the files
+       - `libavcodec/x86/flac_dsp_gpl.asm`
+       - `libavcodec/x86/idct_mmx.c`
+       - `libavfilter/x86/vf_removegrain.asm`
+   - the following building and testing tools
+       - `compat/solaris/make_sunver.pl`
+       - `doc/t2h.pm`
+       - `doc/texi2pod.pl`
+       - `libswresample/tests/swresample.c`
+       - `tests/checkasm/*`
+       - `tests/tiny_ssim.c`
+   - the following filters in libavfilter:
+       - `signature_lookup.c`
+       - `vf_blackframe.c`
+       - `vf_boxblur.c`
+       - `vf_colormatrix.c`
+       - `vf_cover_rect.c`
+       - `vf_cropdetect.c`
+       - `vf_delogo.c`
+       - `vf_eq.c`
+       - `vf_find_rect.c`
+       - `vf_fspp.c`
+       - `vf_histeq.c`
+       - `vf_hqdn3d.c`
+       - `vf_kerndeint.c`
+       - `vf_lensfun.c` (GPL version 3 or later)
+       - `vf_mcdeint.c`
+       - `vf_mpdecimate.c`
+       - `vf_nnedi.c`
+       - `vf_owdenoise.c`
+       - `vf_perspective.c`
+       - `vf_phase.c`
+       - `vf_pp.c`
+       - `vf_pp7.c`
+       - `vf_pullup.c`
+       - `vf_repeatfields.c`
+       - `vf_sab.c`
+       - `vf_signature.c`
+       - `vf_smartblur.c`
+       - `vf_spp.c`
+       - `vf_stereo3d.c`
+       - `vf_super2xsai.c`
+       - `vf_tinterlace.c`
+       - `vf_uspp.c`
+       - `vf_vaguedenoiser.c`
+       - `vsrc_mptestsrc.c`
+   Should you, for whatever reason, prefer to use version 3 of the (L)GPL, then
+   the configure parameter `--enable-version3` will activate this licensing option
+   for you. Read the file `COPYING.LGPLv3` or, if you have enabled GPL parts,
+   `COPYING.GPLv3` to learn the exact legal terms that apply in this case.
+   There are a handful of files under other licensing terms, namely:
+   * The files `libavcodec/jfdctfst.c`, `libavcodec/jfdctint_template.c` and
+     `libavcodec/jrevdct.c` are taken from libjpeg, see the top of the files for
+     licensing details. Specifically note that you must credit the IJG in the
+     documentation accompanying your program if you only distribute executables.
+     You must also indicate any changes including additions and deletions to
+     those three files in the documentation.
+   * `tests/reference.pnm` is under the expat license.
+   ## External libraries
+   FFmpeg can be combined with a number of external libraries, which sometimes
+   affect the licensing of binaries resulting from the combination.
+   ### Compatible libraries
+   The following libraries are under GPL version 2:
+   - avisynth
+   - frei0r
+   - libcdio
+   - libdavs2
+   - librubberband
+   - libvidstab
+   - libx264
+   - libx265
+   - libxavs
+   - libxavs2
+   - libxvid
+   When combining them with FFmpeg, FFmpeg needs to be licensed as GPL as well by
+   passing `--enable-gpl` to configure.
+   The following libraries are under LGPL version 3:
+   - gmp
+   - libaribb24
+   - liblensfun
+   When combining them with FFmpeg, use the configure option `--enable-version3` to
+   upgrade FFmpeg to the LGPL v3.
+   The VMAF, mbedTLS, RK MPI, OpenCORE and VisualOn libraries are under the Apache License
+   2.0. That license is incompatible with the LGPL v2.1 and the GPL v2, but not with
+   version 3 of those licenses. So to combine these libraries with FFmpeg, the
+   license version needs to be upgraded by passing `--enable-version3` to configure.
+   The smbclient library is under the GPL v3, to combine it with FFmpeg,
+   the options `--enable-gpl` and `--enable-version3` have to be passed to
+   configure to upgrade FFmpeg to the GPL v3.
+   ### Incompatible libraries
+   There are certain libraries you can combine with FFmpeg whose licenses are not
+   compatible with the GPL and/or the LGPL. If you wish to enable these
+   libraries, even in circumstances that their license may be incompatible, pass
+   `--enable-nonfree` to configure. This will cause the resulting binary to be
+   unredistributable.
+   The Fraunhofer FDK AAC and OpenSSL libraries are under licenses which are
+   incompatible with the GPLv2 and v3. To the best of our knowledge, they are
+   compatible with the LGPL.
+   ```
+## Hydra-core [MIT License](https://github.com/facebookresearch/hydra/blob/main/LICENSE)
+   ```
+   MIT License
+   Copyright (c) Facebook, Inc. and its affiliates.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   ```
+## ImageIo - [BSD 2-Clause "Simplified" License](https://github.com/imageio/imageio/blob/master/LICENSE)
+   ```
+   Copyright (c) 2014-2022, imageio developers
+   All rights reserved.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+   * Redistributions of source code must retain the above copyright notice, this
+     list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright notice,
+     this list of conditions and the following disclaimer in the documentation
+     and/or other materials provided with the distribution.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+   ```
+## Iopath - [MIT License](https://github.com/facebookresearch/iopath/blob/main/LICENSE)
+   ```
+   MIT License
+   Copyright (c) Facebook, Inc. and its affiliates.
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   ```
+## Loguru - [MIT License](https://github.com/Delgan/loguru/blob/master/LICENSE)
+   ```
+   MIT License
+   Copyright (c) 2017
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   ```
+## Mediapy - [Apache License 2.0](https://github.com/google/mediapy/blob/main/LICENSE)
+   ```
+                                    Apache License
+                              Version 2.0, January 2004
+                           http://www.apache.org/licenses/
+      TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+      1. Definitions.
+         "License" shall mean the terms and conditions for use, reproduction,
+         and distribution as defined by Sections 1 through 9 of this document.
+         "Licensor" shall mean the copyright owner or entity authorized by
+         the copyright owner that is granting the License.
+         "Legal Entity" shall mean the union of the acting entity and all
+         other entities that control, are controlled by, or are under common
+         control with that entity. For the purposes of this definition,
+         "control" means (i) the power, direct or indirect, to cause the
+         direction or management of such entity, whether by contract or
+         otherwise, or (ii) ownership of fifty percent (50%) or more of the
+         outstanding shares, or (iii) beneficial ownership of such entity.
+         "You" (or "Your") shall mean an individual or Legal Entity
+         exercising permissions granted by this License.
+         "Source" form shall mean the preferred form for making modifications,
+         including but not limited to software source code, documentation
+         source, and configuration files.
+         "Object" form shall mean any form resulting from mechanical
+         transformation or translation of a Source form, including but
+         not limited to compiled object code, generated documentation,
+         and conversions to other media types.
+         "Work" shall mean the work of authorship, whether in Source or
+         Object form, made available under the License, as indicated by a
+         copyright notice that is included in or attached to the work
+         (an example is provided in the Appendix below).
+         "Derivative Works" shall mean any work, whether in Source or Object
+         form, that is based on (or derived from) the Work and for which the
+         editorial revisions, annotations, elaborations, or other modifications
+         represent, as a whole, an original work of authorship. For the purposes
+         of this License, Derivative Works shall not include works that remain
+         separable from, or merely link (or bind by name) to the interfaces of,
+         the Work and Derivative Works thereof.
+         "Contribution" shall mean any work of authorship, including
+         the original version of the Work and any modifications or additions
+         to that Work or Derivative Works thereof, that is intentionally
+         submitted to Licensor for inclusion in the Work by the copyright owner
+         or by an individual or Legal Entity authorized to submit on behalf of
+         the copyright owner. For the purposes of this definition, "submitted"
+         means any form of electronic, verbal, or written communication sent
+         to the Licensor or its representatives, including but not limited to
+         communication on electronic mailing lists, source code control systems,
+         and issue tracking systems that are managed by, or on behalf of, the
+         Licensor for the purpose of discussing and improving the Work, but
+         excluding communication that is conspicuously marked or otherwise
+         designated in writing by the copyright owner as "Not a Contribution."
+         "Contributor" shall mean Licensor and any individual or Legal Entity
+         on behalf of whom a Contribution has been received by Licensor and
+         subsequently incorporated within the Work.
+      2. Grant of Copyright License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         copyright license to reproduce, prepare Derivative Works of,
+         publicly display, publicly perform, sublicense, and distribute the
+         Work and such Derivative Works in Source or Object form.
+      3. Grant of Patent License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         (except as stated in this section) patent license to make, have made,
+         use, offer to sell, sell, import, and otherwise transfer the Work,
+         where such license applies only to those patent claims licensable
+         by such Contributor that are necessarily infringed by their
+         Contribution(s) alone or by combination of their Contribution(s)
+         with the Work to which such Contribution(s) was submitted. If You
+         institute patent litigation against any entity (including a
+         cross-claim or counterclaim in a lawsuit) alleging that the Work
+         or a Contribution incorporated within the Work constitutes direct
+         or contributory patent infringement, then any patent licenses
+         granted to You under this License for that Work shall terminate
+         as of the date such litigation is filed.
+      4. Redistribution. You may reproduce and distribute copies of the
+         Work or Derivative Works thereof in any medium, with or without
+         modifications, and in Source or Object form, provided that You
+         meet the following conditions:
+         (a) You must give any other recipients of the Work or
+             Derivative Works a copy of this License; and
+         (b) You must cause any modified files to carry prominent notices
+             stating that You changed the files; and
+         (c) You must retain, in the Source form of any Derivative Works
+             that You distribute, all copyright, patent, trademark, and
+             attribution notices from the Source form of the Work,
+             excluding those notices that do not pertain to any part of
+             the Derivative Works; and
+         (d) If the Work includes a "NOTICE" text file as part of its
+             distribution, then any Derivative Works that You distribute must
+             include a readable copy of the attribution notices contained
+             within such NOTICE file, excluding those notices that do not
+             pertain to any part of the Derivative Works, in at least one
+             of the following places: within a NOTICE text file distributed
+             as part of the Derivative Works; within the Source form or
+             documentation, if provided along with the Derivative Works; or,
+             within a display generated by the Derivative Works, if and
+             wherever such third-party notices normally appear. The contents
+             of the NOTICE file are for informational purposes only and
+             do not modify the License. You may add Your own attribution
+             notices within Derivative Works that You distribute, alongside
+             or as an addendum to the NOTICE text from the Work, provided
+             that such additional attribution notices cannot be construed
+             as modifying the License.
+         You may add Your own copyright statement to Your modifications and
+         may provide additional or different license terms and conditions
+         for use, reproduction, or distribution of Your modifications, or
+         for any such Derivative Works as a whole, provided Your use,
+         reproduction, and distribution of the Work otherwise complies with
+         the conditions stated in this License.
+      5. Submission of Contributions. Unless You explicitly state otherwise,
+         any Contribution intentionally submitted for inclusion in the Work
+         by You to the Licensor shall be under the terms and conditions of
+         this License, without any additional terms or conditions.
+         Notwithstanding the above, nothing herein shall supersede or modify
+         the terms of any separate license agreement you may have executed
+         with Licensor regarding such Contributions.
+      6. Trademarks. This License does not grant permission to use the trade
+         names, trademarks, service marks, or product names of the Licensor,
+         except as required for reasonable and customary use in describing the
+         origin of the Work and reproducing the content of the NOTICE file.
+      7. Disclaimer of Warranty. Unless required by applicable law or
+         agreed to in writing, Licensor provides the Work (and each
+         Contributor provides its Contributions) on an "AS IS" BASIS,
+         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+         implied, including, without limitation, any warranties or conditions
+         of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+         PARTICULAR PURPOSE. You are solely responsible for determining the
+         appropriateness of using or redistributing the Work and assume any
+         risks associated with Your exercise of permissions under this License.
+      8. Limitation of Liability. In no event and under no legal theory,
+         whether in tort (including negligence), contract, or otherwise,
+         unless required by applicable law (such as deliberate and grossly
+         negligent acts) or agreed to in writing, shall any Contributor be
+         liable to You for damages, including any direct, indirect, special,
+         incidental, or consequential damages of any character arising as a
+         result of this License or out of the use or inability to use the
+         Work (including but not limited to damages for loss of goodwill,
+         work stoppage, computer failure or malfunction, or any and all
+         other commercial damages or losses), even if such Contributor
+         has been advised of the possibility of such damages.
+      9. Accepting Warranty or Additional Liability. While redistributing
+         the Work or Derivative Works thereof, You may choose to offer,
+         and charge a fee for, acceptance of support, warranty, indemnity,
+         or other liability obligations and/or rights consistent with this
+         License. However, in accepting such obligations, You may act only
+         on Your own behalf and on Your sole responsibility, not on behalf
+         of any other Contributor, and only if You agree to indemnify,
+         defend, and hold each Contributor harmless for any liability
+         incurred by, or claims asserted against, such Contributor by reason
+         of your accepting any such warranty or additional liability.
+      END OF TERMS AND CONDITIONS
+      APPENDIX: How to apply the Apache License to your work.
+         To apply the Apache License to your work, attach the following
+         boilerplate notice, with the fields enclosed by brackets "[]"
+         replaced with your own identifying information. (Don't include
+         the brackets!)  The text should be enclosed in the appropriate
+         comment syntax for the file format. We also recommend that a
+         file or class name and description of purpose be included on the
+         same "printed page" as the copyright notice for easier
+         identification within third-party archives.
+      Copyright [yyyy] [name of copyright owner]
+      Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License.
+      You may obtain a copy of the License at
+          http://www.apache.org/licenses/LICENSE-2.0
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      See the License for the specific language governing permissions and
+      limitations under the License.
+   ```
+## Nltk - [Apache License 2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
+   ```
+                                    Apache License
+                              Version 2.0, January 2004
+                           http://www.apache.org/licenses/
+      TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+      1. Definitions.
+         "License" shall mean the terms and conditions for use, reproduction,
+         and distribution as defined by Sections 1 through 9 of this document.
+         "Licensor" shall mean the copyright owner or entity authorized by
+         the copyright owner that is granting the License.
+         "Legal Entity" shall mean the union of the acting entity and all
+         other entities that control, are controlled by, or are under common
+         control with that entity. For the purposes of this definition,
+         "control" means (i) the power, direct or indirect, to cause the
+         direction or management of such entity, whether by contract or
+         otherwise, or (ii) ownership of fifty percent (50%) or more of the
+         outstanding shares, or (iii) beneficial ownership of such entity.
+         "You" (or "Your") shall mean an individual or Legal Entity
+         exercising permissions granted by this License.
+         "Source" form shall mean the preferred form for making modifications,
+         including but not limited to software source code, documentation
+         source, and configuration files.
+         "Object" form shall mean any form resulting from mechanical
+         transformation or translation of a Source form, including but
+         not limited to compiled object code, generated documentation,
+         and conversions to other media types.
+         "Work" shall mean the work of authorship, whether in Source or
+         Object form, made available under the License, as indicated by a
+         copyright notice that is included in or attached to the work
+         (an example is provided in the Appendix below).
+         "Derivative Works" shall mean any work, whether in Source or Object
+         form, that is based on (or derived from) the Work and for which the
+         editorial revisions, annotations, elaborations, or other modifications
+         represent, as a whole, an original work of authorship. For the purposes
+         of this License, Derivative Works shall not include works that remain
+         separable from, or merely link (or bind by name) to the interfaces of,
+         the Work and Derivative Works thereof.
+         "Contribution" shall mean any work of authorship, including
+         the original version of the Work and any modifications or additions
+         to that Work or Derivative Works thereof, that is intentionally
+         submitted to Licensor for inclusion in the Work by the copyright owner
+         or by an individual or Legal Entity authorized to submit on behalf of
+         the copyright owner. For the purposes of this definition, "submitted"
+         means any form of electronic, verbal, or written communication sent
+         to the Licensor or its representatives, including but not limited to
+         communication on electronic mailing lists, source code control systems,
+         and issue tracking systems that are managed by, or on behalf of, the
+         Licensor for the purpose of discussing and improving the Work, but
+         excluding communication that is conspicuously marked or otherwise
+         designated in writing by the copyright owner as "Not a Contribution."
+         "Contributor" shall mean Licensor and any individual or Legal Entity
+         on behalf of whom a Contribution has been received by Licensor and
+         subsequently incorporated within the Work.
+      2. Grant of Copyright License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         copyright license to reproduce, prepare Derivative Works of,
+         publicly display, publicly perform, sublicense, and distribute the
+         Work and such Derivative Works in Source or Object form.
+      3. Grant of Patent License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         (except as stated in this section) patent license to make, have made,
+         use, offer to sell, sell, import, and otherwise transfer the Work,
+         where such license applies only to those patent claims licensable
+         by such Contributor that are necessarily infringed by their
+         Contribution(s) alone or by combination of their Contribution(s)
+         with the Work to which such Contribution(s) was submitted. If You
+         institute patent litigation against any entity (including a
+         cross-claim or counterclaim in a lawsuit) alleging that the Work
+         or a Contribution incorporated within the Work constitutes direct
+         or contributory patent infringement, then any patent licenses
+         granted to You under this License for that Work shall terminate
+         as of the date such litigation is filed.
+      4. Redistribution. You may reproduce and distribute copies of the
+         Work or Derivative Works thereof in any medium, with or without
+         modifications, and in Source or Object form, provided that You
+         meet the following conditions:
+         (a) You must give any other recipients of the Work or
+             Derivative Works a copy of this License; and
+         (b) You must cause any modified files to carry prominent notices
+             stating that You changed the files; and
+         (c) You must retain, in the Source form of any Derivative Works
+             that You distribute, all copyright, patent, trademark, and
+             attribution notices from the Source form of the Work,
+             excluding those notices that do not pertain to any part of
+             the Derivative Works; and
+         (d) If the Work includes a "NOTICE" text file as part of its
+             distribution, then any Derivative Works that You distribute must
+             include a readable copy of the attribution notices contained
+             within such NOTICE file, excluding those notices that do not
+             pertain to any part of the Derivative Works, in at least one
+             of the following places: within a NOTICE text file distributed
+             as part of the Derivative Works; within the Source form or
+             documentation, if provided along with the Derivative Works; or,
+             within a display generated by the Derivative Works, if and
+             wherever such third-party notices normally appear. The contents
+             of the NOTICE file are for informational purposes only and
+             do not modify the License. You may add Your own attribution
+             notices within Derivative Works that You distribute, alongside
+             or as an addendum to the NOTICE text from the Work, provided
+             that such additional attribution notices cannot be construed
+             as modifying the License.
+         You may add Your own copyright statement to Your modifications and
+         may provide additional or different license terms and conditions
+         for use, reproduction, or distribution of Your modifications, or
+         for any such Derivative Works as a whole, provided Your use,
+         reproduction, and distribution of the Work otherwise complies with
+         the conditions stated in this License.
+      5. Submission of Contributions. Unless You explicitly state otherwise,
+         any Contribution intentionally submitted for inclusion in the Work
+         by You to the Licensor shall be under the terms and conditions of
+         this License, without any additional terms or conditions.
+         Notwithstanding the above, nothing herein shall supersede or modify
+         the terms of any separate license agreement you may have executed
+         with Licensor regarding such Contributions.
+      6. Trademarks. This License does not grant permission to use the trade
+         names, trademarks, service marks, or product names of the Licensor,
+         except as required for reasonable and customary use in describing the
+         origin of the Work and reproducing the content of the NOTICE file.
+      7. Disclaimer of Warranty. Unless required by applicable law or
+         agreed to in writing, Licensor provides the Work (and each
+         Contributor provides its Contributions) on an "AS IS" BASIS,
+         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+         implied, including, without limitation, any warranties or conditions
+         of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+         PARTICULAR PURPOSE. You are solely responsible for determining the
+         appropriateness of using or redistributing the Work and assume any
+         risks associated with Your exercise of permissions under this License.
+      8. Limitation of Liability. In no event and under no legal theory,
+         whether in tort (including negligence), contract, or otherwise,
+         unless required by applicable law (such as deliberate and grossly
+         negligent acts) or agreed to in writing, shall any Contributor be
+         liable to You for damages, including any direct, indirect, special,
+         incidental, or consequential damages of any character arising as a
+         result of this License or out of the use or inability to use the
+         Work (including but not limited to damages for loss of goodwill,
+         work stoppage, computer failure or malfunction, or any and all
+         other commercial damages or losses), even if such Contributor
+         has been advised of the possibility of such damages.
+      9. Accepting Warranty or Additional Liability. While redistributing
+         the Work or Derivative Works thereof, You may choose to offer,
+         and charge a fee for, acceptance of support, warranty, indemnity,
+         or other liability obligations and/or rights consistent with this
+         License. However, in accepting such obligations, You may act only
+         on Your own behalf and on Your sole responsibility, not on behalf
+         of any other Contributor, and only if You agree to indemnify,
+         defend, and hold each Contributor harmless for any liability
+         incurred by, or claims asserted against, such Contributor by reason
+         of your accepting any such warranty or additional liability.
+      END OF TERMS AND CONDITIONS
+      APPENDIX: How to apply the Apache License to your work.
+         To apply the Apache License to your work, attach the following
+         boilerplate notice, with the fields enclosed by brackets "[]"
+         replaced with your own identifying information. (Don't include
+         the brackets!)  The text should be enclosed in the appropriate
+         comment syntax for the file format. We also recommend that a
+         file or class name and description of purpose be included on the
+         same "printed page" as the copyright notice for easier
+         identification within third-party archives.
+      Copyright [yyyy] [name of copyright owner]
+      Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License.
+      You may obtain a copy of the License at
+          http://www.apache.org/licenses/LICENSE-2.0
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      See the License for the specific language governing permissions and
+      limitations under the License.
+   ```
+## PEFT - [Apache License 2.0](https://github.com/huggingface/peft/blob/main/LICENSE)
+   ```
+                                    Apache License
+                              Version 2.0, January 2004
+                           http://www.apache.org/licenses/
+      TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+      1. Definitions.
+         "License" shall mean the terms and conditions for use, reproduction,
+         and distribution as defined by Sections 1 through 9 of this document.
+         "Licensor" shall mean the copyright owner or entity authorized by
+         the copyright owner that is granting the License.
+         "Legal Entity" shall mean the union of the acting entity and all
+         other entities that control, are controlled by, or are under common
+         control with that entity. For the purposes of this definition,
+         "control" means (i) the power, direct or indirect, to cause the
+         direction or management of such entity, whether by contract or
+         otherwise, or (ii) ownership of fifty percent (50%) or more of the
+         outstanding shares, or (iii) beneficial ownership of such entity.
+         "You" (or "Your") shall mean an individual or Legal Entity
+         exercising permissions granted by this License.
+         "Source" form shall mean the preferred form for making modifications,
+         including but not limited to software source code, documentation
+         source, and configuration files.
+         "Object" form shall mean any form resulting from mechanical
+         transformation or translation of a Source form, including but
+         not limited to compiled object code, generated documentation,
+         and conversions to other media types.
+         "Work" shall mean the work of authorship, whether in Source or
+         Object form, made available under the License, as indicated by a
+         copyright notice that is included in or attached to the work
+         (an example is provided in the Appendix below).
+         "Derivative Works" shall mean any work, whether in Source or Object
+         form, that is based on (or derived from) the Work and for which the
+         editorial revisions, annotations, elaborations, or other modifications
+         represent, as a whole, an original work of authorship. For the purposes
+         of this License, Derivative Works shall not include works that remain
+         separable from, or merely link (or bind by name) to the interfaces of,
+         the Work and Derivative Works thereof.
+         "Contribution" shall mean any work of authorship, including
+         the original version of the Work and any modifications or additions
+         to that Work or Derivative Works thereof, that is intentionally
+         submitted to Licensor for inclusion in the Work by the copyright owner
+         or by an individual or Legal Entity authorized to submit on behalf of
+         the copyright owner. For the purposes of this definition, "submitted"
+         means any form of electronic, verbal, or written communication sent
+         to the Licensor or its representatives, including but not limited to
+         communication on electronic mailing lists, source code control systems,
+         and issue tracking systems that are managed by, or on behalf of, the
+         Licensor for the purpose of discussing and improving the Work, but
+         excluding communication that is conspicuously marked or otherwise
+         designated in writing by the copyright owner as "Not a Contribution."
+         "Contributor" shall mean Licensor and any individual or Legal Entity
+         on behalf of whom a Contribution has been received by Licensor and
+         subsequently incorporated within the Work.
+      2. Grant of Copyright License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         copyright license to reproduce, prepare Derivative Works of,
+         publicly display, publicly perform, sublicense, and distribute the
+         Work and such Derivative Works in Source or Object form.
+      3. Grant of Patent License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         (except as stated in this section) patent license to make, have made,
+         use, offer to sell, sell, import, and otherwise transfer the Work,
+         where such license applies only to those patent claims licensable
+         by such Contributor that are necessarily infringed by their
+         Contribution(s) alone or by combination of their Contribution(s)
+         with the Work to which such Contribution(s) was submitted. If You
+         institute patent litigation against any entity (including a
+         cross-claim or counterclaim in a lawsuit) alleging that the Work
+         or a Contribution incorporated within the Work constitutes direct
+         or contributory patent infringement, then any patent licenses
+         granted to You under this License for that Work shall terminate
+         as of the date such litigation is filed.
+      4. Redistribution. You may reproduce and distribute copies of the
+         Work or Derivative Works thereof in any medium, with or without
+         modifications, and in Source or Object form, provided that You
+         meet the following conditions:
+         (a) You must give any other recipients of the Work or
+             Derivative Works a copy of this License; and
+         (b) You must cause any modified files to carry prominent notices
+             stating that You changed the files; and
+         (c) You must retain, in the Source form of any Derivative Works
+             that You distribute, all copyright, patent, trademark, and
+             attribution notices from the Source form of the Work,
+             excluding those notices that do not pertain to any part of
+             the Derivative Works; and
+         (d) If the Work includes a "NOTICE" text file as part of its
+             distribution, then any Derivative Works that You distribute must
+             include a readable copy of the attribution notices contained
+             within such NOTICE file, excluding those notices that do not
+             pertain to any part of the Derivative Works, in at least one
+             of the following places: within a NOTICE text file distributed
+             as part of the Derivative Works; within the Source form or
+             documentation, if provided along with the Derivative Works; or,
+             within a display generated by the Derivative Works, if and
+             wherever such third-party notices normally appear. The contents
+             of the NOTICE file are for informational purposes only and
+             do not modify the License. You may add Your own attribution
+             notices within Derivative Works that You distribute, alongside
+             or as an addendum to the NOTICE text from the Work, provided
+             that such additional attribution notices cannot be construed
+             as modifying the License.
+         You may add Your own copyright statement to Your modifications and
+         may provide additional or different license terms and conditions
+         for use, reproduction, or distribution of Your modifications, or
+         for any such Derivative Works as a whole, provided Your use,
+         reproduction, and distribution of the Work otherwise complies with
+         the conditions stated in this License.
+      5. Submission of Contributions. Unless You explicitly state otherwise,
+         any Contribution intentionally submitted for inclusion in the Work
+         by You to the Licensor shall be under the terms and conditions of
+         this License, without any additional terms or conditions.
+         Notwithstanding the above, nothing herein shall supersede or modify
+         the terms of any separate license agreement you may have executed
+         with Licensor regarding such Contributions.
+      6. Trademarks. This License does not grant permission to use the trade
+         names, trademarks, service marks, or product names of the Licensor,
+         except as required for reasonable and customary use in describing the
+         origin of the Work and reproducing the content of the NOTICE file.
+      7. Disclaimer of Warranty. Unless required by applicable law or
+         agreed to in writing, Licensor provides the Work (and each
+         Contributor provides its Contributions) on an "AS IS" BASIS,
+         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+         implied, including, without limitation, any warranties or conditions
+         of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+         PARTICULAR PURPOSE. You are solely responsible for determining the
+         appropriateness of using or redistributing the Work and assume any
+         risks associated with Your exercise of permissions under this License.
+      8. Limitation of Liability. In no event and under no legal theory,
+         whether in tort (including negligence), contract, or otherwise,
+         unless required by applicable law (such as deliberate and grossly
+         negligent acts) or agreed to in writing, shall any Contributor be
+         liable to You for damages, including any direct, indirect, special,
+         incidental, or consequential damages of any character arising as a
+         result of this License or out of the use or inability to use the
+         Work (including but not limited to damages for loss of goodwill,
+         work stoppage, computer failure or malfunction, or any and all
+         other commercial damages or losses), even if such Contributor
+         has been advised of the possibility of such damages.
+      9. Accepting Warranty or Additional Liability. While redistributing
+         the Work or Derivative Works thereof, You may choose to offer,
+         and charge a fee for, acceptance of support, warranty, indemnity,
+         or other liability obligations and/or rights consistent with this
+         License. However, in accepting such obligations, You may act only
+         on Your own behalf and on Your sole responsibility, not on behalf
+         of any other Contributor, and only if You agree to indemnify,
+         defend, and hold each Contributor harmless for any liability
+         incurred by, or claims asserted against, such Contributor by reason
+         of your accepting any such warranty or additional liability.
+      END OF TERMS AND CONDITIONS
+      APPENDIX: How to apply the Apache License to your work.
+         To apply the Apache License to your work, attach the following
+         boilerplate notice, with the fields enclosed by brackets "[]"
+         replaced with your own identifying information. (Don't include
+         the brackets!)  The text should be enclosed in the appropriate
+         comment syntax for the file format. We also recommend that a
+         file or class name and description of purpose be included on the
+         same "printed page" as the copyright notice for easier
+         identification within third-party archives.
+      Copyright [yyyy] [name of copyright owner]
+      Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License.
+      You may obtain a copy of the License at
+          http://www.apache.org/licenses/LICENSE-2.0
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      See the License for the specific language governing permissions and
+      limitations under the License.
+   ```
+## Pillow - [MIT License](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
+   ```
+   The Python Imaging Library (PIL) is
+       Copyright © 1997-2011 by Secret Labs AB
+       Copyright © 1995-2011 by Fredrik Lundh and contributors
+   Pillow is the friendly PIL fork. It is
+       Copyright © 2010 by Jeffrey A. Clark and contributors
+   Like PIL, Pillow is licensed under the open source MIT-CMU License:
+   By obtaining, using, and/or copying this software and/or its associated
+   documentation, you agree that you have read, understood, and will comply
+   with the following terms and conditions:
+   Permission to use, copy, modify and distribute this software and its
+   documentation for any purpose and without fee is hereby granted,
+   provided that the above copyright notice appears in all copies, and that
+   both that copyright notice and this permission notice appear in supporting
+   documentation, and that the name of Secret Labs AB or the author not be
+   used in advertising or publicity pertaining to distribution of the software
+   without specific, written prior permission.
+   SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+   SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
+   IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
+   INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+   LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+   OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+   PERFORMANCE OF THIS SOFTWARE.
+   ```
+## PyAV - [BSD 3-Clause "New" or "Revised" License](https://github.com/PyAV-Org/PyAV/blob/main/LICENSE.txt)
+   ```
+   Copyright retained by original committers. All rights reserved.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are met:
+       * Redistributions of source code must retain the above copyright
+         notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above copyright
+         notice, this list of conditions and the following disclaimer in the
+         documentation and/or other materials provided with the distribution.
+       * Neither the name of the project nor the names of its contributors may be
+         used to endorse or promote products derived from this software without
+         specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT,
+   INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+   EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+   ```
+## Pytorch_Retinaface - [MIT License](https://github.com/biubug6/Pytorch_Retinaface/blob/master/LICENSE.MIT)
+   ```
+   MIT License
+   Copyright (c) 2019
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in all
+   copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+   ```
+## Sentencepiece - [Apache License 2.0](https://github.com/google/sentencepiece/blob/master/LICENSE)
+   ```
+                                    Apache License
+                              Version 2.0, January 2004
+                           http://www.apache.org/licenses/
+      TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+      1. Definitions.
+         "License" shall mean the terms and conditions for use, reproduction,
+         and distribution as defined by Sections 1 through 9 of this document.
+         "Licensor" shall mean the copyright owner or entity authorized by
+         the copyright owner that is granting the License.
+         "Legal Entity" shall mean the union of the acting entity and all
+         other entities that control, are controlled by, or are under common
+         control with that entity. For the purposes of this definition,
+         "control" means (i) the power, direct or indirect, to cause the
+         direction or management of such entity, whether by contract or
+         otherwise, or (ii) ownership of fifty percent (50%) or more of the
+         outstanding shares, or (iii) beneficial ownership of such entity.
+         "You" (or "Your") shall mean an individual or Legal Entity
+         exercising permissions granted by this License.
+         "Source" form shall mean the preferred form for making modifications,
+         including but not limited to software source code, documentation
+         source, and configuration files.
+         "Object" form shall mean any form resulting from mechanical
+         transformation or translation of a Source form, including but
+         not limited to compiled object code, generated documentation,
+         and conversions to other media types.
+         "Work" shall mean the work of authorship, whether in Source or
+         Object form, made available under the License, as indicated by a
+         copyright notice that is included in or attached to the work
+         (an example is provided in the Appendix below).
+         "Derivative Works" shall mean any work, whether in Source or Object
+         form, that is based on (or derived from) the Work and for which the
+         editorial revisions, annotations, elaborations, or other modifications
+         represent, as a whole, an original work of authorship. For the purposes
+         of this License, Derivative Works shall not include works that remain
+         separable from, or merely link (or bind by name) to the interfaces of,
+         the Work and Derivative Works thereof.
+         "Contribution" shall mean any work of authorship, including
+         the original version of the Work and any modifications or additions
+         to that Work or Derivative Works thereof, that is intentionally
+         submitted to Licensor for inclusion in the Work by the copyright owner
+         or by an individual or Legal Entity authorized to submit on behalf of
+         the copyright owner. For the purposes of this definition, "submitted"
+         means any form of electronic, verbal, or written communication sent
+         to the Licensor or its representatives, including but not limited to
+         communication on electronic mailing lists, source code control systems,
+         and issue tracking systems that are managed by, or on behalf of, the
+         Licensor for the purpose of discussing and improving the Work, but
+         excluding communication that is conspicuously marked or otherwise
+         designated in writing by the copyright owner as "Not a Contribution."
+         "Contributor" shall mean Licensor and any individual or Legal Entity
+         on behalf of whom a Contribution has been received by Licensor and
+         subsequently incorporated within the Work.
+      2. Grant of Copyright License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         copyright license to reproduce, prepare Derivative Works of,
+         publicly display, publicly perform, sublicense, and distribute the
+         Work and such Derivative Works in Source or Object form.
+      3. Grant of Patent License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         (except as stated in this section) patent license to make, have made,
+         use, offer to sell, sell, import, and otherwise transfer the Work,
+         where such license applies only to those patent claims licensable
+         by such Contributor that are necessarily infringed by their
+         Contribution(s) alone or by combination of their Contribution(s)
+         with the Work to which such Contribution(s) was submitted. If You
+         institute patent litigation against any entity (including a
+         cross-claim or counterclaim in a lawsuit) alleging that the Work
+         or a Contribution incorporated within the Work constitutes direct
+         or contributory patent infringement, then any patent licenses
+         granted to You under this License for that Work shall terminate
+         as of the date such litigation is filed.
+      4. Redistribution. You may reproduce and distribute copies of the
+         Work or Derivative Works thereof in any medium, with or without
+         modifications, and in Source or Object form, provided that You
+         meet the following conditions:
+         (a) You must give any other recipients of the Work or
+             Derivative Works a copy of this License; and
+         (b) You must cause any modified files to carry prominent notices
+             stating that You changed the files; and
+         (c) You must retain, in the Source form of any Derivative Works
+             that You distribute, all copyright, patent, trademark, and
+             attribution notices from the Source form of the Work,
+             excluding those notices that do not pertain to any part of
+             the Derivative Works; and
+         (d) If the Work includes a "NOTICE" text file as part of its
+             distribution, then any Derivative Works that You distribute must
+             include a readable copy of the attribution notices contained
+             within such NOTICE file, excluding those notices that do not
+             pertain to any part of the Derivative Works, in at least one
+             of the following places: within a NOTICE text file distributed
+             as part of the Derivative Works; within the Source form or
+             documentation, if provided along with the Derivative Works; or,
+             within a display generated by the Derivative Works, if and
+             wherever such third-party notices normally appear. The contents
+             of the NOTICE file are for informational purposes only and
+             do not modify the License. You may add Your own attribution
+             notices within Derivative Works that You distribute, alongside
+             or as an addendum to the NOTICE text from the Work, provided
+             that such additional attribution notices cannot be construed
+             as modifying the License.
+         You may add Your own copyright statement to Your modifications and
+         may provide additional or different license terms and conditions
+         for use, reproduction, or distribution of Your modifications, or
+         for any such Derivative Works as a whole, provided Your use,
+         reproduction, and distribution of the Work otherwise complies with
+         the conditions stated in this License.
+      5. Submission of Contributions. Unless You explicitly state otherwise,
+         any Contribution intentionally submitted for inclusion in the Work
+         by You to the Licensor shall be under the terms and conditions of
+         this License, without any additional terms or conditions.
+         Notwithstanding the above, nothing herein shall supersede or modify
+         the terms of any separate license agreement you may have executed
+         with Licensor regarding such Contributions.
+      6. Trademarks. This License does not grant permission to use the trade
+         names, trademarks, service marks, or product names of the Licensor,
+         except as required for reasonable and customary use in describing the
+         origin of the Work and reproducing the content of the NOTICE file.
+      7. Disclaimer of Warranty. Unless required by applicable law or
+         agreed to in writing, Licensor provides the Work (and each
+         Contributor provides its Contributions) on an "AS IS" BASIS,
+         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+         implied, including, without limitation, any warranties or conditions
+         of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+         PARTICULAR PURPOSE. You are solely responsible for determining the
+         appropriateness of using or redistributing the Work and assume any
+         risks associated with Your exercise of permissions under this License.
+      8. Limitation of Liability. In no event and under no legal theory,
+         whether in tort (including negligence), contract, or otherwise,
+         unless required by applicable law (such as deliberate and grossly
+         negligent acts) or agreed to in writing, shall any Contributor be
+         liable to You for damages, including any direct, indirect, special,
+         incidental, or consequential damages of any character arising as a
+         result of this License or out of the use or inability to use the
+         Work (including but not limited to damages for loss of goodwill,
+         work stoppage, computer failure or malfunction, or any and all
+         other commercial damages or losses), even if such Contributor
+         has been advised of the possibility of such damages.
+      9. Accepting Warranty or Additional Liability. While redistributing
+         the Work or Derivative Works thereof, You may choose to offer,
+         and charge a fee for, acceptance of support, warranty, indemnity,
+         or other liability obligations and/or rights consistent with this
+         License. However, in accepting such obligations, You may act only
+         on Your own behalf and on Your sole responsibility, not on behalf
+         of any other Contributor, and only if You agree to indemnify,
+         defend, and hold each Contributor harmless for any liability
+         incurred by, or claims asserted against, such Contributor by reason
+         of your accepting any such warranty or additional liability.
+      END OF TERMS AND CONDITIONS
+      APPENDIX: How to apply the Apache License to your work.
+         To apply the Apache License to your work, attach the following
+         boilerplate notice, with the fields enclosed by brackets "[]"
+         replaced with your own identifying information. (Don't include
+         the brackets!)  The text should be enclosed in the appropriate
+         comment syntax for the file format. We also recommend that a
+         file or class name and description of purpose be included on the
+         same "printed page" as the copyright notice for easier
+         identification within third-party archives.
+      Copyright [yyyy] [name of copyright owner]
+      Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License.
+      You may obtain a copy of the License at
+          http://www.apache.org/licenses/LICENSE-2.0
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      See the License for the specific language governing permissions and
+      limitations under the License.
+   ```
+## Termcolor - [MIT License](https://github.com/termcolor/termcolor/blob/main/COPYING.txt)
+   ```
+   Copyright (c) 2008-2011 Volvox Development Team
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   ```
+## Transformers [Apache License 2.0](https://github.com/huggingface/transformers/blob/main/LICENSE)
+   ```
+   Copyright 2018- The Hugging Face team. All rights reserved.
+                                    Apache License
+                              Version 2.0, January 2004
+                           http://www.apache.org/licenses/
+      TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+      1. Definitions.
+         "License" shall mean the terms and conditions for use, reproduction,
+         and distribution as defined by Sections 1 through 9 of this document.
+         "Licensor" shall mean the copyright owner or entity authorized by
+         the copyright owner that is granting the License.
+         "Legal Entity" shall mean the union of the acting entity and all
+         other entities that control, are controlled by, or are under common
+         control with that entity. For the purposes of this definition,
+         "control" means (i) the power, direct or indirect, to cause the
+         direction or management of such entity, whether by contract or
+         otherwise, or (ii) ownership of fifty percent (50%) or more of the
+         outstanding shares, or (iii) beneficial ownership of such entity.
+         "You" (or "Your") shall mean an individual or Legal Entity
+         exercising permissions granted by this License.
+         "Source" form shall mean the preferred form for making modifications,
+         including but not limited to software source code, documentation
+         source, and configuration files.
+         "Object" form shall mean any form resulting from mechanical
+         transformation or translation of a Source form, including but
+         not limited to compiled object code, generated documentation,
+         and conversions to other media types.
+         "Work" shall mean the work of authorship, whether in Source or
+         Object form, made available under the License, as indicated by a
+         copyright notice that is included in or attached to the work
+         (an example is provided in the Appendix below).
+         "Derivative Works" shall mean any work, whether in Source or Object
+         form, that is based on (or derived from) the Work and for which the
+         editorial revisions, annotations, elaborations, or other modifications
+         represent, as a whole, an original work of authorship. For the purposes
+         of this License, Derivative Works shall not include works that remain
+         separable from, or merely link (or bind by name) to the interfaces of,
+         the Work and Derivative Works thereof.
+         "Contribution" shall mean any work of authorship, including
+         the original version of the Work and any modifications or additions
+         to that Work or Derivative Works thereof, that is intentionally
+         submitted to Licensor for inclusion in the Work by the copyright owner
+         or by an individual or Legal Entity authorized to submit on behalf of
+         the copyright owner. For the purposes of this definition, "submitted"
+         means any form of electronic, verbal, or written communication sent
+         to the Licensor or its representatives, including but not limited to
+         communication on electronic mailing lists, source code control systems,
+         and issue tracking systems that are managed by, or on behalf of, the
+         Licensor for the purpose of discussing and improving the Work, but
+         excluding communication that is conspicuously marked or otherwise
+         designated in writing by the copyright owner as "Not a Contribution."
+         "Contributor" shall mean Licensor and any individual or Legal Entity
+         on behalf of whom a Contribution has been received by Licensor and
+         subsequently incorporated within the Work.
+      2. Grant of Copyright License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         copyright license to reproduce, prepare Derivative Works of,
+         publicly display, publicly perform, sublicense, and distribute the
+         Work and such Derivative Works in Source or Object form.
+      3. Grant of Patent License. Subject to the terms and conditions of
+         this License, each Contributor hereby grants to You a perpetual,
+         worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+         (except as stated in this section) patent license to make, have made,
+         use, offer to sell, sell, import, and otherwise transfer the Work,
+         where such license applies only to those patent claims licensable
+         by such Contributor that are necessarily infringed by their
+         Contribution(s) alone or by combination of their Contribution(s)
+         with the Work to which such Contribution(s) was submitted. If You
+         institute patent litigation against any entity (including a
+         cross-claim or counterclaim in a lawsuit) alleging that the Work
+         or a Contribution incorporated within the Work constitutes direct
+         or contributory patent infringement, then any patent licenses
+         granted to You under this License for that Work shall terminate
+         as of the date such litigation is filed.
+      4. Redistribution. You may reproduce and distribute copies of the
+         Work or Derivative Works thereof in any medium, with or without
+         modifications, and in Source or Object form, provided that You
+         meet the following conditions:
+         (a) You must give any other recipients of the Work or
+             Derivative Works a copy of this License; and
+         (b) You must cause any modified files to carry prominent notices
+             stating that You changed the files; and
+         (c) You must retain, in the Source form of any Derivative Works
+             that You distribute, all copyright, patent, trademark, and
+             attribution notices from the Source form of the Work,
+             excluding those notices that do not pertain to any part of
+             the Derivative Works; and
+         (d) If the Work includes a "NOTICE" text file as part of its
+             distribution, then any Derivative Works that You distribute must
+             include a readable copy of the attribution notices contained
+             within such NOTICE file, excluding those notices that do not
+             pertain to any part of the Derivative Works, in at least one
+             of the following places: within a NOTICE text file distributed
+             as part of the Derivative Works; within the Source form or
+             documentation, if provided along with the Derivative Works; or,
+             within a display generated by the Derivative Works, if and
+             wherever such third-party notices normally appear. The contents
+             of the NOTICE file are for informational purposes only and
+             do not modify the License. You may add Your own attribution
+             notices within Derivative Works that You distribute, alongside
+             or as an addendum to the NOTICE text from the Work, provided
+             that such additional attribution notices cannot be construed
+             as modifying the License.
+         You may add Your own copyright statement to Your modifications and
+         may provide additional or different license terms and conditions
+         for use, reproduction, or distribution of Your modifications, or
+         for any such Derivative Works as a whole, provided Your use,
+         reproduction, and distribution of the Work otherwise complies with
+         the conditions stated in this License.
+      5. Submission of Contributions. Unless You explicitly state otherwise,
+         any Contribution intentionally submitted for inclusion in the Work
+         by You to the Licensor shall be under the terms and conditions of
+         this License, without any additional terms or conditions.
+         Notwithstanding the above, nothing herein shall supersede or modify
+         the terms of any separate license agreement you may have executed
+         with Licensor regarding such Contributions.
+      6. Trademarks. This License does not grant permission to use the trade
+         names, trademarks, service marks, or product names of the Licensor,
+         except as required for reasonable and customary use in describing the
+         origin of the Work and reproducing the content of the NOTICE file.
+      7. Disclaimer of Warranty. Unless required by applicable law or
+         agreed to in writing, Licensor provides the Work (and each
+         Contributor provides its Contributions) on an "AS IS" BASIS,
+         WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+         implied, including, without limitation, any warranties or conditions
+         of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+         PARTICULAR PURPOSE. You are solely responsible for determining the
+         appropriateness of using or redistributing the Work and assume any
+         risks associated with Your exercise of permissions under this License.
+      8. Limitation of Liability. In no event and under no legal theory,
+         whether in tort (including negligence), contract, or otherwise,
+         unless required by applicable law (such as deliberate and grossly
+         negligent acts) or agreed to in writing, shall any Contributor be
+         liable to You for damages, including any direct, indirect, special,
+         incidental, or consequential damages of any character arising as a
+         result of this License or out of the use or inability to use the
+         Work (including but not limited to damages for loss of goodwill,
+         work stoppage, computer failure or malfunction, or any and all
+         other commercial damages or losses), even if such Contributor
+         has been advised of the possibility of such damages.
+      9. Accepting Warranty or Additional Liability. While redistributing
+         the Work or Derivative Works thereof, You may choose to offer,
+         and charge a fee for, acceptance of support, warranty, indemnity,
+         or other liability obligations and/or rights consistent with this
+         License. However, in accepting such obligations, You may act only
+         on Your own behalf and on Your sole responsibility, not on behalf
+         of any other Contributor, and only if You agree to indemnify,
+         defend, and hold each Contributor harmless for any liability
+         incurred by, or claims asserted against, such Contributor by reason
+         of your accepting any such warranty or additional liability.
+      END OF TERMS AND CONDITIONS
+      APPENDIX: How to apply the Apache License to your work.
+         To apply the Apache License to your work, attach the following
+         boilerplate notice, with the fields enclosed by brackets "[]"
+         replaced with your own identifying information. (Don't include
+         the brackets!)  The text should be enclosed in the appropriate
+         comment syntax for the file format. We also recommend that a
+         file or class name and description of purpose be included on the
+         same "printed page" as the copyright notice for easier
+         identification within third-party archives.
+      Copyright [yyyy] [name of copyright owner]
+      Licensed under the Apache License, Version 2.0 (the "License");
+      you may not use this file except in compliance with the License.
+      You may obtain a copy of the License at
+          http://www.apache.org/licenses/LICENSE-2.0
+      Unless required by applicable law or agreed to in writing, software
+      distributed under the License is distributed on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+      See the License for the specific language governing permissions and
+      limitations under the License.
+   ```

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# How to Contribute
+We'd love to receive your patches and contributions. Please keep your PRs as draft until such time that you would like us to review them.
+## Code Reviews
+All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests.
+## Pipeline
+Ensure you run the linter prior to submitting your pull request and the CI-CD pipeline is green before removing the draft designation.
+```bash
+./cosmos1/scripts/format.sh
+```
+## Signing Your Work
+* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
+  * Any contribution which contains commits that are not Signed-Off will not be accepted.
+* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
+  ```bash
+  $ git commit -s -m "Add cool feature."
+  ```
+  This will append the following to your commit message:
+  ```
+  Signed-off-by: Your Name <[email protected]>
+  ```
+* Full text of the DCO:
+  ```
+    Developer Certificate of Origin
+    Version 1.1
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+    1 Letterman Drive
+    Suite D4700
+    San Francisco, CA, 94129
+    Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+  ```
+  ```
+    Developer's Certificate of Origin 1.1
+    By making a contribution to this project, I certify that:
+    (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
+    (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
+    (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
+    (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
+  ```

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Use NVIDIA PyTorch container as base image
+FROM nvcr.io/nvidia/pytorch:24.10-py3
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /workspace
+# Copy source code
+COPY cosmos1 /workspace/cosmos1
+# Copy main README
+COPY README.md /workspace/
+# Copy third-party licenses
+COPY ATTRIBUTIONS.md /workspace/
+# Copy requirements file
+COPY requirements.txt /workspace/
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Default command
+CMD ["/bin/bash"]

INSTALL.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# Cosmos Installation
+We have only tested the installation with Ubuntu 24.04, 22.04, and 20.04.
+1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+2. Clone the repository.
+```bash
+git clone [email protected]:NVIDIA/Cosmos.git
+cd Cosmos
+```
+3. Build a Docker image using `Dockerfile` and run the Docker container.
+```bash
+docker build -t cosmos .
+docker run -d --name cosmos_container --gpus all --ipc=host -it -v $(pwd):/workspace cosmos
+docker attach cosmos_container
+```

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+![Cosmos Logo](assets/cosmos-logo.png)
+--------------------------------------------------------------------------------
+### [Website](https://www.nvidia.com/en-us/ai/cosmos/) | [HuggingFace](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6) | [GPU-free Preview](https://build.nvidia.com/explore/discover) | [Paper](https://arxiv.org/abs/2501.03575) | [Paper Website](https://research.nvidia.com/labs/dir/cosmos1/)
+[NVIDIA Cosmos](https://www.nvidia.com/cosmos/) is a developer-first world foundation model platform designed to help Physical AI developers build their Physical AI systems better and faster. Cosmos contains
+1. pre-trained models, available via [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6) under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) that allows commercial use of the models for free
+2. training scripts under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0), offered through [NVIDIA Nemo Framework](https://github.com/NVIDIA/NeMo) for post-training the models for various downstream Physical AI applications
+Details of the platform is described in the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai). Preview access is avaiable at [build.nvidia.com](https://build.nvidia.com).
+## Key Features
+- [Pre-trained Diffusion-based world foundation models](cosmos1/models/diffusion/README.md) for Text2World and Video2World generation where a user can generate visual simulation based on text prompts and video prompts.
+- [Pre-trained Autoregressive-based world foundation models](cosmos1/models/autoregressive/README.md) for Video2World generation where a user can generate visual simulation based on video prompts and optional text prompts.
+- [Video tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively.
+- Video curation pipeline for building your own video dataset. [Coming soon]
+- [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup.
+- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)].
+## Model Family
+| Model name | Description | Try it out |
+|------------|----------|----------|
+| [Cosmos-1.0-Diffusion-7B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Text2World) | Text to visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Diffusion-14B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Text2World) | Text to visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Diffusion-7B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Video2World) | Video + Text based future visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Diffusion-14B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Video2World) | Video + Text based future visual world generation  | [Inference](cosmos1/models/diffusion/README.md)   |
+| [Cosmos-1.0-Autoregressive-4B](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-4B) | Future visual world generation  | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Autoregressive-12B](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-12B) | Future visual world generation  | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Autoregressive-5B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-5B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Autoregressive-13B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-13B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/autoregressive/README.md)   |
+| [Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) | Guardrail contains pre-Guard and post-Guard for safe use | Embedded in model inference scripts |
+## Example Usage
+### Inference
+Follow the [Cosmos Installation Guide](INSTALL.md) to setup the docker. For inference with the pretrained models, please refer to [Cosmos Diffusion Inference](cosmos1/models/diffusion/README.md) and [Cosmos Autoregressive Inference](cosmos1/models/autoregressive/README.md).
+The code snippet below provides a gist of the inference usage.
+```bash
+PROMPT="A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. \
+The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. \
+A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, \
+suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. \
+The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of \
+field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
+# Example using 7B model
+PYTHONPATH=$(pwd) python cosmos1/models/diffusion/inference/text2world.py \
+    --checkpoint_dir checkpoints \
+    --diffusion_transformer_dir Cosmos-1.0-Diffusion-7B-Text2World \
+    --prompt "$PROMPT" \
+    --offload_prompt_upsampler \
+    --video_save_name Cosmos-1.0-Diffusion-7B-Text2World
+```
+<video src="https://github.com/user-attachments/assets/db7bebfe-5314-40a6-b045-4f6ce0a87f2a">
+  Your browser does not support the video tag.
+</video>
+We also offer [multi-GPU inference](cosmos1/models/diffusion/nemo/inference/README.md) support for Diffusion Text2World WFM models through NeMo Framework.
+### Post-training
+NeMo Framework provides GPU accelerated post-training with general post-training for both [diffusion](cosmos1/models/diffusion/nemo/post_training/README.md) and [autoregressive](cosmos1/models/autoregressive/nemo/post_training/README.md) models, with other types of post-training coming soon.
+## License and Contact
+This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
+NVIDIA Cosmos source code is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
+NVIDIA Cosmos models are released under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). For a custom license, please contact [[email protected]](mailto:[email protected]).

RELEASE.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# Release Cadence
+| Version | Description | Date |
+|------------|----------|----------|
+| [v1.0](release_notes/v0p1.md) | Initial diffusion and autoregressive WFMs release | 2025-01-06 |
+| [v0.1](release_notes/v0p1.md) | Initial tokenizer release | 2024-11-06 |

assets/cosmos-logo.png ADDED Viewed

checkpoints/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Checkpoint directory
2	+
3	+ Follow our instructions for downloading checkpoints in [Cosmos Diffusion Inference](../cosmos1/models/diffusion/README.md#download-checkpoints) and [Cosmos Autoregressive Inference](../cosmos1/models/autoregressive/README.md). Cosmos checkpoints will be downloaded to this directory.

cosmos1/models/POST_TRAINING.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Cosmos Post-training
+In the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai), we discuss several post-training examples of Cosmos pre-trained World Foundation Models (WFMs) for various Physical AI tasks, including
+- General Post-Training: Fine-tune the WFM to generate a target distribution of videos based on the custom dataset. The target distribution could include a specific camera spec or a specific domain such as a factory.
+- Instruction Control: Post-trains models for robotic manipulation to predict videos based on textual instructions, enabling robots to visually simulate tasks like folding clothes or picking up objects.
+- Action Control: Post-trains models for robotic manipulation to predict the next visual frame based on action vectors, simulating robotic tasks like object handling or movement planning.
+- Camera Control: Adds camera pose conditioning to generate 3D-consistent video simulations from single images, enabling joystick-like navigation in virtual environments.
+- Multi-View Generation: Post-trains models for autonomous vehicles to generate synchronized multi-view videos from text prompts, simulating driving scenarios with multiple camera perspectives.
+- Multi-View Generation with Vehicle Trajectory Control: Extends multi-view generation by incorporating trajectory inputs, enabling precise simulation of driving environments for autonomous vehicles, adhering to specified paths.
+Except for the instruction control where the WFM is post-trained on a dataset of instruction-video pairs, all other cases require minor modifications of the network architectures. Post-training tasks will be supported by NeMo Framework. In this initial release, we provide post-training scripts for the general post-training of both diffusion and autorgressive WFMs. Scripts of the other post-training tasks will be provided in a future release.
+## Post-training Support Matrix
+| Post-training Task  | Diffusion WFM | Autoregressive WFM |
+|---------------------|---------------|--------------------|
+| General post-training | [Supported](../models/diffusion/nemo/post_training/README.md) | [Supported](../models/autoregressive/nemo/post_training/README.md) |
+| Instruction control | Coming soon | Coming soon |
+| Action control | Coming soon | Coming soon |
+| Camera control | Coming soon | Coming soon |
+| Multi-view generation | Coming soon | Coming soon |
+| Multi-view generation with vehicle trajectory control | Coming soon | Coming soon |

cosmos1/models/autoregressive/README.md ADDED Viewed

	@@ -0,0 +1,427 @@

+# Cosmos Autoregressive-based World Foundation Models
+## Table of Contents
+- [Getting Started](#getting-started)
+  - [Set Up Docker Environment](#set-up-docker-environment)
+  - [Download Checkpoints](#download-checkpoints)
+- [Usage](#usage)
+  - [Model Types](#model-types)
+  - [Single and Batch Generation](#single-and-batch-generation)
+  - [Sample Commands](#sample-commands)
+    - [Base Models (4B/12B)](#base-basepy-4b-and-12b)
+    - [Video2World Models (5B/13B)](#video2world-video2worldpy-5b-and-13b)
+  - [Arguments](#arguments)
+    - [Common Parameters](#common-parameters)
+    - [Base Specific Parameters](#base-specific-parameters)
+    - [Video2World Specific Parameters](#video2world-specific-parameters)
+  - [Safety Features](#safety-features)
+This page details the steps for using the Cosmos autoregressive-based world foundation models.
+## Getting Started
+### Set Up Docker Environment
+Follow our [Installation Guide](../../../INSTALL.md) to set up the Docker environment. All commands on this page should be run inside Docker.
+### Download Checkpoints
+1. Generate a [Hugging Face](https://huggingface.co/settings/tokens) access token. Set the access token to 'Read' permission (default is 'Fine-grained').
+2. Log in to Hugging Face with the access token:
+```bash
+huggingface-cli login
+```
+3. Download the Cosmos model weights from [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6):
+```bash
+PYTHONPATH=$(pwd) python cosmos1/scripts/download_autoregressive.py --model_sizes 4B 5B 12B 13B
+```
+4. The downloaded files should be in the following structure:
+```
+checkpoints/
+├── Cosmos-1.0-Autoregressive-4B
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Autoregressive-5B-Video2World
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Autoregressive-12B
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Autoregressive-13B-Video2World
+│   ├── model.pt
+│   └── config.json
+├── Cosmos-1.0-Tokenizer-CV8x8x8
+│   ├── decoder.jit
+│   ├── encoder.jit
+│   └── mean_std.pt
+├── Cosmos-1.0-Tokenizer-DV8x16x16
+│   ├── decoder.jit
+│   └── encoder.jit
+├── Cosmos-1.0-Diffusion-7B-Decoder-DV8x16x16ToCV8x8x8
+│   ├── aux_vars.pt
+│   └── model.pt
+└── Cosmos-1.0-Guardrail
+    ├── aegis/
+    ├── blocklist/
+    ├── face_blur_filter/
+    └── video_content_safety_filter/
+```
+## Usage
+### Model Types
+There are two model types available for autoregressive world generation:
+1. **Base**: Supports world generation from image/video input
+* Models: `Cosmos-1.0-Autoregressive-4B` and `Cosmos-1.0-Autoregressive-12B`
+* Inference script: [base.py](/cosmos1/models/autoregressive/inference/base.py)
+2. **Video2World**: Supports world generation from image/video input and text input
+* Models: `Cosmos-1.0-Autoregressive-5B-Video2World` and `Cosmos-1.0-Autoregressive-13B-Video2World`
+* Inference script: [video2world.py](/cosmos1/models/autoregressive/inference/video2world.py)
+Our models now support video extension up to 33 frames. Starting from either a single image or a 9-frame video input, they can generate the remaining frames to reach the 33-frame length (generating 32 or 24 frames, respectively).
+We have evaluated all eight possible configurations (4 models × 2 vision input types: image or video) using 100 test videos on physical AI topics. Below are the failure rates for each configuration:
+| Model                                      | Image input | Video input (9 frames) |
+|:------------------------------------------|:--------------:|:-------------------------:|
+| Cosmos-1.0-Autoregressive-4B              | 15%           | 1%                       |
+| Cosmos-1.0-Autoregressive-5B-Video2World  | 7%            | 2%                       |
+| Cosmos-1.0-Autoregressive-12B             | 2%            | 1%                       |
+| Cosmos-1.0-Autoregressive-13B-Video2World | 3%            | 0%                       |
+We define failure cases as videos with severe distortions, such as:
+* Sudden appearance of large unexpected objects
+* Video degrading to a single solid color
+Note that the following are not considered failures in our analysis:
+* Static video frames
+* Minor object distortions or artifacts
+### Single and Batch Generation
+We support both single and batch video generation.
+For generating a single video, `base` mode requires the input argument `--input_image_or_video_path` (image/video input), while `video2world` mode requires both `--input_image_or_video_path` (image/video input) and `--prompt` (text input).
+Note that our model only works with 1024x640 resolution videos. If the input image/video is not in this resolution, it will be resized and cropped.
+For generating a batch of videos, both `base` and `video2world` require `--batch_input_path` (path to a JSONL file). For `base`, the JSONL file should contain one visual input per line in the following format, where each line must contain a "visual_input" field:
+```json
+{"visual_input": "path/to/video1.mp4"}
+{"visual_input": "path/to/video2.mp4"}
+```
+For `video2world`, each line in the JSONL file must contain both "prompt" and "visual_input" fields:
+```json
+{"prompt": "prompt1", "visual_input": "path/to/video1.mp4"}
+{"prompt": "prompt2", "visual_input": "path/to/video2.mp4"}
+```
+### Sample Commands
+There are two main demo scripts for autoregressive world generation: `base.py` and `video2world.py`. Below you will find sample commands for single and batch generation, as well as commands for running with low-memory GPUs using model offloading. We also provide a memory usage table comparing different offloading strategies to help with configuration.
+#### Base (base.py): 4B and 12B
+Generates world from image/video input.
+The `input_type` argument can be either `video` or `image`. We have tuned the sampling parameters `top_p` and `temperature` to achieve the best performance. Please use the provided values in the command examples.
+Note that the command examples below all use video input. If you want to use image input, please change the `input_type` to `image`.
+##### Single Generation
+```bash
+# Example using 4B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-4B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
+    --top_p=0.8 \
+    --temperature=1.0
+# Example for low-memory GPUs using 4B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-4B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer
+# Example using 12B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-12B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
+    --top_p=0.9 \
+    --temperature=1.0
+# Example for low-memory GPUs using 12B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --video_save_name=Cosmos-1.0-Autoregressive-12B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
+    --top_p=0.9 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer
+```
+##### Batch Generation
+```bash
+# Example using 4B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-4B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
+    --top_p=0.8 \
+    --temperature=1.0
+# Example using 12B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
+    --input_type=video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-12B \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
+    --top_p=0.9 \
+    --temperature=1.0
+```
+##### Example Output
+Here is an example output video generated using base.py with image input, using `Cosmos-1.0-Autoregressive-12B`:
+<video src="https://github.com/user-attachments/assets/634403a5-1873-42d7-8dd0-eb7fb4ac8cf4">
+  Your browser does not support the video tag.
+</video>
+The input image used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.jpg`. The image is from [BDD dataset](http://bdd-data.berkeley.edu/).
+Here is an example output video generated using base.py with 9-frame video input, using `Cosmos-1.0-Autoregressive-12B`:
+<video src="https://github.com/user-attachments/assets/1a3ff099-87d7-41e8-b149-a25cfcd4f40b">
+  Your browser does not support the video tag.
+</video>
+The input video used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.mp4`.
+##### Inference Time and GPU Memory Usage
+These numbers may vary based on system specifications and are provided for reference only.
+| Offloading Strategy | Cosmos-1.0-Autoregressive-4B | Cosmos-1.0-Autoregressive-12B |
+|-------------|---------|---------|
+| No offloading | 31.3 GB | 47.5 GB |
+| Guardrails | 28.9 GB | 45.2 GB |
+| Guardrails & Diffusion decoder | 28.5 GB | 43.1 GB |
+| Guardrails & Diffusion decoder & Tokenizer | 27.3 GB | 42.9 GB |
+| Guardrails & Diffusion decoder & Tokenizer & AR model | 18.7 GB | 27.4 GB |
+End-to-end inference runtime on one H100 without offloading and after model initialization:
+| Cosmos-1.0-Autoregressive-4B | Cosmos-1.0-Autoregressive-12B |
+|---------|---------|
+| ~62 seconds | ~119 seconds |
+#### Video2World (video2world.py): 5B and 13B
+Generates world from image/video and text input.
+The `input_type` argument can be either `text_and_video` or `text_and_image`. We have tuned the sampling parameters `top_p` and `temperature` to achieve the best performance. Please use the provided values in the command examples.
+Note that the command examples below all use video input. If you want to use image input, please change the `input_type` to `text_and_image`.
+##### Single Generation
+```bash
+# Example using 5B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --top_p=0.7 \
+    --temperature=1.0
+# Example for low-memory GPUs using 5B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --top_p=0.7 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer \
+    --offload_text_encoder_model
+# Example using 13B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models
+# Example for low-memory GPUs using 13B model with model offloading
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
+    --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
+    --video_save_name=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models \
+    --offload_diffusion_decoder \
+    --offload_ar_model \
+    --offload_tokenizer \
+    --offload_text_encoder_model
+```
+##### Batch Generation
+```bash
+# Example using 5B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-5B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
+    --top_p=0.7 \
+    --temperature=1.0
+# Example using 13B model
+CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
+    --input_type=text_and_video \
+    --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl \
+    --video_save_folder=outputs/Cosmos-1.0-Autoregressive-13B-Video2World \
+    --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
+    --top_p=0.8 \
+    --temperature=1.0 \
+    --offload_guardrail_models
+```
+##### Example Output
+Here is an example output video generated using video2world.py with image input, using `Cosmos-1.0-Autoregressive-13B-Video2World`:
+<video src="https://github.com/user-attachments/assets/869f3b81-fabd-462e-a545-c04cdd9c1d22">
+  Your browser does not support the video tag.
+</video>
+The input image used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.jpg`. The prompt for generating the video is:
+```
+A driving video captures a serene urban street scene on a sunny day. The camera is mounted on the dashboard of a moving vehicle, providing a first-person perspective as it travels down a two-lane road. The street is lined with parked cars on both sides, predominantly black and silver sedans and SUVs. The road is flanked by a mix of residential and commercial buildings, with a prominent red-brick building on the left side, featuring multiple windows and a flat roof. The sky is clear with a few scattered clouds, casting soft shadows on the street. Trees with lush green foliage line the right side of the road, providing a natural contrast to the urban environment. The camera remains steady, maintaining a consistent forward motion, suggesting a leisurely drive. Traffic is light, with a few vehicles moving in the opposite direction, including a black sedan and a yellow taxi. Street signs are visible, including a no-parking sign on the right. The overall atmosphere is calm and peaceful, with no pedestrians visible, emphasizing the focus on the drive and the surrounding urban landscape.
+```
+Here is an example output video generated using video2world.py with 9-frame video input, using `Cosmos-1.0-Autoregressive-13B-Video2World`:
+<video src="https://github.com/user-attachments/assets/81840e1c-624b-4b01-9240-ab7db3722e58">
+  Your browser does not support the video tag.
+</video>
+The input video used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.mp4`. The prompt for generating the video is:
+```
+A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.
+```
+##### Inference Time and GPU Memory Usage
+These numbers may vary based on system specifications and are provided for reference only.
+| Offloading Strategy | Cosmos-1.0-Autoregressive-5B-Video2World | Cosmos-1.0-Autoregressive-13B-Video2World |
+|-------------|---------|---------|
+| No offloading | 66.2 GB | > 80 GB |
+| Guardrails | 58.7 GB | 76.6 GB |
+| Guardrails & T5 encoder | 41.3 GB | 58.0 GB |
+| Guardrails & T5 encoder & Diffusion decoder | 29.0 GB | 46.9 GB |
+| Guardrails & T5 encoder & Diffusion decoder & Tokenizer | 28.8 GB | 46.7 GB |
+| Guardrails & T5 encoder & Diffusion decoder & Tokenizer & AR model | 21.1 GB | 30.9 GB |
+End-to-end inference runtime on one H100 with no offloading for 5B model and guardrail offloading for 13B, after model initialization:
+| Cosmos-1.0-Autoregressive-5B-Video2World | Cosmos-1.0-Autoregressive-13B-Video2World |
+|---------|---------|
+| ~73 seconds | ~150 seconds |
+### Arguments
+#### Common Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--checkpoint_dir` | Directory containing model weights | "checkpoints" |
+| `--video_save_name` | Output video filename for single video generation | "output" |
+| `--video_save_folder` | Folder where all output videos are stored | "outputs/" |
+| `--input_image_or_video_path` | Input image or video path. Required for single video generation | None |
+| `--batch_input_path` | Folder containing input images or videos. Required for batch video generation | None |
+| `--num_input_frames` | Number of input frames to use for Video2World prediction | 9 |
+| `--temperature` | Temperature used while sampling | 1.0 (recommend using values in sample commands provided) |
+| `--top_p` | Top-p value for top-p sampling | 0.8 (recommend using values in sample commands provided) |
+| `--seed` | Random seed | 0 |
+| `--disable_diffusion_decoder` | When set to True, use discrete tokenizer to decode discrete tokens to video. Otherwise, use diffusion decoder to decode video | False |
+| `--offload_guardrail_models` | Offload guardrail models after inference, used for low-memory GPUs | False |
+| `--offload_diffusion_decoder` | Offload diffusion decoder after inference, used for low-memory GPUs | False |
+| `--offload_ar_model` | Offload AR model after inference, used for low-memory GPUs | False |
+| `--offload_prompt_upsampler` | Offload prompt upsampler after inference, used for low-memory GPUs | False |
+#### Base Specific Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--ar_model_dir` | Directory containing AR model weight | "Cosmos-1.0-Autoregressive-4B" |
+| `--input_type` | Input type, either `video` or `image` | "video" |
+#### Video2World Specific Parameters
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--ar_model_dir` | Directory containing AR model weight | "Cosmos-1.0-Autoregressive-4B" |
+| `--input_type` | Input type, either `text_and_video` or `text_and_image` | "text_and_video" |
+| `--prompt` | Text prompt for single video generation. Required for single video generation | None |
+| `--input_prompts_path` | Path to JSONL file for batch video generation. Required for batch video generation | None |
+| `--offload_text_encoder_model` | Offload text encoder after inference, used for low-memory GPUs | False |
+### Safety Features
+The model uses a built-in safety guardrail system that cannot be disabled. Generating human faces is not allowed and will be blurred by the guardrail.
+For more information, check out the [Cosmos Guardrail Documentation](../guardrail/README.md).

cosmos1/models/autoregressive/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos1/models/autoregressive/assets/nemo/finetuned_result.mp4 ADDED Viewed

Binary file (193 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4 ADDED Viewed

Binary file (299 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4 ADDED Viewed

Binary file (222 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4 ADDED Viewed

Binary file (511 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4 ADDED Viewed

Binary file (461 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4 ADDED Viewed

Binary file (331 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4 ADDED Viewed

Binary file (282 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4 ADDED Viewed

Binary file (289 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4 ADDED Viewed

Binary file (170 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4 ADDED Viewed

Binary file (188 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4 ADDED Viewed

Binary file (174 kB). View file

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4"}
+{"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4"}

cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4"}
+{"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4"}

cosmos1/models/autoregressive/assets/v1p0/input.jpg ADDED Viewed

cosmos1/models/autoregressive/assets/v1p0/input.mp4 ADDED Viewed

Binary file (282 kB). View file

cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_12b.mp4 ADDED Viewed

Binary file (390 kB). View file

cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_13b.mp4 ADDED Viewed

Binary file (430 kB). View file

cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_12b.mp4 ADDED Viewed

Binary file (195 kB). View file

cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_13b.mp4 ADDED Viewed

Binary file (193 kB). View file

cosmos1/models/autoregressive/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos1/models/autoregressive/configs/base/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos1/models/autoregressive/configs/base/model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import attrs
+from cosmos1.models.autoregressive.configs.base.tokenizer import TokenizerConfig
+@attrs.define
+class ModelConfig:
+    """
+    A class to hold model configuration arguments.
+    Args:
+        dim (int): The dimensionality of the input and output of each transformer block.
+        n_layers (int): Number of layers in the transformer.
+        n_heads (int): Number of attention heads.
+        n_kv_heads (Optional[int]): Number of key-value heads. If None, defaults to n_heads. Note: this is equivalent to
+            `num_gqa_groups` in TransformerEngine, where GQA means Grouped Query Attention.
+        head_dim (Optional[int]): Dimensionality of each head. If None, defaults to dim // n_heads.
+        vocab_size (int): Vocabulary size.
+        ffn_hidden_size (int): Hidden size for feedforward network.
+        norm_eps (float): Epsilon value for normalization.
+        rope_theta (float): Theta value for rotary positional embeddings.
+        apply_abs_pos_emb (bool): Whether to apply absolute position embeddings.
+        max_batch_size (int): Maximum batch size for inference.
+        max_seq_len (int): Maximum sequence length for input text.
+        fuse_qkv (bool): Whether to fuse QKV in attention. Defaults to True.
+        causal_mask (bool): Whether to use causal mask. Defaults to True.
+        norm_type (str): Type of normalization layer. Choices: "rmsnorm", "fused_rmsnorm", "layernorm", "np_layernorm".
+        precision (str): Data type for the model.
+        use_qk_normalization (bool): Whether to enable QK normalization.
+        ckpt_dir (str): Checkpoint directory.
+        ckpt_path (str): Checkpoint path.
+        apply_yarn (Optional[bool]): Whether to apply YaRN (long-context extension).
+        yarn_scale (Optional[float]): Scale factor for YaRN.
+        yarn_beta_fast (Optional[int]): Beta fast variable for YaRN (i.e., low_freq_factor in Llama 3.1 RoPE scaling code)
+        yarn_beta_slow (Optional[int]): Beta slow variable for YaRN (i.e., high_freq_factor in Llama 3.1 RoPE scaling code)
+        original_seq_len (Optional[int]): Original sequence length.
+        vision_encoder (Optional[str]): Vision encoder name.
+        mm_projector (Optional[str]): Multi-modal projector name.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4-channel images with the last channel as the alpha channel, set this to 4.
+        rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "3D".
+        pytorch_rope_version (Optional[str]): Version of the PyTorch RoPE implementation. Choices: "v1", "v2".
+        original_latent_shape (Optional[list]): Original shape of the latent tensor needed for rope extension.
+        pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3.
+        insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer.
+        insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers.
+        context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim.
+        num_video_frames (Optional[int]): Number of video frames.
+        video_height (Optional[int]): Raw video pixel height dimension.
+        video_width (Optional[int]): Raw video pixel width dimension.
+        video_latent_shape (Optional[list]): Video tokenizer output dimension, in (T,H,W).
+    """
+    dim: int = attrs.field(default=4096)
+    n_layers: int = attrs.field(default=32)
+    n_heads: int = attrs.field(default=32)
+    n_kv_heads: Optional[int] = attrs.field(default=8)
+    head_dim: Optional[int] = attrs.field(default=None)
+    vocab_size: int = attrs.field(default=128256)
+    ffn_hidden_size: int = attrs.field(default=14336)
+    norm_eps: float = attrs.field(default=1e-5)
+    rope_theta: float = attrs.field(default=500000)
+    apply_abs_pos_emb: bool = attrs.field(default=False)
+    max_batch_size: int = attrs.field(default=1)
+    max_seq_len: int = attrs.field(default=8192)
+    fuse_qkv: bool = attrs.field(default=False)
+    causal_mask: bool = attrs.field(default=True)
+    norm_type: str = attrs.field(default="rmsnorm")
+    precision: str = attrs.field(default="bfloat16")
+    use_qk_normalization: bool = False
+    tokenizer: Optional[TokenizerConfig] = None
+    ckpt_dir: Optional[str] = attrs.field(default=None)
+    ckpt_path: Optional[str] = attrs.field(
+        default=None
+    )  # If not None, load the model from this path instead of ckpt_dir
+    apply_yarn: Optional[bool] = attrs.field(default=False)
+    yarn_scale: Optional[float] = attrs.field(default=None)
+    yarn_beta_fast: Optional[int] = attrs.field(default=None)
+    yarn_beta_slow: Optional[int] = attrs.field(default=None)
+    original_seq_len: Optional[int] = attrs.field(default=None)
+    vision_encoder: Optional[str] = attrs.field(default=None)
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    mm_projector: Optional[str] = attrs.field(default=None)
+    rope_dim: Optional[str] = attrs.field(default="1D")
+    pytorch_rope_version: Optional[str] = attrs.field(default="v2")
+    original_latent_shape: Optional[list] = None
+    pad_to_multiple_of: Optional[int] = None
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    insert_cross_attn: bool = False
+    insert_cross_attn_every_k_layers: int = 1
+    context_dim: Optional[int] = attrs.field(default=1024)
+    # For video training
+    num_video_frames: Optional[int] = None
+    # Raw video pixel dimension
+    video_height: Optional[int] = None
+    video_width: Optional[int] = None
+    # Video tokenizer output dimension, in (T,H,W), it's computed by num_video_frames/temporal_compress_factor, video_height/spatial_compression_fact, video_width/spatial_compression_fact
+    video_latent_shape: Optional[list] = None
+    def __getitem__(self, item):
+        return getattr(self, item)

cosmos1/models/autoregressive/configs/base/model_config.py ADDED Viewed

	@@ -0,0 +1,421 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from typing import Callable, List, Optional
+from cosmos1.models.autoregressive.configs.base.model import ModelConfig
+from cosmos1.models.autoregressive.configs.base.tokenizer import (
+    TextTokenizerConfig,
+    TokenizerConfig,
+    VideoTokenizerConfig,
+    create_discrete_video_fsq_tokenizer_state_dict_config,
+)
+from cosmos1.models.autoregressive.tokenizer.image_text_tokenizer import ImageTextTokenizer
+from cosmos1.models.autoregressive.tokenizer.text_tokenizer import TextTokenizer
+from cosmos1.utils import log
+from cosmos1.utils.lazy_config import LazyCall as L
+# Common architecture specifications
+BASE_CONFIG = {"n_kv_heads": 8, "norm_type": "rmsnorm", "norm_eps": 1e-5, "ffn_hidden_size": 14336}
+COSMOS_ARCHITECTURES = {
+    "4b": {
+        "n_layers": 16,
+        "dim": 4096,
+        "n_heads": 32,
+    },
+    "12b": {
+        "n_layers": 40,
+        "dim": 5120,
+        "n_heads": 32,
+        "head_dim": 128,
+    },
+}
+COSMOS_YARN_CONFIG = {
+    "original_latent_shape": [3, 40, 64],
+    "apply_yarn": True,
+    "yarn_beta_fast": 4,
+    "yarn_beta_slow": 1,
+    "yarn_scale": 2,
+}
+# Llama3 architecture specifications for different model sizes
+LLAMA3_ARCHITECTURES = {
+    "8b": {
+        "n_layers": 32,
+        "dim": 4096,
+        "n_heads": 32,
+        "ffn_hidden_size": 14336,
+    },
+}
+# Llama3.1 uses YaRN for long context support (context of 128k tokens)
+LLAMA_YARN_CONFIG = {
+    "apply_yarn": True,
+    "yarn_scale": 8,
+    "yarn_beta_fast": 4,
+    "yarn_beta_slow": 1,
+}
+# Mistral architecture specifications for different model sizes
+MISTRAL_ARCHITECTURES = {
+    "12b": {
+        "n_layers": 40,
+        "dim": 5120,
+        "n_heads": 32,
+        "ffn_hidden_size": 14336,
+        "head_dim": 128,
+    },
+}
+PIXTRAL_VISION_ARCHITECTURES = {
+    "12b": {"vision_encoder": "pixtral-12b-vit", "mm_projector": "mlp"},
+}
+def get_model_arch_specs(model_size: str, model_family: str = "mistral", pretrained: bool = False) -> dict:
+    """
+    Get the model architecture specifications for the given model size, model family and pretrained status.
+    Args:
+        model_size (str): Model size. Choices: "1b", "3b", "4b", "7b", etc.
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral"
+        pretrained (bool): Whether to load pretrained weights.
+    Returns:
+        dict: A dictionary containing the model architecture specifications.
+    """
+    arch_specs = copy.deepcopy(BASE_CONFIG)
+    model_size = model_size.lower()
+    if model_family.startswith("cosmos"):
+        arch_specs.update(COSMOS_ARCHITECTURES[model_size])
+    elif model_family.startswith("llama"):
+        arch_specs.update(LLAMA3_ARCHITECTURES[model_size])
+    elif model_family in ["mistral", "pixtral"]:
+        arch_specs.update(MISTRAL_ARCHITECTURES[model_size])
+        if model_family == "pixtral":
+            arch_specs.update(PIXTRAL_VISION_ARCHITECTURES[model_size])
+    else:
+        raise ValueError(f"Model family {model_family} is not supported.")
+    if pretrained:
+        if model_family == "cosmos":
+            if model_size == "12b":
+                arch_specs.update(COSMOS_YARN_CONFIG)
+                log.debug(f"Using YaRN for RoPE extension with config: {COSMOS_YARN_CONFIG}")
+            else:
+                pass
+        elif model_family in ["llama", "llama3"]:
+            pretrained_specs = {
+                "rope_theta": 500000,
+                "max_seq_len": 8192,
+                "vocab_size": 128256,
+            }
+            arch_specs.update(pretrained_specs)
+        elif model_family == "llama3.1":
+            pretrained_specs = {
+                "rope_theta": 500000,
+                "max_seq_len": 131072,
+                "original_seq_len": 8192,
+                "vocab_size": 128256,
+                **LLAMA_YARN_CONFIG,
+            }
+            arch_specs.update(pretrained_specs)
+        elif model_family == "mistral":
+            assert model_size == "12b", "We only support Mistral-Nemo-12B model."
+            pretrained_specs = {
+                "rope_theta": 1000000,
+                "max_seq_len": 128000,
+                "vocab_size": 131072,
+            }
+            arch_specs.update(pretrained_specs)
+        elif model_family == "pixtral":
+            assert model_size == "12b", "We only support Pixtral 12B model."
+            pretrained_specs = {"rope_theta": 1000000000, "max_seq_len": 128000, "vocab_size": 131072}
+            arch_specs.update(pretrained_specs)
+        else:
+            raise ValueError(f"Model family {model_family} doesn't have a pretrained config.")
+    return arch_specs
+def create_text_model_config(
+    model_ckpt_path: str,
+    tokenizer_path: str,
+    model_family: str = "mistral",
+    model_size: str = "12b",
+    is_instruct_model: bool = True,
+    max_seq_len: int = None,
+    max_batch_size: int = 1,
+    rope_dim: str = "1D",
+    add_special_tokens: bool = True,
+    pytorch_rope_version: str = None,
+) -> dict:
+    """Create a text model for training or inference.
+    Args:
+        model_ckpt_path (str): Path to the model checkpoint.
+        tokenizer_path (str): Path to the tokenizer folder.
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
+        model_size (str): Model size. Choices: "1b", "3b", "4b", "7b", "8b", "72b", etc.
+        is_instruct_model (bool): Whether the model is an instruct model.
+        inference (bool): Whether to create the model for inference.
+        max_seq_len (int): Maximum sequence length.
+        max_batch_size (int): Maximum batch size.
+        rope_dim (str): RoPE dimension. Choices: "1D", "3D".
+        add_special_tokens (bool): Whether to add special tokens.
+    Returns:
+        dict: A dictionary containing the model configuration, which can be used to instantiate the model object.
+    """
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
+    if max_seq_len is not None:
+        # Override the max_seq_len if provided
+        model_arch_specs["max_seq_len"] = max_seq_len
+    if pytorch_rope_version is not None:
+        model_arch_specs["pytorch_rope_version"] = pytorch_rope_version
+    model_config = ModelConfig(
+        max_batch_size=max_batch_size,
+        precision="bfloat16",
+        ckpt_path=model_ckpt_path,
+        use_qk_normalization=False,
+        rope_dim=rope_dim,
+        **model_arch_specs,
+    )
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=TextTokenizerConfig(
+            config=L(TextTokenizer)(
+                model_family=model_family,
+                is_instruct_model=is_instruct_model,
+                local_path=tokenizer_path,
+            ),
+            data_key="text",
+            tokenizer_offset=model_config.vocab_size,
+            tokenize_here=False,
+            vocab_size=model_config.vocab_size,
+        ),
+        seq_len=model_config.max_seq_len,
+        training_type="text_only",
+        add_special_tokens=add_special_tokens,
+    )
+    return model_config, tokenizer_config
+def create_vision_language_model_config(
+    model_ckpt_path: str,
+    tokenizer_ckpt_path: str,
+    model_family: str = "pixtral",
+    model_size: str = "12b",
+    is_instruct_model: bool = True,
+    max_batch_size: int = 1,
+    rope_dim: str = "1D",
+    add_special_tokens: bool = True,
+    max_seq_len: int = None,
+    vision_encoder_in_channels: int = 3,
+    fuse_qkv: bool = False,
+    pytorch_rope_version: str = None,
+) -> dict:
+    """Create a vision-language model for training or inference.
+    Args:
+        model_ckpt_path (str): Path to the model checkpoint.
+        tokenizer_ckpt_path (str): Path to the tokenizer checkpoint.
+        model_family (str): Model family. Choices: "pixtral".
+        model_size (str): Model size. Choices: "12b".
+        is_instruct_model (bool): Whether the model is an instruct model.
+        rope_dim (str): RoPE dimension. Choices: "1D".
+        add_special_tokens (bool): Whether to add special tokens.
+        max_seq_len (int): Maximum sequence length.
+        vision_encoder_in_channels (int): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4 channel images where last channel is binary mask, set this to 4.
+        fuse_qkv (bool): Whether to fuse the QKV linear layers.
+    Returns:
+        dict: A dictionary containing the model configuration, which can be used to instantiate the model object.
+    """
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
+    if max_seq_len is not None:
+        # Override the max_seq_len if provided
+        model_arch_specs["max_seq_len"] = max_seq_len
+    if pytorch_rope_version is not None:
+        model_arch_specs["pytorch_rope_version"] = pytorch_rope_version
+    model_config = ModelConfig(
+        max_batch_size=max_batch_size,
+        precision="bfloat16",
+        ckpt_path=model_ckpt_path,
+        use_qk_normalization=False,
+        rope_dim=rope_dim,
+        vision_encoder_in_channels=vision_encoder_in_channels,
+        fuse_qkv=fuse_qkv,
+        **model_arch_specs,
+    )
+    # Vision-language tokenizer
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=TextTokenizerConfig(
+            config=L(ImageTextTokenizer)(
+                model_family=model_family,
+                is_instruct_model=is_instruct_model,
+                image_processor_path=tokenizer_ckpt_path,
+                tokenizer_path=tokenizer_ckpt_path,
+            ),
+            data_key="image_text_interleaved",
+            tokenizer_offset=model_config.vocab_size,
+            tokenize_here=False,
+            vocab_size=model_config.vocab_size,
+        ),
+        seq_len=model_config.max_seq_len,
+        training_type="image_text_interleaved",
+        add_special_tokens=add_special_tokens,
+    )
+    return model_config, tokenizer_config
+def create_video2world_model_config(
+    model_ckpt_path: str,
+    tokenizer_ckpt_path: str,
+    model_family: str = "cosmos",
+    model_size: str = "4b",
+    pixel_chunk_duration: int = 9,
+    num_video_frames: int = 36,
+    compression_ratio: List[int] = [8, 16, 16],
+    original_seq_len: int = 8192,
+    num_condition_latents_t: int = 1,
+    num_tokens_to_ignore: int = -1,
+    batch_size: int = 2,
+    video_tokenizer_config_creator: Callable = create_discrete_video_fsq_tokenizer_state_dict_config,
+    rope_dim: str = "3D",
+    add_special_tokens: bool = True,
+    video_height: int = 384,
+    video_width: int = 640,
+    use_qk_normalization: bool = True,
+    insert_cross_attn: bool = False,
+    insert_cross_attn_every_k_layers: int = 1,
+    context_dim: int = 1024,
+    training_type: str = "video_to_video",
+    pad_to_multiple_of: Optional[int] = 64,
+    vocab_size: int = 64000,
+    apply_abs_pos_emb: bool = False,
+) -> dict:
+    """Create a video-to-world model config.
+    Args:
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
+        model_size (str): Model size. Choices: "1b", "8b", "3b".
+        pixel_chunk_duration (int): Number of frames in each chunk.
+        num_video_frames (int): Number of video frames.
+        compression_ratio (List[int]): Compression ratio for the video frames. Choices: [8, 16, 16] or [4, 8, 8].
+        original_seq_len (int): Original sequence length.
+        apply_yarn (bool): Whether to apply YaRN for long context scaling.
+        yarn_beta_fast (Optional[int]): Fast beta for YaRN.
+        yarn_beta_slow (Optional[int]): Slow beta for YaRN.
+        yarn_scale (Optional[int]): Scale factor for ctx extension.
+        use_qk_normalization (bool): Whether to use Query-Key normalization.
+        training_type (str): Type of training task.
+        batch_size (int): Batch size.
+        video_tokenizer_config_creator (Callable): Method that takes "pixel_chunk_duration: int" and "version: str" as arguments and returns video tokenizer config
+        video_tokenizer_version (str): Version of the video tokenizer.
+        num_condition_latents_t (int): Number of conditioning latent channels
+        num_tokens_to_ignore (int) = Number of tokens to ignore. This takes the precedence
+        video_height (int): Height of the video frame. Defaults to 384.
+        video_width (int): Width of the video frame. Defaults to 640.
+        rope_dim (str): RoPE dimension. Choices: "1D", "3D".
+        add_special_tokens (bool): Whether to add special tokens, use False for 2D/3D RoPE.
+        pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
+        vocab_size (int): Vocabulary size.
+        apply_abs_pos_emb (bool): Whether to apply absolute positional embeddings.
+    Returns:
+        dict: A dictionary containing the model configuration representing the model object, can be instantiated.
+    """
+    assert (
+        pixel_chunk_duration % compression_ratio[0] == 1
+    ), f"pixel_chunk_duration({pixel_chunk_duration}) should be k*n + 1 (k={compression_ratio[0]})"
+    latent_chunk_duration = (pixel_chunk_duration - 1) // compression_ratio[0] + 1
+    latent_height = video_height // compression_ratio[1]
+    latent_width = video_width // compression_ratio[2]
+    # Do some math to compute the video latent shape and sequence length
+    assert (
+        num_video_frames % pixel_chunk_duration == 0
+    ), f"num_video_frames {num_video_frames} should be divisible by pixel_chunk_duration {pixel_chunk_duration}"
+    video_latent_shape = [
+        num_video_frames // pixel_chunk_duration * latent_chunk_duration,
+        latent_height,
+        latent_width,
+    ]
+    # product of video_latent_shape
+    num_token_video_latent = video_latent_shape[0] * video_latent_shape[1] * video_latent_shape[2]
+    if add_special_tokens:
+        seq_len = num_token_video_latent + 3  # Sequence length per batch, max_seq_len + 3
+        seq_len = (seq_len + 63) // 64 * 64  # Round up to multiple of 64
+    # for text to video, we need to add <bov> token to indicate the start of the video
+    elif training_type == "text_to_video":
+        seq_len = num_token_video_latent + 1
+    else:
+        seq_len = num_token_video_latent
+    if seq_len % pad_to_multiple_of != 0:
+        # Round up to the nearest multiple of pad_to_multiple_of
+        seq_len = ((seq_len + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
+    # Whether skip the loss for first chunk or not, note the first token is already skipped when computing the loss
+    # If num_tokens_to_ignore is specified, use it.
+    # Else compute it from num_condition_latents_t
+    if num_tokens_to_ignore < 0:
+        num_tokens_to_ignore = latent_height * latent_width * num_condition_latents_t
+        if not add_special_tokens and num_condition_latents_t > 0:
+            # If there are no special tokens (bov), do a -1 so that you can compute the loss
+            # from the first token of the next chunk
+            num_tokens_to_ignore -= 1
+    model_config = ModelConfig(
+        video_height=video_height,
+        video_width=video_width,
+        max_seq_len=seq_len,
+        max_batch_size=batch_size,
+        precision="bfloat16",
+        ckpt_path=model_ckpt_path,
+        use_qk_normalization=use_qk_normalization,
+        vocab_size=64000,
+        original_seq_len=original_seq_len,
+        video_latent_shape=video_latent_shape,
+        num_video_frames=num_video_frames,
+        rope_dim=rope_dim,
+        pad_to_multiple_of=pad_to_multiple_of,
+        insert_cross_attn=insert_cross_attn,
+        insert_cross_attn_every_k_layers=insert_cross_attn_every_k_layers,
+        context_dim=context_dim,
+        apply_abs_pos_emb=apply_abs_pos_emb,
+        **model_arch_specs,
+    )
+    video_tokenizer_config = video_tokenizer_config_creator(
+        tokenizer_ckpt_path, pixel_chunk_duration, compression_ratio
+    )
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=None,
+        video_tokenizer=VideoTokenizerConfig(
+            config=video_tokenizer_config,
+            data_key="video",
+            tokenizer_offset=0,  # Since there is no text embeddings in the model. Note this only apply when the model is trained from scratch. If we use text pretrained model, the offset will be vocab_size of text token.
+            tokenize_here=True,
+            max_seq_len=num_token_video_latent,
+            vocab_size=vocab_size,
+        ),
+        seq_len=seq_len,
+        training_type=training_type,
+        add_special_tokens=add_special_tokens,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
+    return model_config, tokenizer_config

cosmos1/models/autoregressive/configs/base/tokenizer.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import attrs
+from cosmos1.models.autoregressive.tokenizer.discrete_video import DiscreteVideoFSQStateDictTokenizer
+from cosmos1.models.autoregressive.tokenizer.networks import CausalDiscreteVideoTokenizer
+from cosmos1.utils.lazy_config import LazyCall as L
+from cosmos1.utils.lazy_config import LazyDict
+def create_discrete_video_fsq_tokenizer_state_dict_config(
+    ckpt_path, pixel_chunk_duration=33, compression_ratio=[8, 16, 16]
+) -> LazyDict:
+    CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)(
+        # The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime.
+        # - It relies on fully 3D discrete wavelet transform
+        # - Uses a layer norm instead of a group norm
+        # - Factorizes full convolutions into spatial and temporal convolutions
+        # - Factorizes full attention into spatial and temporal attention
+        # - Strictly causal, with flexible temporal length at inference.
+        attn_resolutions=[32],
+        channels=128,
+        channels_mult=[2, 4, 4],
+        dropout=0.0,
+        in_channels=3,
+        num_res_blocks=2,
+        out_channels=3,
+        resolution=1024,
+        patch_size=4,
+        patch_method="haar",
+        z_channels=16,
+        z_factor=1,
+        num_groups=1,
+        legacy_mode=False,
+        spatial_compression=16,
+        temporal_compression=8,
+        embedding_dim=6,
+        levels=[8, 8, 8, 5, 5, 5],
+        name="CausalDiscreteFactorizedVideoTokenizer",
+    )
+    return L(DiscreteVideoFSQStateDictTokenizer)(
+        enc_fp=ckpt_path.replace("ema.jit", "encoder.jit"),
+        dec_fp=ckpt_path.replace("ema.jit", "decoder.jit"),
+        tokenizer_module=CausalDiscreteFactorizedVideoTokenizerConfig,
+        name="discrete_video_fsq",
+        latent_ch=6,
+        is_bf16=True,
+        pixel_chunk_duration=pixel_chunk_duration,
+        latent_chunk_duration=1 + (pixel_chunk_duration - 1) // compression_ratio[0],
+        max_enc_batch_size=8,
+        max_dec_batch_size=4,
+        levels=[8, 8, 8, 5, 5, 5],
+        compression_ratio=compression_ratio,
+    )
+@attrs.define(slots=False)
+class TextTokenizerConfig:
+    """
+    Text tokenizer config
+    Args:
+        config: Config file to define the text tokenizer class.
+        data_key (str): The input key from data_dict that will be passed to the text tokenizer.
+        tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
+        tokenizer_offset (int): Offset that is added to the tokens.
+        vocab_size (int): Vocabulary size of the tokenizer.
+    """
+    config: LazyDict
+    data_key: str = ""
+    tokenize_here: bool = False
+    tokenizer_offset: int = 0
+    vocab_size: int = 0
+@attrs.define(slots=False)
+class VideoTokenizerConfig:
+    """
+    Video tokenizer config
+    Args:
+        config: Config file to define the video tokenizer class.
+        data_key (str): The input key from data_dict that will be passed to the video tokenizer.
+        tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
+        tokenizer_offset (int): Offset that is added to the tokens. In case of joint text-video tokenizers, we
+            add an offset to make sure that video tokens and text tokens don't overlap.
+        vocab_size (int): Vocabulary size of the tokenizer.
+        max_seq_len (int): Maximum token length for an input video.
+    """
+    config: LazyDict
+    data_key: str = ""
+    tokenize_here: bool = True
+    tokenizer_offset: int = 0
+    vocab_size: int = 0
+    max_seq_len: int = -1
+@attrs.define(slots=False)
+class TokenizerConfig:
+    """
+    Joint tokenizer config
+    Args:
+        text_tokenizer (TextTokenizerConfig): Text tokenizer config file
+        class_tokenizer (ClassTokenizerConfig): Class tokenizer config file
+        video_tokenizer (VideoTokenizerConfig): Video tokenizer config file
+        image_tokenizer (ImageTokenizerConfig): Image tokenizer config file
+        seq_len (int): Final token sequence length
+        training_type (str): Type of training we use. Supports ["text_only", "text_to_video", "class_to_image", "image_text_interleaved"]
+        add_special_tokens (bool): Whether to add special tokens to the output tokens
+        pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
+    """
+    text_tokenizer: Optional[TextTokenizerConfig] = None
+    video_tokenizer: Optional[VideoTokenizerConfig] = None
+    seq_len: int = 4096
+    training_type: str = None
+    add_special_tokens: bool = True
+    pad_to_multiple_of: Optional[int] = 64

cosmos1/models/autoregressive/configs/inference/inference_config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Union
+import attrs
+from cosmos1.models.autoregressive.configs.base.model import ModelConfig, TokenizerConfig
+@attrs.define(slots=False)
+class DataShapeConfig:
+    latent_shape: list = []
+    num_video_frames: Union[None, int] = None
+    height: Union[None, int] = None
+    width: Union[None, int] = None
+@attrs.define(slots=False)
+class SamplingConfig:
+    """
+    Sampling config
+    Args:
+        temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+        top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+        logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False.
+        echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+    """
+    temperature: float = 0.6
+    top_k: int = None
+    top_p: float = 0.9
+    compile_prefill: bool = False
+    compile_sampling: bool = True
+    logprobs: bool = False
+    echo: bool = False
+@attrs.define(slots=False)
+class DiffusionDecoderSamplingConfig:
+    """
+    Diffusion decoder sampling config
+    Args:
+        guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8.
+        sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02.
+        sigma (float): Initial noise level for the diffusion process. Defaults to 8.
+        num_steps (int): Number of denoising steps to perform. Defaults to 35.
+        overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2.
+        continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16.
+        continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8.
+        dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57.
+    """
+    guidance: float = 1.8
+    sigma_min: float = 0.02
+    sigma: float = 8
+    num_steps: int = 15
+    overlap: int = 2
+    continuous_tokenizer_channel = 16
+    continuous_tokenizer_spatial_compression_ratio = 8
+    dd_train_num_video_frames: int = 57
+    max_iter: int = 99
+    fps: int = 24
+@attrs.define(slots=False)
+class InferenceConfig:
+    """
+    Inference config
+    Args:
+        model_config (ModelConfig): Model config
+        tokenizer_config (TokenizerConfig): Tokenizer config
+        ckpt_path (str): Path to the checkpoint
+        latent_shape (list): Shape of the latent
+    """
+    model_config: ModelConfig = None
+    tokenizer_config: TokenizerConfig = None
+    ckpt_path: str = ""
+    data_shape_config: DataShapeConfig = None
+    defaults: List[Any] = attrs.field(
+        factory=lambda: [
+            "_self_",
+            {"data_val": None},
+            {"data_shape_config": "video_shape_as_model_config"},
+            {"eval_job": None},
+        ]
+    )

cosmos1/models/autoregressive/diffusion_decoder/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos1/models/autoregressive/diffusion_decoder/config/base/conditioner.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Dict, Optional
+import torch
+from cosmos1.models.diffusion.conditioner import BaseVideoCondition, GeneralConditioner
+from cosmos1.models.diffusion.config.base.conditioner import (
+    FPSConfig,
+    ImageSizeConfig,
+    LatentConditionConfig,
+    LatentConditionSigmaConfig,
+    NumFramesConfig,
+    PaddingMaskConfig,
+    TextConfig,
+)
+from cosmos1.utils.lazy_config import LazyCall as L
+from cosmos1.utils.lazy_config import LazyDict
+@dataclass
+class VideoLatentDiffusionDecoderCondition(BaseVideoCondition):
+    # latent_condition will concat to the input of network, along channel dim;
+    # cfg will make latent_condition all zero padding.
+    latent_condition: Optional[torch.Tensor] = None
+    latent_condition_sigma: Optional[torch.Tensor] = None
+class VideoDiffusionDecoderConditioner(GeneralConditioner):
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> VideoLatentDiffusionDecoderCondition:
+        output = super()._forward(batch, override_dropout_rate)
+        return VideoLatentDiffusionDecoderCondition(**output)
+VideoLatentDiffusionDecoderConditionerConfig: LazyDict = L(VideoDiffusionDecoderConditioner)(
+    text=TextConfig(),
+    fps=FPSConfig(),
+    num_frames=NumFramesConfig(),
+    image_size=ImageSizeConfig(),
+    padding_mask=PaddingMaskConfig(),
+    latent_condition=LatentConditionConfig(),
+    latent_condition_sigma=LatentConditionSigmaConfig(),
+)

cosmos1/models/autoregressive/diffusion_decoder/config/config_latent_diffusion_decoder.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List
+import attrs
+from cosmos1.models.autoregressive.diffusion_decoder.config.registry import register_configs as register_dd_configs
+from cosmos1.models.diffusion.config.base.model import LatentDiffusionDecoderModelConfig
+from cosmos1.models.diffusion.config.registry import register_configs
+from cosmos1.utils import config
+from cosmos1.utils.config_helper import import_all_modules_from_package
+@attrs.define(slots=False)
+class Config(config.Config):
+    # default config groups that will be used unless overwritten
+    # see config groups in registry.py
+    defaults: List[Any] = attrs.field(
+        factory=lambda: [
+            "_self_",
+            {"net": None},
+            {"conditioner": "basic"},
+            {"tokenizer": "tokenizer"},
+            {"tokenizer_corruptor": None},
+            {"latent_corruptor": None},
+            {"pixel_corruptor": None},
+            {"experiment": None},
+        ]
+    )
+def make_config():
+    c = Config(model=LatentDiffusionDecoderModelConfig())
+    # Specifying values through instances of attrs
+    c.job.project = "cosmos_video4"
+    c.job.group = "debug"
+    c.job.name = "delete_${now:%Y-%m-%d}_${now:%H-%M-%S}"
+    # Call this function to register config groups for advanced overriding.
+    register_configs()
+    register_dd_configs()
+    # experiment config are defined in the experiment folder
+    # call import_all_modules_from_package to register them
+    import_all_modules_from_package("cosmos1.models.diffusion.config.inference", reload=True)
+    import_all_modules_from_package("cosmos1.models.autoregressive.diffusion_decoder.config.inference", reload=True)
+    return c

cosmos1/models/autoregressive/diffusion_decoder/config/inference/cosmos_diffusiondecoder_7b.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hydra.core.config_store import ConfigStore
+from cosmos1.models.autoregressive.diffusion_decoder.network import DiffusionDecoderGeneralDIT
+from cosmos1.utils.lazy_config import LazyCall as L
+from cosmos1.utils.lazy_config import LazyDict
+num_frames = 57
+Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY: LazyDict = LazyDict(
+    dict(
+        defaults=[
+            {"override /net": "faditv2_7b"},
+            {"override /tokenizer": "cosmos_video_tokenizer_res720_comp8x8x8_t121_ver092624"},
+            {"override /conditioner": "video_latent_diffusion_decoder_cond"},
+            {"override /tokenizer_corruptor": "cosmos_video_discrete_tokenizer_res720_comp8x16x16_t49_ver110224"},
+            "_self_",
+        ],
+        job=dict(
+            group="diffusion_deocder_FT_7Bv1_001",
+            name="DD_FT_7Bv1_003_002_tokenizer888_spatch2_discrete_cond_on_token",
+        ),
+        model=dict(
+            diffusion_decoder_cond_sigma_low=0.0,
+            diffusion_decoder_cond_sigma_high=0.0,
+            diffusion_decoder_corrupt_prob=0.0,
+            condition_on_tokenizer_corruptor_token=True,
+            latent_shape=[
+                16,
+                num_frames,
+                88,
+                160,
+            ],
+            tokenizer_corruptor=dict(
+                pixel_chunk_duration=num_frames,
+                latent_chunk_duration=1 + (num_frames - 1) // 8,
+            ),
+            net=L(DiffusionDecoderGeneralDIT)(
+                diffusion_decoder_condition_on_sigma=False,
+                max_img_h=240,
+                max_img_w=240,
+                rope_h_extrapolation_ratio=1.5,
+                rope_w_extrapolation_ratio=1.5,
+                rope_t_extrapolation_ratio=1,
+                block_x_format="THWBD",
+                is_diffusion_decoder=True,
+                patch_spatial=2,
+                diffusion_decoder_condition_on_token=True,
+                diffusion_decoder_token_condition_voc_size=64000,
+                diffusion_decoder_token_condition_dim=32,
+            ),
+            tokenizer=dict(
+                video_vae=dict(
+                    pixel_chunk_duration=num_frames,
+                )
+            ),
+            conditioner=dict(
+                latent_condition=dict(
+                    dropout_rate=0.2,
+                )
+            ),
+        ),
+    )
+)
+cs = ConfigStore.instance()
+cs.store(
+    group="experiment",
+    package="_global_",
+    name=Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY["job"]["name"],
+    node=Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY,
+)

cosmos1/models/autoregressive/diffusion_decoder/config/registry.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hydra.core.config_store import ConfigStore
+from cosmos1.models.autoregressive.diffusion_decoder.config.base.conditioner import (
+    VideoLatentDiffusionDecoderConditionerConfig,
+)
+from cosmos1.models.autoregressive.tokenizer.discrete_video import DiscreteVideoFSQJITTokenizer
+from cosmos1.models.diffusion.module.pretrained_vae import JITVAE, JointImageVideoSharedJITTokenizer, VideoJITTokenizer
+from cosmos1.utils.lazy_config import LazyCall as L
+def get_cosmos_video_discrete_tokenizer_comp8x16x16(
+    resolution: str,
+    chunk_duration: int,
+    checkpoint_path: str,
+):
+    assert resolution in ["720"]
+    pixel_chunk_duration = chunk_duration
+    temporal_compression_factor = 8
+    spatial_compression_factor = 16
+    return L(DiscreteVideoFSQJITTokenizer)(
+        enc_fp=checkpoint_path.replace(".jit", "encoder.jit"),
+        dec_fp=checkpoint_path.replace(".jit", "decoder.jit"),
+        name="discrete_video_fsq",
+        latent_ch=6,
+        is_bf16=True,
+        pixel_chunk_duration=pixel_chunk_duration,
+        latent_chunk_duration=1 + (pixel_chunk_duration - 1) // temporal_compression_factor,
+        max_enc_batch_size=8,
+        max_dec_batch_size=4,
+        levels=[8, 8, 8, 5, 5, 5],
+        compression_ratio=[temporal_compression_factor, spatial_compression_factor, spatial_compression_factor],
+    )
+def get_cosmos_video_tokenizer_comp8x8x8(resolution: str, chunk_duration: int, checkpoint_path=None):
+    pixel_chunk_duration = chunk_duration
+    temporal_compression_factor = 8
+    spatial_compression_factor = 8
+    return L(JointImageVideoSharedJITTokenizer)(
+        video_vae=L(VideoJITTokenizer)(
+            name="cosmos_1_0_diffusion_tokenizer",
+            latent_ch=16,
+            is_bf16=True,
+            pixel_chunk_duration=pixel_chunk_duration,
+            temporal_compression_factor=temporal_compression_factor,
+            spatial_compression_factor=spatial_compression_factor,
+            spatial_resolution=resolution,
+        ),
+        image_vae=L(JITVAE)(
+            name="cosmos_1_0_diffusion_tokenizer",
+            latent_ch=16,
+            is_image=False,
+            is_bf16=True,
+        ),
+        name="cosmos_diffusion_tokenizer_res720_comp8x8x8_t121_ver092624",
+        latent_ch=16,
+    )
+def register_tokenizer(cs):
+    cs.store(
+        group="tokenizer",
+        package="model.tokenizer",
+        name="cosmos_video_tokenizer_res720_comp8x8x8_t121_ver092624",
+        node=get_cosmos_video_tokenizer_comp8x8x8(
+            resolution="720",
+            chunk_duration=121,
+            checkpoint_path="checkpoints/Cosmos-1.0-Tokenizer-CV8x8x8/.jit",
+        ),
+    )
+def register_corruptor(cs):
+    cs.store(
+        group="tokenizer_corruptor",
+        package="model.tokenizer_corruptor",
+        name="cosmos_video_discrete_tokenizer_res720_comp8x16x16_t49_ver110224",
+        node=get_cosmos_video_discrete_tokenizer_comp8x16x16(
+            resolution="720",
+            chunk_duration=49,
+            checkpoint_path="checkpoints/Cosmos-1.0-Tokenizer-DV8x16x16/.jit",
+        ),
+    )
+def register_conditioner(cs):
+    cs.store(
+        group="conditioner",
+        package="model.conditioner",
+        name="video_latent_diffusion_decoder_cond",
+        node=VideoLatentDiffusionDecoderConditionerConfig,
+    )
+def register_configs():
+    cs = ConfigStore.instance()
+    register_conditioner(cs)
+    register_corruptor(cs)
+    register_tokenizer(cs)

cosmos1/models/autoregressive/diffusion_decoder/inference.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import gc
+from typing import List
+import torch
+from cosmos1.models.autoregressive.configs.inference.inference_config import DiffusionDecoderSamplingConfig
+from cosmos1.models.autoregressive.diffusion_decoder.model import LatentDiffusionDecoderModel
+from cosmos1.models.autoregressive.diffusion_decoder.utils import linear_blend_video_list, split_with_overlap
+from cosmos1.utils import log
+def diffusion_decoder_process_tokens(
+    model: LatentDiffusionDecoderModel,
+    indices_tensor: List[torch.Tensor],
+    dd_sampling_config: DiffusionDecoderSamplingConfig = None,
+    original_video_example: torch.Tensor = None,
+    t5_emb_batch: List[torch.Tensor] = None,
+):
+    _, T, H, W = original_video_example.shape
+    if dd_sampling_config is None:
+        dd_sampling_config = DiffusionDecoderSamplingConfig()
+    # indices_tensor is assumed to be a list of tensors with shape 1LHW
+    data_batch_list = []
+    for sample_num, token_CTHW in enumerate(indices_tensor):
+        token_BCTHW = token_CTHW.unsqueeze(0).unsqueeze(1)
+        token_BCTHW = split_with_overlap(
+            token_BCTHW,
+            (dd_sampling_config.dd_train_num_video_frames - 1) // 8 + 1,
+            overlap=dd_sampling_config.overlap,
+            tobf16=False,
+        )
+        data_batch_list.append(
+            {
+                "token_chunks": token_BCTHW,
+                "t5_text_embeddings": t5_emb_batch[sample_num].to(torch.bfloat16),
+                "t5_text_mask": torch.ones(1, 512, dtype=torch.bfloat16).cuda(),
+                # other conditions
+                "image_size": torch.tensor([[H, W, H, W]] * 1, dtype=torch.bfloat16).cuda(),
+                "fps": torch.tensor([dd_sampling_config.fps] * 1, dtype=torch.bfloat16).cuda(),
+                "num_frames": torch.tensor(
+                    [dd_sampling_config.dd_train_num_video_frames] * 1, dtype=torch.bfloat16
+                ).cuda(),
+                "padding_mask": torch.zeros((1, 1, H, W), dtype=torch.bfloat16).cuda(),
+            }
+        )
+    out_videos_batch = []
+    for idx, data_batch_template in enumerate(data_batch_list):
+        full_length_sample = []
+        iterations = min(len(data_batch_template["token_chunks"]), dd_sampling_config.max_iter)
+        for iter in range(iterations):
+            gc.collect()
+            torch.cuda.empty_cache()
+            data_batch = copy.deepcopy(data_batch_template)
+            data_batch["video"] = data_batch_template["token_chunks"][iter].cuda().to("cuda")
+            log.debug(f"Run iter {iter} for video # {idx} at length {data_batch['video'].shape[2]}")
+            # org_video,
+            with torch.no_grad():
+                samples_latent = model.generate_samples_from_batch(
+                    data_batch,
+                    guidance=dd_sampling_config.guidance,
+                    sigma_min=dd_sampling_config.sigma_min,
+                    state_shape=[
+                        dd_sampling_config.continuous_tokenizer_channel,
+                        dd_sampling_config.continuous_tokenizer_spatial_compression_ratio,
+                        H // 8,
+                        W // 8,
+                    ],
+                    apply_corruptor=False,
+                    return_recon_x=False,
+                    # corrupt_sigma=dd_sampling_config.sigma,
+                    preencode_condition=True,  # We are using discrete model, so the input is already pre-encoded
+                    num_steps=dd_sampling_config.num_steps,
+                )
+                log.debug(f"Current sample shape {samples_latent.shape} for video # {idx} ")
+            full_length_sample.append(samples_latent.detach())
+            # Turn off because we remove CP
+            # distributed.barrier()
+            del data_batch
+            torch.cuda.empty_cache()
+        gc.collect()
+        torch.cuda.empty_cache()
+        # Decode full-length samples and free GPU memory
+        full_length_sample_pixs = [model.decode(item).clamp(-1, 1).cpu() for item in full_length_sample]
+        torch.cuda.empty_cache()
+        # Blend pixel samples
+        if len(full_length_sample_pixs) > 1:
+            full_length_sample_pixel_blend = linear_blend_video_list(
+                full_length_sample_pixs, dd_sampling_config.overlap
+            )[:, :, :T]
+        else:
+            full_length_sample_pixel_blend = full_length_sample_pixs[0][:, :, :T]
+        # Batch size of full_length_sample_pixel_blend is always 1
+        out_videos_batch.append((1 + full_length_sample_pixel_blend[0].cpu()) / 2)
+    return out_videos_batch

cosmos1/models/autoregressive/diffusion_decoder/model.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple
+import torch
+from torch import Tensor
+from cosmos1.models.diffusion.conditioner import BaseVideoCondition
+from cosmos1.models.diffusion.diffusion.functional.batch_ops import batch_mul
+from cosmos1.models.diffusion.diffusion.modules.res_sampler import COMMON_SOLVER_OPTIONS
+from cosmos1.models.diffusion.model.model_t2w import DiffusionT2WModel as VideoDiffusionModel
+from cosmos1.utils.lazy_config import instantiate as lazy_instantiate
+@dataclass
+class VideoLatentDiffusionDecoderCondition(BaseVideoCondition):
+    # latent_condition will concat to the input of network, along channel dim;
+    # cfg will make latent_condition all zero padding.
+    latent_condition: Optional[torch.Tensor] = None
+    latent_condition_sigma: Optional[torch.Tensor] = None
+class LatentDiffusionDecoderModel(VideoDiffusionModel):
+    def __init__(self, config):
+        super().__init__(config)
+        """
+        latent_corruptor: the corruption module is used to corrupt the latents. It add gaussian noise to the latents.
+        pixel_corruptor: the corruption module is used to corrupt the pixels. It apply gaussian blur kernel to pixels in a temporal consistent way.
+        tokenizer_corruptor: the corruption module is used to simulate tokenizer reconstruction errors.
+        diffusion decoder noise augmentation pipeline for continuous token condition model:
+        condition: GT_video [T, H, W]
+                        -> tokenizer_corruptor~(8x8x8) encode -> latent_corruptor -> tokenizer_corruptor~(8x8x8) decode
+                        -> pixel corruptor
+                        -> tokenizer~(1x8x8) encode -> condition [T, H/8, W/8]
+        GT: GT_video [T, H, W] -> tokenizer~(1x8x8) -> x_t [T, H/8, W/8].
+        diffusion decoder noise augmentation pipeline for discrete token condition model:
+        condition: GT_video [T, H, W]
+                -> pixel corruptor
+                -> discrete tokenizer encode -> condition [T, T/8, H/16, W/16]
+        GT: GT_video [T, H, W] -> tokenizer~(8x8x8) -> x_t [T, T/8, H/8, W/8].
+        """
+        self.latent_corruptor = lazy_instantiate(config.latent_corruptor)
+        self.pixel_corruptor = lazy_instantiate(config.pixel_corruptor)
+        self.tokenizer_corruptor = lazy_instantiate(config.tokenizer_corruptor)
+        if self.latent_corruptor:
+            self.latent_corruptor.to(**self.tensor_kwargs)
+        if self.pixel_corruptor:
+            self.pixel_corruptor.to(**self.tensor_kwargs)
+        if self.tokenizer_corruptor:
+            if hasattr(self.tokenizer_corruptor, "reset_dtype"):
+                self.tokenizer_corruptor.reset_dtype()
+        else:
+            assert self.pixel_corruptor is not None
+        self.diffusion_decoder_cond_sigma_low = config.diffusion_decoder_cond_sigma_low
+        self.diffusion_decoder_cond_sigma_high = config.diffusion_decoder_cond_sigma_high
+        self.diffusion_decoder_corrupt_prob = config.diffusion_decoder_corrupt_prob
+        if hasattr(config, "condition_on_tokenizer_corruptor_token"):
+            self.condition_on_tokenizer_corruptor_token = config.condition_on_tokenizer_corruptor_token
+        else:
+            self.condition_on_tokenizer_corruptor_token = False
+    def is_image_batch(self, data_batch: dict[str, Tensor]) -> bool:
+        """We hanlde two types of data_batch. One comes from a joint_dataloader where "dataset_name" can be used to differenciate image_batch and video_batch.
+        Another comes from a dataloader which we by default assumes as video_data for video model training.
+        """
+        is_image = self.input_image_key in data_batch
+        is_video = self.input_data_key in data_batch
+        assert (
+            is_image != is_video
+        ), "Only one of the input_image_key or input_data_key should be present in the data_batch."
+        return is_image
+    def get_x0_fn_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        is_negative_prompt: bool = False,
+        apply_corruptor: bool = True,
+        corrupt_sigma: float = 1.5,
+        preencode_condition: bool = False,
+    ) -> Callable:
+        """
+        Generates a callable function `x0_fn` based on the provided data batch and guidance factor.
+        This function first processes the input data batch through a conditioning workflow (`conditioner`) to obtain conditioned and unconditioned states. It then defines a nested function `x0_fn` which applies a denoising operation on an input `noise_x` at a given noise level `sigma` using both the conditioned and unconditioned states.
+        Args:
+        - data_batch (Dict): A batch of data used for conditioning. The format and content of this dictionary should align with the expectations of the `self.conditioner`
+        - guidance (float, optional): A scalar value that modulates the influence of the conditioned state relative to the unconditioned state in the output. Defaults to 1.5.
+        - is_negative_prompt (bool): use negative prompt t5 in uncondition if true
+        Returns:
+        - Callable: A function `x0_fn(noise_x, sigma)` that takes two arguments, `noise_x` and `sigma`, and return x0 predictoin
+        The returned function is suitable for use in scenarios where a denoised state is required based on both conditioned and unconditioned inputs, with an adjustable level of guidance influence.
+        """
+        input_key = self.input_data_key  # by default it is video key
+        # Latent state
+        raw_state = data_batch[input_key]
+        if self.condition_on_tokenizer_corruptor_token:
+            if preencode_condition:
+                latent_condition = raw_state.to(torch.int32).contiguous()
+                corrupted_pixel = self.tokenizer_corruptor.decode(latent_condition[:, 0])
+            else:
+                corrupted_pixel = (
+                    self.pixel_corruptor(raw_state) if apply_corruptor and self.pixel_corruptor else raw_state
+                )
+                latent_condition = self.tokenizer_corruptor.encode(corrupted_pixel)
+                latent_condition = latent_condition[1] if isinstance(latent_condition, tuple) else latent_condition
+                corrupted_pixel = self.tokenizer_corruptor.decode(latent_condition)
+                latent_condition = latent_condition.unsqueeze(1)
+        else:
+            if preencode_condition:
+                latent_condition = raw_state
+                corrupted_pixel = self.decode(latent_condition)
+            else:
+                corrupted_pixel = (
+                    self.pixel_corruptor(raw_state) if apply_corruptor and self.pixel_corruptor else raw_state
+                )
+                latent_condition = self.encode(corrupted_pixel).contiguous()
+        sigma = (
+            torch.rand((latent_condition.shape[0],)).to(**self.tensor_kwargs) * corrupt_sigma
+        )  # small value to indicate clean video
+        _, _, _, c_noise_cond = self.scaling(sigma=sigma)
+        if corrupt_sigma != self.diffusion_decoder_cond_sigma_low and self.diffusion_decoder_corrupt_prob > 0:
+            noise = batch_mul(sigma, torch.randn_like(latent_condition))
+            latent_condition = latent_condition + noise
+        data_batch["latent_condition_sigma"] = batch_mul(torch.ones_like(latent_condition[:, 0:1, ::]), c_noise_cond)
+        data_batch["latent_condition"] = latent_condition
+        if is_negative_prompt:
+            condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
+        else:
+            condition, uncondition = self.conditioner.get_condition_uncondition(data_batch)
+        def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+            cond_x0 = self.denoise(noise_x, sigma, condition).x0
+            uncond_x0 = self.denoise(noise_x, sigma, uncondition).x0
+            return cond_x0 + guidance * (cond_x0 - uncond_x0)
+        return x0_fn, corrupted_pixel
+    def generate_samples_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        seed: int = 1,
+        state_shape: Tuple | None = None,
+        n_sample: int | None = None,
+        is_negative_prompt: bool = False,
+        num_steps: int = 35,
+        solver_option: COMMON_SOLVER_OPTIONS = "2ab",
+        sigma_min: float = 0.02,
+        apply_corruptor: bool = False,
+        return_recon_x: bool = False,
+        corrupt_sigma: float = 0.01,
+        preencode_condition: bool = False,
+    ) -> Tensor:
+        """
+        Generate samples from the batch. Based on given batch, it will automatically determine whether to generate image or video samples.
+        Args:
+            data_batch (dict): raw data batch draw from the training data loader.
+            iteration (int): Current iteration number.
+            guidance (float): guidance weights
+            seed (int): random seed
+            state_shape (tuple): shape of the state, default to self.state_shape if not provided
+            n_sample (int): number of samples to generate
+            is_negative_prompt (bool): use negative prompt t5 in uncondition if true
+            num_steps (int): number of steps for the diffusion process
+            solver_option (str): differential equation solver option, default to "2ab"~(mulitstep solver)
+            preencode_condition (bool): use pre-computed condition if true, save tokenizer's inference time memory/
+        """
+        if not preencode_condition:
+            self._normalize_video_databatch_inplace(data_batch)
+            self._augment_image_dim_inplace(data_batch)
+        is_image_batch = False
+        if n_sample is None:
+            input_key = self.input_image_key if is_image_batch else self.input_data_key
+            n_sample = data_batch[input_key].shape[0]
+        if state_shape is None:
+            if is_image_batch:
+                state_shape = (self.state_shape[0], 1, *self.state_shape[2:])  # C,T,H,W
+        x0_fn, recon_x = self.get_x0_fn_from_batch(
+            data_batch,
+            guidance,
+            is_negative_prompt=is_negative_prompt,
+            apply_corruptor=apply_corruptor,
+            corrupt_sigma=corrupt_sigma,
+            preencode_condition=preencode_condition,
+        )
+        generator = torch.Generator(device=self.tensor_kwargs["device"])
+        generator.manual_seed(seed)
+        x_sigma_max = (
+            torch.randn(n_sample, *state_shape, **self.tensor_kwargs, generator=generator) * self.sde.sigma_max
+        )
+        samples = self.sampler(
+            x0_fn,
+            x_sigma_max,
+            num_steps=num_steps,
+            sigma_min=sigma_min,
+            sigma_max=self.sde.sigma_max,
+            solver_option=solver_option,
+        )
+        if return_recon_x:
+            return samples, recon_x
+        else:
+            return samples

cosmos1/models/autoregressive/diffusion_decoder/network.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import torch
+from einops import rearrange
+from torch import nn
+from torchvision import transforms
+from cosmos1.models.diffusion.module.blocks import PatchEmbed
+from cosmos1.models.diffusion.networks.general_dit import GeneralDIT
+class DiffusionDecoderGeneralDIT(GeneralDIT):
+    def __init__(
+        self,
+        *args,
+        is_diffusion_decoder: bool = True,
+        diffusion_decoder_condition_on_sigma: bool = False,
+        diffusion_decoder_condition_on_token: bool = False,
+        diffusion_decoder_token_condition_voc_size: int = 64000,
+        diffusion_decoder_token_condition_dim: int = 32,
+        **kwargs,
+    ):
+        # diffusion decoder setting
+        self.is_diffusion_decoder = is_diffusion_decoder
+        self.diffusion_decoder_condition_on_sigma = diffusion_decoder_condition_on_sigma
+        self.diffusion_decoder_condition_on_token = diffusion_decoder_condition_on_token
+        self.diffusion_decoder_token_condition_voc_size = diffusion_decoder_token_condition_voc_size
+        self.diffusion_decoder_token_condition_dim = diffusion_decoder_token_condition_dim
+        super().__init__(*args, **kwargs)
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        super().initialize_weights()
+        if self.diffusion_decoder_condition_on_token:
+            nn.init.constant_(self.token_embedder.weight, 0)
+    def build_patch_embed(self):
+        (
+            concat_padding_mask,
+            in_channels,
+            patch_spatial,
+            patch_temporal,
+            model_channels,
+            is_diffusion_decoder,
+            diffusion_decoder_token_condition_dim,
+            diffusion_decoder_condition_on_sigma,
+        ) = (
+            self.concat_padding_mask,
+            self.in_channels,
+            self.patch_spatial,
+            self.patch_temporal,
+            self.model_channels,
+            self.is_diffusion_decoder,
+            self.diffusion_decoder_token_condition_dim,
+            self.diffusion_decoder_condition_on_sigma,
+        )
+        in_channels = (
+            in_channels + in_channels
+            if (is_diffusion_decoder and not self.diffusion_decoder_condition_on_token)
+            else in_channels
+        )
+        in_channels = in_channels + 1 if diffusion_decoder_condition_on_sigma else in_channels
+        in_channels = (
+            in_channels + self.diffusion_decoder_token_condition_dim
+            if self.diffusion_decoder_condition_on_token
+            else in_channels
+        )
+        in_channels = in_channels + 1 if concat_padding_mask else in_channels
+        self.x_embedder = PatchEmbed(
+            spatial_patch_size=patch_spatial,
+            temporal_patch_size=patch_temporal,
+            in_channels=in_channels,
+            out_channels=model_channels,
+            bias=False,
+        )
+        if self.diffusion_decoder_condition_on_token:
+            self.token_embedder = nn.Embedding(
+                self.diffusion_decoder_token_condition_voc_size, self.diffusion_decoder_token_condition_dim
+            )
+    def prepare_embedded_sequence(
+        self,
+        x_B_C_T_H_W: torch.Tensor,
+        fps: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        latent_condition: Optional[torch.Tensor] = None,
+        latent_condition_sigma: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
+        Args:
+            x_B_C_T_H_W (torch.Tensor): video
+            fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
+                                    If None, a default value (`self.base_fps`) will be used.
+            padding_mask (Optional[torch.Tensor]): current it is not used
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor]]:
+                - A tensor of shape (B, T, H, W, D) with the embedded sequence.
+                - An optional positional embedding tensor, returned only if the positional embedding class
+                (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
+        Notes:
+            - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
+            - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
+            - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
+                the `self.pos_embedder` with the shape [T, H, W].
+            - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the `self.pos_embedder`
+                with the fps tensor.
+            - Otherwise, the positional embeddings are generated without considering fps.
+        """
+        if self.diffusion_decoder_condition_on_token:
+            latent_condition = self.token_embedder(latent_condition)
+            B, _, T, H, W, _ = latent_condition.shape
+            latent_condition = rearrange(latent_condition, "B 1 T H W D -> (B T) (1 D) H W")
+            latent_condition = transforms.functional.resize(
+                latent_condition, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.BILINEAR
+            )
+            latent_condition = rearrange(latent_condition, "(B T) D H W -> B D T H W ", B=B, T=T)
+        x_B_C_T_H_W = torch.cat([x_B_C_T_H_W, latent_condition], dim=1)
+        if self.diffusion_decoder_condition_on_sigma:
+            x_B_C_T_H_W = torch.cat([x_B_C_T_H_W, latent_condition_sigma], dim=1)
+        if self.concat_padding_mask:
+            padding_mask = transforms.functional.resize(
+                padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
+            )
+            x_B_C_T_H_W = torch.cat(
+                [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
+            )
+        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
+        if self.extra_per_block_abs_pos_emb:
+            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps)
+        else:
+            extra_pos_emb = None
+        if "rope" in self.pos_emb_cls.lower():
+            return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps), extra_pos_emb
+        if "fps_aware" in self.pos_emb_cls:
+            x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, fps=fps)  # [B, T, H, W, D]
+        else:
+            x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D)  # [B, T, H, W, D]
+        return x_B_T_H_W_D, None, extra_pos_emb