EthanZyh commited on
Commit
01a383f
·
1 Parent(s): 3edb341

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +39 -0
  2. .flake8 +25 -0
  3. .github/workflows/lint.yml +35 -0
  4. .gitignore +243 -0
  5. .pre-commit-config.yaml +53 -0
  6. ATTRIBUTIONS.md +1437 -0
  7. CONTRIBUTING.md +59 -0
  8. Dockerfile +43 -0
  9. INSTALL.md +20 -0
  10. LICENSE +201 -0
  11. README.md +78 -0
  12. RELEASE.md +7 -0
  13. assets/cosmos-logo.png +0 -0
  14. checkpoints/README.md +3 -0
  15. cosmos1/models/POST_TRAINING.md +23 -0
  16. cosmos1/models/autoregressive/README.md +427 -0
  17. cosmos1/models/autoregressive/__init__.py +14 -0
  18. cosmos1/models/autoregressive/assets/nemo/finetuned_result.mp4 +0 -0
  19. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4 +0 -0
  20. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4 +0 -0
  21. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4 +0 -0
  22. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4 +0 -0
  23. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4 +0 -0
  24. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4 +0 -0
  25. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4 +0 -0
  26. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4 +0 -0
  27. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4 +0 -0
  28. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4 +0 -0
  29. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl +10 -0
  30. cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl +10 -0
  31. cosmos1/models/autoregressive/assets/v1p0/input.jpg +0 -0
  32. cosmos1/models/autoregressive/assets/v1p0/input.mp4 +0 -0
  33. cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_12b.mp4 +0 -0
  34. cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_13b.mp4 +0 -0
  35. cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_12b.mp4 +0 -0
  36. cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_13b.mp4 +0 -0
  37. cosmos1/models/autoregressive/configs/__init__.py +14 -0
  38. cosmos1/models/autoregressive/configs/base/__init__.py +14 -0
  39. cosmos1/models/autoregressive/configs/base/model.py +118 -0
  40. cosmos1/models/autoregressive/configs/base/model_config.py +421 -0
  41. cosmos1/models/autoregressive/configs/base/tokenizer.py +137 -0
  42. cosmos1/models/autoregressive/configs/inference/inference_config.py +102 -0
  43. cosmos1/models/autoregressive/diffusion_decoder/__init__.py +14 -0
  44. cosmos1/models/autoregressive/diffusion_decoder/config/base/conditioner.py +61 -0
  45. cosmos1/models/autoregressive/diffusion_decoder/config/config_latent_diffusion_decoder.py +61 -0
  46. cosmos1/models/autoregressive/diffusion_decoder/config/inference/cosmos_diffusiondecoder_7b.py +85 -0
  47. cosmos1/models/autoregressive/diffusion_decoder/config/registry.py +118 -0
  48. cosmos1/models/autoregressive/diffusion_decoder/inference.py +120 -0
  49. cosmos1/models/autoregressive/diffusion_decoder/model.py +231 -0
  50. cosmos1/models/autoregressive/diffusion_decoder/network.py +163 -0
.dockerignore ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Please keep below sorted alphabetically
17
+
18
+ __pycache__
19
+ .cache
20
+ .coverage
21
+ .coverage.*
22
+ .DS_Store
23
+ .env
24
+ .git
25
+ .gitignore
26
+ .pytest_cache
27
+ .Python
28
+ .tox
29
+ .venv
30
+ *.cover
31
+ *.log
32
+ *.pyc
33
+ *.pyd
34
+ *.pyo
35
+ coverage.xml
36
+ env
37
+ nosetests.xml
38
+ pip-delete-this-directory.txt
39
+ pip-log.txt
.flake8 ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ [flake8]
17
+ enable-extensions = G
18
+ select = B,C,E,F,G,P,SIM1,T4,W,B9
19
+ max-line-length = 120
20
+ # C408 ignored because we like the dict keyword argument syntax
21
+ # E501 is not flexible enough, we're using B950 instead
22
+ ignore =
23
+ E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,E226,E265
24
+ exclude =
25
+ third_party
.github/workflows/lint.yml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # lint.yml : A workflow to trigger lint tests on GitHub
17
+ name: 'Lint'
18
+ on:
19
+ pull_request:
20
+ workflow_dispatch:
21
+ jobs:
22
+ lint:
23
+ name: 'Linting'
24
+ runs-on: ubuntu-latest
25
+ steps:
26
+ - name: 'Checkout'
27
+ uses: actions/checkout@v4
28
+ - name: 'Setup Python'
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: 'pypy3.10'
32
+ - name: 'Lint'
33
+ run: |
34
+ sudo apt-get update
35
+ bash ./cosmos1/scripts/format.sh
.gitignore ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Misc
17
+ outputs/
18
+ checkpoints/*
19
+ !checkpoints/README.md
20
+
21
+ # Data types
22
+ *.jit
23
+ *.pt
24
+ *.hdr
25
+ *.webp
26
+ *.pgm
27
+ *.tiff
28
+ *.tif
29
+ *.tar
30
+ *.tar.gz
31
+ *.gz
32
+ *.pkl
33
+ *.pt
34
+ *.bin
35
+
36
+ # Other uncheckable file types
37
+ *.zip
38
+ *.exe
39
+ *.dll
40
+ *.swp
41
+ *.vscode
42
+ *.ipynb
43
+ *.DS_Store
44
+ *.pyc
45
+ *Thumbs.db
46
+ *.patch
47
+
48
+ # Credential information that should never be checked in
49
+ credentials
50
+ *.secret
51
+
52
+ # ------------------------ BELOW IS AUTO-GENERATED FOR PYTHON REPOS ------------------------
53
+
54
+ # Byte-compiled / optimized / DLL files
55
+ **/__pycache__/
56
+ *.py[cod]
57
+ *$py.class
58
+
59
+ # C extensions
60
+ *.so
61
+
62
+ # Distribution / packaging
63
+ .Python
64
+ build/
65
+ develop-eggs/
66
+ dist/
67
+ downloads/
68
+ eggs/
69
+ .eggs/
70
+ lib/
71
+ lib64/
72
+ parts/
73
+ results/
74
+ sdist/
75
+ var/
76
+ wheels/
77
+ share/python-wheels/
78
+ *.egg-info/
79
+ .installed.config
80
+ *.egg
81
+ MANIFEST
82
+
83
+ # PyInstaller
84
+ # Usually these files are written by a python script from a template
85
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
86
+ *.manifest
87
+ *.spec
88
+
89
+ # Installer logs
90
+ pip-log.txt
91
+ pip-delete-this-directory.txt
92
+
93
+ # Unit test / coverage reports
94
+ htmlcov/
95
+ .tox/
96
+ .nox/
97
+ .coverage
98
+ .coverage.*
99
+ .cache
100
+ nosetests.xml
101
+ coverage.xml
102
+ *.cover
103
+ *.py,cover
104
+ .hypothesis/
105
+ .pytest_cache/
106
+ cover/
107
+
108
+ # Translations
109
+ *.mo
110
+ *.pot
111
+
112
+ # Django stuff:
113
+ *.log
114
+ local_settings.py
115
+ db.sqlite3
116
+ db.sqlite3-journal
117
+
118
+ # Flask stuff:
119
+ instance/
120
+ .webassets-cache
121
+
122
+ # Scrapy stuff:
123
+ .scrapy
124
+
125
+ # Sphinx documentation
126
+ docs/_build/
127
+
128
+ # PyBuilder
129
+ .pybuilder/
130
+ target/
131
+
132
+ # Third party
133
+ # Jupyter Notebook
134
+ .ipynb_checkpoints
135
+
136
+ # IPython
137
+ profile_default/
138
+ ipython_config.py
139
+
140
+ # pyenv
141
+ # For a library or package, you might want to ignore these files since the code is
142
+ # intended to run in multiple environments; otherwise, check them in:
143
+ # .python-version
144
+
145
+ # pipenv
146
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
147
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
148
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
149
+ # install all needed dependencies.
150
+ #Pipfile.lock
151
+
152
+ # poetry
153
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
154
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
155
+ # commonly ignored for libraries.
156
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
157
+ #poetry.lock
158
+
159
+ # pdm
160
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
161
+ #pdm.lock
162
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
163
+ # in version control.
164
+ # https://pdm.fming.dev/#use-with-ide
165
+ .pdm.toml
166
+
167
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
168
+ __pypackages__/
169
+
170
+ # Celery stuff
171
+ celerybeat-schedule
172
+ celerybeat.pid
173
+
174
+ # SageMath parsed files
175
+ *.sage.py
176
+
177
+ # Environments
178
+ .env
179
+ .venv
180
+ env/
181
+ venv/
182
+ ENV/
183
+ env.bak/
184
+ venv.bak/
185
+
186
+ # Spyder project settings
187
+ .spyderproject
188
+ .spyproject
189
+
190
+ # Rope project settings
191
+ .ropeproject
192
+
193
+ # mkdocs documentation
194
+ /site
195
+
196
+ # mypy
197
+ .mypy_cache/
198
+ .dmypy.json
199
+ dmypy.json
200
+
201
+ # Pyre type checker
202
+ .pyre/
203
+
204
+ # pytype static type analyzer
205
+ .pytype/
206
+
207
+ # Cython debug symbols
208
+ cython_debug/
209
+
210
+ # ruff
211
+ .ruff_cache
212
+
213
+ # PyCharm
214
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
215
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
216
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
217
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
218
+ #.idea/
219
+ CLIP
220
+ .devcontainer/devcontainer.json
221
+
222
+ # Coverage
223
+ .coverage
224
+ coverage.xml
225
+
226
+ # JUnit Reports
227
+ report.xml
228
+
229
+ # CI-CD
230
+ temp/
231
+ envs.txt
232
+ manifest.json
233
+
234
+
235
+ # locks and t5 temp files
236
+ *.locks*
237
+ *.no_exist*
238
+ *models--t5*
239
+
240
+ # OneLogger
241
+ wandb/
242
+ onelogger.err
243
+ onelogger.log
.pre-commit-config.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ default_language_version:
17
+ python: python3.10
18
+ repos:
19
+ - repo: https://github.com/pycqa/flake8
20
+ rev: 6.0.0
21
+ hooks:
22
+ - id: flake8
23
+ args: [--max-line-length=120]
24
+ exclude: ^dist/|^third_party/
25
+
26
+ - repo: https://github.com/psf/black
27
+ rev: 23.12.1
28
+ hooks:
29
+ - id: black
30
+ args: [--line-length=120]
31
+ exclude: ^dist/|^third_party/
32
+
33
+ - repo: https://github.com/timothycrosley/isort
34
+ rev: 5.12.0
35
+ hooks:
36
+ - id: isort
37
+ args: [--line-length=120]
38
+
39
+ - repo: https://github.com/MarcoGorelli/absolufy-imports
40
+ rev: v0.3.1
41
+ hooks:
42
+ - id: absolufy-imports
43
+
44
+ - repo: https://github.com/pre-commit/pre-commit-hooks
45
+ rev: v4.0.1
46
+ hooks:
47
+ - id: trailing-whitespace
48
+ exclude: ^tests/.*/fixtures/.*
49
+ args: [--markdown-linebreak-ext=md]
50
+ - id: end-of-file-fixer
51
+ exclude: ^tests/.*/fixtures/.*
52
+ - id: check-added-large-files
53
+ args: ['--maxkb=2000']
ATTRIBUTIONS.md ADDED
@@ -0,0 +1,1437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Source License Attribution
2
+
3
+ Cosmos uses Open Source components. You can find the details of these open-source projects along with license information below, sorted alphabetically.
4
+ We are grateful to the developers for their contributions to open source and acknowledge these below.
5
+
6
+ ## Better-Profanity - [MIT License](https://github.com/snguyenthanh/better_profanity/blob/master/LICENSE)
7
+
8
+ ```
9
+
10
+ Copyright (c) 2018 The Python Packaging Authority
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+
30
+ ```
31
+
32
+ ## FFmpeg - [FFMPEG License](https://github.com/FFmpeg/FFmpeg/blob/master/LICENSE.md)
33
+
34
+ ```
35
+ # License
36
+
37
+ Most files in FFmpeg are under the GNU Lesser General Public License version 2.1
38
+ or later (LGPL v2.1+). Read the file `COPYING.LGPLv2.1` for details. Some other
39
+ files have MIT/X11/BSD-style licenses. In combination the LGPL v2.1+ applies to
40
+ FFmpeg.
41
+
42
+ Some optional parts of FFmpeg are licensed under the GNU General Public License
43
+ version 2 or later (GPL v2+). See the file `COPYING.GPLv2` for details. None of
44
+ these parts are used by default, you have to explicitly pass `--enable-gpl` to
45
+ configure to activate them. In this case, FFmpeg's license changes to GPL v2+.
46
+
47
+ Specifically, the GPL parts of FFmpeg are:
48
+
49
+ - libpostproc
50
+ - optional x86 optimization in the files
51
+ - `libavcodec/x86/flac_dsp_gpl.asm`
52
+ - `libavcodec/x86/idct_mmx.c`
53
+ - `libavfilter/x86/vf_removegrain.asm`
54
+ - the following building and testing tools
55
+ - `compat/solaris/make_sunver.pl`
56
+ - `doc/t2h.pm`
57
+ - `doc/texi2pod.pl`
58
+ - `libswresample/tests/swresample.c`
59
+ - `tests/checkasm/*`
60
+ - `tests/tiny_ssim.c`
61
+ - the following filters in libavfilter:
62
+ - `signature_lookup.c`
63
+ - `vf_blackframe.c`
64
+ - `vf_boxblur.c`
65
+ - `vf_colormatrix.c`
66
+ - `vf_cover_rect.c`
67
+ - `vf_cropdetect.c`
68
+ - `vf_delogo.c`
69
+ - `vf_eq.c`
70
+ - `vf_find_rect.c`
71
+ - `vf_fspp.c`
72
+ - `vf_histeq.c`
73
+ - `vf_hqdn3d.c`
74
+ - `vf_kerndeint.c`
75
+ - `vf_lensfun.c` (GPL version 3 or later)
76
+ - `vf_mcdeint.c`
77
+ - `vf_mpdecimate.c`
78
+ - `vf_nnedi.c`
79
+ - `vf_owdenoise.c`
80
+ - `vf_perspective.c`
81
+ - `vf_phase.c`
82
+ - `vf_pp.c`
83
+ - `vf_pp7.c`
84
+ - `vf_pullup.c`
85
+ - `vf_repeatfields.c`
86
+ - `vf_sab.c`
87
+ - `vf_signature.c`
88
+ - `vf_smartblur.c`
89
+ - `vf_spp.c`
90
+ - `vf_stereo3d.c`
91
+ - `vf_super2xsai.c`
92
+ - `vf_tinterlace.c`
93
+ - `vf_uspp.c`
94
+ - `vf_vaguedenoiser.c`
95
+ - `vsrc_mptestsrc.c`
96
+
97
+ Should you, for whatever reason, prefer to use version 3 of the (L)GPL, then
98
+ the configure parameter `--enable-version3` will activate this licensing option
99
+ for you. Read the file `COPYING.LGPLv3` or, if you have enabled GPL parts,
100
+ `COPYING.GPLv3` to learn the exact legal terms that apply in this case.
101
+
102
+ There are a handful of files under other licensing terms, namely:
103
+
104
+ * The files `libavcodec/jfdctfst.c`, `libavcodec/jfdctint_template.c` and
105
+ `libavcodec/jrevdct.c` are taken from libjpeg, see the top of the files for
106
+ licensing details. Specifically note that you must credit the IJG in the
107
+ documentation accompanying your program if you only distribute executables.
108
+ You must also indicate any changes including additions and deletions to
109
+ those three files in the documentation.
110
+ * `tests/reference.pnm` is under the expat license.
111
+
112
+
113
+ ## External libraries
114
+
115
+ FFmpeg can be combined with a number of external libraries, which sometimes
116
+ affect the licensing of binaries resulting from the combination.
117
+
118
+ ### Compatible libraries
119
+
120
+ The following libraries are under GPL version 2:
121
+ - avisynth
122
+ - frei0r
123
+ - libcdio
124
+ - libdavs2
125
+ - librubberband
126
+ - libvidstab
127
+ - libx264
128
+ - libx265
129
+ - libxavs
130
+ - libxavs2
131
+ - libxvid
132
+
133
+ When combining them with FFmpeg, FFmpeg needs to be licensed as GPL as well by
134
+ passing `--enable-gpl` to configure.
135
+
136
+ The following libraries are under LGPL version 3:
137
+ - gmp
138
+ - libaribb24
139
+ - liblensfun
140
+
141
+ When combining them with FFmpeg, use the configure option `--enable-version3` to
142
+ upgrade FFmpeg to the LGPL v3.
143
+
144
+ The VMAF, mbedTLS, RK MPI, OpenCORE and VisualOn libraries are under the Apache License
145
+ 2.0. That license is incompatible with the LGPL v2.1 and the GPL v2, but not with
146
+ version 3 of those licenses. So to combine these libraries with FFmpeg, the
147
+ license version needs to be upgraded by passing `--enable-version3` to configure.
148
+
149
+ The smbclient library is under the GPL v3, to combine it with FFmpeg,
150
+ the options `--enable-gpl` and `--enable-version3` have to be passed to
151
+ configure to upgrade FFmpeg to the GPL v3.
152
+
153
+ ### Incompatible libraries
154
+
155
+ There are certain libraries you can combine with FFmpeg whose licenses are not
156
+ compatible with the GPL and/or the LGPL. If you wish to enable these
157
+ libraries, even in circumstances that their license may be incompatible, pass
158
+ `--enable-nonfree` to configure. This will cause the resulting binary to be
159
+ unredistributable.
160
+
161
+ The Fraunhofer FDK AAC and OpenSSL libraries are under licenses which are
162
+ incompatible with the GPLv2 and v3. To the best of our knowledge, they are
163
+ compatible with the LGPL.
164
+
165
+ ```
166
+
167
+ ## Hydra-core [MIT License](https://github.com/facebookresearch/hydra/blob/main/LICENSE)
168
+
169
+ ```
170
+
171
+ MIT License
172
+
173
+ Copyright (c) Facebook, Inc. and its affiliates.
174
+
175
+ Permission is hereby granted, free of charge, to any person obtaining a copy
176
+ of this software and associated documentation files (the "Software"), to deal
177
+ in the Software without restriction, including without limitation the rights
178
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
179
+ copies of the Software, and to permit persons to whom the Software is
180
+ furnished to do so, subject to the following conditions:
181
+
182
+ The above copyright notice and this permission notice shall be included in all
183
+ copies or substantial portions of the Software.
184
+
185
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
186
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
187
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
188
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
189
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
190
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
191
+ SOFTWARE.
192
+
193
+ ```
194
+
195
+ ## ImageIo - [BSD 2-Clause "Simplified" License](https://github.com/imageio/imageio/blob/master/LICENSE)
196
+
197
+ ```
198
+
199
+ Copyright (c) 2014-2022, imageio developers
200
+ All rights reserved.
201
+
202
+ Redistribution and use in source and binary forms, with or without
203
+ modification, are permitted provided that the following conditions are met:
204
+
205
+ * Redistributions of source code must retain the above copyright notice, this
206
+ list of conditions and the following disclaimer.
207
+
208
+ * Redistributions in binary form must reproduce the above copyright notice,
209
+ this list of conditions and the following disclaimer in the documentation
210
+ and/or other materials provided with the distribution.
211
+
212
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
213
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
214
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
215
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
216
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
217
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
218
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
219
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
220
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
221
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
222
+
223
+ ```
224
+
225
+ ## Iopath - [MIT License](https://github.com/facebookresearch/iopath/blob/main/LICENSE)
226
+
227
+ ```
228
+ MIT License
229
+
230
+ Copyright (c) Facebook, Inc. and its affiliates.
231
+
232
+ Permission is hereby granted, free of charge, to any person obtaining a copy
233
+ of this software and associated documentation files (the "Software"), to deal
234
+ in the Software without restriction, including without limitation the rights
235
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
236
+ copies of the Software, and to permit persons to whom the Software is
237
+ furnished to do so, subject to the following conditions:
238
+
239
+ The above copyright notice and this permission notice shall be included in all
240
+ copies or substantial portions of the Software.
241
+
242
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
243
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
244
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
245
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
246
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
247
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
248
+ SOFTWARE.
249
+
250
+ ```
251
+
252
+ ## Loguru - [MIT License](https://github.com/Delgan/loguru/blob/master/LICENSE)
253
+
254
+ ```
255
+
256
+ MIT License
257
+
258
+ Copyright (c) 2017
259
+
260
+ Permission is hereby granted, free of charge, to any person obtaining a copy
261
+ of this software and associated documentation files (the "Software"), to deal
262
+ in the Software without restriction, including without limitation the rights
263
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
264
+ copies of the Software, and to permit persons to whom the Software is
265
+ furnished to do so, subject to the following conditions:
266
+
267
+ The above copyright notice and this permission notice shall be included in all
268
+ copies or substantial portions of the Software.
269
+
270
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
271
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
272
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
273
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
274
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
275
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
276
+ SOFTWARE.
277
+
278
+ ```
279
+
280
+ ## Mediapy - [Apache License 2.0](https://github.com/google/mediapy/blob/main/LICENSE)
281
+
282
+ ```
283
+
284
+ Apache License
285
+ Version 2.0, January 2004
286
+ http://www.apache.org/licenses/
287
+
288
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
289
+
290
+ 1. Definitions.
291
+
292
+ "License" shall mean the terms and conditions for use, reproduction,
293
+ and distribution as defined by Sections 1 through 9 of this document.
294
+
295
+ "Licensor" shall mean the copyright owner or entity authorized by
296
+ the copyright owner that is granting the License.
297
+
298
+ "Legal Entity" shall mean the union of the acting entity and all
299
+ other entities that control, are controlled by, or are under common
300
+ control with that entity. For the purposes of this definition,
301
+ "control" means (i) the power, direct or indirect, to cause the
302
+ direction or management of such entity, whether by contract or
303
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
304
+ outstanding shares, or (iii) beneficial ownership of such entity.
305
+
306
+ "You" (or "Your") shall mean an individual or Legal Entity
307
+ exercising permissions granted by this License.
308
+
309
+ "Source" form shall mean the preferred form for making modifications,
310
+ including but not limited to software source code, documentation
311
+ source, and configuration files.
312
+
313
+ "Object" form shall mean any form resulting from mechanical
314
+ transformation or translation of a Source form, including but
315
+ not limited to compiled object code, generated documentation,
316
+ and conversions to other media types.
317
+
318
+ "Work" shall mean the work of authorship, whether in Source or
319
+ Object form, made available under the License, as indicated by a
320
+ copyright notice that is included in or attached to the work
321
+ (an example is provided in the Appendix below).
322
+
323
+ "Derivative Works" shall mean any work, whether in Source or Object
324
+ form, that is based on (or derived from) the Work and for which the
325
+ editorial revisions, annotations, elaborations, or other modifications
326
+ represent, as a whole, an original work of authorship. For the purposes
327
+ of this License, Derivative Works shall not include works that remain
328
+ separable from, or merely link (or bind by name) to the interfaces of,
329
+ the Work and Derivative Works thereof.
330
+
331
+ "Contribution" shall mean any work of authorship, including
332
+ the original version of the Work and any modifications or additions
333
+ to that Work or Derivative Works thereof, that is intentionally
334
+ submitted to Licensor for inclusion in the Work by the copyright owner
335
+ or by an individual or Legal Entity authorized to submit on behalf of
336
+ the copyright owner. For the purposes of this definition, "submitted"
337
+ means any form of electronic, verbal, or written communication sent
338
+ to the Licensor or its representatives, including but not limited to
339
+ communication on electronic mailing lists, source code control systems,
340
+ and issue tracking systems that are managed by, or on behalf of, the
341
+ Licensor for the purpose of discussing and improving the Work, but
342
+ excluding communication that is conspicuously marked or otherwise
343
+ designated in writing by the copyright owner as "Not a Contribution."
344
+
345
+ "Contributor" shall mean Licensor and any individual or Legal Entity
346
+ on behalf of whom a Contribution has been received by Licensor and
347
+ subsequently incorporated within the Work.
348
+
349
+ 2. Grant of Copyright License. Subject to the terms and conditions of
350
+ this License, each Contributor hereby grants to You a perpetual,
351
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
352
+ copyright license to reproduce, prepare Derivative Works of,
353
+ publicly display, publicly perform, sublicense, and distribute the
354
+ Work and such Derivative Works in Source or Object form.
355
+
356
+ 3. Grant of Patent License. Subject to the terms and conditions of
357
+ this License, each Contributor hereby grants to You a perpetual,
358
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
359
+ (except as stated in this section) patent license to make, have made,
360
+ use, offer to sell, sell, import, and otherwise transfer the Work,
361
+ where such license applies only to those patent claims licensable
362
+ by such Contributor that are necessarily infringed by their
363
+ Contribution(s) alone or by combination of their Contribution(s)
364
+ with the Work to which such Contribution(s) was submitted. If You
365
+ institute patent litigation against any entity (including a
366
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
367
+ or a Contribution incorporated within the Work constitutes direct
368
+ or contributory patent infringement, then any patent licenses
369
+ granted to You under this License for that Work shall terminate
370
+ as of the date such litigation is filed.
371
+
372
+ 4. Redistribution. You may reproduce and distribute copies of the
373
+ Work or Derivative Works thereof in any medium, with or without
374
+ modifications, and in Source or Object form, provided that You
375
+ meet the following conditions:
376
+
377
+ (a) You must give any other recipients of the Work or
378
+ Derivative Works a copy of this License; and
379
+
380
+ (b) You must cause any modified files to carry prominent notices
381
+ stating that You changed the files; and
382
+
383
+ (c) You must retain, in the Source form of any Derivative Works
384
+ that You distribute, all copyright, patent, trademark, and
385
+ attribution notices from the Source form of the Work,
386
+ excluding those notices that do not pertain to any part of
387
+ the Derivative Works; and
388
+
389
+ (d) If the Work includes a "NOTICE" text file as part of its
390
+ distribution, then any Derivative Works that You distribute must
391
+ include a readable copy of the attribution notices contained
392
+ within such NOTICE file, excluding those notices that do not
393
+ pertain to any part of the Derivative Works, in at least one
394
+ of the following places: within a NOTICE text file distributed
395
+ as part of the Derivative Works; within the Source form or
396
+ documentation, if provided along with the Derivative Works; or,
397
+ within a display generated by the Derivative Works, if and
398
+ wherever such third-party notices normally appear. The contents
399
+ of the NOTICE file are for informational purposes only and
400
+ do not modify the License. You may add Your own attribution
401
+ notices within Derivative Works that You distribute, alongside
402
+ or as an addendum to the NOTICE text from the Work, provided
403
+ that such additional attribution notices cannot be construed
404
+ as modifying the License.
405
+
406
+ You may add Your own copyright statement to Your modifications and
407
+ may provide additional or different license terms and conditions
408
+ for use, reproduction, or distribution of Your modifications, or
409
+ for any such Derivative Works as a whole, provided Your use,
410
+ reproduction, and distribution of the Work otherwise complies with
411
+ the conditions stated in this License.
412
+
413
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
414
+ any Contribution intentionally submitted for inclusion in the Work
415
+ by You to the Licensor shall be under the terms and conditions of
416
+ this License, without any additional terms or conditions.
417
+ Notwithstanding the above, nothing herein shall supersede or modify
418
+ the terms of any separate license agreement you may have executed
419
+ with Licensor regarding such Contributions.
420
+
421
+ 6. Trademarks. This License does not grant permission to use the trade
422
+ names, trademarks, service marks, or product names of the Licensor,
423
+ except as required for reasonable and customary use in describing the
424
+ origin of the Work and reproducing the content of the NOTICE file.
425
+
426
+ 7. Disclaimer of Warranty. Unless required by applicable law or
427
+ agreed to in writing, Licensor provides the Work (and each
428
+ Contributor provides its Contributions) on an "AS IS" BASIS,
429
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
430
+ implied, including, without limitation, any warranties or conditions
431
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
432
+ PARTICULAR PURPOSE. You are solely responsible for determining the
433
+ appropriateness of using or redistributing the Work and assume any
434
+ risks associated with Your exercise of permissions under this License.
435
+
436
+ 8. Limitation of Liability. In no event and under no legal theory,
437
+ whether in tort (including negligence), contract, or otherwise,
438
+ unless required by applicable law (such as deliberate and grossly
439
+ negligent acts) or agreed to in writing, shall any Contributor be
440
+ liable to You for damages, including any direct, indirect, special,
441
+ incidental, or consequential damages of any character arising as a
442
+ result of this License or out of the use or inability to use the
443
+ Work (including but not limited to damages for loss of goodwill,
444
+ work stoppage, computer failure or malfunction, or any and all
445
+ other commercial damages or losses), even if such Contributor
446
+ has been advised of the possibility of such damages.
447
+
448
+ 9. Accepting Warranty or Additional Liability. While redistributing
449
+ the Work or Derivative Works thereof, You may choose to offer,
450
+ and charge a fee for, acceptance of support, warranty, indemnity,
451
+ or other liability obligations and/or rights consistent with this
452
+ License. However, in accepting such obligations, You may act only
453
+ on Your own behalf and on Your sole responsibility, not on behalf
454
+ of any other Contributor, and only if You agree to indemnify,
455
+ defend, and hold each Contributor harmless for any liability
456
+ incurred by, or claims asserted against, such Contributor by reason
457
+ of your accepting any such warranty or additional liability.
458
+
459
+ END OF TERMS AND CONDITIONS
460
+
461
+ APPENDIX: How to apply the Apache License to your work.
462
+
463
+ To apply the Apache License to your work, attach the following
464
+ boilerplate notice, with the fields enclosed by brackets "[]"
465
+ replaced with your own identifying information. (Don't include
466
+ the brackets!) The text should be enclosed in the appropriate
467
+ comment syntax for the file format. We also recommend that a
468
+ file or class name and description of purpose be included on the
469
+ same "printed page" as the copyright notice for easier
470
+ identification within third-party archives.
471
+
472
+ Copyright [yyyy] [name of copyright owner]
473
+
474
+ Licensed under the Apache License, Version 2.0 (the "License");
475
+ you may not use this file except in compliance with the License.
476
+ You may obtain a copy of the License at
477
+
478
+ http://www.apache.org/licenses/LICENSE-2.0
479
+
480
+ Unless required by applicable law or agreed to in writing, software
481
+ distributed under the License is distributed on an "AS IS" BASIS,
482
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
483
+ See the License for the specific language governing permissions and
484
+ limitations under the License.
485
+
486
+ ```
487
+
488
+ ## Nltk - [Apache License 2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
489
+
490
+ ```
491
+
492
+ Apache License
493
+ Version 2.0, January 2004
494
+ http://www.apache.org/licenses/
495
+
496
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
497
+
498
+ 1. Definitions.
499
+
500
+ "License" shall mean the terms and conditions for use, reproduction,
501
+ and distribution as defined by Sections 1 through 9 of this document.
502
+
503
+ "Licensor" shall mean the copyright owner or entity authorized by
504
+ the copyright owner that is granting the License.
505
+
506
+ "Legal Entity" shall mean the union of the acting entity and all
507
+ other entities that control, are controlled by, or are under common
508
+ control with that entity. For the purposes of this definition,
509
+ "control" means (i) the power, direct or indirect, to cause the
510
+ direction or management of such entity, whether by contract or
511
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
512
+ outstanding shares, or (iii) beneficial ownership of such entity.
513
+
514
+ "You" (or "Your") shall mean an individual or Legal Entity
515
+ exercising permissions granted by this License.
516
+
517
+ "Source" form shall mean the preferred form for making modifications,
518
+ including but not limited to software source code, documentation
519
+ source, and configuration files.
520
+
521
+ "Object" form shall mean any form resulting from mechanical
522
+ transformation or translation of a Source form, including but
523
+ not limited to compiled object code, generated documentation,
524
+ and conversions to other media types.
525
+
526
+ "Work" shall mean the work of authorship, whether in Source or
527
+ Object form, made available under the License, as indicated by a
528
+ copyright notice that is included in or attached to the work
529
+ (an example is provided in the Appendix below).
530
+
531
+ "Derivative Works" shall mean any work, whether in Source or Object
532
+ form, that is based on (or derived from) the Work and for which the
533
+ editorial revisions, annotations, elaborations, or other modifications
534
+ represent, as a whole, an original work of authorship. For the purposes
535
+ of this License, Derivative Works shall not include works that remain
536
+ separable from, or merely link (or bind by name) to the interfaces of,
537
+ the Work and Derivative Works thereof.
538
+
539
+ "Contribution" shall mean any work of authorship, including
540
+ the original version of the Work and any modifications or additions
541
+ to that Work or Derivative Works thereof, that is intentionally
542
+ submitted to Licensor for inclusion in the Work by the copyright owner
543
+ or by an individual or Legal Entity authorized to submit on behalf of
544
+ the copyright owner. For the purposes of this definition, "submitted"
545
+ means any form of electronic, verbal, or written communication sent
546
+ to the Licensor or its representatives, including but not limited to
547
+ communication on electronic mailing lists, source code control systems,
548
+ and issue tracking systems that are managed by, or on behalf of, the
549
+ Licensor for the purpose of discussing and improving the Work, but
550
+ excluding communication that is conspicuously marked or otherwise
551
+ designated in writing by the copyright owner as "Not a Contribution."
552
+
553
+ "Contributor" shall mean Licensor and any individual or Legal Entity
554
+ on behalf of whom a Contribution has been received by Licensor and
555
+ subsequently incorporated within the Work.
556
+
557
+ 2. Grant of Copyright License. Subject to the terms and conditions of
558
+ this License, each Contributor hereby grants to You a perpetual,
559
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
560
+ copyright license to reproduce, prepare Derivative Works of,
561
+ publicly display, publicly perform, sublicense, and distribute the
562
+ Work and such Derivative Works in Source or Object form.
563
+
564
+ 3. Grant of Patent License. Subject to the terms and conditions of
565
+ this License, each Contributor hereby grants to You a perpetual,
566
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
567
+ (except as stated in this section) patent license to make, have made,
568
+ use, offer to sell, sell, import, and otherwise transfer the Work,
569
+ where such license applies only to those patent claims licensable
570
+ by such Contributor that are necessarily infringed by their
571
+ Contribution(s) alone or by combination of their Contribution(s)
572
+ with the Work to which such Contribution(s) was submitted. If You
573
+ institute patent litigation against any entity (including a
574
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
575
+ or a Contribution incorporated within the Work constitutes direct
576
+ or contributory patent infringement, then any patent licenses
577
+ granted to You under this License for that Work shall terminate
578
+ as of the date such litigation is filed.
579
+
580
+ 4. Redistribution. You may reproduce and distribute copies of the
581
+ Work or Derivative Works thereof in any medium, with or without
582
+ modifications, and in Source or Object form, provided that You
583
+ meet the following conditions:
584
+
585
+ (a) You must give any other recipients of the Work or
586
+ Derivative Works a copy of this License; and
587
+
588
+ (b) You must cause any modified files to carry prominent notices
589
+ stating that You changed the files; and
590
+
591
+ (c) You must retain, in the Source form of any Derivative Works
592
+ that You distribute, all copyright, patent, trademark, and
593
+ attribution notices from the Source form of the Work,
594
+ excluding those notices that do not pertain to any part of
595
+ the Derivative Works; and
596
+
597
+ (d) If the Work includes a "NOTICE" text file as part of its
598
+ distribution, then any Derivative Works that You distribute must
599
+ include a readable copy of the attribution notices contained
600
+ within such NOTICE file, excluding those notices that do not
601
+ pertain to any part of the Derivative Works, in at least one
602
+ of the following places: within a NOTICE text file distributed
603
+ as part of the Derivative Works; within the Source form or
604
+ documentation, if provided along with the Derivative Works; or,
605
+ within a display generated by the Derivative Works, if and
606
+ wherever such third-party notices normally appear. The contents
607
+ of the NOTICE file are for informational purposes only and
608
+ do not modify the License. You may add Your own attribution
609
+ notices within Derivative Works that You distribute, alongside
610
+ or as an addendum to the NOTICE text from the Work, provided
611
+ that such additional attribution notices cannot be construed
612
+ as modifying the License.
613
+
614
+ You may add Your own copyright statement to Your modifications and
615
+ may provide additional or different license terms and conditions
616
+ for use, reproduction, or distribution of Your modifications, or
617
+ for any such Derivative Works as a whole, provided Your use,
618
+ reproduction, and distribution of the Work otherwise complies with
619
+ the conditions stated in this License.
620
+
621
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
622
+ any Contribution intentionally submitted for inclusion in the Work
623
+ by You to the Licensor shall be under the terms and conditions of
624
+ this License, without any additional terms or conditions.
625
+ Notwithstanding the above, nothing herein shall supersede or modify
626
+ the terms of any separate license agreement you may have executed
627
+ with Licensor regarding such Contributions.
628
+
629
+ 6. Trademarks. This License does not grant permission to use the trade
630
+ names, trademarks, service marks, or product names of the Licensor,
631
+ except as required for reasonable and customary use in describing the
632
+ origin of the Work and reproducing the content of the NOTICE file.
633
+
634
+ 7. Disclaimer of Warranty. Unless required by applicable law or
635
+ agreed to in writing, Licensor provides the Work (and each
636
+ Contributor provides its Contributions) on an "AS IS" BASIS,
637
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
638
+ implied, including, without limitation, any warranties or conditions
639
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
640
+ PARTICULAR PURPOSE. You are solely responsible for determining the
641
+ appropriateness of using or redistributing the Work and assume any
642
+ risks associated with Your exercise of permissions under this License.
643
+
644
+ 8. Limitation of Liability. In no event and under no legal theory,
645
+ whether in tort (including negligence), contract, or otherwise,
646
+ unless required by applicable law (such as deliberate and grossly
647
+ negligent acts) or agreed to in writing, shall any Contributor be
648
+ liable to You for damages, including any direct, indirect, special,
649
+ incidental, or consequential damages of any character arising as a
650
+ result of this License or out of the use or inability to use the
651
+ Work (including but not limited to damages for loss of goodwill,
652
+ work stoppage, computer failure or malfunction, or any and all
653
+ other commercial damages or losses), even if such Contributor
654
+ has been advised of the possibility of such damages.
655
+
656
+ 9. Accepting Warranty or Additional Liability. While redistributing
657
+ the Work or Derivative Works thereof, You may choose to offer,
658
+ and charge a fee for, acceptance of support, warranty, indemnity,
659
+ or other liability obligations and/or rights consistent with this
660
+ License. However, in accepting such obligations, You may act only
661
+ on Your own behalf and on Your sole responsibility, not on behalf
662
+ of any other Contributor, and only if You agree to indemnify,
663
+ defend, and hold each Contributor harmless for any liability
664
+ incurred by, or claims asserted against, such Contributor by reason
665
+ of your accepting any such warranty or additional liability.
666
+
667
+ END OF TERMS AND CONDITIONS
668
+
669
+ APPENDIX: How to apply the Apache License to your work.
670
+
671
+ To apply the Apache License to your work, attach the following
672
+ boilerplate notice, with the fields enclosed by brackets "[]"
673
+ replaced with your own identifying information. (Don't include
674
+ the brackets!) The text should be enclosed in the appropriate
675
+ comment syntax for the file format. We also recommend that a
676
+ file or class name and description of purpose be included on the
677
+ same "printed page" as the copyright notice for easier
678
+ identification within third-party archives.
679
+
680
+ Copyright [yyyy] [name of copyright owner]
681
+
682
+ Licensed under the Apache License, Version 2.0 (the "License");
683
+ you may not use this file except in compliance with the License.
684
+ You may obtain a copy of the License at
685
+
686
+ http://www.apache.org/licenses/LICENSE-2.0
687
+
688
+ Unless required by applicable law or agreed to in writing, software
689
+ distributed under the License is distributed on an "AS IS" BASIS,
690
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
691
+ See the License for the specific language governing permissions and
692
+ limitations under the License.
693
+
694
+ ```
695
+
696
+ ## PEFT - [Apache License 2.0](https://github.com/huggingface/peft/blob/main/LICENSE)
697
+
698
+ ```
699
+
700
+ Apache License
701
+ Version 2.0, January 2004
702
+ http://www.apache.org/licenses/
703
+
704
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
705
+
706
+ 1. Definitions.
707
+
708
+ "License" shall mean the terms and conditions for use, reproduction,
709
+ and distribution as defined by Sections 1 through 9 of this document.
710
+
711
+ "Licensor" shall mean the copyright owner or entity authorized by
712
+ the copyright owner that is granting the License.
713
+
714
+ "Legal Entity" shall mean the union of the acting entity and all
715
+ other entities that control, are controlled by, or are under common
716
+ control with that entity. For the purposes of this definition,
717
+ "control" means (i) the power, direct or indirect, to cause the
718
+ direction or management of such entity, whether by contract or
719
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
720
+ outstanding shares, or (iii) beneficial ownership of such entity.
721
+
722
+ "You" (or "Your") shall mean an individual or Legal Entity
723
+ exercising permissions granted by this License.
724
+
725
+ "Source" form shall mean the preferred form for making modifications,
726
+ including but not limited to software source code, documentation
727
+ source, and configuration files.
728
+
729
+ "Object" form shall mean any form resulting from mechanical
730
+ transformation or translation of a Source form, including but
731
+ not limited to compiled object code, generated documentation,
732
+ and conversions to other media types.
733
+
734
+ "Work" shall mean the work of authorship, whether in Source or
735
+ Object form, made available under the License, as indicated by a
736
+ copyright notice that is included in or attached to the work
737
+ (an example is provided in the Appendix below).
738
+
739
+ "Derivative Works" shall mean any work, whether in Source or Object
740
+ form, that is based on (or derived from) the Work and for which the
741
+ editorial revisions, annotations, elaborations, or other modifications
742
+ represent, as a whole, an original work of authorship. For the purposes
743
+ of this License, Derivative Works shall not include works that remain
744
+ separable from, or merely link (or bind by name) to the interfaces of,
745
+ the Work and Derivative Works thereof.
746
+
747
+ "Contribution" shall mean any work of authorship, including
748
+ the original version of the Work and any modifications or additions
749
+ to that Work or Derivative Works thereof, that is intentionally
750
+ submitted to Licensor for inclusion in the Work by the copyright owner
751
+ or by an individual or Legal Entity authorized to submit on behalf of
752
+ the copyright owner. For the purposes of this definition, "submitted"
753
+ means any form of electronic, verbal, or written communication sent
754
+ to the Licensor or its representatives, including but not limited to
755
+ communication on electronic mailing lists, source code control systems,
756
+ and issue tracking systems that are managed by, or on behalf of, the
757
+ Licensor for the purpose of discussing and improving the Work, but
758
+ excluding communication that is conspicuously marked or otherwise
759
+ designated in writing by the copyright owner as "Not a Contribution."
760
+
761
+ "Contributor" shall mean Licensor and any individual or Legal Entity
762
+ on behalf of whom a Contribution has been received by Licensor and
763
+ subsequently incorporated within the Work.
764
+
765
+ 2. Grant of Copyright License. Subject to the terms and conditions of
766
+ this License, each Contributor hereby grants to You a perpetual,
767
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
768
+ copyright license to reproduce, prepare Derivative Works of,
769
+ publicly display, publicly perform, sublicense, and distribute the
770
+ Work and such Derivative Works in Source or Object form.
771
+
772
+ 3. Grant of Patent License. Subject to the terms and conditions of
773
+ this License, each Contributor hereby grants to You a perpetual,
774
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
775
+ (except as stated in this section) patent license to make, have made,
776
+ use, offer to sell, sell, import, and otherwise transfer the Work,
777
+ where such license applies only to those patent claims licensable
778
+ by such Contributor that are necessarily infringed by their
779
+ Contribution(s) alone or by combination of their Contribution(s)
780
+ with the Work to which such Contribution(s) was submitted. If You
781
+ institute patent litigation against any entity (including a
782
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
783
+ or a Contribution incorporated within the Work constitutes direct
784
+ or contributory patent infringement, then any patent licenses
785
+ granted to You under this License for that Work shall terminate
786
+ as of the date such litigation is filed.
787
+
788
+ 4. Redistribution. You may reproduce and distribute copies of the
789
+ Work or Derivative Works thereof in any medium, with or without
790
+ modifications, and in Source or Object form, provided that You
791
+ meet the following conditions:
792
+
793
+ (a) You must give any other recipients of the Work or
794
+ Derivative Works a copy of this License; and
795
+
796
+ (b) You must cause any modified files to carry prominent notices
797
+ stating that You changed the files; and
798
+
799
+ (c) You must retain, in the Source form of any Derivative Works
800
+ that You distribute, all copyright, patent, trademark, and
801
+ attribution notices from the Source form of the Work,
802
+ excluding those notices that do not pertain to any part of
803
+ the Derivative Works; and
804
+
805
+ (d) If the Work includes a "NOTICE" text file as part of its
806
+ distribution, then any Derivative Works that You distribute must
807
+ include a readable copy of the attribution notices contained
808
+ within such NOTICE file, excluding those notices that do not
809
+ pertain to any part of the Derivative Works, in at least one
810
+ of the following places: within a NOTICE text file distributed
811
+ as part of the Derivative Works; within the Source form or
812
+ documentation, if provided along with the Derivative Works; or,
813
+ within a display generated by the Derivative Works, if and
814
+ wherever such third-party notices normally appear. The contents
815
+ of the NOTICE file are for informational purposes only and
816
+ do not modify the License. You may add Your own attribution
817
+ notices within Derivative Works that You distribute, alongside
818
+ or as an addendum to the NOTICE text from the Work, provided
819
+ that such additional attribution notices cannot be construed
820
+ as modifying the License.
821
+
822
+ You may add Your own copyright statement to Your modifications and
823
+ may provide additional or different license terms and conditions
824
+ for use, reproduction, or distribution of Your modifications, or
825
+ for any such Derivative Works as a whole, provided Your use,
826
+ reproduction, and distribution of the Work otherwise complies with
827
+ the conditions stated in this License.
828
+
829
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
830
+ any Contribution intentionally submitted for inclusion in the Work
831
+ by You to the Licensor shall be under the terms and conditions of
832
+ this License, without any additional terms or conditions.
833
+ Notwithstanding the above, nothing herein shall supersede or modify
834
+ the terms of any separate license agreement you may have executed
835
+ with Licensor regarding such Contributions.
836
+
837
+ 6. Trademarks. This License does not grant permission to use the trade
838
+ names, trademarks, service marks, or product names of the Licensor,
839
+ except as required for reasonable and customary use in describing the
840
+ origin of the Work and reproducing the content of the NOTICE file.
841
+
842
+ 7. Disclaimer of Warranty. Unless required by applicable law or
843
+ agreed to in writing, Licensor provides the Work (and each
844
+ Contributor provides its Contributions) on an "AS IS" BASIS,
845
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
846
+ implied, including, without limitation, any warranties or conditions
847
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
848
+ PARTICULAR PURPOSE. You are solely responsible for determining the
849
+ appropriateness of using or redistributing the Work and assume any
850
+ risks associated with Your exercise of permissions under this License.
851
+
852
+ 8. Limitation of Liability. In no event and under no legal theory,
853
+ whether in tort (including negligence), contract, or otherwise,
854
+ unless required by applicable law (such as deliberate and grossly
855
+ negligent acts) or agreed to in writing, shall any Contributor be
856
+ liable to You for damages, including any direct, indirect, special,
857
+ incidental, or consequential damages of any character arising as a
858
+ result of this License or out of the use or inability to use the
859
+ Work (including but not limited to damages for loss of goodwill,
860
+ work stoppage, computer failure or malfunction, or any and all
861
+ other commercial damages or losses), even if such Contributor
862
+ has been advised of the possibility of such damages.
863
+
864
+ 9. Accepting Warranty or Additional Liability. While redistributing
865
+ the Work or Derivative Works thereof, You may choose to offer,
866
+ and charge a fee for, acceptance of support, warranty, indemnity,
867
+ or other liability obligations and/or rights consistent with this
868
+ License. However, in accepting such obligations, You may act only
869
+ on Your own behalf and on Your sole responsibility, not on behalf
870
+ of any other Contributor, and only if You agree to indemnify,
871
+ defend, and hold each Contributor harmless for any liability
872
+ incurred by, or claims asserted against, such Contributor by reason
873
+ of your accepting any such warranty or additional liability.
874
+
875
+ END OF TERMS AND CONDITIONS
876
+
877
+ APPENDIX: How to apply the Apache License to your work.
878
+
879
+ To apply the Apache License to your work, attach the following
880
+ boilerplate notice, with the fields enclosed by brackets "[]"
881
+ replaced with your own identifying information. (Don't include
882
+ the brackets!) The text should be enclosed in the appropriate
883
+ comment syntax for the file format. We also recommend that a
884
+ file or class name and description of purpose be included on the
885
+ same "printed page" as the copyright notice for easier
886
+ identification within third-party archives.
887
+
888
+ Copyright [yyyy] [name of copyright owner]
889
+
890
+ Licensed under the Apache License, Version 2.0 (the "License");
891
+ you may not use this file except in compliance with the License.
892
+ You may obtain a copy of the License at
893
+
894
+ http://www.apache.org/licenses/LICENSE-2.0
895
+
896
+ Unless required by applicable law or agreed to in writing, software
897
+ distributed under the License is distributed on an "AS IS" BASIS,
898
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
899
+ See the License for the specific language governing permissions and
900
+ limitations under the License.
901
+
902
+ ```
903
+
904
+ ## Pillow - [MIT License](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
905
+
906
+ ```
907
+
908
+ The Python Imaging Library (PIL) is
909
+
910
+ Copyright © 1997-2011 by Secret Labs AB
911
+ Copyright © 1995-2011 by Fredrik Lundh and contributors
912
+
913
+ Pillow is the friendly PIL fork. It is
914
+
915
+ Copyright © 2010 by Jeffrey A. Clark and contributors
916
+
917
+ Like PIL, Pillow is licensed under the open source MIT-CMU License:
918
+
919
+ By obtaining, using, and/or copying this software and/or its associated
920
+ documentation, you agree that you have read, understood, and will comply
921
+ with the following terms and conditions:
922
+
923
+ Permission to use, copy, modify and distribute this software and its
924
+ documentation for any purpose and without fee is hereby granted,
925
+ provided that the above copyright notice appears in all copies, and that
926
+ both that copyright notice and this permission notice appear in supporting
927
+ documentation, and that the name of Secret Labs AB or the author not be
928
+ used in advertising or publicity pertaining to distribution of the software
929
+ without specific, written prior permission.
930
+
931
+ SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
932
+ SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
933
+ IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
934
+ INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
935
+ LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
936
+ OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
937
+ PERFORMANCE OF THIS SOFTWARE.
938
+
939
+ ```
940
+
941
+ ## PyAV - [BSD 3-Clause "New" or "Revised" License](https://github.com/PyAV-Org/PyAV/blob/main/LICENSE.txt)
942
+
943
+ ```
944
+
945
+ Copyright retained by original committers. All rights reserved.
946
+
947
+ Redistribution and use in source and binary forms, with or without
948
+ modification, are permitted provided that the following conditions are met:
949
+ * Redistributions of source code must retain the above copyright
950
+ notice, this list of conditions and the following disclaimer.
951
+ * Redistributions in binary form must reproduce the above copyright
952
+ notice, this list of conditions and the following disclaimer in the
953
+ documentation and/or other materials provided with the distribution.
954
+ * Neither the name of the project nor the names of its contributors may be
955
+ used to endorse or promote products derived from this software without
956
+ specific prior written permission.
957
+
958
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
959
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
960
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
961
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT,
962
+ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
963
+ BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
964
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
965
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
966
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
967
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
968
+
969
+ ```
970
+
971
+ ## Pytorch_Retinaface - [MIT License](https://github.com/biubug6/Pytorch_Retinaface/blob/master/LICENSE.MIT)
972
+
973
+ ```
974
+ MIT License
975
+
976
+ Copyright (c) 2019
977
+
978
+ Permission is hereby granted, free of charge, to any person obtaining a copy
979
+ of this software and associated documentation files (the "Software"), to deal
980
+ in the Software without restriction, including without limitation the rights
981
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
982
+ copies of the Software, and to permit persons to whom the Software is
983
+ furnished to do so, subject to the following conditions:
984
+
985
+ The above copyright notice and this permission notice shall be included in all
986
+ copies or substantial portions of the Software.
987
+
988
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
989
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
990
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
991
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
992
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
993
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
994
+ SOFTWARE.
995
+ ```
996
+
997
+ ## Sentencepiece - [Apache License 2.0](https://github.com/google/sentencepiece/blob/master/LICENSE)
998
+
999
+ ```
1000
+
1001
+ Apache License
1002
+ Version 2.0, January 2004
1003
+ http://www.apache.org/licenses/
1004
+
1005
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1006
+
1007
+ 1. Definitions.
1008
+
1009
+ "License" shall mean the terms and conditions for use, reproduction,
1010
+ and distribution as defined by Sections 1 through 9 of this document.
1011
+
1012
+ "Licensor" shall mean the copyright owner or entity authorized by
1013
+ the copyright owner that is granting the License.
1014
+
1015
+ "Legal Entity" shall mean the union of the acting entity and all
1016
+ other entities that control, are controlled by, or are under common
1017
+ control with that entity. For the purposes of this definition,
1018
+ "control" means (i) the power, direct or indirect, to cause the
1019
+ direction or management of such entity, whether by contract or
1020
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
1021
+ outstanding shares, or (iii) beneficial ownership of such entity.
1022
+
1023
+ "You" (or "Your") shall mean an individual or Legal Entity
1024
+ exercising permissions granted by this License.
1025
+
1026
+ "Source" form shall mean the preferred form for making modifications,
1027
+ including but not limited to software source code, documentation
1028
+ source, and configuration files.
1029
+
1030
+ "Object" form shall mean any form resulting from mechanical
1031
+ transformation or translation of a Source form, including but
1032
+ not limited to compiled object code, generated documentation,
1033
+ and conversions to other media types.
1034
+
1035
+ "Work" shall mean the work of authorship, whether in Source or
1036
+ Object form, made available under the License, as indicated by a
1037
+ copyright notice that is included in or attached to the work
1038
+ (an example is provided in the Appendix below).
1039
+
1040
+ "Derivative Works" shall mean any work, whether in Source or Object
1041
+ form, that is based on (or derived from) the Work and for which the
1042
+ editorial revisions, annotations, elaborations, or other modifications
1043
+ represent, as a whole, an original work of authorship. For the purposes
1044
+ of this License, Derivative Works shall not include works that remain
1045
+ separable from, or merely link (or bind by name) to the interfaces of,
1046
+ the Work and Derivative Works thereof.
1047
+
1048
+ "Contribution" shall mean any work of authorship, including
1049
+ the original version of the Work and any modifications or additions
1050
+ to that Work or Derivative Works thereof, that is intentionally
1051
+ submitted to Licensor for inclusion in the Work by the copyright owner
1052
+ or by an individual or Legal Entity authorized to submit on behalf of
1053
+ the copyright owner. For the purposes of this definition, "submitted"
1054
+ means any form of electronic, verbal, or written communication sent
1055
+ to the Licensor or its representatives, including but not limited to
1056
+ communication on electronic mailing lists, source code control systems,
1057
+ and issue tracking systems that are managed by, or on behalf of, the
1058
+ Licensor for the purpose of discussing and improving the Work, but
1059
+ excluding communication that is conspicuously marked or otherwise
1060
+ designated in writing by the copyright owner as "Not a Contribution."
1061
+
1062
+ "Contributor" shall mean Licensor and any individual or Legal Entity
1063
+ on behalf of whom a Contribution has been received by Licensor and
1064
+ subsequently incorporated within the Work.
1065
+
1066
+ 2. Grant of Copyright License. Subject to the terms and conditions of
1067
+ this License, each Contributor hereby grants to You a perpetual,
1068
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1069
+ copyright license to reproduce, prepare Derivative Works of,
1070
+ publicly display, publicly perform, sublicense, and distribute the
1071
+ Work and such Derivative Works in Source or Object form.
1072
+
1073
+ 3. Grant of Patent License. Subject to the terms and conditions of
1074
+ this License, each Contributor hereby grants to You a perpetual,
1075
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1076
+ (except as stated in this section) patent license to make, have made,
1077
+ use, offer to sell, sell, import, and otherwise transfer the Work,
1078
+ where such license applies only to those patent claims licensable
1079
+ by such Contributor that are necessarily infringed by their
1080
+ Contribution(s) alone or by combination of their Contribution(s)
1081
+ with the Work to which such Contribution(s) was submitted. If You
1082
+ institute patent litigation against any entity (including a
1083
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
1084
+ or a Contribution incorporated within the Work constitutes direct
1085
+ or contributory patent infringement, then any patent licenses
1086
+ granted to You under this License for that Work shall terminate
1087
+ as of the date such litigation is filed.
1088
+
1089
+ 4. Redistribution. You may reproduce and distribute copies of the
1090
+ Work or Derivative Works thereof in any medium, with or without
1091
+ modifications, and in Source or Object form, provided that You
1092
+ meet the following conditions:
1093
+
1094
+ (a) You must give any other recipients of the Work or
1095
+ Derivative Works a copy of this License; and
1096
+
1097
+ (b) You must cause any modified files to carry prominent notices
1098
+ stating that You changed the files; and
1099
+
1100
+ (c) You must retain, in the Source form of any Derivative Works
1101
+ that You distribute, all copyright, patent, trademark, and
1102
+ attribution notices from the Source form of the Work,
1103
+ excluding those notices that do not pertain to any part of
1104
+ the Derivative Works; and
1105
+
1106
+ (d) If the Work includes a "NOTICE" text file as part of its
1107
+ distribution, then any Derivative Works that You distribute must
1108
+ include a readable copy of the attribution notices contained
1109
+ within such NOTICE file, excluding those notices that do not
1110
+ pertain to any part of the Derivative Works, in at least one
1111
+ of the following places: within a NOTICE text file distributed
1112
+ as part of the Derivative Works; within the Source form or
1113
+ documentation, if provided along with the Derivative Works; or,
1114
+ within a display generated by the Derivative Works, if and
1115
+ wherever such third-party notices normally appear. The contents
1116
+ of the NOTICE file are for informational purposes only and
1117
+ do not modify the License. You may add Your own attribution
1118
+ notices within Derivative Works that You distribute, alongside
1119
+ or as an addendum to the NOTICE text from the Work, provided
1120
+ that such additional attribution notices cannot be construed
1121
+ as modifying the License.
1122
+
1123
+ You may add Your own copyright statement to Your modifications and
1124
+ may provide additional or different license terms and conditions
1125
+ for use, reproduction, or distribution of Your modifications, or
1126
+ for any such Derivative Works as a whole, provided Your use,
1127
+ reproduction, and distribution of the Work otherwise complies with
1128
+ the conditions stated in this License.
1129
+
1130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
1131
+ any Contribution intentionally submitted for inclusion in the Work
1132
+ by You to the Licensor shall be under the terms and conditions of
1133
+ this License, without any additional terms or conditions.
1134
+ Notwithstanding the above, nothing herein shall supersede or modify
1135
+ the terms of any separate license agreement you may have executed
1136
+ with Licensor regarding such Contributions.
1137
+
1138
+ 6. Trademarks. This License does not grant permission to use the trade
1139
+ names, trademarks, service marks, or product names of the Licensor,
1140
+ except as required for reasonable and customary use in describing the
1141
+ origin of the Work and reproducing the content of the NOTICE file.
1142
+
1143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
1144
+ agreed to in writing, Licensor provides the Work (and each
1145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
1146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
1147
+ implied, including, without limitation, any warranties or conditions
1148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
1149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
1150
+ appropriateness of using or redistributing the Work and assume any
1151
+ risks associated with Your exercise of permissions under this License.
1152
+
1153
+ 8. Limitation of Liability. In no event and under no legal theory,
1154
+ whether in tort (including negligence), contract, or otherwise,
1155
+ unless required by applicable law (such as deliberate and grossly
1156
+ negligent acts) or agreed to in writing, shall any Contributor be
1157
+ liable to You for damages, including any direct, indirect, special,
1158
+ incidental, or consequential damages of any character arising as a
1159
+ result of this License or out of the use or inability to use the
1160
+ Work (including but not limited to damages for loss of goodwill,
1161
+ work stoppage, computer failure or malfunction, or any and all
1162
+ other commercial damages or losses), even if such Contributor
1163
+ has been advised of the possibility of such damages.
1164
+
1165
+ 9. Accepting Warranty or Additional Liability. While redistributing
1166
+ the Work or Derivative Works thereof, You may choose to offer,
1167
+ and charge a fee for, acceptance of support, warranty, indemnity,
1168
+ or other liability obligations and/or rights consistent with this
1169
+ License. However, in accepting such obligations, You may act only
1170
+ on Your own behalf and on Your sole responsibility, not on behalf
1171
+ of any other Contributor, and only if You agree to indemnify,
1172
+ defend, and hold each Contributor harmless for any liability
1173
+ incurred by, or claims asserted against, such Contributor by reason
1174
+ of your accepting any such warranty or additional liability.
1175
+
1176
+ END OF TERMS AND CONDITIONS
1177
+
1178
+ APPENDIX: How to apply the Apache License to your work.
1179
+
1180
+ To apply the Apache License to your work, attach the following
1181
+ boilerplate notice, with the fields enclosed by brackets "[]"
1182
+ replaced with your own identifying information. (Don't include
1183
+ the brackets!) The text should be enclosed in the appropriate
1184
+ comment syntax for the file format. We also recommend that a
1185
+ file or class name and description of purpose be included on the
1186
+ same "printed page" as the copyright notice for easier
1187
+ identification within third-party archives.
1188
+
1189
+ Copyright [yyyy] [name of copyright owner]
1190
+
1191
+ Licensed under the Apache License, Version 2.0 (the "License");
1192
+ you may not use this file except in compliance with the License.
1193
+ You may obtain a copy of the License at
1194
+
1195
+ http://www.apache.org/licenses/LICENSE-2.0
1196
+
1197
+ Unless required by applicable law or agreed to in writing, software
1198
+ distributed under the License is distributed on an "AS IS" BASIS,
1199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1200
+ See the License for the specific language governing permissions and
1201
+ limitations under the License.
1202
+
1203
+ ```
1204
+
1205
+ ## Termcolor - [MIT License](https://github.com/termcolor/termcolor/blob/main/COPYING.txt)
1206
+
1207
+ ```
1208
+ Copyright (c) 2008-2011 Volvox Development Team
1209
+
1210
+ Permission is hereby granted, free of charge, to any person obtaining a copy
1211
+ of this software and associated documentation files (the "Software"), to deal
1212
+ in the Software without restriction, including without limitation the rights
1213
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1214
+ copies of the Software, and to permit persons to whom the Software is
1215
+ furnished to do so, subject to the following conditions:
1216
+
1217
+ The above copyright notice and this permission notice shall be included in
1218
+ all copies or substantial portions of the Software.
1219
+
1220
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1221
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1222
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1223
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1224
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1225
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1226
+ THE SOFTWARE.
1227
+ ```
1228
+
1229
+ ## Transformers [Apache License 2.0](https://github.com/huggingface/transformers/blob/main/LICENSE)
1230
+
1231
+ ```
1232
+
1233
+ Copyright 2018- The Hugging Face team. All rights reserved.
1234
+
1235
+ Apache License
1236
+ Version 2.0, January 2004
1237
+ http://www.apache.org/licenses/
1238
+
1239
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1240
+
1241
+ 1. Definitions.
1242
+
1243
+ "License" shall mean the terms and conditions for use, reproduction,
1244
+ and distribution as defined by Sections 1 through 9 of this document.
1245
+
1246
+ "Licensor" shall mean the copyright owner or entity authorized by
1247
+ the copyright owner that is granting the License.
1248
+
1249
+ "Legal Entity" shall mean the union of the acting entity and all
1250
+ other entities that control, are controlled by, or are under common
1251
+ control with that entity. For the purposes of this definition,
1252
+ "control" means (i) the power, direct or indirect, to cause the
1253
+ direction or management of such entity, whether by contract or
1254
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
1255
+ outstanding shares, or (iii) beneficial ownership of such entity.
1256
+
1257
+ "You" (or "Your") shall mean an individual or Legal Entity
1258
+ exercising permissions granted by this License.
1259
+
1260
+ "Source" form shall mean the preferred form for making modifications,
1261
+ including but not limited to software source code, documentation
1262
+ source, and configuration files.
1263
+
1264
+ "Object" form shall mean any form resulting from mechanical
1265
+ transformation or translation of a Source form, including but
1266
+ not limited to compiled object code, generated documentation,
1267
+ and conversions to other media types.
1268
+
1269
+ "Work" shall mean the work of authorship, whether in Source or
1270
+ Object form, made available under the License, as indicated by a
1271
+ copyright notice that is included in or attached to the work
1272
+ (an example is provided in the Appendix below).
1273
+
1274
+ "Derivative Works" shall mean any work, whether in Source or Object
1275
+ form, that is based on (or derived from) the Work and for which the
1276
+ editorial revisions, annotations, elaborations, or other modifications
1277
+ represent, as a whole, an original work of authorship. For the purposes
1278
+ of this License, Derivative Works shall not include works that remain
1279
+ separable from, or merely link (or bind by name) to the interfaces of,
1280
+ the Work and Derivative Works thereof.
1281
+
1282
+ "Contribution" shall mean any work of authorship, including
1283
+ the original version of the Work and any modifications or additions
1284
+ to that Work or Derivative Works thereof, that is intentionally
1285
+ submitted to Licensor for inclusion in the Work by the copyright owner
1286
+ or by an individual or Legal Entity authorized to submit on behalf of
1287
+ the copyright owner. For the purposes of this definition, "submitted"
1288
+ means any form of electronic, verbal, or written communication sent
1289
+ to the Licensor or its representatives, including but not limited to
1290
+ communication on electronic mailing lists, source code control systems,
1291
+ and issue tracking systems that are managed by, or on behalf of, the
1292
+ Licensor for the purpose of discussing and improving the Work, but
1293
+ excluding communication that is conspicuously marked or otherwise
1294
+ designated in writing by the copyright owner as "Not a Contribution."
1295
+
1296
+ "Contributor" shall mean Licensor and any individual or Legal Entity
1297
+ on behalf of whom a Contribution has been received by Licensor and
1298
+ subsequently incorporated within the Work.
1299
+
1300
+ 2. Grant of Copyright License. Subject to the terms and conditions of
1301
+ this License, each Contributor hereby grants to You a perpetual,
1302
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1303
+ copyright license to reproduce, prepare Derivative Works of,
1304
+ publicly display, publicly perform, sublicense, and distribute the
1305
+ Work and such Derivative Works in Source or Object form.
1306
+
1307
+ 3. Grant of Patent License. Subject to the terms and conditions of
1308
+ this License, each Contributor hereby grants to You a perpetual,
1309
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1310
+ (except as stated in this section) patent license to make, have made,
1311
+ use, offer to sell, sell, import, and otherwise transfer the Work,
1312
+ where such license applies only to those patent claims licensable
1313
+ by such Contributor that are necessarily infringed by their
1314
+ Contribution(s) alone or by combination of their Contribution(s)
1315
+ with the Work to which such Contribution(s) was submitted. If You
1316
+ institute patent litigation against any entity (including a
1317
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
1318
+ or a Contribution incorporated within the Work constitutes direct
1319
+ or contributory patent infringement, then any patent licenses
1320
+ granted to You under this License for that Work shall terminate
1321
+ as of the date such litigation is filed.
1322
+
1323
+ 4. Redistribution. You may reproduce and distribute copies of the
1324
+ Work or Derivative Works thereof in any medium, with or without
1325
+ modifications, and in Source or Object form, provided that You
1326
+ meet the following conditions:
1327
+
1328
+ (a) You must give any other recipients of the Work or
1329
+ Derivative Works a copy of this License; and
1330
+
1331
+ (b) You must cause any modified files to carry prominent notices
1332
+ stating that You changed the files; and
1333
+
1334
+ (c) You must retain, in the Source form of any Derivative Works
1335
+ that You distribute, all copyright, patent, trademark, and
1336
+ attribution notices from the Source form of the Work,
1337
+ excluding those notices that do not pertain to any part of
1338
+ the Derivative Works; and
1339
+
1340
+ (d) If the Work includes a "NOTICE" text file as part of its
1341
+ distribution, then any Derivative Works that You distribute must
1342
+ include a readable copy of the attribution notices contained
1343
+ within such NOTICE file, excluding those notices that do not
1344
+ pertain to any part of the Derivative Works, in at least one
1345
+ of the following places: within a NOTICE text file distributed
1346
+ as part of the Derivative Works; within the Source form or
1347
+ documentation, if provided along with the Derivative Works; or,
1348
+ within a display generated by the Derivative Works, if and
1349
+ wherever such third-party notices normally appear. The contents
1350
+ of the NOTICE file are for informational purposes only and
1351
+ do not modify the License. You may add Your own attribution
1352
+ notices within Derivative Works that You distribute, alongside
1353
+ or as an addendum to the NOTICE text from the Work, provided
1354
+ that such additional attribution notices cannot be construed
1355
+ as modifying the License.
1356
+
1357
+ You may add Your own copyright statement to Your modifications and
1358
+ may provide additional or different license terms and conditions
1359
+ for use, reproduction, or distribution of Your modifications, or
1360
+ for any such Derivative Works as a whole, provided Your use,
1361
+ reproduction, and distribution of the Work otherwise complies with
1362
+ the conditions stated in this License.
1363
+
1364
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
1365
+ any Contribution intentionally submitted for inclusion in the Work
1366
+ by You to the Licensor shall be under the terms and conditions of
1367
+ this License, without any additional terms or conditions.
1368
+ Notwithstanding the above, nothing herein shall supersede or modify
1369
+ the terms of any separate license agreement you may have executed
1370
+ with Licensor regarding such Contributions.
1371
+
1372
+ 6. Trademarks. This License does not grant permission to use the trade
1373
+ names, trademarks, service marks, or product names of the Licensor,
1374
+ except as required for reasonable and customary use in describing the
1375
+ origin of the Work and reproducing the content of the NOTICE file.
1376
+
1377
+ 7. Disclaimer of Warranty. Unless required by applicable law or
1378
+ agreed to in writing, Licensor provides the Work (and each
1379
+ Contributor provides its Contributions) on an "AS IS" BASIS,
1380
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
1381
+ implied, including, without limitation, any warranties or conditions
1382
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
1383
+ PARTICULAR PURPOSE. You are solely responsible for determining the
1384
+ appropriateness of using or redistributing the Work and assume any
1385
+ risks associated with Your exercise of permissions under this License.
1386
+
1387
+ 8. Limitation of Liability. In no event and under no legal theory,
1388
+ whether in tort (including negligence), contract, or otherwise,
1389
+ unless required by applicable law (such as deliberate and grossly
1390
+ negligent acts) or agreed to in writing, shall any Contributor be
1391
+ liable to You for damages, including any direct, indirect, special,
1392
+ incidental, or consequential damages of any character arising as a
1393
+ result of this License or out of the use or inability to use the
1394
+ Work (including but not limited to damages for loss of goodwill,
1395
+ work stoppage, computer failure or malfunction, or any and all
1396
+ other commercial damages or losses), even if such Contributor
1397
+ has been advised of the possibility of such damages.
1398
+
1399
+ 9. Accepting Warranty or Additional Liability. While redistributing
1400
+ the Work or Derivative Works thereof, You may choose to offer,
1401
+ and charge a fee for, acceptance of support, warranty, indemnity,
1402
+ or other liability obligations and/or rights consistent with this
1403
+ License. However, in accepting such obligations, You may act only
1404
+ on Your own behalf and on Your sole responsibility, not on behalf
1405
+ of any other Contributor, and only if You agree to indemnify,
1406
+ defend, and hold each Contributor harmless for any liability
1407
+ incurred by, or claims asserted against, such Contributor by reason
1408
+ of your accepting any such warranty or additional liability.
1409
+
1410
+ END OF TERMS AND CONDITIONS
1411
+
1412
+ APPENDIX: How to apply the Apache License to your work.
1413
+
1414
+ To apply the Apache License to your work, attach the following
1415
+ boilerplate notice, with the fields enclosed by brackets "[]"
1416
+ replaced with your own identifying information. (Don't include
1417
+ the brackets!) The text should be enclosed in the appropriate
1418
+ comment syntax for the file format. We also recommend that a
1419
+ file or class name and description of purpose be included on the
1420
+ same "printed page" as the copyright notice for easier
1421
+ identification within third-party archives.
1422
+
1423
+ Copyright [yyyy] [name of copyright owner]
1424
+
1425
+ Licensed under the Apache License, Version 2.0 (the "License");
1426
+ you may not use this file except in compliance with the License.
1427
+ You may obtain a copy of the License at
1428
+
1429
+ http://www.apache.org/licenses/LICENSE-2.0
1430
+
1431
+ Unless required by applicable law or agreed to in writing, software
1432
+ distributed under the License is distributed on an "AS IS" BASIS,
1433
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1434
+ See the License for the specific language governing permissions and
1435
+ limitations under the License.
1436
+
1437
+ ```
CONTRIBUTING.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to Contribute
2
+
3
+ We'd love to receive your patches and contributions. Please keep your PRs as draft until such time that you would like us to review them.
4
+
5
+ ## Code Reviews
6
+
7
+ All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult
8
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests.
9
+
10
+ ## Pipeline
11
+
12
+ Ensure you run the linter prior to submitting your pull request and the CI-CD pipeline is green before removing the draft designation.
13
+
14
+ ```bash
15
+ ./cosmos1/scripts/format.sh
16
+ ```
17
+
18
+ ## Signing Your Work
19
+
20
+ * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
21
+
22
+ * Any contribution which contains commits that are not Signed-Off will not be accepted.
23
+
24
+ * To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
25
+ ```bash
26
+ $ git commit -s -m "Add cool feature."
27
+ ```
28
+ This will append the following to your commit message:
29
+ ```
30
+ Signed-off-by: Your Name <[email protected]>
31
+ ```
32
+
33
+ * Full text of the DCO:
34
+
35
+ ```
36
+ Developer Certificate of Origin
37
+ Version 1.1
38
+
39
+ Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
40
+ 1 Letterman Drive
41
+ Suite D4700
42
+ San Francisco, CA, 94129
43
+
44
+ Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
45
+ ```
46
+
47
+ ```
48
+ Developer's Certificate of Origin 1.1
49
+
50
+ By making a contribution to this project, I certify that:
51
+
52
+ (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
53
+
54
+ (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
55
+
56
+ (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
57
+
58
+ (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
59
+ ```
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Use NVIDIA PyTorch container as base image
17
+ FROM nvcr.io/nvidia/pytorch:24.10-py3
18
+
19
+ # Install system dependencies
20
+ RUN apt-get update && apt-get install -y \
21
+ ffmpeg \
22
+ && rm -rf /var/lib/apt/lists/*
23
+
24
+ # Set working directory
25
+ WORKDIR /workspace
26
+
27
+ # Copy source code
28
+ COPY cosmos1 /workspace/cosmos1
29
+
30
+ # Copy main README
31
+ COPY README.md /workspace/
32
+
33
+ # Copy third-party licenses
34
+ COPY ATTRIBUTIONS.md /workspace/
35
+
36
+ # Copy requirements file
37
+ COPY requirements.txt /workspace/
38
+
39
+ # Install Python dependencies
40
+ RUN pip install --no-cache-dir -r requirements.txt
41
+
42
+ # Default command
43
+ CMD ["/bin/bash"]
INSTALL.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cosmos Installation
2
+
3
+ We have only tested the installation with Ubuntu 24.04, 22.04, and 20.04.
4
+
5
+ 1. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
6
+
7
+ 2. Clone the repository.
8
+
9
+ ```bash
10
+ git clone [email protected]:NVIDIA/Cosmos.git
11
+ cd Cosmos
12
+ ```
13
+
14
+ 3. Build a Docker image using `Dockerfile` and run the Docker container.
15
+
16
+ ```bash
17
+ docker build -t cosmos .
18
+ docker run -d --name cosmos_container --gpus all --ipc=host -it -v $(pwd):/workspace cosmos
19
+ docker attach cosmos_container
20
+ ```
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ![Cosmos Logo](assets/cosmos-logo.png)
3
+
4
+ --------------------------------------------------------------------------------
5
+ ### [Website](https://www.nvidia.com/en-us/ai/cosmos/) | [HuggingFace](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6) | [GPU-free Preview](https://build.nvidia.com/explore/discover) | [Paper](https://arxiv.org/abs/2501.03575) | [Paper Website](https://research.nvidia.com/labs/dir/cosmos1/)
6
+
7
+ [NVIDIA Cosmos](https://www.nvidia.com/cosmos/) is a developer-first world foundation model platform designed to help Physical AI developers build their Physical AI systems better and faster. Cosmos contains
8
+
9
+ 1. pre-trained models, available via [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6) under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/) that allows commercial use of the models for free
10
+ 2. training scripts under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0), offered through [NVIDIA Nemo Framework](https://github.com/NVIDIA/NeMo) for post-training the models for various downstream Physical AI applications
11
+
12
+ Details of the platform is described in the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai). Preview access is avaiable at [build.nvidia.com](https://build.nvidia.com).
13
+
14
+ ## Key Features
15
+
16
+ - [Pre-trained Diffusion-based world foundation models](cosmos1/models/diffusion/README.md) for Text2World and Video2World generation where a user can generate visual simulation based on text prompts and video prompts.
17
+ - [Pre-trained Autoregressive-based world foundation models](cosmos1/models/autoregressive/README.md) for Video2World generation where a user can generate visual simulation based on video prompts and optional text prompts.
18
+ - [Video tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively.
19
+ - Video curation pipeline for building your own video dataset. [Coming soon]
20
+ - [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup.
21
+ - Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)].
22
+
23
+ ## Model Family
24
+
25
+ | Model name | Description | Try it out |
26
+ |------------|----------|----------|
27
+ | [Cosmos-1.0-Diffusion-7B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Text2World) | Text to visual world generation | [Inference](cosmos1/models/diffusion/README.md) |
28
+ | [Cosmos-1.0-Diffusion-14B-Text2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Text2World) | Text to visual world generation | [Inference](cosmos1/models/diffusion/README.md) |
29
+ | [Cosmos-1.0-Diffusion-7B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-7B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/diffusion/README.md) |
30
+ | [Cosmos-1.0-Diffusion-14B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Diffusion-14B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/diffusion/README.md) |
31
+ | [Cosmos-1.0-Autoregressive-4B](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-4B) | Future visual world generation | [Inference](cosmos1/models/autoregressive/README.md) |
32
+ | [Cosmos-1.0-Autoregressive-12B](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-12B) | Future visual world generation | [Inference](cosmos1/models/autoregressive/README.md) |
33
+ | [Cosmos-1.0-Autoregressive-5B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-5B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/autoregressive/README.md) |
34
+ | [Cosmos-1.0-Autoregressive-13B-Video2World](https://huggingface.co/nvidia/Cosmos-1.0-Autoregressive-13B-Video2World) | Video + Text based future visual world generation | [Inference](cosmos1/models/autoregressive/README.md) |
35
+ | [Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) | Guardrail contains pre-Guard and post-Guard for safe use | Embedded in model inference scripts |
36
+
37
+ ## Example Usage
38
+
39
+ ### Inference
40
+
41
+ Follow the [Cosmos Installation Guide](INSTALL.md) to setup the docker. For inference with the pretrained models, please refer to [Cosmos Diffusion Inference](cosmos1/models/diffusion/README.md) and [Cosmos Autoregressive Inference](cosmos1/models/autoregressive/README.md).
42
+
43
+ The code snippet below provides a gist of the inference usage.
44
+
45
+ ```bash
46
+ PROMPT="A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. \
47
+ The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. \
48
+ A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, \
49
+ suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. \
50
+ The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of \
51
+ field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
52
+
53
+ # Example using 7B model
54
+ PYTHONPATH=$(pwd) python cosmos1/models/diffusion/inference/text2world.py \
55
+ --checkpoint_dir checkpoints \
56
+ --diffusion_transformer_dir Cosmos-1.0-Diffusion-7B-Text2World \
57
+ --prompt "$PROMPT" \
58
+ --offload_prompt_upsampler \
59
+ --video_save_name Cosmos-1.0-Diffusion-7B-Text2World
60
+ ```
61
+
62
+ <video src="https://github.com/user-attachments/assets/db7bebfe-5314-40a6-b045-4f6ce0a87f2a">
63
+ Your browser does not support the video tag.
64
+ </video>
65
+
66
+ We also offer [multi-GPU inference](cosmos1/models/diffusion/nemo/inference/README.md) support for Diffusion Text2World WFM models through NeMo Framework.
67
+
68
+ ### Post-training
69
+
70
+ NeMo Framework provides GPU accelerated post-training with general post-training for both [diffusion](cosmos1/models/diffusion/nemo/post_training/README.md) and [autoregressive](cosmos1/models/autoregressive/nemo/post_training/README.md) models, with other types of post-training coming soon.
71
+
72
+ ## License and Contact
73
+
74
+ This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
75
+
76
+ NVIDIA Cosmos source code is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
77
+
78
+ NVIDIA Cosmos models are released under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). For a custom license, please contact [[email protected]](mailto:[email protected]).
RELEASE.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Release Cadence
2
+
3
+
4
+ | Version | Description | Date |
5
+ |------------|----------|----------|
6
+ | [v1.0](release_notes/v0p1.md) | Initial diffusion and autoregressive WFMs release | 2025-01-06 |
7
+ | [v0.1](release_notes/v0p1.md) | Initial tokenizer release | 2024-11-06 |
assets/cosmos-logo.png ADDED
checkpoints/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Checkpoint directory
2
+
3
+ Follow our instructions for downloading checkpoints in [Cosmos Diffusion Inference](../cosmos1/models/diffusion/README.md#download-checkpoints) and [Cosmos Autoregressive Inference](../cosmos1/models/autoregressive/README.md). Cosmos checkpoints will be downloaded to this directory.
cosmos1/models/POST_TRAINING.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cosmos Post-training
2
+
3
+ In the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai), we discuss several post-training examples of Cosmos pre-trained World Foundation Models (WFMs) for various Physical AI tasks, including
4
+
5
+ - General Post-Training: Fine-tune the WFM to generate a target distribution of videos based on the custom dataset. The target distribution could include a specific camera spec or a specific domain such as a factory.
6
+ - Instruction Control: Post-trains models for robotic manipulation to predict videos based on textual instructions, enabling robots to visually simulate tasks like folding clothes or picking up objects.
7
+ - Action Control: Post-trains models for robotic manipulation to predict the next visual frame based on action vectors, simulating robotic tasks like object handling or movement planning.
8
+ - Camera Control: Adds camera pose conditioning to generate 3D-consistent video simulations from single images, enabling joystick-like navigation in virtual environments.
9
+ - Multi-View Generation: Post-trains models for autonomous vehicles to generate synchronized multi-view videos from text prompts, simulating driving scenarios with multiple camera perspectives.
10
+ - Multi-View Generation with Vehicle Trajectory Control: Extends multi-view generation by incorporating trajectory inputs, enabling precise simulation of driving environments for autonomous vehicles, adhering to specified paths.
11
+
12
+ Except for the instruction control where the WFM is post-trained on a dataset of instruction-video pairs, all other cases require minor modifications of the network architectures. Post-training tasks will be supported by NeMo Framework. In this initial release, we provide post-training scripts for the general post-training of both diffusion and autorgressive WFMs. Scripts of the other post-training tasks will be provided in a future release.
13
+
14
+ ## Post-training Support Matrix
15
+
16
+ | Post-training Task | Diffusion WFM | Autoregressive WFM |
17
+ |---------------------|---------------|--------------------|
18
+ | General post-training | [Supported](../models/diffusion/nemo/post_training/README.md) | [Supported](../models/autoregressive/nemo/post_training/README.md) |
19
+ | Instruction control | Coming soon | Coming soon |
20
+ | Action control | Coming soon | Coming soon |
21
+ | Camera control | Coming soon | Coming soon |
22
+ | Multi-view generation | Coming soon | Coming soon |
23
+ | Multi-view generation with vehicle trajectory control | Coming soon | Coming soon |
cosmos1/models/autoregressive/README.md ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Cosmos Autoregressive-based World Foundation Models
2
+
3
+ ## Table of Contents
4
+ - [Getting Started](#getting-started)
5
+ - [Set Up Docker Environment](#set-up-docker-environment)
6
+ - [Download Checkpoints](#download-checkpoints)
7
+ - [Usage](#usage)
8
+ - [Model Types](#model-types)
9
+ - [Single and Batch Generation](#single-and-batch-generation)
10
+ - [Sample Commands](#sample-commands)
11
+ - [Base Models (4B/12B)](#base-basepy-4b-and-12b)
12
+ - [Video2World Models (5B/13B)](#video2world-video2worldpy-5b-and-13b)
13
+ - [Arguments](#arguments)
14
+ - [Common Parameters](#common-parameters)
15
+ - [Base Specific Parameters](#base-specific-parameters)
16
+ - [Video2World Specific Parameters](#video2world-specific-parameters)
17
+ - [Safety Features](#safety-features)
18
+
19
+ This page details the steps for using the Cosmos autoregressive-based world foundation models.
20
+
21
+ ## Getting Started
22
+
23
+ ### Set Up Docker Environment
24
+
25
+ Follow our [Installation Guide](../../../INSTALL.md) to set up the Docker environment. All commands on this page should be run inside Docker.
26
+
27
+ ### Download Checkpoints
28
+
29
+ 1. Generate a [Hugging Face](https://huggingface.co/settings/tokens) access token. Set the access token to 'Read' permission (default is 'Fine-grained').
30
+
31
+ 2. Log in to Hugging Face with the access token:
32
+
33
+ ```bash
34
+ huggingface-cli login
35
+ ```
36
+
37
+ 3. Download the Cosmos model weights from [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6):
38
+
39
+ ```bash
40
+ PYTHONPATH=$(pwd) python cosmos1/scripts/download_autoregressive.py --model_sizes 4B 5B 12B 13B
41
+ ```
42
+
43
+ 4. The downloaded files should be in the following structure:
44
+
45
+ ```
46
+ checkpoints/
47
+ ├── Cosmos-1.0-Autoregressive-4B
48
+ │ ├── model.pt
49
+ │ └── config.json
50
+ ├── Cosmos-1.0-Autoregressive-5B-Video2World
51
+ │ ├── model.pt
52
+ │ └── config.json
53
+ ├── Cosmos-1.0-Autoregressive-12B
54
+ │ ├── model.pt
55
+ │ └── config.json
56
+ ├── Cosmos-1.0-Autoregressive-13B-Video2World
57
+ │ ├── model.pt
58
+ │ └── config.json
59
+ ├── Cosmos-1.0-Tokenizer-CV8x8x8
60
+ │ ├── decoder.jit
61
+ │ ├── encoder.jit
62
+ │ └── mean_std.pt
63
+ ├── Cosmos-1.0-Tokenizer-DV8x16x16
64
+ │ ├── decoder.jit
65
+ │ └── encoder.jit
66
+ ├── Cosmos-1.0-Diffusion-7B-Decoder-DV8x16x16ToCV8x8x8
67
+ │ ├── aux_vars.pt
68
+ │ └── model.pt
69
+ └── Cosmos-1.0-Guardrail
70
+ ├── aegis/
71
+ ├── blocklist/
72
+ ├── face_blur_filter/
73
+ └── video_content_safety_filter/
74
+ ```
75
+
76
+ ## Usage
77
+
78
+
79
+ ### Model Types
80
+
81
+ There are two model types available for autoregressive world generation:
82
+
83
+ 1. **Base**: Supports world generation from image/video input
84
+
85
+ * Models: `Cosmos-1.0-Autoregressive-4B` and `Cosmos-1.0-Autoregressive-12B`
86
+ * Inference script: [base.py](/cosmos1/models/autoregressive/inference/base.py)
87
+
88
+ 2. **Video2World**: Supports world generation from image/video input and text input
89
+
90
+ * Models: `Cosmos-1.0-Autoregressive-5B-Video2World` and `Cosmos-1.0-Autoregressive-13B-Video2World`
91
+ * Inference script: [video2world.py](/cosmos1/models/autoregressive/inference/video2world.py)
92
+
93
+ Our models now support video extension up to 33 frames. Starting from either a single image or a 9-frame video input, they can generate the remaining frames to reach the 33-frame length (generating 32 or 24 frames, respectively).
94
+
95
+ We have evaluated all eight possible configurations (4 models × 2 vision input types: image or video) using 100 test videos on physical AI topics. Below are the failure rates for each configuration:
96
+
97
+ | Model | Image input | Video input (9 frames) |
98
+ |:------------------------------------------|:--------------:|:-------------------------:|
99
+ | Cosmos-1.0-Autoregressive-4B | 15% | 1% |
100
+ | Cosmos-1.0-Autoregressive-5B-Video2World | 7% | 2% |
101
+ | Cosmos-1.0-Autoregressive-12B | 2% | 1% |
102
+ | Cosmos-1.0-Autoregressive-13B-Video2World | 3% | 0% |
103
+
104
+ We define failure cases as videos with severe distortions, such as:
105
+
106
+ * Sudden appearance of large unexpected objects
107
+ * Video degrading to a single solid color
108
+
109
+ Note that the following are not considered failures in our analysis:
110
+
111
+ * Static video frames
112
+ * Minor object distortions or artifacts
113
+
114
+ ### Single and Batch Generation
115
+
116
+ We support both single and batch video generation.
117
+
118
+ For generating a single video, `base` mode requires the input argument `--input_image_or_video_path` (image/video input), while `video2world` mode requires both `--input_image_or_video_path` (image/video input) and `--prompt` (text input).
119
+
120
+ Note that our model only works with 1024x640 resolution videos. If the input image/video is not in this resolution, it will be resized and cropped.
121
+
122
+ For generating a batch of videos, both `base` and `video2world` require `--batch_input_path` (path to a JSONL file). For `base`, the JSONL file should contain one visual input per line in the following format, where each line must contain a "visual_input" field:
123
+
124
+ ```json
125
+ {"visual_input": "path/to/video1.mp4"}
126
+ {"visual_input": "path/to/video2.mp4"}
127
+ ```
128
+
129
+ For `video2world`, each line in the JSONL file must contain both "prompt" and "visual_input" fields:
130
+
131
+ ```json
132
+ {"prompt": "prompt1", "visual_input": "path/to/video1.mp4"}
133
+ {"prompt": "prompt2", "visual_input": "path/to/video2.mp4"}
134
+ ```
135
+
136
+ ### Sample Commands
137
+
138
+ There are two main demo scripts for autoregressive world generation: `base.py` and `video2world.py`. Below you will find sample commands for single and batch generation, as well as commands for running with low-memory GPUs using model offloading. We also provide a memory usage table comparing different offloading strategies to help with configuration.
139
+
140
+ #### Base (base.py): 4B and 12B
141
+
142
+ Generates world from image/video input.
143
+
144
+ The `input_type` argument can be either `video` or `image`. We have tuned the sampling parameters `top_p` and `temperature` to achieve the best performance. Please use the provided values in the command examples.
145
+
146
+ Note that the command examples below all use video input. If you want to use image input, please change the `input_type` to `image`.
147
+
148
+ ##### Single Generation
149
+
150
+ ```bash
151
+ # Example using 4B model
152
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
153
+ --input_type=video \
154
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
155
+ --video_save_name=Cosmos-1.0-Autoregressive-4B \
156
+ --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
157
+ --top_p=0.8 \
158
+ --temperature=1.0
159
+
160
+ # Example for low-memory GPUs using 4B model with model offloading
161
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
162
+ --input_type=video \
163
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
164
+ --video_save_name=Cosmos-1.0-Autoregressive-4B \
165
+ --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
166
+ --top_p=0.8 \
167
+ --temperature=1.0 \
168
+ --offload_guardrail_models \
169
+ --offload_diffusion_decoder \
170
+ --offload_ar_model \
171
+ --offload_tokenizer
172
+
173
+ # Example using 12B model
174
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
175
+ --input_type=video \
176
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
177
+ --video_save_name=Cosmos-1.0-Autoregressive-12B \
178
+ --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
179
+ --top_p=0.9 \
180
+ --temperature=1.0
181
+
182
+ # Example for low-memory GPUs using 12B model with model offloading
183
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
184
+ --input_type=video \
185
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
186
+ --video_save_name=Cosmos-1.0-Autoregressive-12B \
187
+ --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
188
+ --top_p=0.9 \
189
+ --temperature=1.0 \
190
+ --offload_guardrail_models \
191
+ --offload_diffusion_decoder \
192
+ --offload_ar_model \
193
+ --offload_tokenizer
194
+ ```
195
+
196
+ ##### Batch Generation
197
+
198
+ ```bash
199
+ # Example using 4B model
200
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
201
+ --input_type=video \
202
+ --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl \
203
+ --video_save_folder=outputs/Cosmos-1.0-Autoregressive-4B \
204
+ --ar_model_dir=Cosmos-1.0-Autoregressive-4B \
205
+ --top_p=0.8 \
206
+ --temperature=1.0
207
+
208
+ # Example using 12B model
209
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/base.py \
210
+ --input_type=video \
211
+ --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl \
212
+ --video_save_folder=outputs/Cosmos-1.0-Autoregressive-12B \
213
+ --ar_model_dir=Cosmos-1.0-Autoregressive-12B \
214
+ --top_p=0.9 \
215
+ --temperature=1.0
216
+ ```
217
+
218
+ ##### Example Output
219
+
220
+ Here is an example output video generated using base.py with image input, using `Cosmos-1.0-Autoregressive-12B`:
221
+
222
+ <video src="https://github.com/user-attachments/assets/634403a5-1873-42d7-8dd0-eb7fb4ac8cf4">
223
+ Your browser does not support the video tag.
224
+ </video>
225
+
226
+ The input image used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.jpg`. The image is from [BDD dataset](http://bdd-data.berkeley.edu/).
227
+
228
+ Here is an example output video generated using base.py with 9-frame video input, using `Cosmos-1.0-Autoregressive-12B`:
229
+
230
+ <video src="https://github.com/user-attachments/assets/1a3ff099-87d7-41e8-b149-a25cfcd4f40b">
231
+ Your browser does not support the video tag.
232
+ </video>
233
+
234
+ The input video used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.mp4`.
235
+
236
+ ##### Inference Time and GPU Memory Usage
237
+
238
+ These numbers may vary based on system specifications and are provided for reference only.
239
+
240
+ | Offloading Strategy | Cosmos-1.0-Autoregressive-4B | Cosmos-1.0-Autoregressive-12B |
241
+ |-------------|---------|---------|
242
+ | No offloading | 31.3 GB | 47.5 GB |
243
+ | Guardrails | 28.9 GB | 45.2 GB |
244
+ | Guardrails & Diffusion decoder | 28.5 GB | 43.1 GB |
245
+ | Guardrails & Diffusion decoder & Tokenizer | 27.3 GB | 42.9 GB |
246
+ | Guardrails & Diffusion decoder & Tokenizer & AR model | 18.7 GB | 27.4 GB |
247
+
248
+ End-to-end inference runtime on one H100 without offloading and after model initialization:
249
+
250
+ | Cosmos-1.0-Autoregressive-4B | Cosmos-1.0-Autoregressive-12B |
251
+ |---------|---------|
252
+ | ~62 seconds | ~119 seconds |
253
+
254
+ #### Video2World (video2world.py): 5B and 13B
255
+
256
+ Generates world from image/video and text input.
257
+
258
+ The `input_type` argument can be either `text_and_video` or `text_and_image`. We have tuned the sampling parameters `top_p` and `temperature` to achieve the best performance. Please use the provided values in the command examples.
259
+
260
+ Note that the command examples below all use video input. If you want to use image input, please change the `input_type` to `text_and_image`.
261
+
262
+ ##### Single Generation
263
+
264
+ ```bash
265
+ # Example using 5B model
266
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
267
+ --input_type=text_and_video \
268
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
269
+ --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
270
+ --video_save_name=Cosmos-1.0-Autoregressive-5B-Video2World \
271
+ --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
272
+ --top_p=0.7 \
273
+ --temperature=1.0
274
+
275
+ # Example for low-memory GPUs using 5B model with model offloading
276
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
277
+ --input_type=text_and_video \
278
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
279
+ --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
280
+ --video_save_name=Cosmos-1.0-Autoregressive-5B-Video2World \
281
+ --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
282
+ --top_p=0.7 \
283
+ --temperature=1.0 \
284
+ --offload_guardrail_models \
285
+ --offload_diffusion_decoder \
286
+ --offload_ar_model \
287
+ --offload_tokenizer \
288
+ --offload_text_encoder_model
289
+
290
+ # Example using 13B model
291
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
292
+ --input_type=text_and_video \
293
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
294
+ --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
295
+ --video_save_name=Cosmos-1.0-Autoregressive-13B-Video2World \
296
+ --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
297
+ --top_p=0.8 \
298
+ --temperature=1.0 \
299
+ --offload_guardrail_models
300
+
301
+ # Example for low-memory GPUs using 13B model with model offloading
302
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
303
+ --input_type=text_and_video \
304
+ --input_image_or_video_path=cosmos1/models/autoregressive/assets/v1p0/input.mp4 \
305
+ --prompt="A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
306
+ --video_save_name=Cosmos-1.0-Autoregressive-13B-Video2World \
307
+ --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
308
+ --top_p=0.8 \
309
+ --temperature=1.0 \
310
+ --offload_guardrail_models \
311
+ --offload_diffusion_decoder \
312
+ --offload_ar_model \
313
+ --offload_tokenizer \
314
+ --offload_text_encoder_model
315
+ ```
316
+
317
+ ##### Batch Generation
318
+
319
+ ```bash
320
+ # Example using 5B model
321
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
322
+ --input_type=text_and_video \
323
+ --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl \
324
+ --video_save_folder=outputs/Cosmos-1.0-Autoregressive-5B-Video2World \
325
+ --ar_model_dir=Cosmos-1.0-Autoregressive-5B-Video2World \
326
+ --top_p=0.7 \
327
+ --temperature=1.0
328
+
329
+ # Example using 13B model
330
+ CUDA_VISIBLE_DEVICES=0 PYTHONPATH=$(pwd) python cosmos1/models/autoregressive/inference/video2world.py \
331
+ --input_type=text_and_video \
332
+ --batch_input_path=cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl \
333
+ --video_save_folder=outputs/Cosmos-1.0-Autoregressive-13B-Video2World \
334
+ --ar_model_dir=Cosmos-1.0-Autoregressive-13B-Video2World \
335
+ --top_p=0.8 \
336
+ --temperature=1.0 \
337
+ --offload_guardrail_models
338
+ ```
339
+
340
+ ##### Example Output
341
+
342
+ Here is an example output video generated using video2world.py with image input, using `Cosmos-1.0-Autoregressive-13B-Video2World`:
343
+
344
+ <video src="https://github.com/user-attachments/assets/869f3b81-fabd-462e-a545-c04cdd9c1d22">
345
+ Your browser does not support the video tag.
346
+ </video>
347
+
348
+ The input image used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.jpg`. The prompt for generating the video is:
349
+
350
+ ```
351
+ A driving video captures a serene urban street scene on a sunny day. The camera is mounted on the dashboard of a moving vehicle, providing a first-person perspective as it travels down a two-lane road. The street is lined with parked cars on both sides, predominantly black and silver sedans and SUVs. The road is flanked by a mix of residential and commercial buildings, with a prominent red-brick building on the left side, featuring multiple windows and a flat roof. The sky is clear with a few scattered clouds, casting soft shadows on the street. Trees with lush green foliage line the right side of the road, providing a natural contrast to the urban environment. The camera remains steady, maintaining a consistent forward motion, suggesting a leisurely drive. Traffic is light, with a few vehicles moving in the opposite direction, including a black sedan and a yellow taxi. Street signs are visible, including a no-parking sign on the right. The overall atmosphere is calm and peaceful, with no pedestrians visible, emphasizing the focus on the drive and the surrounding urban landscape.
352
+ ```
353
+
354
+ Here is an example output video generated using video2world.py with 9-frame video input, using `Cosmos-1.0-Autoregressive-13B-Video2World`:
355
+
356
+ <video src="https://github.com/user-attachments/assets/81840e1c-624b-4b01-9240-ab7db3722e58">
357
+ Your browser does not support the video tag.
358
+ </video>
359
+
360
+ The input video used to generate this video can be found in `cosmos1/models/autoregressive/assets/v1p0/input.mp4`. The prompt for generating the video is:
361
+
362
+ ```
363
+ A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.
364
+ ```
365
+
366
+ ##### Inference Time and GPU Memory Usage
367
+
368
+ These numbers may vary based on system specifications and are provided for reference only.
369
+
370
+ | Offloading Strategy | Cosmos-1.0-Autoregressive-5B-Video2World | Cosmos-1.0-Autoregressive-13B-Video2World |
371
+ |-------------|---------|---------|
372
+ | No offloading | 66.2 GB | > 80 GB |
373
+ | Guardrails | 58.7 GB | 76.6 GB |
374
+ | Guardrails & T5 encoder | 41.3 GB | 58.0 GB |
375
+ | Guardrails & T5 encoder & Diffusion decoder | 29.0 GB | 46.9 GB |
376
+ | Guardrails & T5 encoder & Diffusion decoder & Tokenizer | 28.8 GB | 46.7 GB |
377
+ | Guardrails & T5 encoder & Diffusion decoder & Tokenizer & AR model | 21.1 GB | 30.9 GB |
378
+
379
+ End-to-end inference runtime on one H100 with no offloading for 5B model and guardrail offloading for 13B, after model initialization:
380
+
381
+ | Cosmos-1.0-Autoregressive-5B-Video2World | Cosmos-1.0-Autoregressive-13B-Video2World |
382
+ |---------|---------|
383
+ | ~73 seconds | ~150 seconds |
384
+
385
+ ### Arguments
386
+
387
+ #### Common Parameters
388
+
389
+ | Parameter | Description | Default |
390
+ |-----------|-------------|---------|
391
+ | `--checkpoint_dir` | Directory containing model weights | "checkpoints" |
392
+ | `--video_save_name` | Output video filename for single video generation | "output" |
393
+ | `--video_save_folder` | Folder where all output videos are stored | "outputs/" |
394
+ | `--input_image_or_video_path` | Input image or video path. Required for single video generation | None |
395
+ | `--batch_input_path` | Folder containing input images or videos. Required for batch video generation | None |
396
+ | `--num_input_frames` | Number of input frames to use for Video2World prediction | 9 |
397
+ | `--temperature` | Temperature used while sampling | 1.0 (recommend using values in sample commands provided) |
398
+ | `--top_p` | Top-p value for top-p sampling | 0.8 (recommend using values in sample commands provided) |
399
+ | `--seed` | Random seed | 0 |
400
+ | `--disable_diffusion_decoder` | When set to True, use discrete tokenizer to decode discrete tokens to video. Otherwise, use diffusion decoder to decode video | False |
401
+ | `--offload_guardrail_models` | Offload guardrail models after inference, used for low-memory GPUs | False |
402
+ | `--offload_diffusion_decoder` | Offload diffusion decoder after inference, used for low-memory GPUs | False |
403
+ | `--offload_ar_model` | Offload AR model after inference, used for low-memory GPUs | False |
404
+ | `--offload_prompt_upsampler` | Offload prompt upsampler after inference, used for low-memory GPUs | False |
405
+
406
+ #### Base Specific Parameters
407
+
408
+ | Parameter | Description | Default |
409
+ |-----------|-------------|---------|
410
+ | `--ar_model_dir` | Directory containing AR model weight | "Cosmos-1.0-Autoregressive-4B" |
411
+ | `--input_type` | Input type, either `video` or `image` | "video" |
412
+
413
+ #### Video2World Specific Parameters
414
+
415
+ | Parameter | Description | Default |
416
+ |-----------|-------------|---------|
417
+ | `--ar_model_dir` | Directory containing AR model weight | "Cosmos-1.0-Autoregressive-4B" |
418
+ | `--input_type` | Input type, either `text_and_video` or `text_and_image` | "text_and_video" |
419
+ | `--prompt` | Text prompt for single video generation. Required for single video generation | None |
420
+ | `--input_prompts_path` | Path to JSONL file for batch video generation. Required for batch video generation | None |
421
+ | `--offload_text_encoder_model` | Offload text encoder after inference, used for low-memory GPUs | False |
422
+
423
+ ### Safety Features
424
+
425
+ The model uses a built-in safety guardrail system that cannot be disabled. Generating human faces is not allowed and will be blurred by the guardrail.
426
+
427
+ For more information, check out the [Cosmos Guardrail Documentation](../guardrail/README.md).
cosmos1/models/autoregressive/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
cosmos1/models/autoregressive/assets/nemo/finetuned_result.mp4 ADDED
Binary file (193 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4 ADDED
Binary file (299 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4 ADDED
Binary file (222 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4 ADDED
Binary file (511 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4 ADDED
Binary file (461 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4 ADDED
Binary file (331 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4 ADDED
Binary file (282 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4 ADDED
Binary file (289 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4 ADDED
Binary file (170 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4 ADDED
Binary file (188 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4 ADDED
Binary file (174 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/base.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4"}
2
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4"}
3
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4"}
4
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4"}
5
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4"}
6
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4"}
7
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4"}
8
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4"}
9
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4"}
10
+ {"visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4"}
cosmos1/models/autoregressive/assets/v1p0/batch_inputs/video2world.jsonl ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/0.mp4"}
2
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/1.mp4"}
3
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/2.mp4"}
4
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/3.mp4"}
5
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/4.mp4"}
6
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/5.mp4"}
7
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/6.mp4"}
8
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/7.mp4"}
9
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/8.mp4"}
10
+ {"prompt": "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions.", "visual_input": "cosmos1/models/autoregressive/assets/v1p0/batch_inputs/9.mp4"}
cosmos1/models/autoregressive/assets/v1p0/input.jpg ADDED
cosmos1/models/autoregressive/assets/v1p0/input.mp4 ADDED
Binary file (282 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_12b.mp4 ADDED
Binary file (390 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/output_from_image_input_13b.mp4 ADDED
Binary file (430 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_12b.mp4 ADDED
Binary file (195 kB). View file
 
cosmos1/models/autoregressive/assets/v1p0/output_from_video_input_13b.mp4 ADDED
Binary file (193 kB). View file
 
cosmos1/models/autoregressive/configs/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
cosmos1/models/autoregressive/configs/base/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
cosmos1/models/autoregressive/configs/base/model.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Optional
17
+
18
+ import attrs
19
+
20
+ from cosmos1.models.autoregressive.configs.base.tokenizer import TokenizerConfig
21
+
22
+
23
+ @attrs.define
24
+ class ModelConfig:
25
+ """
26
+ A class to hold model configuration arguments.
27
+
28
+ Args:
29
+ dim (int): The dimensionality of the input and output of each transformer block.
30
+ n_layers (int): Number of layers in the transformer.
31
+ n_heads (int): Number of attention heads.
32
+ n_kv_heads (Optional[int]): Number of key-value heads. If None, defaults to n_heads. Note: this is equivalent to
33
+ `num_gqa_groups` in TransformerEngine, where GQA means Grouped Query Attention.
34
+ head_dim (Optional[int]): Dimensionality of each head. If None, defaults to dim // n_heads.
35
+ vocab_size (int): Vocabulary size.
36
+ ffn_hidden_size (int): Hidden size for feedforward network.
37
+ norm_eps (float): Epsilon value for normalization.
38
+ rope_theta (float): Theta value for rotary positional embeddings.
39
+ apply_abs_pos_emb (bool): Whether to apply absolute position embeddings.
40
+ max_batch_size (int): Maximum batch size for inference.
41
+ max_seq_len (int): Maximum sequence length for input text.
42
+ fuse_qkv (bool): Whether to fuse QKV in attention. Defaults to True.
43
+ causal_mask (bool): Whether to use causal mask. Defaults to True.
44
+ norm_type (str): Type of normalization layer. Choices: "rmsnorm", "fused_rmsnorm", "layernorm", "np_layernorm".
45
+ precision (str): Data type for the model.
46
+ use_qk_normalization (bool): Whether to enable QK normalization.
47
+ ckpt_dir (str): Checkpoint directory.
48
+ ckpt_path (str): Checkpoint path.
49
+ apply_yarn (Optional[bool]): Whether to apply YaRN (long-context extension).
50
+ yarn_scale (Optional[float]): Scale factor for YaRN.
51
+ yarn_beta_fast (Optional[int]): Beta fast variable for YaRN (i.e., low_freq_factor in Llama 3.1 RoPE scaling code)
52
+ yarn_beta_slow (Optional[int]): Beta slow variable for YaRN (i.e., high_freq_factor in Llama 3.1 RoPE scaling code)
53
+ original_seq_len (Optional[int]): Original sequence length.
54
+ vision_encoder (Optional[str]): Vision encoder name.
55
+ mm_projector (Optional[str]): Multi-modal projector name.
56
+ vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4-channel images with the last channel as the alpha channel, set this to 4.
57
+ rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "3D".
58
+ pytorch_rope_version (Optional[str]): Version of the PyTorch RoPE implementation. Choices: "v1", "v2".
59
+ original_latent_shape (Optional[list]): Original shape of the latent tensor needed for rope extension.
60
+ pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
61
+ vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3.
62
+ insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer.
63
+ insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers.
64
+ context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim.
65
+ num_video_frames (Optional[int]): Number of video frames.
66
+ video_height (Optional[int]): Raw video pixel height dimension.
67
+ video_width (Optional[int]): Raw video pixel width dimension.
68
+ video_latent_shape (Optional[list]): Video tokenizer output dimension, in (T,H,W).
69
+ """
70
+
71
+ dim: int = attrs.field(default=4096)
72
+ n_layers: int = attrs.field(default=32)
73
+ n_heads: int = attrs.field(default=32)
74
+ n_kv_heads: Optional[int] = attrs.field(default=8)
75
+ head_dim: Optional[int] = attrs.field(default=None)
76
+ vocab_size: int = attrs.field(default=128256)
77
+ ffn_hidden_size: int = attrs.field(default=14336)
78
+ norm_eps: float = attrs.field(default=1e-5)
79
+ rope_theta: float = attrs.field(default=500000)
80
+ apply_abs_pos_emb: bool = attrs.field(default=False)
81
+ max_batch_size: int = attrs.field(default=1)
82
+ max_seq_len: int = attrs.field(default=8192)
83
+ fuse_qkv: bool = attrs.field(default=False)
84
+ causal_mask: bool = attrs.field(default=True)
85
+ norm_type: str = attrs.field(default="rmsnorm")
86
+ precision: str = attrs.field(default="bfloat16")
87
+ use_qk_normalization: bool = False
88
+ tokenizer: Optional[TokenizerConfig] = None
89
+ ckpt_dir: Optional[str] = attrs.field(default=None)
90
+ ckpt_path: Optional[str] = attrs.field(
91
+ default=None
92
+ ) # If not None, load the model from this path instead of ckpt_dir
93
+ apply_yarn: Optional[bool] = attrs.field(default=False)
94
+ yarn_scale: Optional[float] = attrs.field(default=None)
95
+ yarn_beta_fast: Optional[int] = attrs.field(default=None)
96
+ yarn_beta_slow: Optional[int] = attrs.field(default=None)
97
+ original_seq_len: Optional[int] = attrs.field(default=None)
98
+ vision_encoder: Optional[str] = attrs.field(default=None)
99
+ vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
100
+ mm_projector: Optional[str] = attrs.field(default=None)
101
+ rope_dim: Optional[str] = attrs.field(default="1D")
102
+ pytorch_rope_version: Optional[str] = attrs.field(default="v2")
103
+ original_latent_shape: Optional[list] = None
104
+ pad_to_multiple_of: Optional[int] = None
105
+ vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
106
+ insert_cross_attn: bool = False
107
+ insert_cross_attn_every_k_layers: int = 1
108
+ context_dim: Optional[int] = attrs.field(default=1024)
109
+ # For video training
110
+ num_video_frames: Optional[int] = None
111
+ # Raw video pixel dimension
112
+ video_height: Optional[int] = None
113
+ video_width: Optional[int] = None
114
+ # Video tokenizer output dimension, in (T,H,W), it's computed by num_video_frames/temporal_compress_factor, video_height/spatial_compression_fact, video_width/spatial_compression_fact
115
+ video_latent_shape: Optional[list] = None
116
+
117
+ def __getitem__(self, item):
118
+ return getattr(self, item)
cosmos1/models/autoregressive/configs/base/model_config.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import copy
17
+ from typing import Callable, List, Optional
18
+
19
+ from cosmos1.models.autoregressive.configs.base.model import ModelConfig
20
+ from cosmos1.models.autoregressive.configs.base.tokenizer import (
21
+ TextTokenizerConfig,
22
+ TokenizerConfig,
23
+ VideoTokenizerConfig,
24
+ create_discrete_video_fsq_tokenizer_state_dict_config,
25
+ )
26
+ from cosmos1.models.autoregressive.tokenizer.image_text_tokenizer import ImageTextTokenizer
27
+ from cosmos1.models.autoregressive.tokenizer.text_tokenizer import TextTokenizer
28
+ from cosmos1.utils import log
29
+ from cosmos1.utils.lazy_config import LazyCall as L
30
+
31
+ # Common architecture specifications
32
+ BASE_CONFIG = {"n_kv_heads": 8, "norm_type": "rmsnorm", "norm_eps": 1e-5, "ffn_hidden_size": 14336}
33
+ COSMOS_ARCHITECTURES = {
34
+ "4b": {
35
+ "n_layers": 16,
36
+ "dim": 4096,
37
+ "n_heads": 32,
38
+ },
39
+ "12b": {
40
+ "n_layers": 40,
41
+ "dim": 5120,
42
+ "n_heads": 32,
43
+ "head_dim": 128,
44
+ },
45
+ }
46
+
47
+ COSMOS_YARN_CONFIG = {
48
+ "original_latent_shape": [3, 40, 64],
49
+ "apply_yarn": True,
50
+ "yarn_beta_fast": 4,
51
+ "yarn_beta_slow": 1,
52
+ "yarn_scale": 2,
53
+ }
54
+
55
+ # Llama3 architecture specifications for different model sizes
56
+ LLAMA3_ARCHITECTURES = {
57
+ "8b": {
58
+ "n_layers": 32,
59
+ "dim": 4096,
60
+ "n_heads": 32,
61
+ "ffn_hidden_size": 14336,
62
+ },
63
+ }
64
+ # Llama3.1 uses YaRN for long context support (context of 128k tokens)
65
+ LLAMA_YARN_CONFIG = {
66
+ "apply_yarn": True,
67
+ "yarn_scale": 8,
68
+ "yarn_beta_fast": 4,
69
+ "yarn_beta_slow": 1,
70
+ }
71
+
72
+ # Mistral architecture specifications for different model sizes
73
+ MISTRAL_ARCHITECTURES = {
74
+ "12b": {
75
+ "n_layers": 40,
76
+ "dim": 5120,
77
+ "n_heads": 32,
78
+ "ffn_hidden_size": 14336,
79
+ "head_dim": 128,
80
+ },
81
+ }
82
+
83
+ PIXTRAL_VISION_ARCHITECTURES = {
84
+ "12b": {"vision_encoder": "pixtral-12b-vit", "mm_projector": "mlp"},
85
+ }
86
+
87
+
88
+ def get_model_arch_specs(model_size: str, model_family: str = "mistral", pretrained: bool = False) -> dict:
89
+ """
90
+ Get the model architecture specifications for the given model size, model family and pretrained status.
91
+
92
+ Args:
93
+ model_size (str): Model size. Choices: "1b", "3b", "4b", "7b", etc.
94
+ model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral"
95
+ pretrained (bool): Whether to load pretrained weights.
96
+
97
+ Returns:
98
+ dict: A dictionary containing the model architecture specifications.
99
+ """
100
+ arch_specs = copy.deepcopy(BASE_CONFIG)
101
+ model_size = model_size.lower()
102
+ if model_family.startswith("cosmos"):
103
+ arch_specs.update(COSMOS_ARCHITECTURES[model_size])
104
+ elif model_family.startswith("llama"):
105
+ arch_specs.update(LLAMA3_ARCHITECTURES[model_size])
106
+ elif model_family in ["mistral", "pixtral"]:
107
+ arch_specs.update(MISTRAL_ARCHITECTURES[model_size])
108
+ if model_family == "pixtral":
109
+ arch_specs.update(PIXTRAL_VISION_ARCHITECTURES[model_size])
110
+ else:
111
+ raise ValueError(f"Model family {model_family} is not supported.")
112
+
113
+ if pretrained:
114
+ if model_family == "cosmos":
115
+ if model_size == "12b":
116
+ arch_specs.update(COSMOS_YARN_CONFIG)
117
+ log.debug(f"Using YaRN for RoPE extension with config: {COSMOS_YARN_CONFIG}")
118
+ else:
119
+ pass
120
+ elif model_family in ["llama", "llama3"]:
121
+ pretrained_specs = {
122
+ "rope_theta": 500000,
123
+ "max_seq_len": 8192,
124
+ "vocab_size": 128256,
125
+ }
126
+ arch_specs.update(pretrained_specs)
127
+ elif model_family == "llama3.1":
128
+ pretrained_specs = {
129
+ "rope_theta": 500000,
130
+ "max_seq_len": 131072,
131
+ "original_seq_len": 8192,
132
+ "vocab_size": 128256,
133
+ **LLAMA_YARN_CONFIG,
134
+ }
135
+ arch_specs.update(pretrained_specs)
136
+ elif model_family == "mistral":
137
+ assert model_size == "12b", "We only support Mistral-Nemo-12B model."
138
+ pretrained_specs = {
139
+ "rope_theta": 1000000,
140
+ "max_seq_len": 128000,
141
+ "vocab_size": 131072,
142
+ }
143
+ arch_specs.update(pretrained_specs)
144
+ elif model_family == "pixtral":
145
+ assert model_size == "12b", "We only support Pixtral 12B model."
146
+ pretrained_specs = {"rope_theta": 1000000000, "max_seq_len": 128000, "vocab_size": 131072}
147
+ arch_specs.update(pretrained_specs)
148
+ else:
149
+ raise ValueError(f"Model family {model_family} doesn't have a pretrained config.")
150
+
151
+ return arch_specs
152
+
153
+
154
+ def create_text_model_config(
155
+ model_ckpt_path: str,
156
+ tokenizer_path: str,
157
+ model_family: str = "mistral",
158
+ model_size: str = "12b",
159
+ is_instruct_model: bool = True,
160
+ max_seq_len: int = None,
161
+ max_batch_size: int = 1,
162
+ rope_dim: str = "1D",
163
+ add_special_tokens: bool = True,
164
+ pytorch_rope_version: str = None,
165
+ ) -> dict:
166
+ """Create a text model for training or inference.
167
+ Args:
168
+ model_ckpt_path (str): Path to the model checkpoint.
169
+ tokenizer_path (str): Path to the tokenizer folder.
170
+ model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
171
+ model_size (str): Model size. Choices: "1b", "3b", "4b", "7b", "8b", "72b", etc.
172
+ is_instruct_model (bool): Whether the model is an instruct model.
173
+ inference (bool): Whether to create the model for inference.
174
+ max_seq_len (int): Maximum sequence length.
175
+ max_batch_size (int): Maximum batch size.
176
+ rope_dim (str): RoPE dimension. Choices: "1D", "3D".
177
+ add_special_tokens (bool): Whether to add special tokens.
178
+ Returns:
179
+ dict: A dictionary containing the model configuration, which can be used to instantiate the model object.
180
+ """
181
+ # Model size specific parameters
182
+ model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
183
+ if max_seq_len is not None:
184
+ # Override the max_seq_len if provided
185
+ model_arch_specs["max_seq_len"] = max_seq_len
186
+ if pytorch_rope_version is not None:
187
+ model_arch_specs["pytorch_rope_version"] = pytorch_rope_version
188
+ model_config = ModelConfig(
189
+ max_batch_size=max_batch_size,
190
+ precision="bfloat16",
191
+ ckpt_path=model_ckpt_path,
192
+ use_qk_normalization=False,
193
+ rope_dim=rope_dim,
194
+ **model_arch_specs,
195
+ )
196
+
197
+ tokenizer_config = TokenizerConfig(
198
+ text_tokenizer=TextTokenizerConfig(
199
+ config=L(TextTokenizer)(
200
+ model_family=model_family,
201
+ is_instruct_model=is_instruct_model,
202
+ local_path=tokenizer_path,
203
+ ),
204
+ data_key="text",
205
+ tokenizer_offset=model_config.vocab_size,
206
+ tokenize_here=False,
207
+ vocab_size=model_config.vocab_size,
208
+ ),
209
+ seq_len=model_config.max_seq_len,
210
+ training_type="text_only",
211
+ add_special_tokens=add_special_tokens,
212
+ )
213
+ return model_config, tokenizer_config
214
+
215
+
216
+ def create_vision_language_model_config(
217
+ model_ckpt_path: str,
218
+ tokenizer_ckpt_path: str,
219
+ model_family: str = "pixtral",
220
+ model_size: str = "12b",
221
+ is_instruct_model: bool = True,
222
+ max_batch_size: int = 1,
223
+ rope_dim: str = "1D",
224
+ add_special_tokens: bool = True,
225
+ max_seq_len: int = None,
226
+ vision_encoder_in_channels: int = 3,
227
+ fuse_qkv: bool = False,
228
+ pytorch_rope_version: str = None,
229
+ ) -> dict:
230
+ """Create a vision-language model for training or inference.
231
+ Args:
232
+ model_ckpt_path (str): Path to the model checkpoint.
233
+ tokenizer_ckpt_path (str): Path to the tokenizer checkpoint.
234
+ model_family (str): Model family. Choices: "pixtral".
235
+ model_size (str): Model size. Choices: "12b".
236
+ is_instruct_model (bool): Whether the model is an instruct model.
237
+ rope_dim (str): RoPE dimension. Choices: "1D".
238
+ add_special_tokens (bool): Whether to add special tokens.
239
+ max_seq_len (int): Maximum sequence length.
240
+ vision_encoder_in_channels (int): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4 channel images where last channel is binary mask, set this to 4.
241
+ fuse_qkv (bool): Whether to fuse the QKV linear layers.
242
+ Returns:
243
+ dict: A dictionary containing the model configuration, which can be used to instantiate the model object.
244
+ """
245
+ # Model size specific parameters
246
+ model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
247
+ if max_seq_len is not None:
248
+ # Override the max_seq_len if provided
249
+ model_arch_specs["max_seq_len"] = max_seq_len
250
+ if pytorch_rope_version is not None:
251
+ model_arch_specs["pytorch_rope_version"] = pytorch_rope_version
252
+
253
+ model_config = ModelConfig(
254
+ max_batch_size=max_batch_size,
255
+ precision="bfloat16",
256
+ ckpt_path=model_ckpt_path,
257
+ use_qk_normalization=False,
258
+ rope_dim=rope_dim,
259
+ vision_encoder_in_channels=vision_encoder_in_channels,
260
+ fuse_qkv=fuse_qkv,
261
+ **model_arch_specs,
262
+ )
263
+ # Vision-language tokenizer
264
+ tokenizer_config = TokenizerConfig(
265
+ text_tokenizer=TextTokenizerConfig(
266
+ config=L(ImageTextTokenizer)(
267
+ model_family=model_family,
268
+ is_instruct_model=is_instruct_model,
269
+ image_processor_path=tokenizer_ckpt_path,
270
+ tokenizer_path=tokenizer_ckpt_path,
271
+ ),
272
+ data_key="image_text_interleaved",
273
+ tokenizer_offset=model_config.vocab_size,
274
+ tokenize_here=False,
275
+ vocab_size=model_config.vocab_size,
276
+ ),
277
+ seq_len=model_config.max_seq_len,
278
+ training_type="image_text_interleaved",
279
+ add_special_tokens=add_special_tokens,
280
+ )
281
+ return model_config, tokenizer_config
282
+
283
+
284
+ def create_video2world_model_config(
285
+ model_ckpt_path: str,
286
+ tokenizer_ckpt_path: str,
287
+ model_family: str = "cosmos",
288
+ model_size: str = "4b",
289
+ pixel_chunk_duration: int = 9,
290
+ num_video_frames: int = 36,
291
+ compression_ratio: List[int] = [8, 16, 16],
292
+ original_seq_len: int = 8192,
293
+ num_condition_latents_t: int = 1,
294
+ num_tokens_to_ignore: int = -1,
295
+ batch_size: int = 2,
296
+ video_tokenizer_config_creator: Callable = create_discrete_video_fsq_tokenizer_state_dict_config,
297
+ rope_dim: str = "3D",
298
+ add_special_tokens: bool = True,
299
+ video_height: int = 384,
300
+ video_width: int = 640,
301
+ use_qk_normalization: bool = True,
302
+ insert_cross_attn: bool = False,
303
+ insert_cross_attn_every_k_layers: int = 1,
304
+ context_dim: int = 1024,
305
+ training_type: str = "video_to_video",
306
+ pad_to_multiple_of: Optional[int] = 64,
307
+ vocab_size: int = 64000,
308
+ apply_abs_pos_emb: bool = False,
309
+ ) -> dict:
310
+ """Create a video-to-world model config.
311
+ Args:
312
+ model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
313
+ model_size (str): Model size. Choices: "1b", "8b", "3b".
314
+ pixel_chunk_duration (int): Number of frames in each chunk.
315
+ num_video_frames (int): Number of video frames.
316
+ compression_ratio (List[int]): Compression ratio for the video frames. Choices: [8, 16, 16] or [4, 8, 8].
317
+ original_seq_len (int): Original sequence length.
318
+ apply_yarn (bool): Whether to apply YaRN for long context scaling.
319
+ yarn_beta_fast (Optional[int]): Fast beta for YaRN.
320
+ yarn_beta_slow (Optional[int]): Slow beta for YaRN.
321
+ yarn_scale (Optional[int]): Scale factor for ctx extension.
322
+ use_qk_normalization (bool): Whether to use Query-Key normalization.
323
+ training_type (str): Type of training task.
324
+ batch_size (int): Batch size.
325
+ video_tokenizer_config_creator (Callable): Method that takes "pixel_chunk_duration: int" and "version: str" as arguments and returns video tokenizer config
326
+ video_tokenizer_version (str): Version of the video tokenizer.
327
+ num_condition_latents_t (int): Number of conditioning latent channels
328
+ num_tokens_to_ignore (int) = Number of tokens to ignore. This takes the precedence
329
+ video_height (int): Height of the video frame. Defaults to 384.
330
+ video_width (int): Width of the video frame. Defaults to 640.
331
+ rope_dim (str): RoPE dimension. Choices: "1D", "3D".
332
+ add_special_tokens (bool): Whether to add special tokens, use False for 2D/3D RoPE.
333
+ pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
334
+ vocab_size (int): Vocabulary size.
335
+ apply_abs_pos_emb (bool): Whether to apply absolute positional embeddings.
336
+ Returns:
337
+ dict: A dictionary containing the model configuration representing the model object, can be instantiated.
338
+ """
339
+ assert (
340
+ pixel_chunk_duration % compression_ratio[0] == 1
341
+ ), f"pixel_chunk_duration({pixel_chunk_duration}) should be k*n + 1 (k={compression_ratio[0]})"
342
+ latent_chunk_duration = (pixel_chunk_duration - 1) // compression_ratio[0] + 1
343
+ latent_height = video_height // compression_ratio[1]
344
+ latent_width = video_width // compression_ratio[2]
345
+ # Do some math to compute the video latent shape and sequence length
346
+ assert (
347
+ num_video_frames % pixel_chunk_duration == 0
348
+ ), f"num_video_frames {num_video_frames} should be divisible by pixel_chunk_duration {pixel_chunk_duration}"
349
+ video_latent_shape = [
350
+ num_video_frames // pixel_chunk_duration * latent_chunk_duration,
351
+ latent_height,
352
+ latent_width,
353
+ ]
354
+ # product of video_latent_shape
355
+ num_token_video_latent = video_latent_shape[0] * video_latent_shape[1] * video_latent_shape[2]
356
+ if add_special_tokens:
357
+ seq_len = num_token_video_latent + 3 # Sequence length per batch, max_seq_len + 3
358
+ seq_len = (seq_len + 63) // 64 * 64 # Round up to multiple of 64
359
+ # for text to video, we need to add <bov> token to indicate the start of the video
360
+ elif training_type == "text_to_video":
361
+ seq_len = num_token_video_latent + 1
362
+ else:
363
+ seq_len = num_token_video_latent
364
+
365
+ if seq_len % pad_to_multiple_of != 0:
366
+ # Round up to the nearest multiple of pad_to_multiple_of
367
+ seq_len = ((seq_len + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
368
+
369
+ # Model size specific parameters
370
+ model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
371
+
372
+ # Whether skip the loss for first chunk or not, note the first token is already skipped when computing the loss
373
+ # If num_tokens_to_ignore is specified, use it.
374
+ # Else compute it from num_condition_latents_t
375
+ if num_tokens_to_ignore < 0:
376
+ num_tokens_to_ignore = latent_height * latent_width * num_condition_latents_t
377
+ if not add_special_tokens and num_condition_latents_t > 0:
378
+ # If there are no special tokens (bov), do a -1 so that you can compute the loss
379
+ # from the first token of the next chunk
380
+ num_tokens_to_ignore -= 1
381
+
382
+ model_config = ModelConfig(
383
+ video_height=video_height,
384
+ video_width=video_width,
385
+ max_seq_len=seq_len,
386
+ max_batch_size=batch_size,
387
+ precision="bfloat16",
388
+ ckpt_path=model_ckpt_path,
389
+ use_qk_normalization=use_qk_normalization,
390
+ vocab_size=64000,
391
+ original_seq_len=original_seq_len,
392
+ video_latent_shape=video_latent_shape,
393
+ num_video_frames=num_video_frames,
394
+ rope_dim=rope_dim,
395
+ pad_to_multiple_of=pad_to_multiple_of,
396
+ insert_cross_attn=insert_cross_attn,
397
+ insert_cross_attn_every_k_layers=insert_cross_attn_every_k_layers,
398
+ context_dim=context_dim,
399
+ apply_abs_pos_emb=apply_abs_pos_emb,
400
+ **model_arch_specs,
401
+ )
402
+
403
+ video_tokenizer_config = video_tokenizer_config_creator(
404
+ tokenizer_ckpt_path, pixel_chunk_duration, compression_ratio
405
+ )
406
+ tokenizer_config = TokenizerConfig(
407
+ text_tokenizer=None,
408
+ video_tokenizer=VideoTokenizerConfig(
409
+ config=video_tokenizer_config,
410
+ data_key="video",
411
+ tokenizer_offset=0, # Since there is no text embeddings in the model. Note this only apply when the model is trained from scratch. If we use text pretrained model, the offset will be vocab_size of text token.
412
+ tokenize_here=True,
413
+ max_seq_len=num_token_video_latent,
414
+ vocab_size=vocab_size,
415
+ ),
416
+ seq_len=seq_len,
417
+ training_type=training_type,
418
+ add_special_tokens=add_special_tokens,
419
+ pad_to_multiple_of=pad_to_multiple_of,
420
+ )
421
+ return model_config, tokenizer_config
cosmos1/models/autoregressive/configs/base/tokenizer.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Optional
17
+
18
+ import attrs
19
+
20
+ from cosmos1.models.autoregressive.tokenizer.discrete_video import DiscreteVideoFSQStateDictTokenizer
21
+ from cosmos1.models.autoregressive.tokenizer.networks import CausalDiscreteVideoTokenizer
22
+ from cosmos1.utils.lazy_config import LazyCall as L
23
+ from cosmos1.utils.lazy_config import LazyDict
24
+
25
+
26
+ def create_discrete_video_fsq_tokenizer_state_dict_config(
27
+ ckpt_path, pixel_chunk_duration=33, compression_ratio=[8, 16, 16]
28
+ ) -> LazyDict:
29
+ CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)(
30
+ # The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime.
31
+ # - It relies on fully 3D discrete wavelet transform
32
+ # - Uses a layer norm instead of a group norm
33
+ # - Factorizes full convolutions into spatial and temporal convolutions
34
+ # - Factorizes full attention into spatial and temporal attention
35
+ # - Strictly causal, with flexible temporal length at inference.
36
+ attn_resolutions=[32],
37
+ channels=128,
38
+ channels_mult=[2, 4, 4],
39
+ dropout=0.0,
40
+ in_channels=3,
41
+ num_res_blocks=2,
42
+ out_channels=3,
43
+ resolution=1024,
44
+ patch_size=4,
45
+ patch_method="haar",
46
+ z_channels=16,
47
+ z_factor=1,
48
+ num_groups=1,
49
+ legacy_mode=False,
50
+ spatial_compression=16,
51
+ temporal_compression=8,
52
+ embedding_dim=6,
53
+ levels=[8, 8, 8, 5, 5, 5],
54
+ name="CausalDiscreteFactorizedVideoTokenizer",
55
+ )
56
+
57
+ return L(DiscreteVideoFSQStateDictTokenizer)(
58
+ enc_fp=ckpt_path.replace("ema.jit", "encoder.jit"),
59
+ dec_fp=ckpt_path.replace("ema.jit", "decoder.jit"),
60
+ tokenizer_module=CausalDiscreteFactorizedVideoTokenizerConfig,
61
+ name="discrete_video_fsq",
62
+ latent_ch=6,
63
+ is_bf16=True,
64
+ pixel_chunk_duration=pixel_chunk_duration,
65
+ latent_chunk_duration=1 + (pixel_chunk_duration - 1) // compression_ratio[0],
66
+ max_enc_batch_size=8,
67
+ max_dec_batch_size=4,
68
+ levels=[8, 8, 8, 5, 5, 5],
69
+ compression_ratio=compression_ratio,
70
+ )
71
+
72
+
73
+ @attrs.define(slots=False)
74
+ class TextTokenizerConfig:
75
+ """
76
+ Text tokenizer config
77
+
78
+ Args:
79
+ config: Config file to define the text tokenizer class.
80
+ data_key (str): The input key from data_dict that will be passed to the text tokenizer.
81
+ tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
82
+ tokenizer_offset (int): Offset that is added to the tokens.
83
+ vocab_size (int): Vocabulary size of the tokenizer.
84
+ """
85
+
86
+ config: LazyDict
87
+ data_key: str = ""
88
+ tokenize_here: bool = False
89
+ tokenizer_offset: int = 0
90
+ vocab_size: int = 0
91
+
92
+
93
+ @attrs.define(slots=False)
94
+ class VideoTokenizerConfig:
95
+ """
96
+ Video tokenizer config
97
+
98
+ Args:
99
+ config: Config file to define the video tokenizer class.
100
+ data_key (str): The input key from data_dict that will be passed to the video tokenizer.
101
+ tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
102
+ tokenizer_offset (int): Offset that is added to the tokens. In case of joint text-video tokenizers, we
103
+ add an offset to make sure that video tokens and text tokens don't overlap.
104
+ vocab_size (int): Vocabulary size of the tokenizer.
105
+ max_seq_len (int): Maximum token length for an input video.
106
+ """
107
+
108
+ config: LazyDict
109
+ data_key: str = ""
110
+ tokenize_here: bool = True
111
+ tokenizer_offset: int = 0
112
+ vocab_size: int = 0
113
+ max_seq_len: int = -1
114
+
115
+
116
+ @attrs.define(slots=False)
117
+ class TokenizerConfig:
118
+ """
119
+ Joint tokenizer config
120
+
121
+ Args:
122
+ text_tokenizer (TextTokenizerConfig): Text tokenizer config file
123
+ class_tokenizer (ClassTokenizerConfig): Class tokenizer config file
124
+ video_tokenizer (VideoTokenizerConfig): Video tokenizer config file
125
+ image_tokenizer (ImageTokenizerConfig): Image tokenizer config file
126
+ seq_len (int): Final token sequence length
127
+ training_type (str): Type of training we use. Supports ["text_only", "text_to_video", "class_to_image", "image_text_interleaved"]
128
+ add_special_tokens (bool): Whether to add special tokens to the output tokens
129
+ pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
130
+ """
131
+
132
+ text_tokenizer: Optional[TextTokenizerConfig] = None
133
+ video_tokenizer: Optional[VideoTokenizerConfig] = None
134
+ seq_len: int = 4096
135
+ training_type: str = None
136
+ add_special_tokens: bool = True
137
+ pad_to_multiple_of: Optional[int] = 64
cosmos1/models/autoregressive/configs/inference/inference_config.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, List, Union
17
+
18
+ import attrs
19
+
20
+ from cosmos1.models.autoregressive.configs.base.model import ModelConfig, TokenizerConfig
21
+
22
+
23
+ @attrs.define(slots=False)
24
+ class DataShapeConfig:
25
+ latent_shape: list = []
26
+ num_video_frames: Union[None, int] = None
27
+ height: Union[None, int] = None
28
+ width: Union[None, int] = None
29
+
30
+
31
+ @attrs.define(slots=False)
32
+ class SamplingConfig:
33
+ """
34
+ Sampling config
35
+ Args:
36
+ temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
37
+ top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
38
+ logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False.
39
+ echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
40
+
41
+ """
42
+
43
+ temperature: float = 0.6
44
+ top_k: int = None
45
+ top_p: float = 0.9
46
+ compile_prefill: bool = False
47
+ compile_sampling: bool = True
48
+ logprobs: bool = False
49
+ echo: bool = False
50
+
51
+
52
+ @attrs.define(slots=False)
53
+ class DiffusionDecoderSamplingConfig:
54
+ """
55
+ Diffusion decoder sampling config
56
+ Args:
57
+ guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8.
58
+ sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02.
59
+ sigma (float): Initial noise level for the diffusion process. Defaults to 8.
60
+ num_steps (int): Number of denoising steps to perform. Defaults to 35.
61
+ overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2.
62
+ continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16.
63
+ continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8.
64
+ dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57.
65
+ """
66
+
67
+ guidance: float = 1.8
68
+ sigma_min: float = 0.02
69
+ sigma: float = 8
70
+ num_steps: int = 15
71
+ overlap: int = 2
72
+ continuous_tokenizer_channel = 16
73
+ continuous_tokenizer_spatial_compression_ratio = 8
74
+ dd_train_num_video_frames: int = 57
75
+ max_iter: int = 99
76
+ fps: int = 24
77
+
78
+
79
+ @attrs.define(slots=False)
80
+ class InferenceConfig:
81
+ """
82
+ Inference config
83
+ Args:
84
+ model_config (ModelConfig): Model config
85
+ tokenizer_config (TokenizerConfig): Tokenizer config
86
+ ckpt_path (str): Path to the checkpoint
87
+ latent_shape (list): Shape of the latent
88
+ """
89
+
90
+ model_config: ModelConfig = None
91
+ tokenizer_config: TokenizerConfig = None
92
+ ckpt_path: str = ""
93
+ data_shape_config: DataShapeConfig = None
94
+
95
+ defaults: List[Any] = attrs.field(
96
+ factory=lambda: [
97
+ "_self_",
98
+ {"data_val": None},
99
+ {"data_shape_config": "video_shape_as_model_config"},
100
+ {"eval_job": None},
101
+ ]
102
+ )
cosmos1/models/autoregressive/diffusion_decoder/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
cosmos1/models/autoregressive/diffusion_decoder/config/base/conditioner.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from dataclasses import dataclass
17
+ from typing import Dict, Optional
18
+
19
+ import torch
20
+
21
+ from cosmos1.models.diffusion.conditioner import BaseVideoCondition, GeneralConditioner
22
+ from cosmos1.models.diffusion.config.base.conditioner import (
23
+ FPSConfig,
24
+ ImageSizeConfig,
25
+ LatentConditionConfig,
26
+ LatentConditionSigmaConfig,
27
+ NumFramesConfig,
28
+ PaddingMaskConfig,
29
+ TextConfig,
30
+ )
31
+ from cosmos1.utils.lazy_config import LazyCall as L
32
+ from cosmos1.utils.lazy_config import LazyDict
33
+
34
+
35
+ @dataclass
36
+ class VideoLatentDiffusionDecoderCondition(BaseVideoCondition):
37
+ # latent_condition will concat to the input of network, along channel dim;
38
+ # cfg will make latent_condition all zero padding.
39
+ latent_condition: Optional[torch.Tensor] = None
40
+ latent_condition_sigma: Optional[torch.Tensor] = None
41
+
42
+
43
+ class VideoDiffusionDecoderConditioner(GeneralConditioner):
44
+ def forward(
45
+ self,
46
+ batch: Dict,
47
+ override_dropout_rate: Optional[Dict[str, float]] = None,
48
+ ) -> VideoLatentDiffusionDecoderCondition:
49
+ output = super()._forward(batch, override_dropout_rate)
50
+ return VideoLatentDiffusionDecoderCondition(**output)
51
+
52
+
53
+ VideoLatentDiffusionDecoderConditionerConfig: LazyDict = L(VideoDiffusionDecoderConditioner)(
54
+ text=TextConfig(),
55
+ fps=FPSConfig(),
56
+ num_frames=NumFramesConfig(),
57
+ image_size=ImageSizeConfig(),
58
+ padding_mask=PaddingMaskConfig(),
59
+ latent_condition=LatentConditionConfig(),
60
+ latent_condition_sigma=LatentConditionSigmaConfig(),
61
+ )
cosmos1/models/autoregressive/diffusion_decoder/config/config_latent_diffusion_decoder.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, List
17
+
18
+ import attrs
19
+
20
+ from cosmos1.models.autoregressive.diffusion_decoder.config.registry import register_configs as register_dd_configs
21
+ from cosmos1.models.diffusion.config.base.model import LatentDiffusionDecoderModelConfig
22
+ from cosmos1.models.diffusion.config.registry import register_configs
23
+ from cosmos1.utils import config
24
+ from cosmos1.utils.config_helper import import_all_modules_from_package
25
+
26
+
27
+ @attrs.define(slots=False)
28
+ class Config(config.Config):
29
+ # default config groups that will be used unless overwritten
30
+ # see config groups in registry.py
31
+ defaults: List[Any] = attrs.field(
32
+ factory=lambda: [
33
+ "_self_",
34
+ {"net": None},
35
+ {"conditioner": "basic"},
36
+ {"tokenizer": "tokenizer"},
37
+ {"tokenizer_corruptor": None},
38
+ {"latent_corruptor": None},
39
+ {"pixel_corruptor": None},
40
+ {"experiment": None},
41
+ ]
42
+ )
43
+
44
+
45
+ def make_config():
46
+ c = Config(model=LatentDiffusionDecoderModelConfig())
47
+
48
+ # Specifying values through instances of attrs
49
+ c.job.project = "cosmos_video4"
50
+ c.job.group = "debug"
51
+ c.job.name = "delete_${now:%Y-%m-%d}_${now:%H-%M-%S}"
52
+
53
+ # Call this function to register config groups for advanced overriding.
54
+ register_configs()
55
+ register_dd_configs()
56
+
57
+ # experiment config are defined in the experiment folder
58
+ # call import_all_modules_from_package to register them
59
+ import_all_modules_from_package("cosmos1.models.diffusion.config.inference", reload=True)
60
+ import_all_modules_from_package("cosmos1.models.autoregressive.diffusion_decoder.config.inference", reload=True)
61
+ return c
cosmos1/models/autoregressive/diffusion_decoder/config/inference/cosmos_diffusiondecoder_7b.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from hydra.core.config_store import ConfigStore
17
+
18
+ from cosmos1.models.autoregressive.diffusion_decoder.network import DiffusionDecoderGeneralDIT
19
+ from cosmos1.utils.lazy_config import LazyCall as L
20
+ from cosmos1.utils.lazy_config import LazyDict
21
+
22
+ num_frames = 57
23
+ Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY: LazyDict = LazyDict(
24
+ dict(
25
+ defaults=[
26
+ {"override /net": "faditv2_7b"},
27
+ {"override /tokenizer": "cosmos_video_tokenizer_res720_comp8x8x8_t121_ver092624"},
28
+ {"override /conditioner": "video_latent_diffusion_decoder_cond"},
29
+ {"override /tokenizer_corruptor": "cosmos_video_discrete_tokenizer_res720_comp8x16x16_t49_ver110224"},
30
+ "_self_",
31
+ ],
32
+ job=dict(
33
+ group="diffusion_deocder_FT_7Bv1_001",
34
+ name="DD_FT_7Bv1_003_002_tokenizer888_spatch2_discrete_cond_on_token",
35
+ ),
36
+ model=dict(
37
+ diffusion_decoder_cond_sigma_low=0.0,
38
+ diffusion_decoder_cond_sigma_high=0.0,
39
+ diffusion_decoder_corrupt_prob=0.0,
40
+ condition_on_tokenizer_corruptor_token=True,
41
+ latent_shape=[
42
+ 16,
43
+ num_frames,
44
+ 88,
45
+ 160,
46
+ ],
47
+ tokenizer_corruptor=dict(
48
+ pixel_chunk_duration=num_frames,
49
+ latent_chunk_duration=1 + (num_frames - 1) // 8,
50
+ ),
51
+ net=L(DiffusionDecoderGeneralDIT)(
52
+ diffusion_decoder_condition_on_sigma=False,
53
+ max_img_h=240,
54
+ max_img_w=240,
55
+ rope_h_extrapolation_ratio=1.5,
56
+ rope_w_extrapolation_ratio=1.5,
57
+ rope_t_extrapolation_ratio=1,
58
+ block_x_format="THWBD",
59
+ is_diffusion_decoder=True,
60
+ patch_spatial=2,
61
+ diffusion_decoder_condition_on_token=True,
62
+ diffusion_decoder_token_condition_voc_size=64000,
63
+ diffusion_decoder_token_condition_dim=32,
64
+ ),
65
+ tokenizer=dict(
66
+ video_vae=dict(
67
+ pixel_chunk_duration=num_frames,
68
+ )
69
+ ),
70
+ conditioner=dict(
71
+ latent_condition=dict(
72
+ dropout_rate=0.2,
73
+ )
74
+ ),
75
+ ),
76
+ )
77
+ )
78
+
79
+ cs = ConfigStore.instance()
80
+ cs.store(
81
+ group="experiment",
82
+ package="_global_",
83
+ name=Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY["job"]["name"],
84
+ node=Cosmos_DiffusionDecoder_7B_INFERENCE_ONLY,
85
+ )
cosmos1/models/autoregressive/diffusion_decoder/config/registry.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from hydra.core.config_store import ConfigStore
17
+
18
+ from cosmos1.models.autoregressive.diffusion_decoder.config.base.conditioner import (
19
+ VideoLatentDiffusionDecoderConditionerConfig,
20
+ )
21
+ from cosmos1.models.autoregressive.tokenizer.discrete_video import DiscreteVideoFSQJITTokenizer
22
+ from cosmos1.models.diffusion.module.pretrained_vae import JITVAE, JointImageVideoSharedJITTokenizer, VideoJITTokenizer
23
+ from cosmos1.utils.lazy_config import LazyCall as L
24
+
25
+
26
+ def get_cosmos_video_discrete_tokenizer_comp8x16x16(
27
+ resolution: str,
28
+ chunk_duration: int,
29
+ checkpoint_path: str,
30
+ ):
31
+ assert resolution in ["720"]
32
+
33
+ pixel_chunk_duration = chunk_duration
34
+ temporal_compression_factor = 8
35
+ spatial_compression_factor = 16
36
+
37
+ return L(DiscreteVideoFSQJITTokenizer)(
38
+ enc_fp=checkpoint_path.replace(".jit", "encoder.jit"),
39
+ dec_fp=checkpoint_path.replace(".jit", "decoder.jit"),
40
+ name="discrete_video_fsq",
41
+ latent_ch=6,
42
+ is_bf16=True,
43
+ pixel_chunk_duration=pixel_chunk_duration,
44
+ latent_chunk_duration=1 + (pixel_chunk_duration - 1) // temporal_compression_factor,
45
+ max_enc_batch_size=8,
46
+ max_dec_batch_size=4,
47
+ levels=[8, 8, 8, 5, 5, 5],
48
+ compression_ratio=[temporal_compression_factor, spatial_compression_factor, spatial_compression_factor],
49
+ )
50
+
51
+
52
+ def get_cosmos_video_tokenizer_comp8x8x8(resolution: str, chunk_duration: int, checkpoint_path=None):
53
+ pixel_chunk_duration = chunk_duration
54
+ temporal_compression_factor = 8
55
+ spatial_compression_factor = 8
56
+
57
+ return L(JointImageVideoSharedJITTokenizer)(
58
+ video_vae=L(VideoJITTokenizer)(
59
+ name="cosmos_1_0_diffusion_tokenizer",
60
+ latent_ch=16,
61
+ is_bf16=True,
62
+ pixel_chunk_duration=pixel_chunk_duration,
63
+ temporal_compression_factor=temporal_compression_factor,
64
+ spatial_compression_factor=spatial_compression_factor,
65
+ spatial_resolution=resolution,
66
+ ),
67
+ image_vae=L(JITVAE)(
68
+ name="cosmos_1_0_diffusion_tokenizer",
69
+ latent_ch=16,
70
+ is_image=False,
71
+ is_bf16=True,
72
+ ),
73
+ name="cosmos_diffusion_tokenizer_res720_comp8x8x8_t121_ver092624",
74
+ latent_ch=16,
75
+ )
76
+
77
+
78
+ def register_tokenizer(cs):
79
+ cs.store(
80
+ group="tokenizer",
81
+ package="model.tokenizer",
82
+ name="cosmos_video_tokenizer_res720_comp8x8x8_t121_ver092624",
83
+ node=get_cosmos_video_tokenizer_comp8x8x8(
84
+ resolution="720",
85
+ chunk_duration=121,
86
+ checkpoint_path="checkpoints/Cosmos-1.0-Tokenizer-CV8x8x8/.jit",
87
+ ),
88
+ )
89
+
90
+
91
+ def register_corruptor(cs):
92
+ cs.store(
93
+ group="tokenizer_corruptor",
94
+ package="model.tokenizer_corruptor",
95
+ name="cosmos_video_discrete_tokenizer_res720_comp8x16x16_t49_ver110224",
96
+ node=get_cosmos_video_discrete_tokenizer_comp8x16x16(
97
+ resolution="720",
98
+ chunk_duration=49,
99
+ checkpoint_path="checkpoints/Cosmos-1.0-Tokenizer-DV8x16x16/.jit",
100
+ ),
101
+ )
102
+
103
+
104
+ def register_conditioner(cs):
105
+ cs.store(
106
+ group="conditioner",
107
+ package="model.conditioner",
108
+ name="video_latent_diffusion_decoder_cond",
109
+ node=VideoLatentDiffusionDecoderConditionerConfig,
110
+ )
111
+
112
+
113
+ def register_configs():
114
+ cs = ConfigStore.instance()
115
+
116
+ register_conditioner(cs)
117
+ register_corruptor(cs)
118
+ register_tokenizer(cs)
cosmos1/models/autoregressive/diffusion_decoder/inference.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import copy
17
+ import gc
18
+ from typing import List
19
+
20
+ import torch
21
+
22
+ from cosmos1.models.autoregressive.configs.inference.inference_config import DiffusionDecoderSamplingConfig
23
+ from cosmos1.models.autoregressive.diffusion_decoder.model import LatentDiffusionDecoderModel
24
+ from cosmos1.models.autoregressive.diffusion_decoder.utils import linear_blend_video_list, split_with_overlap
25
+ from cosmos1.utils import log
26
+
27
+
28
+ def diffusion_decoder_process_tokens(
29
+ model: LatentDiffusionDecoderModel,
30
+ indices_tensor: List[torch.Tensor],
31
+ dd_sampling_config: DiffusionDecoderSamplingConfig = None,
32
+ original_video_example: torch.Tensor = None,
33
+ t5_emb_batch: List[torch.Tensor] = None,
34
+ ):
35
+ _, T, H, W = original_video_example.shape
36
+ if dd_sampling_config is None:
37
+ dd_sampling_config = DiffusionDecoderSamplingConfig()
38
+ # indices_tensor is assumed to be a list of tensors with shape 1LHW
39
+ data_batch_list = []
40
+ for sample_num, token_CTHW in enumerate(indices_tensor):
41
+ token_BCTHW = token_CTHW.unsqueeze(0).unsqueeze(1)
42
+ token_BCTHW = split_with_overlap(
43
+ token_BCTHW,
44
+ (dd_sampling_config.dd_train_num_video_frames - 1) // 8 + 1,
45
+ overlap=dd_sampling_config.overlap,
46
+ tobf16=False,
47
+ )
48
+ data_batch_list.append(
49
+ {
50
+ "token_chunks": token_BCTHW,
51
+ "t5_text_embeddings": t5_emb_batch[sample_num].to(torch.bfloat16),
52
+ "t5_text_mask": torch.ones(1, 512, dtype=torch.bfloat16).cuda(),
53
+ # other conditions
54
+ "image_size": torch.tensor([[H, W, H, W]] * 1, dtype=torch.bfloat16).cuda(),
55
+ "fps": torch.tensor([dd_sampling_config.fps] * 1, dtype=torch.bfloat16).cuda(),
56
+ "num_frames": torch.tensor(
57
+ [dd_sampling_config.dd_train_num_video_frames] * 1, dtype=torch.bfloat16
58
+ ).cuda(),
59
+ "padding_mask": torch.zeros((1, 1, H, W), dtype=torch.bfloat16).cuda(),
60
+ }
61
+ )
62
+
63
+ out_videos_batch = []
64
+
65
+ for idx, data_batch_template in enumerate(data_batch_list):
66
+ full_length_sample = []
67
+ iterations = min(len(data_batch_template["token_chunks"]), dd_sampling_config.max_iter)
68
+ for iter in range(iterations):
69
+ gc.collect()
70
+ torch.cuda.empty_cache()
71
+
72
+ data_batch = copy.deepcopy(data_batch_template)
73
+ data_batch["video"] = data_batch_template["token_chunks"][iter].cuda().to("cuda")
74
+
75
+ log.debug(f"Run iter {iter} for video # {idx} at length {data_batch['video'].shape[2]}")
76
+ # org_video,
77
+ with torch.no_grad():
78
+ samples_latent = model.generate_samples_from_batch(
79
+ data_batch,
80
+ guidance=dd_sampling_config.guidance,
81
+ sigma_min=dd_sampling_config.sigma_min,
82
+ state_shape=[
83
+ dd_sampling_config.continuous_tokenizer_channel,
84
+ dd_sampling_config.continuous_tokenizer_spatial_compression_ratio,
85
+ H // 8,
86
+ W // 8,
87
+ ],
88
+ apply_corruptor=False,
89
+ return_recon_x=False,
90
+ # corrupt_sigma=dd_sampling_config.sigma,
91
+ preencode_condition=True, # We are using discrete model, so the input is already pre-encoded
92
+ num_steps=dd_sampling_config.num_steps,
93
+ )
94
+ log.debug(f"Current sample shape {samples_latent.shape} for video # {idx} ")
95
+ full_length_sample.append(samples_latent.detach())
96
+
97
+ # Turn off because we remove CP
98
+ # distributed.barrier()
99
+ del data_batch
100
+
101
+ torch.cuda.empty_cache()
102
+
103
+ gc.collect()
104
+ torch.cuda.empty_cache()
105
+
106
+ # Decode full-length samples and free GPU memory
107
+ full_length_sample_pixs = [model.decode(item).clamp(-1, 1).cpu() for item in full_length_sample]
108
+ torch.cuda.empty_cache()
109
+
110
+ # Blend pixel samples
111
+ if len(full_length_sample_pixs) > 1:
112
+ full_length_sample_pixel_blend = linear_blend_video_list(
113
+ full_length_sample_pixs, dd_sampling_config.overlap
114
+ )[:, :, :T]
115
+ else:
116
+ full_length_sample_pixel_blend = full_length_sample_pixs[0][:, :, :T]
117
+
118
+ # Batch size of full_length_sample_pixel_blend is always 1
119
+ out_videos_batch.append((1 + full_length_sample_pixel_blend[0].cpu()) / 2)
120
+ return out_videos_batch
cosmos1/models/autoregressive/diffusion_decoder/model.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from dataclasses import dataclass
17
+ from typing import Callable, Dict, Optional, Tuple
18
+
19
+ import torch
20
+ from torch import Tensor
21
+
22
+ from cosmos1.models.diffusion.conditioner import BaseVideoCondition
23
+ from cosmos1.models.diffusion.diffusion.functional.batch_ops import batch_mul
24
+ from cosmos1.models.diffusion.diffusion.modules.res_sampler import COMMON_SOLVER_OPTIONS
25
+ from cosmos1.models.diffusion.model.model_t2w import DiffusionT2WModel as VideoDiffusionModel
26
+ from cosmos1.utils.lazy_config import instantiate as lazy_instantiate
27
+
28
+
29
+ @dataclass
30
+ class VideoLatentDiffusionDecoderCondition(BaseVideoCondition):
31
+ # latent_condition will concat to the input of network, along channel dim;
32
+ # cfg will make latent_condition all zero padding.
33
+ latent_condition: Optional[torch.Tensor] = None
34
+ latent_condition_sigma: Optional[torch.Tensor] = None
35
+
36
+
37
+ class LatentDiffusionDecoderModel(VideoDiffusionModel):
38
+ def __init__(self, config):
39
+ super().__init__(config)
40
+ """
41
+ latent_corruptor: the corruption module is used to corrupt the latents. It add gaussian noise to the latents.
42
+ pixel_corruptor: the corruption module is used to corrupt the pixels. It apply gaussian blur kernel to pixels in a temporal consistent way.
43
+ tokenizer_corruptor: the corruption module is used to simulate tokenizer reconstruction errors.
44
+
45
+ diffusion decoder noise augmentation pipeline for continuous token condition model:
46
+ condition: GT_video [T, H, W]
47
+ -> tokenizer_corruptor~(8x8x8) encode -> latent_corruptor -> tokenizer_corruptor~(8x8x8) decode
48
+ -> pixel corruptor
49
+ -> tokenizer~(1x8x8) encode -> condition [T, H/8, W/8]
50
+ GT: GT_video [T, H, W] -> tokenizer~(1x8x8) -> x_t [T, H/8, W/8].
51
+
52
+ diffusion decoder noise augmentation pipeline for discrete token condition model:
53
+ condition: GT_video [T, H, W]
54
+ -> pixel corruptor
55
+ -> discrete tokenizer encode -> condition [T, T/8, H/16, W/16]
56
+ GT: GT_video [T, H, W] -> tokenizer~(8x8x8) -> x_t [T, T/8, H/8, W/8].
57
+
58
+ """
59
+ self.latent_corruptor = lazy_instantiate(config.latent_corruptor)
60
+ self.pixel_corruptor = lazy_instantiate(config.pixel_corruptor)
61
+ self.tokenizer_corruptor = lazy_instantiate(config.tokenizer_corruptor)
62
+
63
+ if self.latent_corruptor:
64
+ self.latent_corruptor.to(**self.tensor_kwargs)
65
+ if self.pixel_corruptor:
66
+ self.pixel_corruptor.to(**self.tensor_kwargs)
67
+
68
+ if self.tokenizer_corruptor:
69
+ if hasattr(self.tokenizer_corruptor, "reset_dtype"):
70
+ self.tokenizer_corruptor.reset_dtype()
71
+ else:
72
+ assert self.pixel_corruptor is not None
73
+
74
+ self.diffusion_decoder_cond_sigma_low = config.diffusion_decoder_cond_sigma_low
75
+ self.diffusion_decoder_cond_sigma_high = config.diffusion_decoder_cond_sigma_high
76
+ self.diffusion_decoder_corrupt_prob = config.diffusion_decoder_corrupt_prob
77
+ if hasattr(config, "condition_on_tokenizer_corruptor_token"):
78
+ self.condition_on_tokenizer_corruptor_token = config.condition_on_tokenizer_corruptor_token
79
+ else:
80
+ self.condition_on_tokenizer_corruptor_token = False
81
+
82
+ def is_image_batch(self, data_batch: dict[str, Tensor]) -> bool:
83
+ """We hanlde two types of data_batch. One comes from a joint_dataloader where "dataset_name" can be used to differenciate image_batch and video_batch.
84
+ Another comes from a dataloader which we by default assumes as video_data for video model training.
85
+ """
86
+ is_image = self.input_image_key in data_batch
87
+ is_video = self.input_data_key in data_batch
88
+ assert (
89
+ is_image != is_video
90
+ ), "Only one of the input_image_key or input_data_key should be present in the data_batch."
91
+ return is_image
92
+
93
+ def get_x0_fn_from_batch(
94
+ self,
95
+ data_batch: Dict,
96
+ guidance: float = 1.5,
97
+ is_negative_prompt: bool = False,
98
+ apply_corruptor: bool = True,
99
+ corrupt_sigma: float = 1.5,
100
+ preencode_condition: bool = False,
101
+ ) -> Callable:
102
+ """
103
+ Generates a callable function `x0_fn` based on the provided data batch and guidance factor.
104
+
105
+ This function first processes the input data batch through a conditioning workflow (`conditioner`) to obtain conditioned and unconditioned states. It then defines a nested function `x0_fn` which applies a denoising operation on an input `noise_x` at a given noise level `sigma` using both the conditioned and unconditioned states.
106
+
107
+ Args:
108
+ - data_batch (Dict): A batch of data used for conditioning. The format and content of this dictionary should align with the expectations of the `self.conditioner`
109
+ - guidance (float, optional): A scalar value that modulates the influence of the conditioned state relative to the unconditioned state in the output. Defaults to 1.5.
110
+ - is_negative_prompt (bool): use negative prompt t5 in uncondition if true
111
+
112
+ Returns:
113
+ - Callable: A function `x0_fn(noise_x, sigma)` that takes two arguments, `noise_x` and `sigma`, and return x0 predictoin
114
+
115
+ The returned function is suitable for use in scenarios where a denoised state is required based on both conditioned and unconditioned inputs, with an adjustable level of guidance influence.
116
+ """
117
+ input_key = self.input_data_key # by default it is video key
118
+ # Latent state
119
+ raw_state = data_batch[input_key]
120
+
121
+ if self.condition_on_tokenizer_corruptor_token:
122
+ if preencode_condition:
123
+ latent_condition = raw_state.to(torch.int32).contiguous()
124
+ corrupted_pixel = self.tokenizer_corruptor.decode(latent_condition[:, 0])
125
+ else:
126
+ corrupted_pixel = (
127
+ self.pixel_corruptor(raw_state) if apply_corruptor and self.pixel_corruptor else raw_state
128
+ )
129
+ latent_condition = self.tokenizer_corruptor.encode(corrupted_pixel)
130
+ latent_condition = latent_condition[1] if isinstance(latent_condition, tuple) else latent_condition
131
+ corrupted_pixel = self.tokenizer_corruptor.decode(latent_condition)
132
+ latent_condition = latent_condition.unsqueeze(1)
133
+ else:
134
+ if preencode_condition:
135
+ latent_condition = raw_state
136
+ corrupted_pixel = self.decode(latent_condition)
137
+ else:
138
+ corrupted_pixel = (
139
+ self.pixel_corruptor(raw_state) if apply_corruptor and self.pixel_corruptor else raw_state
140
+ )
141
+ latent_condition = self.encode(corrupted_pixel).contiguous()
142
+
143
+ sigma = (
144
+ torch.rand((latent_condition.shape[0],)).to(**self.tensor_kwargs) * corrupt_sigma
145
+ ) # small value to indicate clean video
146
+ _, _, _, c_noise_cond = self.scaling(sigma=sigma)
147
+ if corrupt_sigma != self.diffusion_decoder_cond_sigma_low and self.diffusion_decoder_corrupt_prob > 0:
148
+ noise = batch_mul(sigma, torch.randn_like(latent_condition))
149
+ latent_condition = latent_condition + noise
150
+ data_batch["latent_condition_sigma"] = batch_mul(torch.ones_like(latent_condition[:, 0:1, ::]), c_noise_cond)
151
+ data_batch["latent_condition"] = latent_condition
152
+ if is_negative_prompt:
153
+ condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
154
+ else:
155
+ condition, uncondition = self.conditioner.get_condition_uncondition(data_batch)
156
+
157
+ def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
158
+ cond_x0 = self.denoise(noise_x, sigma, condition).x0
159
+ uncond_x0 = self.denoise(noise_x, sigma, uncondition).x0
160
+ return cond_x0 + guidance * (cond_x0 - uncond_x0)
161
+
162
+ return x0_fn, corrupted_pixel
163
+
164
+ def generate_samples_from_batch(
165
+ self,
166
+ data_batch: Dict,
167
+ guidance: float = 1.5,
168
+ seed: int = 1,
169
+ state_shape: Tuple | None = None,
170
+ n_sample: int | None = None,
171
+ is_negative_prompt: bool = False,
172
+ num_steps: int = 35,
173
+ solver_option: COMMON_SOLVER_OPTIONS = "2ab",
174
+ sigma_min: float = 0.02,
175
+ apply_corruptor: bool = False,
176
+ return_recon_x: bool = False,
177
+ corrupt_sigma: float = 0.01,
178
+ preencode_condition: bool = False,
179
+ ) -> Tensor:
180
+ """
181
+ Generate samples from the batch. Based on given batch, it will automatically determine whether to generate image or video samples.
182
+ Args:
183
+ data_batch (dict): raw data batch draw from the training data loader.
184
+ iteration (int): Current iteration number.
185
+ guidance (float): guidance weights
186
+ seed (int): random seed
187
+ state_shape (tuple): shape of the state, default to self.state_shape if not provided
188
+ n_sample (int): number of samples to generate
189
+ is_negative_prompt (bool): use negative prompt t5 in uncondition if true
190
+ num_steps (int): number of steps for the diffusion process
191
+ solver_option (str): differential equation solver option, default to "2ab"~(mulitstep solver)
192
+ preencode_condition (bool): use pre-computed condition if true, save tokenizer's inference time memory/
193
+ """
194
+ if not preencode_condition:
195
+ self._normalize_video_databatch_inplace(data_batch)
196
+ self._augment_image_dim_inplace(data_batch)
197
+ is_image_batch = False
198
+ if n_sample is None:
199
+ input_key = self.input_image_key if is_image_batch else self.input_data_key
200
+ n_sample = data_batch[input_key].shape[0]
201
+ if state_shape is None:
202
+ if is_image_batch:
203
+ state_shape = (self.state_shape[0], 1, *self.state_shape[2:]) # C,T,H,W
204
+
205
+ x0_fn, recon_x = self.get_x0_fn_from_batch(
206
+ data_batch,
207
+ guidance,
208
+ is_negative_prompt=is_negative_prompt,
209
+ apply_corruptor=apply_corruptor,
210
+ corrupt_sigma=corrupt_sigma,
211
+ preencode_condition=preencode_condition,
212
+ )
213
+ generator = torch.Generator(device=self.tensor_kwargs["device"])
214
+ generator.manual_seed(seed)
215
+ x_sigma_max = (
216
+ torch.randn(n_sample, *state_shape, **self.tensor_kwargs, generator=generator) * self.sde.sigma_max
217
+ )
218
+
219
+ samples = self.sampler(
220
+ x0_fn,
221
+ x_sigma_max,
222
+ num_steps=num_steps,
223
+ sigma_min=sigma_min,
224
+ sigma_max=self.sde.sigma_max,
225
+ solver_option=solver_option,
226
+ )
227
+
228
+ if return_recon_x:
229
+ return samples, recon_x
230
+ else:
231
+ return samples
cosmos1/models/autoregressive/diffusion_decoder/network.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Optional, Tuple
17
+
18
+ import torch
19
+ from einops import rearrange
20
+ from torch import nn
21
+ from torchvision import transforms
22
+
23
+ from cosmos1.models.diffusion.module.blocks import PatchEmbed
24
+ from cosmos1.models.diffusion.networks.general_dit import GeneralDIT
25
+
26
+
27
+ class DiffusionDecoderGeneralDIT(GeneralDIT):
28
+ def __init__(
29
+ self,
30
+ *args,
31
+ is_diffusion_decoder: bool = True,
32
+ diffusion_decoder_condition_on_sigma: bool = False,
33
+ diffusion_decoder_condition_on_token: bool = False,
34
+ diffusion_decoder_token_condition_voc_size: int = 64000,
35
+ diffusion_decoder_token_condition_dim: int = 32,
36
+ **kwargs,
37
+ ):
38
+ # diffusion decoder setting
39
+ self.is_diffusion_decoder = is_diffusion_decoder
40
+ self.diffusion_decoder_condition_on_sigma = diffusion_decoder_condition_on_sigma
41
+ self.diffusion_decoder_condition_on_token = diffusion_decoder_condition_on_token
42
+ self.diffusion_decoder_token_condition_voc_size = diffusion_decoder_token_condition_voc_size
43
+ self.diffusion_decoder_token_condition_dim = diffusion_decoder_token_condition_dim
44
+ super().__init__(*args, **kwargs)
45
+
46
+ def initialize_weights(self):
47
+ # Initialize transformer layers:
48
+ super().initialize_weights()
49
+ if self.diffusion_decoder_condition_on_token:
50
+ nn.init.constant_(self.token_embedder.weight, 0)
51
+
52
+ def build_patch_embed(self):
53
+ (
54
+ concat_padding_mask,
55
+ in_channels,
56
+ patch_spatial,
57
+ patch_temporal,
58
+ model_channels,
59
+ is_diffusion_decoder,
60
+ diffusion_decoder_token_condition_dim,
61
+ diffusion_decoder_condition_on_sigma,
62
+ ) = (
63
+ self.concat_padding_mask,
64
+ self.in_channels,
65
+ self.patch_spatial,
66
+ self.patch_temporal,
67
+ self.model_channels,
68
+ self.is_diffusion_decoder,
69
+ self.diffusion_decoder_token_condition_dim,
70
+ self.diffusion_decoder_condition_on_sigma,
71
+ )
72
+ in_channels = (
73
+ in_channels + in_channels
74
+ if (is_diffusion_decoder and not self.diffusion_decoder_condition_on_token)
75
+ else in_channels
76
+ )
77
+ in_channels = in_channels + 1 if diffusion_decoder_condition_on_sigma else in_channels
78
+ in_channels = (
79
+ in_channels + self.diffusion_decoder_token_condition_dim
80
+ if self.diffusion_decoder_condition_on_token
81
+ else in_channels
82
+ )
83
+ in_channels = in_channels + 1 if concat_padding_mask else in_channels
84
+
85
+ self.x_embedder = PatchEmbed(
86
+ spatial_patch_size=patch_spatial,
87
+ temporal_patch_size=patch_temporal,
88
+ in_channels=in_channels,
89
+ out_channels=model_channels,
90
+ bias=False,
91
+ )
92
+
93
+ if self.diffusion_decoder_condition_on_token:
94
+ self.token_embedder = nn.Embedding(
95
+ self.diffusion_decoder_token_condition_voc_size, self.diffusion_decoder_token_condition_dim
96
+ )
97
+
98
+ def prepare_embedded_sequence(
99
+ self,
100
+ x_B_C_T_H_W: torch.Tensor,
101
+ fps: Optional[torch.Tensor] = None,
102
+ padding_mask: Optional[torch.Tensor] = None,
103
+ latent_condition: Optional[torch.Tensor] = None,
104
+ latent_condition_sigma: Optional[torch.Tensor] = None,
105
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
106
+ """
107
+ Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
108
+
109
+ Args:
110
+ x_B_C_T_H_W (torch.Tensor): video
111
+ fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
112
+ If None, a default value (`self.base_fps`) will be used.
113
+ padding_mask (Optional[torch.Tensor]): current it is not used
114
+
115
+ Returns:
116
+ Tuple[torch.Tensor, Optional[torch.Tensor]]:
117
+ - A tensor of shape (B, T, H, W, D) with the embedded sequence.
118
+ - An optional positional embedding tensor, returned only if the positional embedding class
119
+ (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
120
+
121
+ Notes:
122
+ - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
123
+ - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
124
+ - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
125
+ the `self.pos_embedder` with the shape [T, H, W].
126
+ - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the `self.pos_embedder`
127
+ with the fps tensor.
128
+ - Otherwise, the positional embeddings are generated without considering fps.
129
+ """
130
+ if self.diffusion_decoder_condition_on_token:
131
+ latent_condition = self.token_embedder(latent_condition)
132
+ B, _, T, H, W, _ = latent_condition.shape
133
+ latent_condition = rearrange(latent_condition, "B 1 T H W D -> (B T) (1 D) H W")
134
+
135
+ latent_condition = transforms.functional.resize(
136
+ latent_condition, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.BILINEAR
137
+ )
138
+ latent_condition = rearrange(latent_condition, "(B T) D H W -> B D T H W ", B=B, T=T)
139
+ x_B_C_T_H_W = torch.cat([x_B_C_T_H_W, latent_condition], dim=1)
140
+ if self.diffusion_decoder_condition_on_sigma:
141
+ x_B_C_T_H_W = torch.cat([x_B_C_T_H_W, latent_condition_sigma], dim=1)
142
+ if self.concat_padding_mask:
143
+ padding_mask = transforms.functional.resize(
144
+ padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
145
+ )
146
+ x_B_C_T_H_W = torch.cat(
147
+ [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
148
+ )
149
+ x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
150
+
151
+ if self.extra_per_block_abs_pos_emb:
152
+ extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps)
153
+ else:
154
+ extra_pos_emb = None
155
+
156
+ if "rope" in self.pos_emb_cls.lower():
157
+ return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps), extra_pos_emb
158
+
159
+ if "fps_aware" in self.pos_emb_cls:
160
+ x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, fps=fps) # [B, T, H, W, D]
161
+ else:
162
+ x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D) # [B, T, H, W, D]
163
+ return x_B_T_H_W_D, None, extra_pos_emb