File size: 3,723 Bytes
01a383f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84490df
01a383f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, List, Union

import attrs

from .ar_configs_base_model import ModelConfig, TokenizerConfig


@attrs.define(slots=False)
class DataShapeConfig:
    latent_shape: list = []
    num_video_frames: Union[None, int] = None
    height: Union[None, int] = None
    width: Union[None, int] = None


@attrs.define(slots=False)
class SamplingConfig:
    """
    Sampling config
    Args:
        temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
        top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
        logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False.
        echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.

    """

    temperature: float = 0.6
    top_k: int = None
    top_p: float = 0.9
    compile_prefill: bool = False
    compile_sampling: bool = True
    logprobs: bool = False
    echo: bool = False


@attrs.define(slots=False)
class DiffusionDecoderSamplingConfig:
    """
    Diffusion decoder sampling config
    Args:
        guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8.
        sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02.
        sigma (float): Initial noise level for the diffusion process. Defaults to 8.
        num_steps (int): Number of denoising steps to perform. Defaults to 35.
        overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2.
        continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16.
        continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8.
        dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57.
    """

    guidance: float = 1.8
    sigma_min: float = 0.02
    sigma: float = 8
    num_steps: int = 15
    overlap: int = 2
    continuous_tokenizer_channel = 16
    continuous_tokenizer_spatial_compression_ratio = 8
    dd_train_num_video_frames: int = 57
    max_iter: int = 99
    fps: int = 24


@attrs.define(slots=False)
class InferenceConfig:
    """
    Inference config
    Args:
        model_config (ModelConfig): Model config
        tokenizer_config (TokenizerConfig): Tokenizer config
        ckpt_path (str): Path to the checkpoint
        latent_shape (list): Shape of the latent
    """

    model_config: ModelConfig = None
    tokenizer_config: TokenizerConfig = None
    ckpt_path: str = ""
    data_shape_config: DataShapeConfig = None

    defaults: List[Any] = attrs.field(
        factory=lambda: [
            "_self_",
            {"data_val": None},
            {"data_shape_config": "video_shape_as_model_config"},
            {"eval_job": None},
        ]
    )