|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Any, List, Union |
|
|
|
import attrs |
|
|
|
from .ar_configs_base_model import ModelConfig, TokenizerConfig |
|
|
|
|
|
@attrs.define(slots=False) |
|
class DataShapeConfig: |
|
latent_shape: list = [] |
|
num_video_frames: Union[None, int] = None |
|
height: Union[None, int] = None |
|
width: Union[None, int] = None |
|
|
|
|
|
@attrs.define(slots=False) |
|
class SamplingConfig: |
|
""" |
|
Sampling config |
|
Args: |
|
temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6. |
|
top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9. |
|
logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False. |
|
echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False. |
|
|
|
""" |
|
|
|
temperature: float = 0.6 |
|
top_k: int = None |
|
top_p: float = 0.9 |
|
compile_prefill: bool = False |
|
compile_sampling: bool = True |
|
logprobs: bool = False |
|
echo: bool = False |
|
|
|
|
|
@attrs.define(slots=False) |
|
class DiffusionDecoderSamplingConfig: |
|
""" |
|
Diffusion decoder sampling config |
|
Args: |
|
guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8. |
|
sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02. |
|
sigma (float): Initial noise level for the diffusion process. Defaults to 8. |
|
num_steps (int): Number of denoising steps to perform. Defaults to 35. |
|
overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2. |
|
continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16. |
|
continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8. |
|
dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57. |
|
""" |
|
|
|
guidance: float = 1.8 |
|
sigma_min: float = 0.02 |
|
sigma: float = 8 |
|
num_steps: int = 15 |
|
overlap: int = 2 |
|
continuous_tokenizer_channel = 16 |
|
continuous_tokenizer_spatial_compression_ratio = 8 |
|
dd_train_num_video_frames: int = 57 |
|
max_iter: int = 99 |
|
fps: int = 24 |
|
|
|
|
|
@attrs.define(slots=False) |
|
class InferenceConfig: |
|
""" |
|
Inference config |
|
Args: |
|
model_config (ModelConfig): Model config |
|
tokenizer_config (TokenizerConfig): Tokenizer config |
|
ckpt_path (str): Path to the checkpoint |
|
latent_shape (list): Shape of the latent |
|
""" |
|
|
|
model_config: ModelConfig = None |
|
tokenizer_config: TokenizerConfig = None |
|
ckpt_path: str = "" |
|
data_shape_config: DataShapeConfig = None |
|
|
|
defaults: List[Any] = attrs.field( |
|
factory=lambda: [ |
|
"_self_", |
|
{"data_val": None}, |
|
{"data_shape_config": "video_shape_as_model_config"}, |
|
{"eval_job": None}, |
|
] |
|
) |
|
|