File size: 5,189 Bytes
01a383f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84490df
01a383f
 
02c5b0e
01a383f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import gc
from typing import List

import torch

from inference_config import DiffusionDecoderSamplingConfig
from cosmos1.models.autoregressive.diffusion_decoder.model import LatentDiffusionDecoderModel
from cosmos1.models.autoregressive.diffusion_decoder.utils import linear_blend_video_list, split_with_overlap
from .log import log


def diffusion_decoder_process_tokens(
    model: LatentDiffusionDecoderModel,
    indices_tensor: List[torch.Tensor],
    dd_sampling_config: DiffusionDecoderSamplingConfig = None,
    original_video_example: torch.Tensor = None,
    t5_emb_batch: List[torch.Tensor] = None,
):
    _, T, H, W = original_video_example.shape
    if dd_sampling_config is None:
        dd_sampling_config = DiffusionDecoderSamplingConfig()
    # indices_tensor is assumed to be a list of tensors with shape 1LHW
    data_batch_list = []
    for sample_num, token_CTHW in enumerate(indices_tensor):
        token_BCTHW = token_CTHW.unsqueeze(0).unsqueeze(1)
        token_BCTHW = split_with_overlap(
            token_BCTHW,
            (dd_sampling_config.dd_train_num_video_frames - 1) // 8 + 1,
            overlap=dd_sampling_config.overlap,
            tobf16=False,
        )
        data_batch_list.append(
            {
                "token_chunks": token_BCTHW,
                "t5_text_embeddings": t5_emb_batch[sample_num].to(torch.bfloat16),
                "t5_text_mask": torch.ones(1, 512, dtype=torch.bfloat16).cuda(),
                # other conditions
                "image_size": torch.tensor([[H, W, H, W]] * 1, dtype=torch.bfloat16).cuda(),
                "fps": torch.tensor([dd_sampling_config.fps] * 1, dtype=torch.bfloat16).cuda(),
                "num_frames": torch.tensor(
                    [dd_sampling_config.dd_train_num_video_frames] * 1, dtype=torch.bfloat16
                ).cuda(),
                "padding_mask": torch.zeros((1, 1, H, W), dtype=torch.bfloat16).cuda(),
            }
        )

    out_videos_batch = []

    for idx, data_batch_template in enumerate(data_batch_list):
        full_length_sample = []
        iterations = min(len(data_batch_template["token_chunks"]), dd_sampling_config.max_iter)
        for iter in range(iterations):
            gc.collect()
            torch.cuda.empty_cache()

            data_batch = copy.deepcopy(data_batch_template)
            data_batch["video"] = data_batch_template["token_chunks"][iter].cuda().to("cuda")

            log.debug(f"Run iter {iter} for video # {idx} at length {data_batch['video'].shape[2]}")
            # org_video,
            with torch.no_grad():
                samples_latent = model.generate_samples_from_batch(
                    data_batch,
                    guidance=dd_sampling_config.guidance,
                    sigma_min=dd_sampling_config.sigma_min,
                    state_shape=[
                        dd_sampling_config.continuous_tokenizer_channel,
                        dd_sampling_config.continuous_tokenizer_spatial_compression_ratio,
                        H // 8,
                        W // 8,
                    ],
                    apply_corruptor=False,
                    return_recon_x=False,
                    # corrupt_sigma=dd_sampling_config.sigma,
                    preencode_condition=True,  # We are using discrete model, so the input is already pre-encoded
                    num_steps=dd_sampling_config.num_steps,
                )
                log.debug(f"Current sample shape {samples_latent.shape} for video # {idx} ")
            full_length_sample.append(samples_latent.detach())

            # Turn off because we remove CP
            # distributed.barrier()
            del data_batch

            torch.cuda.empty_cache()

        gc.collect()
        torch.cuda.empty_cache()

        # Decode full-length samples and free GPU memory
        full_length_sample_pixs = [model.decode(item).clamp(-1, 1).cpu() for item in full_length_sample]
        torch.cuda.empty_cache()

        # Blend pixel samples
        if len(full_length_sample_pixs) > 1:
            full_length_sample_pixel_blend = linear_blend_video_list(
                full_length_sample_pixs, dd_sampling_config.overlap
            )[:, :, :T]
        else:
            full_length_sample_pixel_blend = full_length_sample_pixs[0][:, :, :T]

        # Batch size of full_length_sample_pixel_blend is always 1
        out_videos_batch.append((1 + full_length_sample_pixel_blend[0].cpu()) / 2)
    return out_videos_batch