Spaces:
Runtime error
Runtime error
File size: 6,710 Bytes
5827423 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Residual vector quantizer implementation."""
from dataclasses import dataclass, field
import math
import typing as tp
import torch
from torch import nn
from .core_vq import ResidualVectorQuantization,LanguageVectorQuantization
@dataclass
class QuantizedResult:
quantized: torch.Tensor
codes: torch.Tensor
bandwidth: torch.Tensor # bandwidth in kb/s used, per batch item.
penalty: tp.Optional[torch.Tensor] = None
metrics: dict = field(default_factory=dict)
class ResidualVectorQuantizer(nn.Module):
"""Residual Vector Quantizer.
Args:
dimension (int): Dimension of the codebooks.
n_q (int): Number of residual vector quantizers used.
bins (int): Codebook size.
decay (float): Decay for exponential moving average over the codebooks.
kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
kmeans_iters (int): Number of iterations used for kmeans initialization.
threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
that have an exponential moving average cluster size less than the specified threshold with
randomly selected vector from the current batch.
"""
def __init__(
self,
dimension: int = 256,
n_q: int = 8,
bins: int = 1024,
decay: float = 0.99,
kmeans_init: bool = True,
kmeans_iters: int = 50,
threshold_ema_dead_code: int = 2,
):
super().__init__()
self.n_q = n_q
self.dimension = dimension
self.bins = bins
self.decay = decay
self.kmeans_init = kmeans_init
self.kmeans_iters = kmeans_iters
self.threshold_ema_dead_code = threshold_ema_dead_code
# print(self.bins)
# breakpoint()
self.vq = LanguageVectorQuantization(
dim=self.dimension,
codebook_size=self.bins,
num_quantizers=self.n_q,
decay=self.decay,
kmeans_init=self.kmeans_init,
kmeans_iters=self.kmeans_iters,
threshold_ema_dead_code=self.threshold_ema_dead_code,
)
# self.vq = ResidualVectorQuantization(
# dim=self.dimension,
# codebook_size=self.bins,
# num_quantizers=self.n_q,
# decay=self.decay,
# kmeans_init=self.kmeans_init,
# kmeans_iters=self.kmeans_iters,
# threshold_ema_dead_code=self.threshold_ema_dead_code,
# )
def forward(self, x: torch.Tensor, frame_rate: int, bandwidth: tp.Optional[float] = None) -> QuantizedResult:
"""Residual vector quantization on the given input tensor.
Args:
x (torch.Tensor): Input tensor.
frame_rate (int): Sample rate of the input tensor.
bandwidth (float): Target bandwidth.
Returns:
QuantizedResult:
The quantized (or approximately quantized) representation with
the associated bandwidth and any penalty term for the loss.
"""
# breakpoint()
bw_per_q = self.get_bandwidth_per_quantizer(frame_rate)
n_q = self.get_num_quantizers_for_bandwidth(frame_rate, bandwidth)
# assert n_q==4
# breakpoint()
# nq_choice=[3,4,8]
nq_choice=[4,6,8]
if self.training:
# choice = int(torch.randint(0, 3, (1,)).item())
choice = int(torch.randint(0, 3, (1,)).item())
# breakpoint()
n_q=nq_choice[choice]
# breakpoint()
# n_q=8
quantized, codes, commit_loss = self.vq(x, n_q=n_q)
bw = torch.tensor(n_q * bw_per_q).to(x)
return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
def infer(self, x: torch.Tensor, frame_rate: int, bandwidth: tp.Optional[float] = None) -> QuantizedResult:
"""Residual vector quantization on the given input tensor.
Args:
x (torch.Tensor): Input tensor.
frame_rate (int): Sample rate of the input tensor.
bandwidth (float): Target bandwidth.
Returns:
QuantizedResult:
The quantized (or approximately quantized) representation with
the associated bandwidth and any penalty term for the loss.
"""
bw_per_q = self.get_bandwidth_per_quantizer(frame_rate)
# n_q = self.get_num_quantizers_for_bandwidth(frame_rate, bandwidth)
# # assert n_q==4
# # breakpoint()
# # nq_choice=[3,4,8]
# nq_choice=[3,4,5,6,7,8]
# if self.training:
# # choice = int(torch.randint(0, 3, (1,)).item())
# choice = int(torch.randint(0, 6, (1,)).item())
# # breakpoint()
# n_q=nq_choice[choice]
n_q=1
quantized, codes, commit_loss = self.vq(x, n_q=n_q)
bw = torch.tensor(n_q * bw_per_q).to(x)
return QuantizedResult(quantized, codes, bw, penalty=torch.mean(commit_loss))
def get_num_quantizers_for_bandwidth(self, frame_rate: int, bandwidth: tp.Optional[float] = None) -> int:
"""Return n_q based on specified target bandwidth.
"""
bw_per_q = self.get_bandwidth_per_quantizer(frame_rate)
n_q = self.n_q
if bandwidth and bandwidth > 0.:
# bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
# bandwidth == 6.0
n_q = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
return n_q
def get_bandwidth_per_quantizer(self, frame_rate: int):
"""Return bandwidth per quantizer for a given input frame rate.
Each quantizer encodes a frame with lg(bins) bits.
"""
return math.log2(self.bins) * frame_rate
def encode(self, x: torch.Tensor, frame_rate: int, bandwidth: tp.Optional[float] = None) -> torch.Tensor:
"""Encode a given input tensor with the specified frame rate at the given bandwidth.
The RVQ encode method sets the appropriate number of quantizers to use
and returns indices for each quantizer.
"""
n_q = self.get_num_quantizers_for_bandwidth(frame_rate, bandwidth)
codes = self.vq.encode(x, n_q=n_q)
return codes
def decode(self, codes: torch.Tensor) -> torch.Tensor:
"""Decode the given codes to the quantized representation.
"""
quantized = self.vq.decode(codes)
return quantized
|