Spaces:

hugggof
/

vampnet-music

Running

App Files Files Community

hugo flores garcia commited on Feb 24

Commit

12dc48a

1 Parent(s): 2d0bc4e

for use with sound objects

Browse files

Files changed (2) hide show

app.py +106 -33
vampnet/newmask.py +365 -0

app.py CHANGED Viewed

@@ -57,6 +57,47 @@ def shift_pitch(signal, interval: int):
     return signal
 @spaces.GPU
 def _vamp(
         seed, input_audio, model_choice,
@@ -78,7 +119,7 @@ def _vamp(
     sr, input_audio = input_audio
     input_audio = input_audio / np.iinfo(input_audio.dtype).max
-    sig = at.AudioSignal(input_audio, sr)
     # reload the model if necessary
     interface.load_finetuned(model_choice)
@@ -88,18 +129,15 @@ def _vamp(
     codes = interface.encode(sig)
-    mask = interface.build_mask(
-        codes, sig,
-        rand_mask_intensity=1.0,
-        prefix_s=0.0,
-        suffix_s=0.0,
-        periodic_prompt=int(periodic_p),
-        periodic_prompt_width=periodic_w,
-        onset_mask_width=onset_mask_width,
-        _dropout=dropout,
-        upper_codebook_mask=int(n_mask_codebooks),
-    )
     # save the mask as a txt file
     interface.set_chunk_size(10.0)
@@ -145,24 +183,45 @@ def vamp(data):
         api=False,
     )
-def api_vamp(data):
     return _vamp(
-        seed=data[seed],
-        input_audio=data[input_audio],
-        model_choice=data[model_choice],
-        pitch_shift_amt=data[pitch_shift_amt],
-        periodic_p=data[periodic_p],
-        n_mask_codebooks=data[n_mask_codebooks],
-        periodic_w=data[periodic_w],
-        onset_mask_width=data[onset_mask_width],
-        dropout=data[dropout],
-        sampletemp=data[sampletemp],
-        typical_filtering=data[typical_filtering],
-        typical_mass=data[typical_mass],
-        typical_min_tokens=data[typical_min_tokens],
-        top_p=data[top_p],
-        sample_cutoff=data[sample_cutoff],
-        stretch_factor=data[stretch_factor],
         api=True,
     )
@@ -258,7 +317,7 @@ with gr.Blocks() as demo:
                     minimum=0,
                     maximum=100,
                     step=1,
-                    value=0, visible=False
                 )
                 n_mask_codebooks = gr.Slider(
@@ -419,8 +478,22 @@ with gr.Blocks() as demo:
     api_vamp_button = gr.Button("api vamp", visible=True)
     api_vamp_button.click(
         fn=api_vamp,
-        inputs=_inputs,
-        outputs=[audio_outs[0]],
         api_name="vamp"
     )

     return signal
+def onsets(sig: at.AudioSignal, hop_length: int):
+    assert sig.batch_size == 1, "batch size must be 1"
+    assert sig.num_channels == 1, "mono signals only"
+    import librosa
+    onset_frame_idxs = librosa.onset.onset_detect(
+        y=sig.samples[0][0].detach().cpu().numpy(), sr=sig.sample_rate,
+        hop_length=hop_length,
+        backtrack=True,
+    )
+    return onset_frame_idxs
+def new_vampnet_mask(self,
+    codes,
+    onset_idxs,
+    width: int = 5,
+    periodic_prompt=2,
+    upper_codebook_mask=1,
+    drop_amt: float = 0.1
+):
+    from vampnet.newmask import mask_and, mask_or, onset_mask, periodic_mask, drop_ones, codebook_mask
+    mask =  mask_and(
+        periodic_mask(codes, periodic_prompt, 1, random_roll=False),
+        mask_or( # this re-masks the onsets, according to a periodic schedule
+            onset_mask(onset_idxs, codes, width=width),
+            periodic_mask(codes, periodic_prompt, 1, random_roll=False),
+        )
+    ).int()
+    # make sure the onset idxs themselves are unmasked
+    # mask = 1 - mask
+    mask[:, :, onset_idxs] = 0
+    mask = mask.cpu() # debug
+    mask = 1-drop_ones(1-mask, drop_amt)
+    mask = codebook_mask(mask, upper_codebook_mask)
+    # save mask as txt (ints)
+    np.savetxt("scratch/rms_mask.txt", mask[0].cpu().numpy(), fmt='%d')
+    mask = mask.to(self.device)
+    return mask[:, :, :]
 @spaces.GPU
 def _vamp(
         seed, input_audio, model_choice,
     sr, input_audio = input_audio
     input_audio = input_audio / np.iinfo(input_audio.dtype).max
+    sig = at.AudioSignal(input_audio, sr).to_mono()
     # reload the model if necessary
     interface.load_finetuned(model_choice)
     codes = interface.encode(sig)
+    mask = new_vampnet_mask(
+        interface,
+        codes,
+        onset_idxs=onsets(sig, hop_length=interface.codec.hop_length),
+        width=onset_mask_width,
+        periodic_prompt=periodic_p,
+        upper_codebook_mask=n_mask_codebooks,
+        drop_amt=dropout
+    ).long()
     # save the mask as a txt file
     interface.set_chunk_size(10.0)
         api=False,
     )
+# def api_vamp(data):
+#     return _vamp(
+#         seed=data[seed],
+#         input_audio=data[input_audio],
+#         model_choice=data[model_choice],
+#         pitch_shift_amt=data[pitch_shift_amt],
+#         periodic_p=data[periodic_p],
+#         n_mask_codebooks=data[n_mask_codebooks],
+#         periodic_w=data[periodic_w],
+#         onset_mask_width=data[onset_mask_width],
+#         dropout=data[dropout],
+#         sampletemp=data[sampletemp],
+#         typical_filtering=data[typical_filtering],
+#         typical_mass=data[typical_mass],
+#         typical_min_tokens=data[typical_min_tokens],
+#         top_p=data[top_p],
+#         sample_cutoff=data[sample_cutoff],
+#         stretch_factor=data[stretch_factor],
+#         api=True,
+#     )
+def api_vamp(input_audio, sampletemp, top_p, periodic_p, periodic_w, dropout, stretch_factor, onset_mask_width, typical_filtering, typical_mass, typical_min_tokens, seed, model_choice, n_mask_codebooks, pitch_shift_amt, sample_cutoff):
     return _vamp(
+        seed=seed,
+        input_audio=input_audio,
+        model_choice=model_choice,
+        pitch_shift_amt=pitch_shift_amt,
+        periodic_p=periodic_p,
+        n_mask_codebooks=n_mask_codebooks,
+        periodic_w=periodic_w,
+        onset_mask_width=onset_mask_width,
+        dropout=dropout,
+        sampletemp=sampletemp,
+        typical_filtering=typical_filtering,
+        typical_mass=typical_mass,
+        typical_min_tokens=typical_min_tokens,
+        top_p=top_p,
+        sample_cutoff=sample_cutoff,
+        stretch_factor=stretch_factor,
         api=True,
     )
                     minimum=0,
                     maximum=100,
                     step=1,
+                    value=0, visible=True
                 )
                 n_mask_codebooks = gr.Slider(
     api_vamp_button = gr.Button("api vamp", visible=True)
     api_vamp_button.click(
         fn=api_vamp,
+        inputs=[input_audio,
+                sampletemp, top_p,
+                periodic_p, periodic_w,
+                dropout,
+                stretch_factor,
+                onset_mask_width,
+                typical_filtering,
+                typical_mass,
+                typical_min_tokens,
+                seed,
+                model_choice,
+                n_mask_codebooks,
+                pitch_shift_amt,
+                sample_cutoff
+        ],
+        outputs=[audio_outs[0]],
         api_name="vamp"
     )

vampnet/newmask.py ADDED Viewed

	@@ -0,0 +1,365 @@

+from typing import Optional
+import torch
+from .util import scalar_to_batch_tensor
+def _gamma(r):
+    return (r * torch.pi / 2).cos().clamp(1e-10, 1.0)
+def _invgamma(y):
+    if not torch.is_tensor(y):
+        y = torch.tensor(y)[None]
+    return 2 * y.acos() / torch.pi
+def full_mask(x: torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    return torch.ones_like(x).int()
+def empty_mask(x: torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    return torch.zeros_like(x).int()
+def apply_mask(
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        mask_token: int
+    ):
+    assert mask.ndim == 3, f"mask must be (batch, n_codebooks, seq), but got {mask.ndim}"
+    assert mask.shape == x.shape, f"mask must be same shape as x, but got {mask.shape} and {x.shape}"
+    assert mask.dtype == torch.int, f"mask must be int dtype, but got {mask.dtype}"
+    assert ~torch.any(mask > 1), "mask must be binary"
+    assert ~torch.any(mask < 0), "mask must be binary"
+    mask = mask.int()
+    fill_x = torch.full_like(x, mask_token)
+    x = x * (1 - mask) + fill_x * mask
+    return x
+def random(
+    x: torch.Tensor,
+    r: torch.Tensor
+):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device)
+    r = _gamma(r)[:, None, None]
+    probs = torch.ones_like(x) * r
+    mask = torch.bernoulli(probs)
+    mask = mask.round().int()
+    return mask, torch.zeros_like(mask).bool()
+def random_along_time(x: torch.Tensor, r: torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, channel, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device)
+    x = x[:, 0, :]
+    r = _gamma(r)[:, None]
+    probs = torch.ones_like(x) * r
+    mask = torch.bernoulli(probs)
+    mask = mask.round().int()
+    return mask
+def stemgen_random(x: torch.Tensor, r: torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device)
+    # Assuming x is your input tensor and r is the probability for the Bernoulli distribution
+    nb, nc, nt = x.shape
+    # Randomly sample a codebook level to infer for each item in the batch
+    c = torch.randint(0, nc, (nb,)).to(x.device)
+    # Create a mask tensor of the same shape as x, initially filled with ones
+    mask = torch.ones_like(x).long().to(x.device)
+    ignore_indices_mask = torch.zeros_like(x).long().to(x.device)
+    # Iterate over each item in the batch
+    for i in range(nb):
+        # Create the Bernoulli mask for the sampled level
+        level_mask = torch.bernoulli(torch.ones(nt).to(x.device) * r[i]).long()
+        # Apply the mask to the sampled level
+        mask[i, c[i]] = level_mask
+        # All levels below the sampled level are unmasked (zeros)
+        mask[i, :c[i]] = 0
+        ignore_indices_mask[i, :c[i]] = 1
+        # All levels above the sampled level are masked (ones)
+        mask[i, c[i]+1:] = 1
+        ignore_indices_mask[i, c[i]+1:] = 1
+    # save a debug mask to np txt
+    # import numpy as np
+    # np.savetxt("mask.txt", mask[0].cpu().numpy(), fmt="%d")
+    # np.savetxt("ignore_indices_mask.txt", ignore_indices_mask[0].cpu().numpy(), fmt="%d")
+    return mask.int(), ignore_indices_mask.bool()
+def hugo_random(x: torch.Tensor, r:torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
+    r = _gamma(r)[:, None, None]
+    nb, nc, nt = x.shape
+    probs = torch.ones_like(x) * r
+    mask = torch.bernoulli(probs)
+    # alternatively, the mask level could be the cumsum of the mask
+    mask = mask.round().long().to(x.device)
+    mask_levels = nc - mask.sum(dim=1) - 1
+    # create a new mask, where all levels below the mask level are masked
+    # shape (nb, nc, nt) where new_mask[i, CB:, t] = 1, CB = mask_level[i, t]
+    # mask = mask_levels[:, :, None] > torch.arange(nc)[None, None, :]
+    mask = (mask_levels[:, None, :] < torch.arange(nc, device=x.device)[None, :, None]).long()
+    ignore_levels = mask_levels + 1
+    ignore_indices_mask = (ignore_levels[:, None, :] < torch.arange(nc, device=x.device)[None, :, None]).long()
+    # for _b in range(nb):
+    #     for _t in range(nt):
+    #         for _c in range(nc):
+    #             if mask[_b, _c, _t] == 1:
+    #                 mask[_b, _c:, _t] = 1
+    #                 ignore_indices_mask[_b, _c + 1:, _t] = 1
+    #                 break
+    return mask.long(), ignore_indices_mask.bool()
+def better_cond_random_but_not_working(x: torch.Tensor, r:torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
+    r = _gamma(r)[:, None, None]
+    nb, nc, nt = x.shape
+    probs = torch.ones_like(x) * r
+    mask = torch.bernoulli(probs)
+    mask = mask.round().long().to(x.device)
+    # there cannot be anything unmasked if there's an masked token
+    # in the same timestep and below it
+    for i in range(nb):
+        for j in range(nc):
+            for t in range(nt):
+                if mask[i, j, t] == 1:
+                    mask[i, j:, t] = 1
+                    break
+    # an ignore indices mask, since we can truly only predict one token
+    # per timestep
+    ignore_indices = torch.zeros_like(x)
+    for i in range(nb):
+        for j in range(nc):
+            for t in range(nt):
+                if mask[i, j, t] == 1:
+                    ignore_indices[i, j, t+1:] = 1
+                    break
+    return mask.int(), ignore_indices
+@torch.jit.script_if_tracing
+def linear_random(
+    x: torch.Tensor,
+    r: torch.Tensor,
+):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
+        r = r[:, None, None]
+    probs = torch.ones_like(x).to(x.device).float()
+    # expand to batch and codebook dims
+    probs = probs.expand(x.shape[0], x.shape[1], -1)
+    probs = probs * r
+    mask = torch.bernoulli(probs)
+    mask = mask.round().int()
+    return mask
+@torch.jit.script_if_tracing
+def inpaint(x: torch.Tensor, n_prefix: int, n_suffix: int,):
+    assert n_prefix is not None
+    assert n_suffix is not None
+    mask = full_mask(x)
+    # if we have a prefix or suffix, set their mask prob to 0
+    if n_prefix > 0:
+        if not isinstance(n_prefix, torch.Tensor):
+            n_prefix = scalar_to_batch_tensor(n_prefix, x.shape[0]).to(x.device)
+        for i, n in enumerate(n_prefix):
+            if n > 0:
+                mask[i, :, :n] = 0.0
+    if n_suffix > 0:
+        if not isinstance(n_suffix, torch.Tensor):
+            n_suffix = scalar_to_batch_tensor(n_suffix, x.shape[0]).to(x.device)
+        for i, n in enumerate(n_suffix):
+            if n > 0:
+                mask[i, :, -n:] = 0.0
+    return mask
+@torch.jit.script_if_tracing
+def periodic_mask(x: torch.Tensor, period: int,
+                  width: int = 1, random_roll: bool = False,):
+    mask = full_mask(x)
+    if period == 0:
+        return full_mask(x)
+    if not isinstance(period, torch.Tensor):
+        period = scalar_to_batch_tensor(period, x.shape[0])
+    if period.ndim == 0:
+        period = period[None]
+    for i, factor in enumerate(period):
+        if factor == 0:
+            continue
+        for j in range(mask.shape[-1]):
+            if j % factor == 0:
+                # figure out how wide the mask should be
+                j_start = max(0, j - width // 2  )
+                j_end = min(mask.shape[-1] - 1, j + width // 2 ) + 1
+                # flip a coin for each position in the mask
+                j_mask = torch.bernoulli(torch.ones(j_end - j_start))
+                assert torch.all(j_mask == 1)
+                j_fill = torch.ones_like(j_mask) * (1 - j_mask)
+                assert torch.all(j_fill == 0)
+                # fill
+                mask[i, :, j_start:j_end] = j_fill
+    return mask
+def codebook_unmask(
+    mask: torch.Tensor,
+    n_conditioning_codebooks: int
+):
+    if n_conditioning_codebooks == None:
+        return mask
+    # if we have any conditioning codebooks, set their mask  to 0
+    mask = mask.clone()
+    mask[:, :n_conditioning_codebooks, :] = 0
+    return mask
+def codebook_mask(mask: torch.Tensor, val1: int, val2: int = None):
+    mask = mask.clone()
+    mask[:, val1:, :] = 1
+    # val2 = val2 or val1
+    # vs = torch.linspace(val1, val2, mask.shape[1])
+    # for t, v in enumerate(vs):
+    #     v = int(v)
+    #     mask[:, v:, t] = 1
+    return mask
+@torch.jit.script_if_tracing
+def mask_and(
+    mask1: torch.Tensor,
+    mask2: torch.Tensor
+):
+    assert mask1.shape == mask2.shape, "masks must be same shape"
+    return torch.min(mask1, mask2)
+def drop_ones(mask: torch.Tensor, p: float):
+    oldshp = mask.shape
+    mask = mask.view(-1)
+    # find ones idxs
+    ones_idxs = torch.where(mask == 1)[0]
+    # shuffle idxs
+    ones_idxs_idxs = torch.randperm(len(ones_idxs))
+    ones_idxs = ones_idxs[ones_idxs_idxs]
+    # drop p% of ones
+    ones_idxs = ones_idxs[:int(len(ones_idxs) * p)]
+    # set those idxs to 0
+    mask[ones_idxs] = 0
+    mask = mask.view(oldshp)
+    return mask
+def mask_or(
+    mask1: torch.Tensor,
+    mask2: torch.Tensor
+):
+    assert mask1.shape == mask2.shape, f"masks must be same shape, but got {mask1.shape} and {mask2.shape}"
+    assert mask1.max() <= 1, "mask1 must be binary"
+    assert mask2.max() <= 1, "mask2 must be binary"
+    assert mask1.min() >= 0, "mask1 must be binary"
+    assert mask2.min() >= 0, "mask2 must be binary"
+    return (mask1 + mask2).clamp(0, 1)
+def time_stretch_mask(
+    x: torch.Tensor,
+    stretch_factor: int,
+):
+    assert stretch_factor >= 1, "stretch factor must be >= 1"
+    c_seq_len = x.shape[-1]
+    x = x.repeat_interleave(stretch_factor, dim=-1)
+    # trim cz to the original length
+    x = x[:, :, :c_seq_len]
+    mask = periodic_mask(x, stretch_factor, width=1)
+    return mask
+def onset_mask(
+    onset_frame_idxs: torch.Tensor,
+    z: torch.Tensor,
+    width: int = 1,
+):
+    if len(onset_frame_idxs) == 0:
+        print("no onsets detected")
+    # print("onset_frame_idxs", onset_frame_idxs)
+    # print("mask shape", z.shape)
+    mask = torch.ones_like(z).int()
+    for idx in onset_frame_idxs:
+        mask[:, :, idx-width:idx+width] = 0
+    return mask.int()
+def tria_mask(
+    codes: torch.Tensor,
+    min_amt: float = 0.1,
+    max_amt: float = 0.4,
+):
+    """
+    unmasks a prefix of the codes tensor,
+    in the range provided
+    """
+    mask = full_mask(codes)
+    nb, nc, nt = codes.shape
+    for i in range(nb):
+        amt = torch.rand(1) * (max_amt - min_amt) + min_amt
+        amt = int(amt * nt)
+        mask[i, :, :amt] = 0
+    return mask
+if __name__ == "__main__":
+    sig = AudioSignal("assets/example.wav")