4.49

Browse files

Files changed (5) hide show

audiocraft/builders.py +34 -56
audiocraft/encodec.py +0 -1
audiocraft/lm.py +13 -43
audiocraft/transformer.py +11 -8
requirements.txt +5 -4

audiocraft/builders.py CHANGED Viewed

@@ -7,47 +7,36 @@ from huggingface_hub import hf_hub_download
 import os
 from omegaconf import OmegaConf
 from .encodec import EncodecModel
-from .lm import LMModel, TorchAutocast
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
-N_REPEAT = 4  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
-    n = x.shape[0]
     offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
-    return torch.roll(x, offset, dims=0)
-def _delete_param(cfg, full_name):
-    parts = full_name.split('.')
-    for part in parts[:-1]:
-        if part in cfg:
-            cfg = cfg[part]
-        else:
-            return
-    OmegaConf.set_struct(cfg, False)
-    if parts[-1] in cfg:
-        del cfg[parts[-1]]
-    OmegaConf.set_struct(cfg, True)
 class AudioGen(torch.nn.Module):
     # https://huggingface.co/facebook/audiogen-medium
     def __init__(self):
         super().__init__()
-        self.autocast = TorchAutocast(
-                enabled=True, device_type='cuda', dtype=torch.float16)
         # Vocoder
         _file_1 = hf_hub_download(
-            repo_id='facebook/audiogen-medium',
             filename="compression_state_dict.bin",
             cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
-            library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
         pkg = torch.load(_file_1, map_location='cpu')
         # kwargs = OmegaConf.create(pkg['xp.cfg'])
@@ -66,58 +55,47 @@ class AudioGen(torch.nn.Module):
         self.resample_fn = torchaudio.transforms.Resample(16000, 24000)  #  AudioGen = 16KHZ                StyleTTS2 = 24 KHz / MMSTTS = 24 KHz
         # # T5 &
         #  LM
         _file_2 = hf_hub_download(
             repo_id='facebook/audiogen-medium',
-            filename="state_dict.bin",
             cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
-            library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
         pkg = torch.load(_file_2, map_location='cpu')
         cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
-        _delete_param(cfg, 'conditioners.self_wav.chroma_stem.cache_path')
-        _delete_param(cfg, 'conditioners.args.merge_text_conditions_p')
-        _delete_param(cfg, 'conditioners.args.drop_desc_p')
-        # print('___________________________CFG___________________',cfg,'\n=======================')
-        # kwargs = dict_from_config(getattr(cfg, 'transformer_lm'))
-        # print('___________________________Kwarg___________________',kwargs,'\n=======================')
         _best = pkg['best_state']
         # _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
-        # _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
         _best['t5.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')#.to(torch.float)
         _best['t5.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')#.to(torch.float)
         self.lm = LMModel() #to(torch.float16)
-        self.lm.load_state_dict(pkg['best_state'],
                               strict=True)
         #
         self.lm.eval()
         self.compression_model.eval()
     def generate(self,
                  prompt='dogs mewo',
                  duration=2.24,  ## seconds of audio
                  ):
-        with torch.no_grad():
-            with self.autocast:
-                # LM
-                gen_tokens = self.lm.generate(
-                    text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,  # '' for null condition,  # ['trance', 'dogs meow', '', '']
-                    max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
-            x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-            x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
-            # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
-            x = self.resample_fn(x)  # [N_REPEAT, duration]
-            x = x.reshape(-1)
-            # for _ in range(7):
-            #     x = _shift(x)
-            print(x.abs().max(), 'MAX')
-            return x / (x.abs().max() + 1e-7)

 import os
 from omegaconf import OmegaConf
 from .encodec import EncodecModel
+from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
+# torch.backends.cudnn.deterministic = True
+N_REPEAT = 2  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
+    n = len(x)
     offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+    if isinstance(x, torch.Tensor):
+        return torch.roll(x, offset, dims=0)
+    elif isinstance(x, str):
+        return x[offset:] + x[:offset]   #np.roll(x, offset)
 class AudioGen(torch.nn.Module):
     # https://huggingface.co/facebook/audiogen-medium
     def __init__(self):
         super().__init__()
+        # self.autocast = TorchAutocast(
+        #         enabled=True, device_type='cuda', dtype=torch.float16)
         # Vocoder
         _file_1 = hf_hub_download(
+            repo_id='facebook/audiogen-medium',
             filename="compression_state_dict.bin",
             cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
         pkg = torch.load(_file_1, map_location='cpu')
         # kwargs = OmegaConf.create(pkg['xp.cfg'])
         self.resample_fn = torchaudio.transforms.Resample(16000, 24000)  #  AudioGen = 16KHZ                StyleTTS2 = 24 KHz / MMSTTS = 24 KHz
         # # T5 &
         #  LM
         _file_2 = hf_hub_download(
             repo_id='facebook/audiogen-medium',
+            filename="state_dict.bin",
             cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
+            library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
         pkg = torch.load(_file_2, map_location='cpu')
         cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
         _best = pkg['best_state']
         # _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
+        # _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
         _best['t5.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')#.to(torch.float)
         _best['t5.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')#.to(torch.float)
         self.lm = LMModel() #to(torch.float16)
+        self.lm.load_state_dict(pkg['best_state'],
                               strict=True)
         #
         self.lm.eval()
         self.compression_model.eval()
+    @torch.no_grad()
     def generate(self,
                  prompt='dogs mewo',
                  duration=2.24,  ## seconds of audio
                  ):
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            gen_tokens = self.lm.generate(
+                text_condition=[prompt] + [prompt[:10] + _shift(prompt) for _ in range(N_REPEAT-1)]  + [''] * N_REPEAT,  # '' for null condition,  # ['trance', 'dogs meow', '', '']
+                max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
+        x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+        x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
+        # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
+        x = self.resample_fn(x)  # [N_REPEAT, duration]
+        x = x.reshape(-1)
+        # for _ in range(7):
+        #      x = _shift(x)
+        return x #x / (x.abs().max() + 1e-7)

audiocraft/encodec.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import typing as tp
-from einops import rearrange
 import numpy as np
 import torch
 from torch import nn

 import typing as tp
 import numpy as np
 import torch
 from torch import nn

audiocraft/lm.py CHANGED Viewed

@@ -3,40 +3,13 @@ from audiocraft.transformer import StreamingTransformer
 from torch import nn
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
-class TorchAutocast:
-    def __init__(self, enabled: bool, *args, **kwargs):
-        self.autocast = torch.autocast(*args, **kwargs) if enabled else None
-    def __enter__(self):
-        if self.autocast is None:
-            return
-        try:
-            self.autocast.__enter__()
-        except RuntimeError:
-            device = self.autocast.device
-            dtype = self.autocast.fast_dtype
-            raise RuntimeError(
-                f"There was an error autocasting with dtype={dtype} device={device}\n"
-                "If you are on the FAIR Cluster, you might need to use autocast_dtype=float16"
-            )
-    def __exit__(self, *args, **kwargs):
-        if self.autocast is None:
-            return
-        self.autocast.__exit__(*args, **kwargs)
 class T5(nn.Module):
     def __init__(self):
         # run this from within lm so it autocasts thus match exact values of t5 in official audiogen
         super().__init__()
-        self.dim = 1024
-        self.output_dim = 1536
-        self.output_proj = nn.Linear(self.dim, self.output_dim)
-        self.autocast = TorchAutocast(enabled=True, device_type='cuda', dtype=torch.float)
         self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-large', legacy=True)
         t5 = T5EncoderModel.from_pretrained('t5-large').train(mode=False)
@@ -45,26 +18,24 @@ class T5(nn.Module):
         self.__dict__['t5'] = t5.to('cuda:0')
     def forward(self, prompt):
-        with torch.set_grad_enabled(False), self.autocast:
             bs = len(prompt) // 2
-            # txt 2 hidden
             d = self.t5_tokenizer(prompt,
                                     return_tensors='pt',
                                     padding=True).to(self.output_proj.bias.device)
             d['attention_mask'][bs:, :] = 0  # null condition t5 attn_mask should be zero
-            x = self.t5(input_ids=d['input_ids'],
                             attention_mask=d['attention_mask']).last_hidden_state  # no kv
-            # output_proj as float32
             print('BEF PROJ',x[0, :, :].sum(), x[1, :, :].sum(), self.output_proj.weight.sum(), self.output_proj.weight.dtype, self.output_proj.bias.sum(), 'GEN\n\n143')
         x = self.output_proj(x)  # nn.Linear() - produces different result if there is no duplicate txt condition here
         x[bs:, :, :] = 0  # venv/../site-packages/audiocraft/modules/conditioners.py -> tokenize()
         return x
 class LMModel(nn.Module):
     def __init__(self,
@@ -77,9 +48,9 @@ class LMModel(nn.Module):
         super().__init__()
         self.t5 = T5()
         self.card = card  # 2048 ?
-        self.n_draw = 6  # replicate so many times the generation of each text in batch
-        # the batch is more expensive than n_draw as it re-runs the model bs times
-        # n_draw just draws more phonemes from the multinomial - after running the lm
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
@@ -107,14 +78,13 @@ class LMModel(nn.Module):
                                cross_attention_src=condition_tensors,
                                token_count=token_count
                                )
-        print(out.sum(), out.dtype, 'TRSF out cust')
         logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
         logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
-        print(logits.sum(0).sum(2), logits.shape, 'SAMPL custom')
         # SAMPLE TOP K
-        k = 1#400 # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
         min_value_top_k = top_k_value[:, :, :, -1:]
@@ -125,7 +95,7 @@ class LMModel(nn.Module):
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
-                                replacement=True)  # [bs*4, self.n_draw]
         # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]

 from torch import nn
 from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 class T5(nn.Module):
     def __init__(self):
         # run this from within lm so it autocasts thus match exact values of t5 in official audiogen
         super().__init__()
+        self.output_proj = nn.Linear(1024,  # t5-large
+                                     1536)  # lm hidden
         self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-large', legacy=True)
         t5 = T5EncoderModel.from_pretrained('t5-large').train(mode=False)
         self.__dict__['t5'] = t5.to('cuda:0')
     def forward(self, prompt):
+        with torch.set_grad_enabled(False), torch.autocast(device_type='cuda', dtype=torch.float32):
             bs = len(prompt) // 2
             d = self.t5_tokenizer(prompt,
                                     return_tensors='pt',
                                     padding=True).to(self.output_proj.bias.device)
             d['attention_mask'][bs:, :] = 0  # null condition t5 attn_mask should be zero
+            x = self.t5(input_ids=d['input_ids'],
                             attention_mask=d['attention_mask']).last_hidden_state  # no kv
+            # output_proj as float32
             print('BEF PROJ',x[0, :, :].sum(), x[1, :, :].sum(), self.output_proj.weight.sum(), self.output_proj.weight.dtype, self.output_proj.bias.sum(), 'GEN\n\n143')
         x = self.output_proj(x)  # nn.Linear() - produces different result if there is no duplicate txt condition here
         x[bs:, :, :] = 0  # venv/../site-packages/audiocraft/modules/conditioners.py -> tokenize()
         return x
 class LMModel(nn.Module):
     def __init__(self,
         super().__init__()
         self.t5 = T5()
         self.card = card  # 2048 ?
+        self.n_draw = 1  # draw additional tokens at each call:
+        #  Batch size is slower  than n_draw as it calls the transformer on larger batch
+        # n_draw instead draws more tokens/phonemes from torch.multinomial - after execution of lm
         embed_dim = self.card + 1
         self.n_q = n_q
         self.dim = dim
                                cross_attention_src=condition_tensors,
                                token_count=token_count
                                )
         logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
         logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
         # SAMPLE TOP K
+        k = 400 # 450 is nice sound still train honk is clear!
         p = torch.softmax(logits, dim=3)
         top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
         min_value_top_k = top_k_value[:, :, :, -1:]
         p = p.reshape(bs * self.n_q, 2048)
         out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
                                 num_samples=self.n_draw,
+                                replacement=False)  # [bs*4, self.n_draw]
         # print('DRAW','c', out)
         return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]

audiocraft/transformer.py CHANGED Viewed

@@ -3,7 +3,12 @@ import torch.nn as nn
 from torch.nn import functional as F
 from einops import rearrange
-def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000):
     assert dim % 2 == 0
     half_dim = dim // 2
     positions = positions.to(torch.float)
@@ -48,7 +53,7 @@ class StreamingMultiheadAttention(nn.Module):
             v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
             q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
-            # print(q.shape, k.shape, v.shape, q.sum(), k.sum(), v.sum(),'CROSS A5')
         else:
             # 1st projected makes k,v (instantaneous)
             # Here else is self_attention for audio with itself (above is cross attention txt)
@@ -56,7 +61,7 @@ class StreamingMultiheadAttention(nn.Module):
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
             projected = nn.functional.linear(query, self.in_proj_weight, None)  # here we have different floating values from official
             bound_layout = "b h p t d"
             packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
@@ -83,8 +88,8 @@ class StreamingMultiheadAttention(nn.Module):
             # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, is_causal=False, dropout_p=0
-        )
         x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
         x = self.out_proj(x)
         return x
@@ -124,7 +129,6 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
     def forward(self,
                 x,
                 cross_attention_src=None):  # txtcond
-        # x = src
         x = x + self.self_attn(self.norm1(x))
         x = x + self.cross_attention(query = self.norm_cross(x),
                                      key   = cross_attention_src,
@@ -163,8 +167,7 @@ class StreamingTransformer(nn.Module):
                 x,
                 token_count=None,
                 cross_attention_src=None):
-        #cross_attention_src = (torch.arange(1536)[None, None, :] * torch.arange(6).reshape(2,3,1)).to(torch.float).to(x.device)
-        print(x.sum(), cross_attention_src[0, :, :].sum(), cross_attention_src[1, :, :].sum(), x.dtype, cross_attention_src.dtype, 'Xattnc')
         if self.positional_embedding in ['sin', 'sin_rope']:
             pos_emb = create_sin_embedding(torch.zeros(x.shape[0], 1, 1, device=x.device) + token_count,
                                            1536,

 from torch.nn import functional as F
 from einops import rearrange
+torch.backends.cuda.enable_mem_efficient_sdp(True)
+def create_sin_embedding(positions,
+                         dim,
+                         max_period=10000
+                         ):
     assert dim % 2 == 0
     half_dim = dim // 2
     positions = positions.to(torch.float)
             v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
             q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
         else:
             # 1st projected makes k,v (instantaneous)
             # Here else is self_attention for audio with itself (above is cross attention txt)
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
             projected = nn.functional.linear(query, self.in_proj_weight, None)  # here we have different floating values from official
+            # print(query.sum(), projected.sum() , self.in_proj_weight.sum(), 'Lc')   # verified official AudioGen values
             bound_layout = "b h p t d"
             packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, is_causal=False, dropout_p=0)
         x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
         x = self.out_proj(x)
         return x
     def forward(self,
                 x,
                 cross_attention_src=None):  # txtcond
         x = x + self.self_attn(self.norm1(x))
         x = x + self.cross_attention(query = self.norm_cross(x),
                                      key   = cross_attention_src,
                 x,
                 token_count=None,
                 cross_attention_src=None):
         if self.positional_embedding in ['sin', 'sin_rope']:
             pos_emb = create_sin_embedding(torch.zeros(x.shape[0], 1, 1, device=x.device) + token_count,
                                            1536,

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-torch
-torchaudio
-numpy
 audiofile
 num2words
 cached_path
 einops
 flask
@@ -12,9 +12,10 @@ sentencepiece
 omegaconf
 opencv-python
 soundfile
-transformers
 audresample
 srt
 nltk
 phonemizer
 docx

+torch==2.1.0
+numpy<2.0.0
 audiofile
 num2words
+huggingface_hub
 cached_path
 einops
 flask
 omegaconf
 opencv-python
 soundfile
+transformers==4.49.0
 audresample
 srt
 nltk
 phonemizer
 docx
+torchaudio