Spaces:

OpenSound
/

EzAudio

Running on Zero

App Files Files Community

OpenSound commited on Sep 16, 2024

Commit

a09029c

verified ·

1 Parent(s): 5099022

Update src/inference.py

Browse files

Files changed (1) hide show

src/inference.py +169 -169

src/inference.py CHANGED Viewed

@@ -1,169 +1,169 @@
-import os
-import random
-import pandas as pd
-import torch
-import librosa
-import numpy as np
-import soundfile as sf
-from tqdm import tqdm
-from utils import scale_shift_re
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
-    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-@torch.no_grad()
-def inference(autoencoder, unet, gt, gt_mask,
-              tokenizer, text_encoder,
-              params, noise_scheduler,
-              text_raw, neg_text=None,
-              audio_frames=500,
-              guidance_scale=3, guidance_rescale=0.0,
-              ddim_steps=50, eta=1, random_seed=2024,
-              device='cuda',
-              ):
-    if neg_text is None:
-        neg_text = [""]
-    if tokenizer is not None:
-        text_batch = tokenizer(text_raw,
-                               max_length=params['text_encoder']['max_length'],
-                               padding="max_length", truncation=True, return_tensors="pt")
-        text, text_mask = text_batch.input_ids.to(device), text_batch.attention_mask.to(device).bool()
-        text = text_encoder(input_ids=text, attention_mask=text_mask).last_hidden_state
-        uncond_text_batch = tokenizer(neg_text,
-                                      max_length=params['text_encoder']['max_length'],
-                                      padding="max_length", truncation=True, return_tensors="pt")
-        uncond_text, uncond_text_mask = uncond_text_batch.input_ids.to(device), uncond_text_batch.attention_mask.to(device).bool()
-        uncond_text = text_encoder(input_ids=uncond_text,
-                                   attention_mask=uncond_text_mask).last_hidden_state
-    else:
-        text, text_mask = None, None
-        guidance_scale = None
-    codec_dim = params['model']['out_chans']
-    unet.eval()
-    if random_seed is not None:
-        generator = torch.Generator(device=device).manual_seed(random_seed)
-    else:
-        generator = torch.Generator(device=device)
-        generator.seed()
-    noise_scheduler.set_timesteps(ddim_steps)
-    # init noise
-    noise = torch.randn((1, codec_dim, audio_frames), generator=generator, device=device)
-    latents = noise
-    for t in noise_scheduler.timesteps:
-        latents = noise_scheduler.scale_model_input(latents, t)
-        if guidance_scale:
-            latents_combined = torch.cat([latents, latents], dim=0)
-            text_combined = torch.cat([text, uncond_text], dim=0)
-            text_mask_combined = torch.cat([text_mask, uncond_text_mask], dim=0)
-            if gt is not None:
-                gt_combined = torch.cat([gt, gt], dim=0)
-                gt_mask_combined = torch.cat([gt_mask, gt_mask], dim=0)
-            else:
-                gt_combined = None
-                gt_mask_combined = None
-            output_combined, _ = unet(latents_combined, t, text_combined, context_mask=text_mask_combined,
-                                      cls_token=None, gt=gt_combined, mae_mask_infer=gt_mask_combined)
-            output_text, output_uncond = torch.chunk(output_combined, 2, dim=0)
-            output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
-            if guidance_rescale > 0.0:
-                output_pred = rescale_noise_cfg(output_pred, output_text,
-                                                guidance_rescale=guidance_rescale)
-        else:
-            output_pred, mae_mask = unet(latents, t, text, context_mask=text_mask,
-                                         cls_token=None, gt=gt, mae_mask_infer=gt_mask)
-        latents = noise_scheduler.step(model_output=output_pred, timestep=t,
-                                       sample=latents,
-                                       eta=eta, generator=generator).prev_sample
-    pred = scale_shift_re(latents, params['autoencoder']['scale'],
-                          params['autoencoder']['shift'])
-    if gt is not None:
-        pred[~gt_mask] = gt[~gt_mask]
-    pred_wav = autoencoder(embedding=pred)
-    return pred_wav
-@torch.no_grad()
-def eval_udit(autoencoder, unet,
-              tokenizer, text_encoder,
-              params, noise_scheduler,
-              val_df, subset,
-              audio_frames, mae=False,
-              guidance_scale=3, guidance_rescale=0.0,
-              ddim_steps=50, eta=1, random_seed=2023,
-              device='cuda',
-              epoch=0, save_path='logs/eval/', val_num=5):
-    val_df = pd.read_csv(val_df)
-    val_df = val_df[val_df['split'] == subset]
-    if mae:
-        val_df = val_df[val_df['audio_length'] != 0]
-    save_path = save_path + str(epoch) + '/'
-    os.makedirs(save_path, exist_ok=True)
-    for i in tqdm(range(len(val_df))):
-        row = val_df.iloc[i]
-        text = [row['caption']]
-        if mae:
-            audio_path = params['data']['val_dir'] + str(row['audio_path'])
-            gt, sr = librosa.load(audio_path, sr=params['data']['sr'])
-            gt = gt / (np.max(np.abs(gt)) + 1e-9)
-            sf.write(save_path + text[0] + '_gt.wav', gt, samplerate=params['data']['sr'])
-            num_samples = 10 * sr
-            if len(gt) < num_samples:
-                padding = num_samples - len(gt)
-                gt = np.pad(gt, (0, padding), 'constant')
-            else:
-                gt = gt[:num_samples]
-            gt = torch.tensor(gt).unsqueeze(0).unsqueeze(1).to(device)
-            gt = autoencoder(audio=gt)
-            B, D, L = gt.shape
-            mask_len = int(L * 0.2)
-            gt_mask = torch.zeros(B, D, L).to(device)
-            for _ in range(2):
-                start = random.randint(0, L - mask_len)
-                gt_mask[:, :, start:start + mask_len] = 1
-            gt_mask = gt_mask.bool()
-        else:
-            gt = None
-            gt_mask = None
-        pred = inference(autoencoder, unet, gt, gt_mask,
-                         tokenizer, text_encoder,
-                         params, noise_scheduler,
-                         text, neg_text=None,
-                         audio_frames=audio_frames,
-                         guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
-                         ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
-                         device=device)
-        pred = pred.cpu().numpy().squeeze(0).squeeze(0)
-        sf.write(save_path + text[0] + '.wav', pred, samplerate=params['data']['sr'])
-        if i + 1 >= val_num:
-            break

+import os
+import random
+import pandas as pd
+import torch
+import librosa
+import numpy as np
+import soundfile as sf
+from tqdm import tqdm
+from .utils import scale_shift_re
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+@torch.no_grad()
+def inference(autoencoder, unet, gt, gt_mask,
+              tokenizer, text_encoder,
+              params, noise_scheduler,
+              text_raw, neg_text=None,
+              audio_frames=500,
+              guidance_scale=3, guidance_rescale=0.0,
+              ddim_steps=50, eta=1, random_seed=2024,
+              device='cuda',
+              ):
+    if neg_text is None:
+        neg_text = [""]
+    if tokenizer is not None:
+        text_batch = tokenizer(text_raw,
+                               max_length=params['text_encoder']['max_length'],
+                               padding="max_length", truncation=True, return_tensors="pt")
+        text, text_mask = text_batch.input_ids.to(device), text_batch.attention_mask.to(device).bool()
+        text = text_encoder(input_ids=text, attention_mask=text_mask).last_hidden_state
+        uncond_text_batch = tokenizer(neg_text,
+                                      max_length=params['text_encoder']['max_length'],
+                                      padding="max_length", truncation=True, return_tensors="pt")
+        uncond_text, uncond_text_mask = uncond_text_batch.input_ids.to(device), uncond_text_batch.attention_mask.to(device).bool()
+        uncond_text = text_encoder(input_ids=uncond_text,
+                                   attention_mask=uncond_text_mask).last_hidden_state
+    else:
+        text, text_mask = None, None
+        guidance_scale = None
+    codec_dim = params['model']['out_chans']
+    unet.eval()
+    if random_seed is not None:
+        generator = torch.Generator(device=device).manual_seed(random_seed)
+    else:
+        generator = torch.Generator(device=device)
+        generator.seed()
+    noise_scheduler.set_timesteps(ddim_steps)
+    # init noise
+    noise = torch.randn((1, codec_dim, audio_frames), generator=generator, device=device)
+    latents = noise
+    for t in noise_scheduler.timesteps:
+        latents = noise_scheduler.scale_model_input(latents, t)
+        if guidance_scale:
+            latents_combined = torch.cat([latents, latents], dim=0)
+            text_combined = torch.cat([text, uncond_text], dim=0)
+            text_mask_combined = torch.cat([text_mask, uncond_text_mask], dim=0)
+            if gt is not None:
+                gt_combined = torch.cat([gt, gt], dim=0)
+                gt_mask_combined = torch.cat([gt_mask, gt_mask], dim=0)
+            else:
+                gt_combined = None
+                gt_mask_combined = None
+            output_combined, _ = unet(latents_combined, t, text_combined, context_mask=text_mask_combined,
+                                      cls_token=None, gt=gt_combined, mae_mask_infer=gt_mask_combined)
+            output_text, output_uncond = torch.chunk(output_combined, 2, dim=0)
+            output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
+            if guidance_rescale > 0.0:
+                output_pred = rescale_noise_cfg(output_pred, output_text,
+                                                guidance_rescale=guidance_rescale)
+        else:
+            output_pred, mae_mask = unet(latents, t, text, context_mask=text_mask,
+                                         cls_token=None, gt=gt, mae_mask_infer=gt_mask)
+        latents = noise_scheduler.step(model_output=output_pred, timestep=t,
+                                       sample=latents,
+                                       eta=eta, generator=generator).prev_sample
+    pred = scale_shift_re(latents, params['autoencoder']['scale'],
+                          params['autoencoder']['shift'])
+    if gt is not None:
+        pred[~gt_mask] = gt[~gt_mask]
+    pred_wav = autoencoder(embedding=pred)
+    return pred_wav
+@torch.no_grad()
+def eval_udit(autoencoder, unet,
+              tokenizer, text_encoder,
+              params, noise_scheduler,
+              val_df, subset,
+              audio_frames, mae=False,
+              guidance_scale=3, guidance_rescale=0.0,
+              ddim_steps=50, eta=1, random_seed=2023,
+              device='cuda',
+              epoch=0, save_path='logs/eval/', val_num=5):
+    val_df = pd.read_csv(val_df)
+    val_df = val_df[val_df['split'] == subset]
+    if mae:
+        val_df = val_df[val_df['audio_length'] != 0]
+    save_path = save_path + str(epoch) + '/'
+    os.makedirs(save_path, exist_ok=True)
+    for i in tqdm(range(len(val_df))):
+        row = val_df.iloc[i]
+        text = [row['caption']]
+        if mae:
+            audio_path = params['data']['val_dir'] + str(row['audio_path'])
+            gt, sr = librosa.load(audio_path, sr=params['data']['sr'])
+            gt = gt / (np.max(np.abs(gt)) + 1e-9)
+            sf.write(save_path + text[0] + '_gt.wav', gt, samplerate=params['data']['sr'])
+            num_samples = 10 * sr
+            if len(gt) < num_samples:
+                padding = num_samples - len(gt)
+                gt = np.pad(gt, (0, padding), 'constant')
+            else:
+                gt = gt[:num_samples]
+            gt = torch.tensor(gt).unsqueeze(0).unsqueeze(1).to(device)
+            gt = autoencoder(audio=gt)
+            B, D, L = gt.shape
+            mask_len = int(L * 0.2)
+            gt_mask = torch.zeros(B, D, L).to(device)
+            for _ in range(2):
+                start = random.randint(0, L - mask_len)
+                gt_mask[:, :, start:start + mask_len] = 1
+            gt_mask = gt_mask.bool()
+        else:
+            gt = None
+            gt_mask = None
+        pred = inference(autoencoder, unet, gt, gt_mask,
+                         tokenizer, text_encoder,
+                         params, noise_scheduler,
+                         text, neg_text=None,
+                         audio_frames=audio_frames,
+                         guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
+                         ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
+                         device=device)
+        pred = pred.cpu().numpy().squeeze(0).squeeze(0)
+        sf.write(save_path + text[0] + '.wav', pred, samplerate=params['data']['sr'])
+        if i + 1 >= val_num:
+            break