Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .gitattributes +1 -0
- app.py +362 -0
- nate_is_humming.wav +3 -0
- nate_is_singing_Gb_minor.wav +0 -0
- pitch_correction_utils.py +161 -0
- requirements.txt +6 -0
- singing_songstarter_demo.ipynb +78 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
nate_is_humming.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
###########################################
|
| 2 |
+
# For fast downloads from Hugging Face Hub
|
| 3 |
+
# **Requires the hf_transfer package**
|
| 4 |
+
###########################################
|
| 5 |
+
import os
|
| 6 |
+
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
| 7 |
+
###########################################
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import random
|
| 11 |
+
import typing as tp
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from functools import partial
|
| 15 |
+
|
| 16 |
+
import gradio as gr
|
| 17 |
+
import torch
|
| 18 |
+
import torchaudio
|
| 19 |
+
import numpy as np
|
| 20 |
+
|
| 21 |
+
from audiocraft.models import musicgen
|
| 22 |
+
from audiocraft.data.audio import audio_write
|
| 23 |
+
from audiocraft.utils.notebook import display_audio
|
| 24 |
+
|
| 25 |
+
from pitch_correction_utils import autotune, closest_pitch, aclosest_pitch_from_scale
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def ta_to_librosa_format(waveform):
|
| 29 |
+
"""
|
| 30 |
+
Convert an audio tensor from torchaudio format to librosa format.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
waveform (torch.Tensor): Audio tensor from torchaudio with shape (n_channels, n_samples).
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
np.ndarray: Audio array in librosa format with shape (n_samples,) or (2, n_samples).
|
| 37 |
+
"""
|
| 38 |
+
# Ensure waveform is in CPU and convert to numpy
|
| 39 |
+
waveform_np = waveform.numpy()
|
| 40 |
+
|
| 41 |
+
# Check if audio is mono or stereo and transpose if necessary
|
| 42 |
+
if waveform_np.shape[0] == 1:
|
| 43 |
+
# Remove the channel dimension for mono
|
| 44 |
+
waveform_np = waveform_np.squeeze(0)
|
| 45 |
+
else:
|
| 46 |
+
# Transpose to switch from (n_channels, n_samples) to (n_samples, n_channels)
|
| 47 |
+
waveform_np = waveform_np.transpose()
|
| 48 |
+
|
| 49 |
+
# Normalize to [-1, 1] if not already
|
| 50 |
+
if waveform_np.dtype in [np.int16, np.int32]:
|
| 51 |
+
waveform_np = waveform_np / np.iinfo(waveform_np.dtype).max
|
| 52 |
+
|
| 53 |
+
return waveform_np
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def librosa_to_ta_format(waveform_np):
|
| 57 |
+
"""
|
| 58 |
+
Convert an audio array from librosa format to torchaudio format.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
waveform_np (np.ndarray): Audio array from librosa with shape (n_samples,) or (2, n_samples).
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
torch.Tensor: Audio tensor in torchaudio format with shape (n_channels, n_samples).
|
| 65 |
+
"""
|
| 66 |
+
# Ensure it is a float32 array normalized to [-1, 1]
|
| 67 |
+
waveform_np = np.array(waveform_np, dtype=np.float32)
|
| 68 |
+
|
| 69 |
+
if waveform_np.ndim == 1:
|
| 70 |
+
# Add a channel dimension for mono
|
| 71 |
+
waveform_np = waveform_np[np.newaxis, :]
|
| 72 |
+
else:
|
| 73 |
+
# Transpose to switch from (n_samples, n_channels) to (n_channels, n_samples)
|
| 74 |
+
waveform_np = waveform_np.transpose()
|
| 75 |
+
|
| 76 |
+
# Convert numpy array to PyTorch tensor
|
| 77 |
+
waveform = torch.from_numpy(waveform_np)
|
| 78 |
+
return waveform
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def run_autotune(y, sr, correction_method="closest", scale=None):
|
| 82 |
+
# Only mono-files are handled. If stereo files are supplied, only the first channel is used.
|
| 83 |
+
if y.ndim > 1:
|
| 84 |
+
y = y[0, :]
|
| 85 |
+
|
| 86 |
+
# Pick the pitch adjustment strategy according to the arguments.
|
| 87 |
+
correction_function = closest_pitch if correction_method == 'closest' else \
|
| 88 |
+
partial(aclosest_pitch_from_scale, scale=scale)
|
| 89 |
+
|
| 90 |
+
# Torchaudio -> librosa
|
| 91 |
+
y = ta_to_librosa_format(y)
|
| 92 |
+
# Autotune
|
| 93 |
+
pitch_corrected_y = autotune(y, sr, correction_function, plot=False)
|
| 94 |
+
# Librosa -> torchaudio
|
| 95 |
+
pitch_corrected_y = librosa_to_ta_format(pitch_corrected_y)
|
| 96 |
+
|
| 97 |
+
return pitch_corrected_y
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def set_all_seeds(seed):
|
| 101 |
+
random.seed(seed)
|
| 102 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
| 103 |
+
np.random.seed(seed)
|
| 104 |
+
torch.manual_seed(seed)
|
| 105 |
+
torch.cuda.manual_seed(seed)
|
| 106 |
+
torch.backends.cudnn.deterministic = True
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _preprocess_audio(
|
| 110 |
+
audio_path, model: musicgen.MusicGen, duration: tp.Optional[int] = None
|
| 111 |
+
):
|
| 112 |
+
wav, sr = torchaudio.load(audio_path)
|
| 113 |
+
wav = torchaudio.functional.resample(wav, sr, model.sample_rate)
|
| 114 |
+
wav = wav.mean(dim=0, keepdim=True)
|
| 115 |
+
|
| 116 |
+
# Calculate duration in seconds if not provided
|
| 117 |
+
if duration is None:
|
| 118 |
+
duration = wav.shape[1] / model.sample_rate
|
| 119 |
+
|
| 120 |
+
# Check if duration is more than 30 seconds
|
| 121 |
+
if duration > 30:
|
| 122 |
+
raise ValueError("Duration cannot be more than 30 seconds")
|
| 123 |
+
|
| 124 |
+
end_sample = int(model.sample_rate * duration)
|
| 125 |
+
wav = wav[:, :end_sample]
|
| 126 |
+
|
| 127 |
+
assert wav.shape[0] == 1
|
| 128 |
+
assert wav.shape[1] == model.sample_rate * duration
|
| 129 |
+
|
| 130 |
+
wav = wav.cuda()
|
| 131 |
+
wav = wav.unsqueeze(1)
|
| 132 |
+
|
| 133 |
+
with torch.no_grad():
|
| 134 |
+
gen_audio = model.compression_model.encode(wav)
|
| 135 |
+
|
| 136 |
+
codes, scale = gen_audio
|
| 137 |
+
|
| 138 |
+
assert scale is None
|
| 139 |
+
|
| 140 |
+
return codes
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _get_stemmed_wav_patched(wav, sample_rate):
|
| 144 |
+
print("Skipping stem separation!")
|
| 145 |
+
return wav
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class Pipeline:
|
| 149 |
+
def __init__(self, model_id, max_batch_size=4, do_skip_demucs=True):
|
| 150 |
+
self.model = musicgen.MusicGen.get_pretrained(model_id)
|
| 151 |
+
self.max_batch_size = max_batch_size
|
| 152 |
+
self.do_skip_demucs = do_skip_demucs
|
| 153 |
+
|
| 154 |
+
if self.do_skip_demucs:
|
| 155 |
+
self.model.lm.condition_provider.conditioners.self_wav._get_stemmed_wav = _get_stemmed_wav_patched
|
| 156 |
+
|
| 157 |
+
def __call__(
|
| 158 |
+
self,
|
| 159 |
+
prompt,
|
| 160 |
+
input_audio=None,
|
| 161 |
+
scale=None,
|
| 162 |
+
continuation=False,
|
| 163 |
+
batch_size=1,
|
| 164 |
+
duration=15,
|
| 165 |
+
use_sampling=True,
|
| 166 |
+
temperature=1.0,
|
| 167 |
+
top_k=250,
|
| 168 |
+
top_p=0.0,
|
| 169 |
+
cfg_coef=3.0,
|
| 170 |
+
output_dir="./samples", # change to google drive if you'd like
|
| 171 |
+
normalization_strategy="loudness",
|
| 172 |
+
seed=-1,
|
| 173 |
+
continuation_start=0,
|
| 174 |
+
continuation_end=None,
|
| 175 |
+
):
|
| 176 |
+
print("Prompt:", prompt)
|
| 177 |
+
if scale == "closest":
|
| 178 |
+
scale = None
|
| 179 |
+
|
| 180 |
+
set_generation_params = lambda duration: self.model.set_generation_params(
|
| 181 |
+
duration=duration,
|
| 182 |
+
top_k=top_k,
|
| 183 |
+
top_p=top_p,
|
| 184 |
+
temperature=temperature,
|
| 185 |
+
cfg_coef=cfg_coef,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
if not seed or seed == -1:
|
| 189 |
+
seed = torch.seed() % 2 ** 32 - 1
|
| 190 |
+
set_all_seeds(seed)
|
| 191 |
+
set_all_seeds(seed)
|
| 192 |
+
print(f"Using seed {seed}")
|
| 193 |
+
if not input_audio:
|
| 194 |
+
set_generation_params(duration)
|
| 195 |
+
wav, tokens = self.model.generate([prompt] * batch_size, progress=True, return_tokens=True)
|
| 196 |
+
else:
|
| 197 |
+
input_audio, sr = torchaudio.load(input_audio)
|
| 198 |
+
# Save a copy of the original input audio
|
| 199 |
+
original_input_audio = input_audio.clone()
|
| 200 |
+
print("Input audio shape:", input_audio.shape)
|
| 201 |
+
if scale is None:
|
| 202 |
+
print("Running pitch correction for 'closest' pitch")
|
| 203 |
+
input_audio = run_autotune(input_audio, sr, correction_method="closest")
|
| 204 |
+
else:
|
| 205 |
+
print("Running pitch correction for 'scale' pitch")
|
| 206 |
+
input_audio = run_autotune(input_audio, sr, correction_method="scale", scale=scale)
|
| 207 |
+
print(f"...Done running pitch correction. Shape after is {input_audio.shape}.\n")
|
| 208 |
+
input_audio = input_audio[None] if input_audio.dim() == 2 else input_audio
|
| 209 |
+
|
| 210 |
+
continuation_start = 0 if not continuation_start else continuation_start
|
| 211 |
+
if continuation_end is None or continuation_end == -1:
|
| 212 |
+
continuation_end = input_audio.shape[2] / sr
|
| 213 |
+
|
| 214 |
+
if continuation_start > continuation_end:
|
| 215 |
+
raise ValueError(
|
| 216 |
+
"`continuation_start` must be less than or equal to `continuation_end`"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
input_audio_wavform = input_audio[
|
| 220 |
+
..., int(sr * continuation_start) : int(sr * continuation_end)
|
| 221 |
+
]
|
| 222 |
+
input_audio_wavform = input_audio_wavform.repeat(batch_size, 1, 1)
|
| 223 |
+
# TODO - not using this - is that wrong??
|
| 224 |
+
input_audio_duration = input_audio_wavform.shape[-1] / sr
|
| 225 |
+
|
| 226 |
+
if continuation:
|
| 227 |
+
set_generation_params(duration) # + input_audio_duration) # SEE TODO above
|
| 228 |
+
print("Continuation wavform shape!", input_audio_wavform.shape)
|
| 229 |
+
wav, tokens = self.model.generate_continuation(
|
| 230 |
+
prompt=input_audio_wavform,
|
| 231 |
+
prompt_sample_rate=sr,
|
| 232 |
+
descriptions=[prompt] * batch_size,
|
| 233 |
+
progress=True,
|
| 234 |
+
return_tokens=True
|
| 235 |
+
)
|
| 236 |
+
else:
|
| 237 |
+
print("Melody wavform shape!", input_audio_wavform.shape)
|
| 238 |
+
set_generation_params(duration)
|
| 239 |
+
wav, tokens = self.model.generate_with_chroma(
|
| 240 |
+
[prompt] * batch_size, input_audio_wavform, sr, progress=True, return_tokens=True
|
| 241 |
+
)
|
| 242 |
+
wav, tokens = wav.cpu(), tokens.cpu()
|
| 243 |
+
# Write to files
|
| 244 |
+
output_dir = Path(output_dir)
|
| 245 |
+
output_dir.mkdir(exist_ok=True, parents=True)
|
| 246 |
+
dt_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
| 247 |
+
if input_audio is not None:
|
| 248 |
+
outfile_path = output_dir / f"{dt_str}_input_raw"
|
| 249 |
+
audio_write(
|
| 250 |
+
outfile_path,
|
| 251 |
+
original_input_audio,
|
| 252 |
+
sr,
|
| 253 |
+
strategy=normalization_strategy,
|
| 254 |
+
)
|
| 255 |
+
outfile_path = output_dir / f"{dt_str}_input_pitch_corrected"
|
| 256 |
+
audio_write(
|
| 257 |
+
outfile_path,
|
| 258 |
+
input_audio_wavform[0],
|
| 259 |
+
sr,
|
| 260 |
+
strategy=normalization_strategy,
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
for i in range(batch_size):
|
| 264 |
+
outfile_path = output_dir / f"{dt_str}_{i:02d}"
|
| 265 |
+
audio_write(
|
| 266 |
+
outfile_path,
|
| 267 |
+
wav[i],
|
| 268 |
+
self.model.sample_rate,
|
| 269 |
+
strategy=normalization_strategy,
|
| 270 |
+
)
|
| 271 |
+
json_out_path = output_dir / f"{dt_str}.json"
|
| 272 |
+
json_out_path.write_text(json.dumps(dict(
|
| 273 |
+
prompt=prompt,
|
| 274 |
+
batch_size=batch_size,
|
| 275 |
+
duration=duration,
|
| 276 |
+
use_sampling=use_sampling,
|
| 277 |
+
temperature=temperature,
|
| 278 |
+
top_k=top_k,
|
| 279 |
+
cfg_coef=cfg_coef,
|
| 280 |
+
)))
|
| 281 |
+
|
| 282 |
+
to_return = [None] * (self.max_batch_size + 1)
|
| 283 |
+
if input_audio is not None:
|
| 284 |
+
print(f"trying to return input audio wavform of shape: {input_audio_wavform.shape}")
|
| 285 |
+
to_return[0] = (sr, input_audio_wavform[0].T.numpy())
|
| 286 |
+
|
| 287 |
+
for i in range(batch_size):
|
| 288 |
+
to_return[i + 1] = (self.model.sample_rate, wav[i].T.numpy())
|
| 289 |
+
print(wav[i].shape)
|
| 290 |
+
return to_return
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
def main(model_id="nateraw/musicgen-songstarter-v0.2", max_batch_size=4, share=False, debug=False):
|
| 294 |
+
pipeline = Pipeline(model_id, max_batch_size)
|
| 295 |
+
interface = gr.Interface(
|
| 296 |
+
fn=pipeline.__call__,
|
| 297 |
+
inputs=[
|
| 298 |
+
gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
|
| 299 |
+
gr.Audio(
|
| 300 |
+
sources=["microphone"],
|
| 301 |
+
waveform_options=gr.WaveformOptions(
|
| 302 |
+
waveform_color="#01C6FF",
|
| 303 |
+
waveform_progress_color="#0066B4",
|
| 304 |
+
skip_length=2,
|
| 305 |
+
show_controls=False,
|
| 306 |
+
),
|
| 307 |
+
type="filepath",
|
| 308 |
+
),
|
| 309 |
+
gr.Dropdown(["closest", "A:maj", "A:min", "Bb:maj", "Bb:min", "B:maj", "B:min", "C:maj", "C:min", "Db:maj", "Db:min", "D:maj", "D:min", "Eb:maj", "Eb:min", "E:maj", "E:min", "F:maj", "F:min", "Gb:maj", "Gb:min", "G:maj", "G:min", "Ab:maj", "Ab:min"], label="Scale for pitch correction.", value="closest"),
|
| 310 |
+
gr.Checkbox(label="Is Continuation", value=False),
|
| 311 |
+
gr.Slider(label="Batch Size", value=1, minimum=1, maximum=pipeline.max_batch_size, step=1),
|
| 312 |
+
gr.Slider(label="Duration", value=15, minimum=4, maximum=30),
|
| 313 |
+
gr.Checkbox(label="Use Sampling", value=True),
|
| 314 |
+
gr.Slider(label="Temperature", value=1.0, minimum=0.0, maximum=2.0),
|
| 315 |
+
gr.Slider(label="Top K", value=250, minimum=0, maximum=1000),
|
| 316 |
+
gr.Slider(label="Top P", value=0.0, minimum=0.0, maximum=1.0),
|
| 317 |
+
gr.Slider(label="CFG Coef", value=3.0, minimum=0.0, maximum=10.0),
|
| 318 |
+
gr.Textbox(label="Output Dir", value="./samples"),
|
| 319 |
+
gr.Dropdown(["loudness", "clip", "peak", "rms"], value="loudness", label="Strategy for normalizing audio."),
|
| 320 |
+
gr.Slider(label="random seed", minimum=-1, maximum=9e8),
|
| 321 |
+
],
|
| 322 |
+
outputs=[gr.Audio(label=("Input " if i == 0 else "") + f"Audio {i}") for i in range(pipeline.max_batch_size + 1)],
|
| 323 |
+
title="🎶 Generate song ideas with musicgen-songstarter-v0.2 🎶",
|
| 324 |
+
description="Check out the repo [here](https://huggingface.co/nateraw/musicgen-songstarter-v0.2)",
|
| 325 |
+
examples=[
|
| 326 |
+
["hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
| 327 |
+
["acoustic, guitar, melody, rnb, trap, E minor, 85 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
| 328 |
+
["synth, dark, hip hop, melody, trap, Gb minor, 140 bpm", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
| 329 |
+
["drill, layered, melody, songstarters, trap, C# minor, 130 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
| 330 |
+
["hip hop, soul, rnb, neo soul, songstarters, B minor, 140 bpm", None, "closest", False, 1, 8, True, 1.0, 250, 0.0, 3.0, "./samples", "loudness", -1],
|
| 331 |
+
["music, mallets, bells, melody, dancehall, african, afropop & afrobeats", "./nate_is_singing_Gb_minor.wav", "Gb:min", False, 1, 7, True, 1.0, 250, 0.0, 4.5, "./samples", "loudness", -1],
|
| 332 |
+
]
|
| 333 |
+
)
|
| 334 |
+
interface.launch(share=share, debug=debug)
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
if __name__ == '__main__':
|
| 338 |
+
from fire import Fire
|
| 339 |
+
Fire(main)
|
| 340 |
+
|
| 341 |
+
# For testing
|
| 342 |
+
|
| 343 |
+
# pipe = Pipeline("nateraw/musicgen-songstarter-v0.2", max_batch_size=4)
|
| 344 |
+
# example_input = (
|
| 345 |
+
# "hip hop, soul, piano, chords, jazz, neo jazz, G# minor, 140 bpm",
|
| 346 |
+
# "nate_is_humming.wav",
|
| 347 |
+
# "closest",
|
| 348 |
+
# False,
|
| 349 |
+
# 1,
|
| 350 |
+
# 8,
|
| 351 |
+
# True,
|
| 352 |
+
# 1.0,
|
| 353 |
+
# 250,
|
| 354 |
+
# 0.0,
|
| 355 |
+
# 3.0,
|
| 356 |
+
# "./samples",
|
| 357 |
+
# "loudness",
|
| 358 |
+
# -1,
|
| 359 |
+
# 0,
|
| 360 |
+
# None
|
| 361 |
+
# )
|
| 362 |
+
# out = pipe(*example_input)
|
nate_is_humming.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a62520e3026bc71b06fa75a8120c3b46524a0a34dcac9661e3e27632e294b11f
|
| 3 |
+
size 1196036
|
nate_is_singing_Gb_minor.wav
ADDED
|
Binary file (619 kB). View file
|
|
|
pitch_correction_utils.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from functools import partial
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import argparse
|
| 4 |
+
import librosa
|
| 5 |
+
import librosa.display
|
| 6 |
+
import numpy as np
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import soundfile as sf
|
| 9 |
+
import scipy.signal as sig
|
| 10 |
+
import psola
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SEMITONES_IN_OCTAVE = 12
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def degrees_from(scale: str):
|
| 17 |
+
"""Return the pitch classes (degrees) that correspond to the given scale"""
|
| 18 |
+
degrees = librosa.key_to_degrees(scale)
|
| 19 |
+
# To properly perform pitch rounding to the nearest degree from the scale, we need to repeat
|
| 20 |
+
# the first degree raised by an octave. Otherwise, pitches slightly lower than the base degree
|
| 21 |
+
# would be incorrectly assigned.
|
| 22 |
+
degrees = np.concatenate((degrees, [degrees[0] + SEMITONES_IN_OCTAVE]))
|
| 23 |
+
return degrees
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def closest_pitch(f0):
|
| 27 |
+
"""Round the given pitch values to the nearest MIDI note numbers"""
|
| 28 |
+
midi_note = np.around(librosa.hz_to_midi(f0))
|
| 29 |
+
# To preserve the nan values.
|
| 30 |
+
nan_indices = np.isnan(f0)
|
| 31 |
+
midi_note[nan_indices] = np.nan
|
| 32 |
+
# Convert back to Hz.
|
| 33 |
+
return librosa.midi_to_hz(midi_note)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def closest_pitch_from_scale(f0, scale):
|
| 37 |
+
"""Return the pitch closest to f0 that belongs to the given scale"""
|
| 38 |
+
# Preserve nan.
|
| 39 |
+
if np.isnan(f0):
|
| 40 |
+
return np.nan
|
| 41 |
+
degrees = degrees_from(scale)
|
| 42 |
+
midi_note = librosa.hz_to_midi(f0)
|
| 43 |
+
# Subtract the multiplicities of 12 so that we have the real-valued pitch class of the
|
| 44 |
+
# input pitch.
|
| 45 |
+
degree = midi_note % SEMITONES_IN_OCTAVE
|
| 46 |
+
# Find the closest pitch class from the scale.
|
| 47 |
+
degree_id = np.argmin(np.abs(degrees - degree))
|
| 48 |
+
# Calculate the difference between the input pitch class and the desired pitch class.
|
| 49 |
+
degree_difference = degree - degrees[degree_id]
|
| 50 |
+
# Shift the input MIDI note number by the calculated difference.
|
| 51 |
+
midi_note -= degree_difference
|
| 52 |
+
# Convert to Hz.
|
| 53 |
+
return librosa.midi_to_hz(midi_note)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def aclosest_pitch_from_scale(f0, scale):
|
| 57 |
+
"""Map each pitch in the f0 array to the closest pitch belonging to the given scale."""
|
| 58 |
+
sanitized_pitch = np.zeros_like(f0)
|
| 59 |
+
for i in np.arange(f0.shape[0]):
|
| 60 |
+
sanitized_pitch[i] = closest_pitch_from_scale(f0[i], scale)
|
| 61 |
+
# Perform median filtering to additionally smooth the corrected pitch.
|
| 62 |
+
smoothed_sanitized_pitch = sig.medfilt(sanitized_pitch, kernel_size=11)
|
| 63 |
+
# Remove the additional NaN values after median filtering.
|
| 64 |
+
smoothed_sanitized_pitch[np.isnan(smoothed_sanitized_pitch)] = \
|
| 65 |
+
sanitized_pitch[np.isnan(smoothed_sanitized_pitch)]
|
| 66 |
+
return smoothed_sanitized_pitch
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def autotune(audio, sr, correction_function, plot=False):
|
| 70 |
+
# Set some basis parameters.
|
| 71 |
+
frame_length = 2048
|
| 72 |
+
hop_length = frame_length // 4
|
| 73 |
+
fmin = librosa.note_to_hz('C2')
|
| 74 |
+
fmax = librosa.note_to_hz('C7')
|
| 75 |
+
|
| 76 |
+
# Pitch tracking using the PYIN algorithm.
|
| 77 |
+
f0, voiced_flag, voiced_probabilities = librosa.pyin(audio,
|
| 78 |
+
frame_length=frame_length,
|
| 79 |
+
hop_length=hop_length,
|
| 80 |
+
sr=sr,
|
| 81 |
+
fmin=fmin,
|
| 82 |
+
fmax=fmax)
|
| 83 |
+
|
| 84 |
+
# Apply the chosen adjustment strategy to the pitch.
|
| 85 |
+
corrected_f0 = correction_function(f0)
|
| 86 |
+
|
| 87 |
+
if plot:
|
| 88 |
+
# Plot the spectrogram, overlaid with the original pitch trajectory and the adjusted
|
| 89 |
+
# pitch trajectory.
|
| 90 |
+
stft = librosa.stft(audio, n_fft=frame_length, hop_length=hop_length)
|
| 91 |
+
time_points = librosa.times_like(stft, sr=sr, hop_length=hop_length)
|
| 92 |
+
log_stft = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
|
| 93 |
+
fig, ax = plt.subplots()
|
| 94 |
+
img = librosa.display.specshow(log_stft, x_axis='time', y_axis='log', ax=ax, sr=sr, hop_length=hop_length, fmin=fmin, fmax=fmax)
|
| 95 |
+
fig.colorbar(img, ax=ax, format="%+2.f dB")
|
| 96 |
+
ax.plot(time_points, f0, label='original pitch', color='cyan', linewidth=2)
|
| 97 |
+
ax.plot(time_points, corrected_f0, label='corrected pitch', color='orange', linewidth=1)
|
| 98 |
+
ax.legend(loc='upper right')
|
| 99 |
+
plt.ylabel('Frequency [Hz]')
|
| 100 |
+
plt.xlabel('Time [M:SS]')
|
| 101 |
+
plt.savefig('pitch_correction.png', dpi=300, bbox_inches='tight')
|
| 102 |
+
|
| 103 |
+
# Pitch-shifting using the PSOLA algorithm.
|
| 104 |
+
return psola.vocode(audio, sample_rate=int(sr), target_pitch=corrected_f0, fmin=fmin, fmax=fmax)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def main(
|
| 108 |
+
vocals_file,
|
| 109 |
+
plot=False,
|
| 110 |
+
correction_method="closest",
|
| 111 |
+
scale=None
|
| 112 |
+
):
|
| 113 |
+
"""Run autotune-like pitch correction on the given audio file.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
vocals_file (str): Filepath to the audio file to be pitch-corrected.
|
| 117 |
+
plot (bool, optional): Whether to plot the results. Defaults to False.
|
| 118 |
+
correction_method (str, optional): The pitch correction method to use. Defaults to `"closest"`. If set to "closest", the pitch will be rounded to the nearest MIDI note.
|
| 119 |
+
If set to "scale", the pitch will be rounded to the nearest note in the given `scale`.
|
| 120 |
+
scale (str, optional): The scale to use for pitch correction. ex. `"C:min"` / `"A:maj"`. Defaults to None.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
# Parse the command line arguments.
|
| 124 |
+
# ap = argparse.ArgumentParser()
|
| 125 |
+
# ap.add_argument('vocals_file')
|
| 126 |
+
# ap.add_argument('--plot', '-p', action='store_true', default=False,
|
| 127 |
+
# help='if set, will produce a plot of the results')
|
| 128 |
+
# ap.add_argument('--correction-method', '-c', choices=['closest', 'scale'], default='closest')
|
| 129 |
+
# ap.add_argument('--scale', '-s', type=str, help='see librosa.key_to_degrees;'
|
| 130 |
+
# ' used only for the \"scale\" correction'
|
| 131 |
+
# ' method')
|
| 132 |
+
# args = ap.parse_args(args=args)
|
| 133 |
+
|
| 134 |
+
filepath = Path(vocals_file)
|
| 135 |
+
|
| 136 |
+
# Load the audio file.
|
| 137 |
+
y, sr = librosa.load(str(filepath), sr=None, mono=False)
|
| 138 |
+
|
| 139 |
+
# Only mono-files are handled. If stereo files are supplied, only the first channel is used.
|
| 140 |
+
if y.ndim > 1:
|
| 141 |
+
y = y[0, :]
|
| 142 |
+
|
| 143 |
+
# Pick the pitch adjustment strategy according to the arguments.
|
| 144 |
+
correction_function = closest_pitch if correction_method == 'closest' else \
|
| 145 |
+
partial(aclosest_pitch_from_scale, scale=scale)
|
| 146 |
+
|
| 147 |
+
# Perform the auto-tuning.
|
| 148 |
+
pitch_corrected_y = autotune(y, sr, correction_function, plot)
|
| 149 |
+
|
| 150 |
+
# Write the corrected audio to an output file.
|
| 151 |
+
filepath = filepath.parent / (filepath.stem + '_pitch_corrected' + filepath.suffix)
|
| 152 |
+
sf.write(str(filepath), pitch_corrected_y, sr)
|
| 153 |
+
return pitch_corrected_y
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
if __name__=='__main__':
|
| 157 |
+
# main("./singing_music_idea.wav --plot -c closest".split())
|
| 158 |
+
# python pitch_correction_utils.py --vocals_file "./nate_is_humming.wav" --plot -c closest
|
| 159 |
+
from fire import Fire
|
| 160 |
+
Fire(main)
|
| 161 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
git+https://[email protected]/facebookresearch/audiocraft#egg=audiocraft
|
| 2 |
+
hf_transfer
|
| 3 |
+
gradio
|
| 4 |
+
psola
|
| 5 |
+
torchvision==0.16.0
|
| 6 |
+
fire
|
singing_songstarter_demo.ipynb
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": [],
|
| 7 |
+
"machine_shape": "hm",
|
| 8 |
+
"gpuType": "A100",
|
| 9 |
+
"authorship_tag": "ABX9TyMm+2HEY3Dh8UBT+NJ/CIoa",
|
| 10 |
+
"include_colab_link": true
|
| 11 |
+
},
|
| 12 |
+
"kernelspec": {
|
| 13 |
+
"name": "python3",
|
| 14 |
+
"display_name": "Python 3"
|
| 15 |
+
},
|
| 16 |
+
"language_info": {
|
| 17 |
+
"name": "python"
|
| 18 |
+
},
|
| 19 |
+
"accelerator": "GPU"
|
| 20 |
+
},
|
| 21 |
+
"cells": [
|
| 22 |
+
{
|
| 23 |
+
"cell_type": "markdown",
|
| 24 |
+
"metadata": {
|
| 25 |
+
"id": "view-in-github",
|
| 26 |
+
"colab_type": "text"
|
| 27 |
+
},
|
| 28 |
+
"source": [
|
| 29 |
+
"<a href=\"https://colab.research.google.com/github/nateraw/singing-songstarter/blob/main/singing_songstarter_demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "markdown",
|
| 34 |
+
"source": [
|
| 35 |
+
"# Singing Songstarter Demo\n",
|
| 36 |
+
"\n",
|
| 37 |
+
"This is a demo of using [`musicgen-songstarter-v0.2`](https://hf.co/nateraw/musicgen-songstarter-v0.2), a large stereo musicgen trained to be useful for music producers, for the task of voice-to-music.\n",
|
| 38 |
+
"\n",
|
| 39 |
+
"**Hum an idea, get a music sample!** 🚀\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"### Usage\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"1. Run the cell below.\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"2. You can ignore \"restart this runtime\" message when it pops up\n",
|
| 46 |
+
"3. Click the public share link. Should look like: `\"Running on public URL: https://<your-link-here>\"`\n",
|
| 47 |
+
"4. Enjoy 🔥\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"\n",
|
| 50 |
+
"### If you think this notebook is cool, consider supporting me by:\n",
|
| 51 |
+
" - giving [the model](https://hf.co/nateraw/musicgen-songstarter-v0.2) a heart on Hugging Face ❤️\n",
|
| 52 |
+
" - following me on [GitHub](https://github.com/nateraw) 👨💻\n",
|
| 53 |
+
" - following me on [X/twitter](https://twitter.com/nateraw) X\n",
|
| 54 |
+
" - giving [the demo repo](https://github.com/nateraw/singing-songstarter) a star ⭐️\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"If you have any questions/concerns about this demo, please [file an issue on GitHub](https://github.com/nateraw/singing-songstarter)."
|
| 57 |
+
],
|
| 58 |
+
"metadata": {
|
| 59 |
+
"id": "hBsE8AuVsgG8"
|
| 60 |
+
}
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": null,
|
| 65 |
+
"metadata": {
|
| 66 |
+
"id": "-fw0bpXysAUG"
|
| 67 |
+
},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"%cd /content\n",
|
| 71 |
+
"! git clone https://github.com/nateraw/singing-songstarter\n",
|
| 72 |
+
"%cd /content/singing-songstarter\n",
|
| 73 |
+
"! pip install -r requirements.txt\n",
|
| 74 |
+
"! python app.py --share --debug"
|
| 75 |
+
]
|
| 76 |
+
}
|
| 77 |
+
]
|
| 78 |
+
}
|