Spaces:
Runtime error
Runtime error
| import gc | |
| import os.path | |
| import numpy as np | |
| import parselmouth | |
| import torch | |
| import pyworld | |
| import torchcrepe | |
| from scipy import signal | |
| from torch import Tensor | |
| def get_f0_crepe_computation( | |
| x, | |
| f0_min, | |
| f0_max, | |
| p_len, | |
| sr, | |
| hop_length=128, | |
| # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. | |
| model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full | |
| ): | |
| x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. | |
| x /= np.quantile(np.abs(x), 0.999) | |
| torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| audio = torch.from_numpy(x).to(torch_device, copy=True) | |
| audio = torch.unsqueeze(audio, dim=0) | |
| if audio.ndim == 2 and audio.shape[0] > 1: | |
| audio = torch.mean(audio, dim=0, keepdim=True).detach() | |
| audio = audio.detach() | |
| # print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) | |
| pitch: torch.Tensor = torchcrepe.predict( | |
| audio, | |
| sr, | |
| hop_length, | |
| f0_min, | |
| f0_max, | |
| model, | |
| batch_size=hop_length * 2, | |
| device=torch_device, | |
| pad=True | |
| ) | |
| p_len = p_len or x.shape[0] // hop_length | |
| # Resize the pitch for final f0 | |
| source = np.array(pitch.squeeze(0).cpu().float().numpy()) | |
| source[source < 0.001] = np.nan | |
| target = np.interp( | |
| np.arange(0, len(source) * p_len, len(source)) / p_len, | |
| np.arange(0, len(source)), | |
| source | |
| ) | |
| f0 = np.nan_to_num(target) | |
| return f0 # Resized f0 | |
| def get_mangio_crepe_f0(x, f0_min, f0_max, p_len, sr, crepe_hop_length, model='full'): | |
| # print("Performing crepe pitch extraction. (EXPERIMENTAL)") | |
| # print("CREPE PITCH EXTRACTION HOP LENGTH: " + str(crepe_hop_length)) | |
| x = x.astype(np.float32) | |
| x /= np.quantile(np.abs(x), 0.999) | |
| torch_device_index = 0 | |
| torch_device = None | |
| if torch.cuda.is_available(): | |
| torch_device = torch.device(f"cuda:{torch_device_index % torch.cuda.device_count()}") | |
| elif torch.backends.mps.is_available(): | |
| torch_device = torch.device("mps") | |
| else: | |
| torch_device = torch.device("cpu") | |
| audio = torch.from_numpy(x).to(torch_device, copy=True) | |
| audio = torch.unsqueeze(audio, dim=0) | |
| if audio.ndim == 2 and audio.shape[0] > 1: | |
| audio = torch.mean(audio, dim=0, keepdim=True).detach() | |
| audio = audio.detach() | |
| # print( | |
| # "Initiating f0 Crepe Feature Extraction with an extraction_crepe_hop_length of: " + | |
| # str(crepe_hop_length) | |
| # ) | |
| # Pitch prediction for pitch extraction | |
| pitch: Tensor = torchcrepe.predict( | |
| audio, | |
| sr, | |
| crepe_hop_length, | |
| f0_min, | |
| f0_max, | |
| model, | |
| batch_size=crepe_hop_length * 2, | |
| device=torch_device, | |
| pad=True | |
| ) | |
| p_len = p_len or x.shape[0] // crepe_hop_length | |
| # Resize the pitch | |
| source = np.array(pitch.squeeze(0).cpu().float().numpy()) | |
| source[source < 0.001] = np.nan | |
| target = np.interp( | |
| np.arange(0, len(source) * p_len, len(source)) / p_len, | |
| np.arange(0, len(source)), | |
| source | |
| ) | |
| return np.nan_to_num(target) | |
| def pitch_extract(f0_method, x, f0_min, f0_max, p_len, time_step, sr, window, crepe_hop_length, filter_radius=3): | |
| f0s = [] | |
| f0 = np.zeros(p_len) | |
| for method in f0_method if isinstance(f0_method, list) else [f0_method]: | |
| if method == "pm": | |
| f0 = ( | |
| parselmouth.Sound(x, sr) | |
| .to_pitch_ac( | |
| time_step=time_step / 1000, | |
| voicing_threshold=0.6, | |
| pitch_floor=f0_min, | |
| pitch_ceiling=f0_max, | |
| ) | |
| .selected_array["frequency"] | |
| ) | |
| pad_size = (p_len - len(f0) + 1) // 2 | |
| if pad_size > 0 or p_len - len(f0) - pad_size > 0: | |
| f0 = np.pad( | |
| f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" | |
| ) | |
| elif method in ['harvest', 'dio']: | |
| if method == 'harvest': | |
| f0, t = pyworld.harvest( | |
| x.astype(np.double), | |
| fs=sr, | |
| f0_ceil=f0_max, | |
| f0_floor=f0_min, | |
| frame_period=10, | |
| ) | |
| elif method == "dio": | |
| f0, t = pyworld.dio( | |
| x.astype(np.double), | |
| fs=sr, | |
| f0_ceil=f0_max, | |
| f0_floor=f0_min, | |
| frame_period=10, | |
| ) | |
| f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr) | |
| elif method == "torchcrepe": | |
| f0 = get_f0_crepe_computation(x, f0_min, f0_max, p_len, sr, crepe_hop_length) | |
| elif method == "torchcrepe tiny": | |
| f0 = get_f0_crepe_computation(x, f0_min, f0_max, p_len, sr, crepe_hop_length, "tiny") | |
| elif method == "mangio-crepe": | |
| f0 = get_mangio_crepe_f0(x, f0_min, f0_max, p_len, sr, crepe_hop_length) | |
| elif method == "mangio-crepe tiny": | |
| f0 = get_mangio_crepe_f0(x, f0_min, f0_max, p_len, sr, crepe_hop_length, 'tiny') | |
| elif method == "rmvpe": | |
| rmvpe_model_path = os.path.join('data', 'models', 'rmvpe') | |
| rmvpe_model_file = os.path.join(rmvpe_model_path, 'rmvpe.pt') | |
| if not os.path.isfile(rmvpe_model_file): | |
| import huggingface_hub | |
| rmvpe_model_file = huggingface_hub.hf_hub_download('lj1995/VoiceConversionWebUI', 'rmvpe.pt', local_dir=rmvpe_model_path, local_dir_use_symlinks=False) | |
| from modules.voice_conversion.rvc.rmvpe import RMVPE | |
| print("loading rmvpe model") | |
| model_rmvpe = RMVPE(rmvpe_model_file, is_half=True, device=None) | |
| f0 = model_rmvpe.infer_from_audio(x, thred=0.03) | |
| del model_rmvpe | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| f0s.append(f0) | |
| if not f0s: | |
| f0s = [f0] | |
| f0s_new = [] | |
| for f0_val in f0s: | |
| _len = f0_val.shape[0] | |
| if _len == p_len: | |
| f0s_new.append(f0) | |
| continue | |
| if _len > p_len: | |
| f0 = f0[:p_len] | |
| f0s_new.append(f0) | |
| continue | |
| if _len < p_len: | |
| print('WARNING: len < p_len, skipping this f0') | |
| f0 = np.nanmedian(np.stack(f0s_new, axis=0), axis=0) | |
| if filter_radius >= 2: | |
| f0 = signal.medfilt(f0, filter_radius) | |
| return f0 | |