Spaces:
Runtime error
Runtime error
File size: 2,541 Bytes
607ecc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from functools import partial
from typing import Callable, Optional, Sequence, Union
import gin
import librosa
import numpy as np
import torch
import torchcrepe
from .upsampling import linear_interpolation
from ...utils import apply
CREPE_WINDOW_LENGTH = 1024
@gin.configurable
def extract_f0_with_crepe(
audio: np.ndarray,
sample_rate: float,
hop_length: int = 128,
minimum_frequency: float = 50.0,
maximum_frequency: float = 2000.0,
full_model: bool = True,
batch_size: int = 2048,
device: Union[str, torch.device] = "cpu",
interpolate_fn: Optional[Callable] = linear_interpolation,
):
# convert to torch tensor with channel dimension (necessary for CREPE)
audio = torch.tensor(audio).unsqueeze(0)
f0, confidence = torchcrepe.predict(
audio,
sample_rate,
hop_length,
minimum_frequency,
maximum_frequency,
"full" if full_model else "tiny",
batch_size=batch_size,
device=device,
decoder=torchcrepe.decode.viterbi,
# decoder=torchcrepe.decode.weighted_argmax,
return_harmonicity=True,
)
f0, confidence = f0.squeeze().numpy(), confidence.squeeze().numpy()
if interpolate_fn:
f0 = interpolate_fn(
f0, CREPE_WINDOW_LENGTH, hop_length, original_length=audio.shape[-1]
)
confidence = interpolate_fn(
confidence,
CREPE_WINDOW_LENGTH,
hop_length,
original_length=audio.shape[-1],
)
return f0, confidence
@gin.configurable
def extract_f0_with_pyin(
audio: np.ndarray,
sample_rate: float,
minimum_frequency: float = 65.0, # recommended minimum freq from librosa docs
maximum_frequency: float = 2093.0, # recommended maximum freq from librosa docs
frame_length: int = 1024,
hop_length: int = 128,
fill_na: Optional[float] = None,
interpolate_fn: Optional[Callable] = linear_interpolation,
):
f0, _, voiced_prob = librosa.pyin(
audio,
sr=sample_rate,
fmin=minimum_frequency,
fmax=maximum_frequency,
frame_length=frame_length,
hop_length=hop_length,
fill_na=fill_na,
)
if interpolate_fn:
f0 = interpolate_fn(
f0, frame_length, hop_length, original_length=audio.shape[-1]
)
voiced_prob = interpolate_fn(
voiced_prob,
frame_length,
hop_length,
original_length=audio.shape[-1],
)
return f0, voiced_prob
|