akhaliq3
spaces demo
607ecc1
raw
history blame
2.54 kB
from functools import partial
from typing import Callable, Optional, Sequence, Union
import gin
import librosa
import numpy as np
import torch
import torchcrepe
from .upsampling import linear_interpolation
from ...utils import apply
CREPE_WINDOW_LENGTH = 1024
@gin.configurable
def extract_f0_with_crepe(
audio: np.ndarray,
sample_rate: float,
hop_length: int = 128,
minimum_frequency: float = 50.0,
maximum_frequency: float = 2000.0,
full_model: bool = True,
batch_size: int = 2048,
device: Union[str, torch.device] = "cpu",
interpolate_fn: Optional[Callable] = linear_interpolation,
):
# convert to torch tensor with channel dimension (necessary for CREPE)
audio = torch.tensor(audio).unsqueeze(0)
f0, confidence = torchcrepe.predict(
audio,
sample_rate,
hop_length,
minimum_frequency,
maximum_frequency,
"full" if full_model else "tiny",
batch_size=batch_size,
device=device,
decoder=torchcrepe.decode.viterbi,
# decoder=torchcrepe.decode.weighted_argmax,
return_harmonicity=True,
)
f0, confidence = f0.squeeze().numpy(), confidence.squeeze().numpy()
if interpolate_fn:
f0 = interpolate_fn(
f0, CREPE_WINDOW_LENGTH, hop_length, original_length=audio.shape[-1]
)
confidence = interpolate_fn(
confidence,
CREPE_WINDOW_LENGTH,
hop_length,
original_length=audio.shape[-1],
)
return f0, confidence
@gin.configurable
def extract_f0_with_pyin(
audio: np.ndarray,
sample_rate: float,
minimum_frequency: float = 65.0, # recommended minimum freq from librosa docs
maximum_frequency: float = 2093.0, # recommended maximum freq from librosa docs
frame_length: int = 1024,
hop_length: int = 128,
fill_na: Optional[float] = None,
interpolate_fn: Optional[Callable] = linear_interpolation,
):
f0, _, voiced_prob = librosa.pyin(
audio,
sr=sample_rate,
fmin=minimum_frequency,
fmax=maximum_frequency,
frame_length=frame_length,
hop_length=hop_length,
fill_na=fill_na,
)
if interpolate_fn:
f0 = interpolate_fn(
f0, frame_length, hop_length, original_length=audio.shape[-1]
)
voiced_prob = interpolate_fn(
voiced_prob,
frame_length,
hop_length,
original_length=audio.shape[-1],
)
return f0, voiced_prob