Spaces:

AnhP
/

RVC-GUI

Running

File size: 13,754 Bytes

e4d8df5

import math

import numba as nb
import numpy as np

from scipy import signal
from scipy.interpolate import interp1d

def dio(x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, target_fs=4000, frame_period=10, allowed_range=0.1):
    temporal_positions = np.arange(0, int(1000 * len(x) / fs / frame_period + 1)) * frame_period / 1000
    boundary_f0_list = f0_floor * (2.0 ** ((np.arange(math.ceil(np.log2(f0_ceil / f0_floor) * channels_in_octave)) + 1) / channels_in_octave))
    
    y = decimate(x, int(fs / target_fs))
    y_spectrum = get_spectrum(y, target_fs, f0_floor)
    raw_f0_candidate, raw_stability = get_candidate_and_stability(np.size(temporal_positions), boundary_f0_list, np.size(y), temporal_positions, target_fs, y_spectrum, f0_floor, f0_ceil)
    
    return np.array(fix_f0_contour(sort_candidates(raw_f0_candidate, raw_stability), frame_period, f0_floor, allowed_range), dtype=np.float32), np.array(temporal_positions, dtype=np.float32)

def get_downsampled_signal(x, fs, target_fs):
    decimation_ratio = int(fs / target_fs + 0.5)

    if fs < target_fs:
        y = np.empty_like(x)
        y[:] = x
        actual_fs = fs
    else: 
        y = decimate_matlab(x, decimation_ratio, n = 3)
        actual_fs = fs / decimation_ratio

    y -= np.mean(y)
    return y, actual_fs

def get_spectrum(x, fs, lowest_f0):
    fft_size = 2 ** math.ceil(math.log(np.size(x) + int(fs / lowest_f0 / 2 + 0.5) * 4,2))
    cutoff_in_sample = int(fs / 50 + 0.5)

    low_cut_filter = signal.windows.hann(2 * cutoff_in_sample + 3)[1:-1] 
    low_cut_filter = -low_cut_filter / np.sum(low_cut_filter)
    low_cut_filter[cutoff_in_sample] = low_cut_filter[cutoff_in_sample] + 1
    low_cut_filter = np.r_[low_cut_filter, np.zeros(fft_size - len(low_cut_filter))]
    low_cut_filter = np.r_[low_cut_filter[cutoff_in_sample:], low_cut_filter[:cutoff_in_sample]]
    
    return np.fft.fft(x, fft_size) * np.fft.fft(low_cut_filter, fft_size)

def get_candidate_and_stability(number_of_frames, boundary_f0_list, y_length, temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil):
    raw_f0_candidate = np.zeros((np.size(boundary_f0_list), number_of_frames))
    raw_f0_stability = np.zeros((np.size(boundary_f0_list), number_of_frames))

    for i in range(np.size(boundary_f0_list)):
        interpolated_f0, f0_deviations = get_raw_event(boundary_f0_list[i], actual_fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil)
        raw_f0_stability[i, :] = np.exp(-(f0_deviations / np.maximum(interpolated_f0, 0.0000001)))
        raw_f0_candidate[i, :] = interpolated_f0

    return raw_f0_candidate, raw_f0_stability

def sort_candidates(f0_candidate_map, stability_map):
    number_of_candidates, number_of_frames = f0_candidate_map.shape
    sorted_index = np.argsort(-stability_map, axis=0, kind='quicksort')
    f0_candidates = np.zeros((number_of_candidates, number_of_frames))

    for i in range(number_of_frames):
        f0_candidates[:, i] = f0_candidate_map[sorted_index[:number_of_candidates,i], i]

    return f0_candidates

def get_raw_event(boundary_f0, fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil):
    low_pass_filter = nuttall(int(fs / boundary_f0 / 2 + 0.5) * 4)

    filtered_signal = np.real(np.fft.ifft(np.fft.fft(low_pass_filter, len(y_spectrum)) * y_spectrum))
    filtered_signal = filtered_signal[low_pass_filter.argmax() + np.arange(1, y_length + 1)] 
    
    neg_loc, neg_f0 = ZeroCrossingEngine(filtered_signal, fs)
    pos_loc, pos_f0 = ZeroCrossingEngine(-filtered_signal, fs)
    peak_loc, peak_f0 = ZeroCrossingEngine(np.diff(filtered_signal), fs)
    dip_loc, dip_f0 = ZeroCrossingEngine(-np.diff(filtered_signal), fs)
    
    f0_candidate, f0_deviations = get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions)
    
    f0_candidate[f0_candidate > boundary_f0] = 0
    f0_candidate[f0_candidate < (boundary_f0 / 2)] = 0
    f0_candidate[f0_candidate > f0_ceil] = 0
    f0_candidate[f0_candidate < f0_floor] = 0
    f0_deviations[f0_candidate == 0] = 100000 
    
    return f0_candidate, f0_deviations

def get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions):
    usable_channel = max(0, np.size(neg_loc) - 2) * max(0, np.size(pos_loc) - 2) * max(0, np.size(peak_loc) - 2) * max(0, np.size(dip_f0) - 2)
    interpolated_f0_list = np.zeros((4, np.size(temporal_positions)))
    
    if usable_channel > 0:
        interpolated_f0_list[0, :] = interp1d(neg_loc, neg_f0, fill_value='extrapolate')(temporal_positions)
        interpolated_f0_list[1, :] = interp1d(pos_loc, pos_f0, fill_value='extrapolate')(temporal_positions)
        interpolated_f0_list[2, :] = interp1d(peak_loc, peak_f0, fill_value='extrapolate')(temporal_positions)
        interpolated_f0_list[3, :] = interp1d(dip_loc, dip_f0, fill_value='extrapolate')(temporal_positions)
        interpolated_f0 = np.mean(interpolated_f0_list, axis=0)
        f0_deviations = np.std(interpolated_f0_list, axis=0, ddof=1)
    else:
        interpolated_f0 = temporal_positions * 0
        f0_deviations = temporal_positions * 0 + 1000

    return interpolated_f0, f0_deviations

@nb.jit((nb.float64[:], nb.float64), nopython=True, cache=True)
def ZeroCrossingEngine(x, fs):
    y = np.empty_like(x)
    y[:-1] = x[1:]
    y[-1] = x[-1]

    negative_going_points = np.arange(1, len(x) + 1) * ((y * x < 0) * (y < x))
    edge_list = negative_going_points[negative_going_points > 0]
    fine_edge_list = (edge_list) - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1])

    return (fine_edge_list[:len(fine_edge_list) - 1] + fine_edge_list[1:]) / 2 / fs, fs / np.diff(fine_edge_list)

def nuttall(N):
    return np.squeeze(np.asarray(np.array([0.355768, -0.487396, 0.144232, -0.012604]) @ np.cos(np.matrix([0,1,2,3]).T @ np.asmatrix(np.arange(N) * 2 * math.pi / (N-1)))))

def fix_f0_contour(f0_candidates, frame_period, f0_floor, allowed_range):
    voice_range_minimum =int(1 / (frame_period / 1000) / f0_floor + 0.5) * 2 + 1
    f0_step2 = fix_step2(fix_step1(f0_candidates, voice_range_minimum, allowed_range), voice_range_minimum)
    section_list = count_voiced_sections(f0_step2)
    f0_step4 = fix_step4(fix_step3(f0_step2, f0_candidates, section_list, allowed_range), f0_candidates, section_list, allowed_range)
    
    return np.copy(f0_step4)

def fix_step1(f0_candidates, voice_range_minimum, allowed_range):
    f0_base = f0_candidates[0]
    f0_base[ : voice_range_minimum] = 0
    f0_base[-voice_range_minimum : ] = 0
    
    f0_step1 = np.copy(f0_base)
    rounding_f0_base = np.array([float("{0:.6f}".format(elm)) for elm in f0_base])
    for i in np.arange(voice_range_minimum - 1, len(f0_base)):
        if abs((rounding_f0_base[i] - rounding_f0_base[i-1]) / (0.000001 + rounding_f0_base[i])) > allowed_range: f0_step1[i] = 0

    return f0_step1

def fix_step2(f0_step1, voice_range_minimum):
    f0_step2 = np.copy(f0_step1)
    for i in np.arange((voice_range_minimum - 1) / 2 , len(f0_step1) - (voice_range_minimum - 1) / 2).astype(int):
        for j in np.arange( -(voice_range_minimum - 1) / 2 , (voice_range_minimum - 1) / 2 + 1).astype(int):
            if f0_step1[i + j] == 0:
                f0_step2[i] = 0
                break

    return f0_step2

def fix_step3(f0_step2, f0_candidates, section_list, allowed_range):
    f0_step3 = np.empty_like(f0_step2)
    f0_step3[:] = f0_step2

    for i in np.arange(section_list.shape[0]):
        limit = len(f0_step3) - 1 if i == section_list.shape[0] - 1 else section_list[i + 1, 0] + 1

        for j in np.arange(section_list[i, 1], limit).astype(int):
            f0_step3[j + 1] = select_best_f0(f0_step3[j], f0_step3[j - 1], f0_candidates[:, j + 1], allowed_range)
            if f0_step3[j + 1] == 0: break

    return f0_step3

def fix_step4(f0_step3, f0_candidates, section_list, allowed_range):
    f0_step4 = np.copy(f0_step3)
    
    for i in range(section_list.shape[0] - 1, -1 , -1):
        limit = 1 if i == 0 else section_list[i - 1, 1]

        for j in np.arange(section_list[i, 0], limit - 1,  -1).astype(int):
            f0_step4[j - 1] = select_best_f0(f0_step4[j], f0_step4[j + 1], f0_candidates[:, j - 1], allowed_range)
            if f0_step4[j - 1] == 0: break

    return f0_step4

def select_best_f0(current_f0, past_f0, candidates, allowed_range):
    from sys import float_info

    reference_f0 = (current_f0 * 3 - past_f0) / 2
    minimum_error = abs(reference_f0 - candidates[0])
    best_f0 = candidates[0]
    
    for i in range(1, len(candidates)):
        current_error = abs(reference_f0 - candidates[i])
        if current_error < minimum_error:
            minimum_error = current_error
            best_f0 = candidates[i]

    if abs(1 - best_f0 / (reference_f0 + float_info.epsilon)) > allowed_range: best_f0 = 0
    return best_f0

def count_voiced_sections(f0):
    vuv = np.copy(f0)
    vuv[vuv != 0] = 1
    diff_vuv = np.diff(vuv)
    boundary_list = np.append(np.append([0], np.where(diff_vuv != 0)[0]), [len(vuv) - 2])
    
    first_section = np.ceil(-0.5 * diff_vuv[boundary_list[1]])
    number_of_voiced_sections = np.floor((len(boundary_list) - (1 - first_section)) / 2).astype(int)

    voiced_section_list = np.zeros((number_of_voiced_sections, 2))
    for i in range(number_of_voiced_sections):
        voiced_section_list[i, :] = np.array([1 + boundary_list[int((i - 1) * 2 + 1 + (1 - first_section)) + 1], boundary_list[int((i * 2) + (1 - first_section)) + 1]])

    return voiced_section_list

def decimate_matlab(x, q, n=None, axis=-1):
    if not isinstance(q, int): raise TypeError
    if n is not None and not isinstance(n, int): raise TypeError

    system = signal.dlti(*signal.cheby1(n, 0.05, 0.8 / q))
    y = signal.filtfilt(system.num, system.den, x, axis=axis, padlen=3 * (max(len(system.den), len(system.num)) - 1))

    nd = len(y)
    return y[int(q - (q * np.ceil(nd / q) - nd)) - 1::q]

def FilterForDecimate(x,r):
    a, b = np.zeros(3), np.zeros(2)

    if r==11:
        a[0] = 2.450743295230728
        a[1] = -2.06794904601978
        a[2] = 0.59574774438332101
        b[0] = 0.0026822508007163792
        b[1] = 0.0080467524021491377
    elif r==12:
        a[0] = 2.4981398605924205
        a[1] = -2.1368928194784025
        a[2] = 0.62187513816221485
        b[0] = 0.0021097275904709001
        b[1] = 0.0063291827714127002
    elif r==10:
        a[0] = 2.3936475118069387
        a[1] = -1.9873904075111861
        a[2] = 0.5658879979027055
        b[0] = 0.0034818622251927556
        b[1] = 0.010445586675578267
    elif r==9:
        a[0] = 2.3236003491759578
        a[1] = -1.8921545617463598
        a[2] = 0.53148928133729068
        b[0] = 0.0046331164041389372
        b[1] = 0.013899349212416812
    elif r==8:
        a[0] = 2.2357462340187593
        a[1] = -1.7780899984041358
        a[2] = 0.49152555365968692
        b[0] = 0.0063522763407111993
        b[1] = 0.019056829022133598
    elif r==7:
        a[0] = 2.1225239019534703
        a[1] = -1.6395144861046302
        a[2] = 0.44469707800587366
        b[0] = 0.0090366882681608418
        b[1] = 0.027110064804482525
    elif r==6:
        a[0] = 1.9715352749512141
        a[1] = -1.4686795689225347
        a[2] = 0.3893908434965701
        b[0] = 0.013469181309343825
        b[1] = 0.040407543928031475
    elif r==5:
        a[0] = 1.7610939654280557
        a[1] = -1.2554914843859768
        a[2] = 0.3237186507788215
        b[0] = 0.021334858522387423
        b[1] = 0.06400457556716227
    elif r==4:
        a[0] = 1.4499664446880227
        a[1] = -0.98943497080950582
        a[2] = 0.24578252340690215
        b[0] = 0.036710750339322612
        b[1] = 0.11013225101796784
    elif r==3:
        a[0] = 0.95039378983237421
        a[1] = -0.67429146741526791
        a[2] = 0.15412211621346475
        b[0] = 0.071221945171178636
        b[1] = 0.21366583551353591
    elif r==2:
        a[0] = 0.041156734567757189
        a[1] = -0.42599112459189636
        a[2] = 0.041037215479961225
        b[0] = 0.16797464681802227
        b[1] = 0.50392394045406674
    else: a[0] = a[1] = a[2] = b[0] = b[1] = 0.0

    w = np.zeros(3)
    y_prime = np.zeros_like(x)

    for i in range(len(x)):
        wt = x[i] + a[0] * w[0] + a[1] * w[1] + a[2] * w[2]
        y_prime[i] = b[0] * wt + b[1] * w[0] + b[1] * w[1] + b[0] * w[2]
        w[2] = w[1]
        w[1] = w[0]
        w[0] = wt

    return y_prime

def decimate(x,r):
    y = []
    kNFact = 9
    x_length = len(x)

    tmp1 = np.zeros(x_length + kNFact * 2)
    tmp2 = np.zeros(x_length + kNFact * 2)

    for i in range(kNFact):
        tmp1[i] = 2 * x[0] - x[kNFact - i]

    for i in range(kNFact, kNFact + x_length):
        tmp1[i] = x[i - kNFact]

    for i in range(kNFact + x_length, 2 * kNFact + x_length):
        tmp1[i] = 2 * x[-1] - x[x_length - 2 - (i - (kNFact + x_length))]

    tmp2 = FilterForDecimate(tmp1, r)
    for i in range(2 * kNFact + x_length):
        tmp1[i] = tmp2[2 * kNFact + x_length - i - 1]

    tmp2 = FilterForDecimate(tmp1, r)
    for i in range(2 * kNFact + x_length):
        tmp1[i] = tmp2[2 * kNFact + x_length - i - 1]

    nbeg = int(r - r * np.ceil(x_length / r + 1) + x_length)

    count = 0
    for i in range(nbeg, x_length + kNFact, r):
        y.append(tmp1[i + kNFact - 1])
        count += 1

    return np.array(y)