AnhP's picture
Upload 82 files
e4d8df5 verified
raw
history blame
13.8 kB
import math
import numba as nb
import numpy as np
from scipy import signal
from scipy.interpolate import interp1d
def dio(x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, target_fs=4000, frame_period=10, allowed_range=0.1):
temporal_positions = np.arange(0, int(1000 * len(x) / fs / frame_period + 1)) * frame_period / 1000
boundary_f0_list = f0_floor * (2.0 ** ((np.arange(math.ceil(np.log2(f0_ceil / f0_floor) * channels_in_octave)) + 1) / channels_in_octave))
y = decimate(x, int(fs / target_fs))
y_spectrum = get_spectrum(y, target_fs, f0_floor)
raw_f0_candidate, raw_stability = get_candidate_and_stability(np.size(temporal_positions), boundary_f0_list, np.size(y), temporal_positions, target_fs, y_spectrum, f0_floor, f0_ceil)
return np.array(fix_f0_contour(sort_candidates(raw_f0_candidate, raw_stability), frame_period, f0_floor, allowed_range), dtype=np.float32), np.array(temporal_positions, dtype=np.float32)
def get_downsampled_signal(x, fs, target_fs):
decimation_ratio = int(fs / target_fs + 0.5)
if fs < target_fs:
y = np.empty_like(x)
y[:] = x
actual_fs = fs
else:
y = decimate_matlab(x, decimation_ratio, n = 3)
actual_fs = fs / decimation_ratio
y -= np.mean(y)
return y, actual_fs
def get_spectrum(x, fs, lowest_f0):
fft_size = 2 ** math.ceil(math.log(np.size(x) + int(fs / lowest_f0 / 2 + 0.5) * 4,2))
cutoff_in_sample = int(fs / 50 + 0.5)
low_cut_filter = signal.windows.hann(2 * cutoff_in_sample + 3)[1:-1]
low_cut_filter = -low_cut_filter / np.sum(low_cut_filter)
low_cut_filter[cutoff_in_sample] = low_cut_filter[cutoff_in_sample] + 1
low_cut_filter = np.r_[low_cut_filter, np.zeros(fft_size - len(low_cut_filter))]
low_cut_filter = np.r_[low_cut_filter[cutoff_in_sample:], low_cut_filter[:cutoff_in_sample]]
return np.fft.fft(x, fft_size) * np.fft.fft(low_cut_filter, fft_size)
def get_candidate_and_stability(number_of_frames, boundary_f0_list, y_length, temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil):
raw_f0_candidate = np.zeros((np.size(boundary_f0_list), number_of_frames))
raw_f0_stability = np.zeros((np.size(boundary_f0_list), number_of_frames))
for i in range(np.size(boundary_f0_list)):
interpolated_f0, f0_deviations = get_raw_event(boundary_f0_list[i], actual_fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil)
raw_f0_stability[i, :] = np.exp(-(f0_deviations / np.maximum(interpolated_f0, 0.0000001)))
raw_f0_candidate[i, :] = interpolated_f0
return raw_f0_candidate, raw_f0_stability
def sort_candidates(f0_candidate_map, stability_map):
number_of_candidates, number_of_frames = f0_candidate_map.shape
sorted_index = np.argsort(-stability_map, axis=0, kind='quicksort')
f0_candidates = np.zeros((number_of_candidates, number_of_frames))
for i in range(number_of_frames):
f0_candidates[:, i] = f0_candidate_map[sorted_index[:number_of_candidates,i], i]
return f0_candidates
def get_raw_event(boundary_f0, fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil):
low_pass_filter = nuttall(int(fs / boundary_f0 / 2 + 0.5) * 4)
filtered_signal = np.real(np.fft.ifft(np.fft.fft(low_pass_filter, len(y_spectrum)) * y_spectrum))
filtered_signal = filtered_signal[low_pass_filter.argmax() + np.arange(1, y_length + 1)]
neg_loc, neg_f0 = ZeroCrossingEngine(filtered_signal, fs)
pos_loc, pos_f0 = ZeroCrossingEngine(-filtered_signal, fs)
peak_loc, peak_f0 = ZeroCrossingEngine(np.diff(filtered_signal), fs)
dip_loc, dip_f0 = ZeroCrossingEngine(-np.diff(filtered_signal), fs)
f0_candidate, f0_deviations = get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions)
f0_candidate[f0_candidate > boundary_f0] = 0
f0_candidate[f0_candidate < (boundary_f0 / 2)] = 0
f0_candidate[f0_candidate > f0_ceil] = 0
f0_candidate[f0_candidate < f0_floor] = 0
f0_deviations[f0_candidate == 0] = 100000
return f0_candidate, f0_deviations
def get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions):
usable_channel = max(0, np.size(neg_loc) - 2) * max(0, np.size(pos_loc) - 2) * max(0, np.size(peak_loc) - 2) * max(0, np.size(dip_f0) - 2)
interpolated_f0_list = np.zeros((4, np.size(temporal_positions)))
if usable_channel > 0:
interpolated_f0_list[0, :] = interp1d(neg_loc, neg_f0, fill_value='extrapolate')(temporal_positions)
interpolated_f0_list[1, :] = interp1d(pos_loc, pos_f0, fill_value='extrapolate')(temporal_positions)
interpolated_f0_list[2, :] = interp1d(peak_loc, peak_f0, fill_value='extrapolate')(temporal_positions)
interpolated_f0_list[3, :] = interp1d(dip_loc, dip_f0, fill_value='extrapolate')(temporal_positions)
interpolated_f0 = np.mean(interpolated_f0_list, axis=0)
f0_deviations = np.std(interpolated_f0_list, axis=0, ddof=1)
else:
interpolated_f0 = temporal_positions * 0
f0_deviations = temporal_positions * 0 + 1000
return interpolated_f0, f0_deviations
@nb.jit((nb.float64[:], nb.float64), nopython=True, cache=True)
def ZeroCrossingEngine(x, fs):
y = np.empty_like(x)
y[:-1] = x[1:]
y[-1] = x[-1]
negative_going_points = np.arange(1, len(x) + 1) * ((y * x < 0) * (y < x))
edge_list = negative_going_points[negative_going_points > 0]
fine_edge_list = (edge_list) - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1])
return (fine_edge_list[:len(fine_edge_list) - 1] + fine_edge_list[1:]) / 2 / fs, fs / np.diff(fine_edge_list)
def nuttall(N):
return np.squeeze(np.asarray(np.array([0.355768, -0.487396, 0.144232, -0.012604]) @ np.cos(np.matrix([0,1,2,3]).T @ np.asmatrix(np.arange(N) * 2 * math.pi / (N-1)))))
def fix_f0_contour(f0_candidates, frame_period, f0_floor, allowed_range):
voice_range_minimum =int(1 / (frame_period / 1000) / f0_floor + 0.5) * 2 + 1
f0_step2 = fix_step2(fix_step1(f0_candidates, voice_range_minimum, allowed_range), voice_range_minimum)
section_list = count_voiced_sections(f0_step2)
f0_step4 = fix_step4(fix_step3(f0_step2, f0_candidates, section_list, allowed_range), f0_candidates, section_list, allowed_range)
return np.copy(f0_step4)
def fix_step1(f0_candidates, voice_range_minimum, allowed_range):
f0_base = f0_candidates[0]
f0_base[ : voice_range_minimum] = 0
f0_base[-voice_range_minimum : ] = 0
f0_step1 = np.copy(f0_base)
rounding_f0_base = np.array([float("{0:.6f}".format(elm)) for elm in f0_base])
for i in np.arange(voice_range_minimum - 1, len(f0_base)):
if abs((rounding_f0_base[i] - rounding_f0_base[i-1]) / (0.000001 + rounding_f0_base[i])) > allowed_range: f0_step1[i] = 0
return f0_step1
def fix_step2(f0_step1, voice_range_minimum):
f0_step2 = np.copy(f0_step1)
for i in np.arange((voice_range_minimum - 1) / 2 , len(f0_step1) - (voice_range_minimum - 1) / 2).astype(int):
for j in np.arange( -(voice_range_minimum - 1) / 2 , (voice_range_minimum - 1) / 2 + 1).astype(int):
if f0_step1[i + j] == 0:
f0_step2[i] = 0
break
return f0_step2
def fix_step3(f0_step2, f0_candidates, section_list, allowed_range):
f0_step3 = np.empty_like(f0_step2)
f0_step3[:] = f0_step2
for i in np.arange(section_list.shape[0]):
limit = len(f0_step3) - 1 if i == section_list.shape[0] - 1 else section_list[i + 1, 0] + 1
for j in np.arange(section_list[i, 1], limit).astype(int):
f0_step3[j + 1] = select_best_f0(f0_step3[j], f0_step3[j - 1], f0_candidates[:, j + 1], allowed_range)
if f0_step3[j + 1] == 0: break
return f0_step3
def fix_step4(f0_step3, f0_candidates, section_list, allowed_range):
f0_step4 = np.copy(f0_step3)
for i in range(section_list.shape[0] - 1, -1 , -1):
limit = 1 if i == 0 else section_list[i - 1, 1]
for j in np.arange(section_list[i, 0], limit - 1, -1).astype(int):
f0_step4[j - 1] = select_best_f0(f0_step4[j], f0_step4[j + 1], f0_candidates[:, j - 1], allowed_range)
if f0_step4[j - 1] == 0: break
return f0_step4
def select_best_f0(current_f0, past_f0, candidates, allowed_range):
from sys import float_info
reference_f0 = (current_f0 * 3 - past_f0) / 2
minimum_error = abs(reference_f0 - candidates[0])
best_f0 = candidates[0]
for i in range(1, len(candidates)):
current_error = abs(reference_f0 - candidates[i])
if current_error < minimum_error:
minimum_error = current_error
best_f0 = candidates[i]
if abs(1 - best_f0 / (reference_f0 + float_info.epsilon)) > allowed_range: best_f0 = 0
return best_f0
def count_voiced_sections(f0):
vuv = np.copy(f0)
vuv[vuv != 0] = 1
diff_vuv = np.diff(vuv)
boundary_list = np.append(np.append([0], np.where(diff_vuv != 0)[0]), [len(vuv) - 2])
first_section = np.ceil(-0.5 * diff_vuv[boundary_list[1]])
number_of_voiced_sections = np.floor((len(boundary_list) - (1 - first_section)) / 2).astype(int)
voiced_section_list = np.zeros((number_of_voiced_sections, 2))
for i in range(number_of_voiced_sections):
voiced_section_list[i, :] = np.array([1 + boundary_list[int((i - 1) * 2 + 1 + (1 - first_section)) + 1], boundary_list[int((i * 2) + (1 - first_section)) + 1]])
return voiced_section_list
def decimate_matlab(x, q, n=None, axis=-1):
if not isinstance(q, int): raise TypeError
if n is not None and not isinstance(n, int): raise TypeError
system = signal.dlti(*signal.cheby1(n, 0.05, 0.8 / q))
y = signal.filtfilt(system.num, system.den, x, axis=axis, padlen=3 * (max(len(system.den), len(system.num)) - 1))
nd = len(y)
return y[int(q - (q * np.ceil(nd / q) - nd)) - 1::q]
def FilterForDecimate(x,r):
a, b = np.zeros(3), np.zeros(2)
if r==11:
a[0] = 2.450743295230728
a[1] = -2.06794904601978
a[2] = 0.59574774438332101
b[0] = 0.0026822508007163792
b[1] = 0.0080467524021491377
elif r==12:
a[0] = 2.4981398605924205
a[1] = -2.1368928194784025
a[2] = 0.62187513816221485
b[0] = 0.0021097275904709001
b[1] = 0.0063291827714127002
elif r==10:
a[0] = 2.3936475118069387
a[1] = -1.9873904075111861
a[2] = 0.5658879979027055
b[0] = 0.0034818622251927556
b[1] = 0.010445586675578267
elif r==9:
a[0] = 2.3236003491759578
a[1] = -1.8921545617463598
a[2] = 0.53148928133729068
b[0] = 0.0046331164041389372
b[1] = 0.013899349212416812
elif r==8:
a[0] = 2.2357462340187593
a[1] = -1.7780899984041358
a[2] = 0.49152555365968692
b[0] = 0.0063522763407111993
b[1] = 0.019056829022133598
elif r==7:
a[0] = 2.1225239019534703
a[1] = -1.6395144861046302
a[2] = 0.44469707800587366
b[0] = 0.0090366882681608418
b[1] = 0.027110064804482525
elif r==6:
a[0] = 1.9715352749512141
a[1] = -1.4686795689225347
a[2] = 0.3893908434965701
b[0] = 0.013469181309343825
b[1] = 0.040407543928031475
elif r==5:
a[0] = 1.7610939654280557
a[1] = -1.2554914843859768
a[2] = 0.3237186507788215
b[0] = 0.021334858522387423
b[1] = 0.06400457556716227
elif r==4:
a[0] = 1.4499664446880227
a[1] = -0.98943497080950582
a[2] = 0.24578252340690215
b[0] = 0.036710750339322612
b[1] = 0.11013225101796784
elif r==3:
a[0] = 0.95039378983237421
a[1] = -0.67429146741526791
a[2] = 0.15412211621346475
b[0] = 0.071221945171178636
b[1] = 0.21366583551353591
elif r==2:
a[0] = 0.041156734567757189
a[1] = -0.42599112459189636
a[2] = 0.041037215479961225
b[0] = 0.16797464681802227
b[1] = 0.50392394045406674
else: a[0] = a[1] = a[2] = b[0] = b[1] = 0.0
w = np.zeros(3)
y_prime = np.zeros_like(x)
for i in range(len(x)):
wt = x[i] + a[0] * w[0] + a[1] * w[1] + a[2] * w[2]
y_prime[i] = b[0] * wt + b[1] * w[0] + b[1] * w[1] + b[0] * w[2]
w[2] = w[1]
w[1] = w[0]
w[0] = wt
return y_prime
def decimate(x,r):
y = []
kNFact = 9
x_length = len(x)
tmp1 = np.zeros(x_length + kNFact * 2)
tmp2 = np.zeros(x_length + kNFact * 2)
for i in range(kNFact):
tmp1[i] = 2 * x[0] - x[kNFact - i]
for i in range(kNFact, kNFact + x_length):
tmp1[i] = x[i - kNFact]
for i in range(kNFact + x_length, 2 * kNFact + x_length):
tmp1[i] = 2 * x[-1] - x[x_length - 2 - (i - (kNFact + x_length))]
tmp2 = FilterForDecimate(tmp1, r)
for i in range(2 * kNFact + x_length):
tmp1[i] = tmp2[2 * kNFact + x_length - i - 1]
tmp2 = FilterForDecimate(tmp1, r)
for i in range(2 * kNFact + x_length):
tmp1[i] = tmp2[2 * kNFact + x_length - i - 1]
nbeg = int(r - r * np.ceil(x_length / r + 1) + x_length)
count = 0
for i in range(nbeg, x_length + kNFact, r):
y.append(tmp1[i + kNFact - 1])
count += 1
return np.array(y)