Spaces:

AnhP
/

RVC-GUI

Running

App Files Files Community

RVC-GUI / main /library /predictors /pyworld /dio.py

AnhP

Upload 82 files

e4d8df5 verified 4 months ago

raw

history blame

13.8 kB

	import math

	import numba as nb
	import numpy as np

	from scipy import signal
	from scipy.interpolate import interp1d

	def dio(x, fs, f0_floor=50, f0_ceil=1100, channels_in_octave=2, target_fs=4000, frame_period=10, allowed_range=0.1):
	temporal_positions = np.arange(0, int(1000 * len(x) / fs / frame_period + 1)) * frame_period / 1000
	boundary_f0_list = f0_floor * (2.0 ** ((np.arange(math.ceil(np.log2(f0_ceil / f0_floor) * channels_in_octave)) + 1) / channels_in_octave))

	y = decimate(x, int(fs / target_fs))
	y_spectrum = get_spectrum(y, target_fs, f0_floor)
	raw_f0_candidate, raw_stability = get_candidate_and_stability(np.size(temporal_positions), boundary_f0_list, np.size(y), temporal_positions, target_fs, y_spectrum, f0_floor, f0_ceil)

	return np.array(fix_f0_contour(sort_candidates(raw_f0_candidate, raw_stability), frame_period, f0_floor, allowed_range), dtype=np.float32), np.array(temporal_positions, dtype=np.float32)

	def get_downsampled_signal(x, fs, target_fs):
	decimation_ratio = int(fs / target_fs + 0.5)

	if fs < target_fs:
	y = np.empty_like(x)
	y[:] = x
	actual_fs = fs
	else:
	y = decimate_matlab(x, decimation_ratio, n = 3)
	actual_fs = fs / decimation_ratio

	y -= np.mean(y)
	return y, actual_fs

	def get_spectrum(x, fs, lowest_f0):
	fft_size = 2 ** math.ceil(math.log(np.size(x) + int(fs / lowest_f0 / 2 + 0.5) * 4,2))
	cutoff_in_sample = int(fs / 50 + 0.5)

	low_cut_filter = signal.windows.hann(2 * cutoff_in_sample + 3)[1:-1]
	low_cut_filter = -low_cut_filter / np.sum(low_cut_filter)
	low_cut_filter[cutoff_in_sample] = low_cut_filter[cutoff_in_sample] + 1
	low_cut_filter = np.r_[low_cut_filter, np.zeros(fft_size - len(low_cut_filter))]
	low_cut_filter = np.r_[low_cut_filter[cutoff_in_sample:], low_cut_filter[:cutoff_in_sample]]

	return np.fft.fft(x, fft_size) * np.fft.fft(low_cut_filter, fft_size)

	def get_candidate_and_stability(number_of_frames, boundary_f0_list, y_length, temporal_positions, actual_fs, y_spectrum, f0_floor, f0_ceil):
	raw_f0_candidate = np.zeros((np.size(boundary_f0_list), number_of_frames))
	raw_f0_stability = np.zeros((np.size(boundary_f0_list), number_of_frames))

	for i in range(np.size(boundary_f0_list)):
	interpolated_f0, f0_deviations = get_raw_event(boundary_f0_list[i], actual_fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil)
	raw_f0_stability[i, :] = np.exp(-(f0_deviations / np.maximum(interpolated_f0, 0.0000001)))
	raw_f0_candidate[i, :] = interpolated_f0

	return raw_f0_candidate, raw_f0_stability

	def sort_candidates(f0_candidate_map, stability_map):
	number_of_candidates, number_of_frames = f0_candidate_map.shape
	sorted_index = np.argsort(-stability_map, axis=0, kind='quicksort')
	f0_candidates = np.zeros((number_of_candidates, number_of_frames))

	for i in range(number_of_frames):
	f0_candidates[:, i] = f0_candidate_map[sorted_index[:number_of_candidates,i], i]

	return f0_candidates

	def get_raw_event(boundary_f0, fs, y_spectrum, y_length, temporal_positions, f0_floor, f0_ceil):
	low_pass_filter = nuttall(int(fs / boundary_f0 / 2 + 0.5) * 4)

	filtered_signal = np.real(np.fft.ifft(np.fft.fft(low_pass_filter, len(y_spectrum)) * y_spectrum))
	filtered_signal = filtered_signal[low_pass_filter.argmax() + np.arange(1, y_length + 1)]

	neg_loc, neg_f0 = ZeroCrossingEngine(filtered_signal, fs)
	pos_loc, pos_f0 = ZeroCrossingEngine(-filtered_signal, fs)
	peak_loc, peak_f0 = ZeroCrossingEngine(np.diff(filtered_signal), fs)
	dip_loc, dip_f0 = ZeroCrossingEngine(-np.diff(filtered_signal), fs)

	f0_candidate, f0_deviations = get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions)

	f0_candidate[f0_candidate > boundary_f0] = 0
	f0_candidate[f0_candidate < (boundary_f0 / 2)] = 0
	f0_candidate[f0_candidate > f0_ceil] = 0
	f0_candidate[f0_candidate < f0_floor] = 0
	f0_deviations[f0_candidate == 0] = 100000

	return f0_candidate, f0_deviations

	def get_f0_candidates(neg_loc, neg_f0, pos_loc, pos_f0, peak_loc, peak_f0, dip_loc, dip_f0, temporal_positions):
	usable_channel = max(0, np.size(neg_loc) - 2) * max(0, np.size(pos_loc) - 2) * max(0, np.size(peak_loc) - 2) * max(0, np.size(dip_f0) - 2)
	interpolated_f0_list = np.zeros((4, np.size(temporal_positions)))

	if usable_channel > 0:
	interpolated_f0_list[0, :] = interp1d(neg_loc, neg_f0, fill_value='extrapolate')(temporal_positions)
	interpolated_f0_list[1, :] = interp1d(pos_loc, pos_f0, fill_value='extrapolate')(temporal_positions)
	interpolated_f0_list[2, :] = interp1d(peak_loc, peak_f0, fill_value='extrapolate')(temporal_positions)
	interpolated_f0_list[3, :] = interp1d(dip_loc, dip_f0, fill_value='extrapolate')(temporal_positions)
	interpolated_f0 = np.mean(interpolated_f0_list, axis=0)
	f0_deviations = np.std(interpolated_f0_list, axis=0, ddof=1)
	else:
	interpolated_f0 = temporal_positions * 0
	f0_deviations = temporal_positions * 0 + 1000

	return interpolated_f0, f0_deviations

	@nb.jit((nb.float64[:], nb.float64), nopython=True, cache=True)
	def ZeroCrossingEngine(x, fs):
	y = np.empty_like(x)
	y[:-1] = x[1:]
	y[-1] = x[-1]

	negative_going_points = np.arange(1, len(x) + 1) * ((y * x < 0) * (y < x))
	edge_list = negative_going_points[negative_going_points > 0]
	fine_edge_list = (edge_list) - x[edge_list - 1] / (x[edge_list] - x[edge_list - 1])

	return (fine_edge_list[:len(fine_edge_list) - 1] + fine_edge_list[1:]) / 2 / fs, fs / np.diff(fine_edge_list)

	def nuttall(N):
	return np.squeeze(np.asarray(np.array([0.355768, -0.487396, 0.144232, -0.012604]) @ np.cos(np.matrix([0,1,2,3]).T @ np.asmatrix(np.arange(N) * 2 * math.pi / (N-1)))))

	def fix_f0_contour(f0_candidates, frame_period, f0_floor, allowed_range):
	voice_range_minimum =int(1 / (frame_period / 1000) / f0_floor + 0.5) * 2 + 1
	f0_step2 = fix_step2(fix_step1(f0_candidates, voice_range_minimum, allowed_range), voice_range_minimum)
	section_list = count_voiced_sections(f0_step2)
	f0_step4 = fix_step4(fix_step3(f0_step2, f0_candidates, section_list, allowed_range), f0_candidates, section_list, allowed_range)

	return np.copy(f0_step4)

	def fix_step1(f0_candidates, voice_range_minimum, allowed_range):
	f0_base = f0_candidates[0]
	f0_base[ : voice_range_minimum] = 0
	f0_base[-voice_range_minimum : ] = 0

	f0_step1 = np.copy(f0_base)
	rounding_f0_base = np.array([float("{0:.6f}".format(elm)) for elm in f0_base])
	for i in np.arange(voice_range_minimum - 1, len(f0_base)):
	if abs((rounding_f0_base[i] - rounding_f0_base[i-1]) / (0.000001 + rounding_f0_base[i])) > allowed_range: f0_step1[i] = 0

	return f0_step1

	def fix_step2(f0_step1, voice_range_minimum):
	f0_step2 = np.copy(f0_step1)
	for i in np.arange((voice_range_minimum - 1) / 2 , len(f0_step1) - (voice_range_minimum - 1) / 2).astype(int):
	for j in np.arange( -(voice_range_minimum - 1) / 2 , (voice_range_minimum - 1) / 2 + 1).astype(int):
	if f0_step1[i + j] == 0:
	f0_step2[i] = 0
	break

	return f0_step2

	def fix_step3(f0_step2, f0_candidates, section_list, allowed_range):
	f0_step3 = np.empty_like(f0_step2)
	f0_step3[:] = f0_step2

	for i in np.arange(section_list.shape[0]):
	limit = len(f0_step3) - 1 if i == section_list.shape[0] - 1 else section_list[i + 1, 0] + 1

	for j in np.arange(section_list[i, 1], limit).astype(int):
	f0_step3[j + 1] = select_best_f0(f0_step3[j], f0_step3[j - 1], f0_candidates[:, j + 1], allowed_range)
	if f0_step3[j + 1] == 0: break

	return f0_step3

	def fix_step4(f0_step3, f0_candidates, section_list, allowed_range):
	f0_step4 = np.copy(f0_step3)

	for i in range(section_list.shape[0] - 1, -1 , -1):
	limit = 1 if i == 0 else section_list[i - 1, 1]

	for j in np.arange(section_list[i, 0], limit - 1, -1).astype(int):
	f0_step4[j - 1] = select_best_f0(f0_step4[j], f0_step4[j + 1], f0_candidates[:, j - 1], allowed_range)
	if f0_step4[j - 1] == 0: break

	return f0_step4

	def select_best_f0(current_f0, past_f0, candidates, allowed_range):
	from sys import float_info

	reference_f0 = (current_f0 * 3 - past_f0) / 2
	minimum_error = abs(reference_f0 - candidates[0])
	best_f0 = candidates[0]

	for i in range(1, len(candidates)):
	current_error = abs(reference_f0 - candidates[i])
	if current_error < minimum_error:
	minimum_error = current_error
	best_f0 = candidates[i]

	if abs(1 - best_f0 / (reference_f0 + float_info.epsilon)) > allowed_range: best_f0 = 0
	return best_f0

	def count_voiced_sections(f0):
	vuv = np.copy(f0)
	vuv[vuv != 0] = 1
	diff_vuv = np.diff(vuv)
	boundary_list = np.append(np.append([0], np.where(diff_vuv != 0)[0]), [len(vuv) - 2])

	first_section = np.ceil(-0.5 * diff_vuv[boundary_list[1]])
	number_of_voiced_sections = np.floor((len(boundary_list) - (1 - first_section)) / 2).astype(int)

	voiced_section_list = np.zeros((number_of_voiced_sections, 2))
	for i in range(number_of_voiced_sections):
	voiced_section_list[i, :] = np.array([1 + boundary_list[int((i - 1) * 2 + 1 + (1 - first_section)) + 1], boundary_list[int((i * 2) + (1 - first_section)) + 1]])

	return voiced_section_list

	def decimate_matlab(x, q, n=None, axis=-1):
	if not isinstance(q, int): raise TypeError
	if n is not None and not isinstance(n, int): raise TypeError

	system = signal.dlti(*signal.cheby1(n, 0.05, 0.8 / q))
	y = signal.filtfilt(system.num, system.den, x, axis=axis, padlen=3 * (max(len(system.den), len(system.num)) - 1))

	nd = len(y)
	return y[int(q - (q * np.ceil(nd / q) - nd)) - 1::q]

	def FilterForDecimate(x,r):
	a, b = np.zeros(3), np.zeros(2)

	if r==11:
	a[0] = 2.450743295230728
	a[1] = -2.06794904601978
	a[2] = 0.59574774438332101
	b[0] = 0.0026822508007163792
	b[1] = 0.0080467524021491377
	elif r==12:
	a[0] = 2.4981398605924205
	a[1] = -2.1368928194784025
	a[2] = 0.62187513816221485
	b[0] = 0.0021097275904709001
	b[1] = 0.0063291827714127002
	elif r==10:
	a[0] = 2.3936475118069387
	a[1] = -1.9873904075111861
	a[2] = 0.5658879979027055
	b[0] = 0.0034818622251927556
	b[1] = 0.010445586675578267
	elif r==9:
	a[0] = 2.3236003491759578
	a[1] = -1.8921545617463598
	a[2] = 0.53148928133729068
	b[0] = 0.0046331164041389372
	b[1] = 0.013899349212416812
	elif r==8:
	a[0] = 2.2357462340187593
	a[1] = -1.7780899984041358
	a[2] = 0.49152555365968692
	b[0] = 0.0063522763407111993
	b[1] = 0.019056829022133598
	elif r==7:
	a[0] = 2.1225239019534703
	a[1] = -1.6395144861046302
	a[2] = 0.44469707800587366
	b[0] = 0.0090366882681608418
	b[1] = 0.027110064804482525
	elif r==6:
	a[0] = 1.9715352749512141
	a[1] = -1.4686795689225347
	a[2] = 0.3893908434965701
	b[0] = 0.013469181309343825
	b[1] = 0.040407543928031475
	elif r==5:
	a[0] = 1.7610939654280557
	a[1] = -1.2554914843859768
	a[2] = 0.3237186507788215
	b[0] = 0.021334858522387423
	b[1] = 0.06400457556716227
	elif r==4:
	a[0] = 1.4499664446880227
	a[1] = -0.98943497080950582
	a[2] = 0.24578252340690215
	b[0] = 0.036710750339322612
	b[1] = 0.11013225101796784
	elif r==3:
	a[0] = 0.95039378983237421
	a[1] = -0.67429146741526791
	a[2] = 0.15412211621346475
	b[0] = 0.071221945171178636
	b[1] = 0.21366583551353591
	elif r==2:
	a[0] = 0.041156734567757189
	a[1] = -0.42599112459189636
	a[2] = 0.041037215479961225
	b[0] = 0.16797464681802227
	b[1] = 0.50392394045406674
	else: a[0] = a[1] = a[2] = b[0] = b[1] = 0.0

	w = np.zeros(3)
	y_prime = np.zeros_like(x)

	for i in range(len(x)):
	wt = x[i] + a[0] * w[0] + a[1] * w[1] + a[2] * w[2]
	y_prime[i] = b[0] * wt + b[1] * w[0] + b[1] * w[1] + b[0] * w[2]
	w[2] = w[1]
	w[1] = w[0]
	w[0] = wt

	return y_prime

	def decimate(x,r):
	y = []
	kNFact = 9
	x_length = len(x)

	tmp1 = np.zeros(x_length + kNFact * 2)
	tmp2 = np.zeros(x_length + kNFact * 2)

	for i in range(kNFact):
	tmp1[i] = 2 * x[0] - x[kNFact - i]

	for i in range(kNFact, kNFact + x_length):
	tmp1[i] = x[i - kNFact]

	for i in range(kNFact + x_length, 2 * kNFact + x_length):
	tmp1[i] = 2 * x[-1] - x[x_length - 2 - (i - (kNFact + x_length))]

	tmp2 = FilterForDecimate(tmp1, r)
	for i in range(2 * kNFact + x_length):
	tmp1[i] = tmp2[2 * kNFact + x_length - i - 1]

	tmp2 = FilterForDecimate(tmp1, r)
	for i in range(2 * kNFact + x_length):
	tmp1[i] = tmp2[2 * kNFact + x_length - i - 1]

	nbeg = int(r - r * np.ceil(x_length / r + 1) + x_length)

	count = 0
	for i in range(nbeg, x_length + kNFact, r):
	y.append(tmp1[i + kNFact - 1])
	count += 1

	return np.array(y)