Spaces:
Running
Running
File size: 4,551 Bytes
0e1dba2 1fd74cb 0e1dba2 1fd74cb 0fb6a95 0e1dba2 0fb6a95 e9f94b1 0e1dba2 e9f94b1 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb 0e1dba2 1fd74cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import os
import numpy as np
from scipy.io import wavfile
from scipy import signal
import resampy
from hparams import hparams as hp
def load_wav(path, sr):
orig_sr, audio = wavfile.read(path)
if len(audio) < 100: # Arbitrary threshold (can be higher for safety)
raise ValueError(f"Input audio too short: {len(audio)} samples")
if audio.dtype.kind == 'i':
audio = audio.astype(np.float32) / np.iinfo(audio.dtype).max
else:
audio = audio.astype(np.float32)
if orig_sr != sr:
audio = resampy.resample(audio, orig_sr, sr)
return audio
def save_wav(wav, path, sr):
"""
Save a float32 waveform to disk as 16-bit PCM WAV.
"""
wav_int16 = (wav * 32767).clip(-32767, 32767).astype(np.int16)
wavfile.write(path, sr, wav_int16)
def preemphasis(wav, k, preemphasize=True):
if preemphasize:
return signal.lfilter([1, -k], [1], wav)
return wav
def inv_preemphasis(wav, k, inv_preemphasize=True):
if inv_preemphasize:
return signal.lfilter([1], [1, -k], wav)
return wav
def get_hop_size():
hop_size = hp.hop_size
if hop_size is None:
assert hp.frame_shift_ms is not None
hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
return hop_size
def linearspectrogram(wav):
D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
S = _amp_to_db(np.abs(D)) - hp.ref_level_db
return _normalize(S) if hp.signal_normalization else S
def melspectrogram(wav):
D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
return _normalize(S) if hp.signal_normalization else S
def _lws_processor():
import lws
return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
def _stft(y):
if hp.use_lws:
return _lws_processor().stft(y).T
else:
import librosa # Safe to import inside function
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
def num_frames(length, fsize, fshift):
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M
def pad_lr(x, fsize, fshift):
M = num_frames(len(x), fsize, fshift)
pad = (fsize - fshift)
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r
def librosa_pad_lr(x, fsize, fshift):
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
_mel_basis = None
def _linear_to_mel(spectrogram):
global _mel_basis
if _mel_basis is None:
_mel_basis = _build_mel_basis()
return np.dot(_mel_basis, spectrogram)
def _build_mel_basis():
import librosa.filters # Imported only when needed
assert hp.fmax <= hp.sample_rate // 2
return librosa.filters.mel(
sr=hp.sample_rate,
n_fft=hp.n_fft,
n_mels=hp.num_mels,
fmin=hp.fmin,
fmax=hp.fmax
)
def _amp_to_db(x):
min_level = np.exp(hp.min_level_db / 20 * np.log(10))
return 20 * np.log10(np.maximum(min_level, x))
def _db_to_amp(x):
return np.power(10.0, x * 0.05)
def _normalize(S):
if hp.allow_clipping_in_normalization:
if hp.symmetric_mels:
return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
-hp.max_abs_value, hp.max_abs_value)
else:
return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
if hp.symmetric_mels:
return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
else:
return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
def _denormalize(D):
if hp.allow_clipping_in_normalization:
if hp.symmetric_mels:
return (((np.clip(D, -hp.max_abs_value,
hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
+ hp.min_level_db)
else:
return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
if hp.symmetric_mels:
return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
else:
return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
|