|
import glob
|
|
import librosa
|
|
import tqdm
|
|
import numpy as np
|
|
import torchaudio
|
|
import torch
|
|
|
|
|
|
import warnings
|
|
|
|
warnings.filterwarnings("ignore")
|
|
|
|
import concurrent.futures
|
|
import glob
|
|
import os
|
|
import librosa
|
|
import numpy as np
|
|
import onnxruntime as ort
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
SAMPLING_RATE = 16000
|
|
INPUT_LENGTH = 9.01
|
|
|
|
|
|
class DNSMOSComputer:
|
|
def __init__(
|
|
self, primary_model_path, p808_model_path, device="cuda", device_id=0
|
|
) -> None:
|
|
self.onnx_sess = ort.InferenceSession(
|
|
primary_model_path, providers=["CUDAExecutionProvider"]
|
|
)
|
|
self.p808_onnx_sess = ort.InferenceSession(
|
|
p808_model_path, providers=["CUDAExecutionProvider"]
|
|
)
|
|
self.onnx_sess.set_providers(["CUDAExecutionProvider"], [{"device_id": device_id}])
|
|
self.p808_onnx_sess.set_providers(
|
|
["CUDAExecutionProvider"], [{"device_id": device_id}]
|
|
)
|
|
kwargs = {
|
|
"sample_rate": 16000,
|
|
"hop_length": 160,
|
|
"n_fft": 320 + 1,
|
|
"n_mels": 120,
|
|
"mel_scale": "slaney",
|
|
}
|
|
self.mel_transform = torchaudio.transforms.MelSpectrogram(**kwargs).to(f"cuda:{device_id}")
|
|
|
|
def audio_melspec(
|
|
self, audio, n_mels=120, frame_size=320, hop_length=160, sr=16000, to_db=True
|
|
):
|
|
mel_specgram = self.mel_transform(torch.Tensor(audio).cuda())
|
|
mel_spec = mel_specgram.cpu()
|
|
if to_db:
|
|
mel_spec = (librosa.power_to_db(mel_spec, ref=np.max) + 40) / 40
|
|
return mel_spec.T
|
|
|
|
def get_polyfit_val(self, sig, bak, ovr, is_personalized_MOS):
|
|
if is_personalized_MOS:
|
|
p_ovr = np.poly1d([-0.00533021, 0.005101, 1.18058466, -0.11236046])
|
|
p_sig = np.poly1d([-0.01019296, 0.02751166, 1.19576786, -0.24348726])
|
|
p_bak = np.poly1d([-0.04976499, 0.44276479, -0.1644611, 0.96883132])
|
|
else:
|
|
p_ovr = np.poly1d([-0.06766283, 1.11546468, 0.04602535])
|
|
p_sig = np.poly1d([-0.08397278, 1.22083953, 0.0052439])
|
|
p_bak = np.poly1d([-0.13166888, 1.60915514, -0.39604546])
|
|
sig_poly = p_sig(sig)
|
|
bak_poly = p_bak(bak)
|
|
ovr_poly = p_ovr(ovr)
|
|
return sig_poly, bak_poly, ovr_poly
|
|
|
|
def compute(self, audio, sampling_rate, is_personalized_MOS=False):
|
|
fs = SAMPLING_RATE
|
|
if isinstance(audio, str):
|
|
audio, _ = librosa.load(audio, sr=fs)
|
|
elif sampling_rate != fs:
|
|
|
|
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=fs)
|
|
actual_audio_len = len(audio)
|
|
len_samples = int(INPUT_LENGTH * fs)
|
|
while len(audio) < len_samples:
|
|
audio = np.append(audio, audio)
|
|
num_hops = int(np.floor(len(audio) / fs) - INPUT_LENGTH) + 1
|
|
hop_len_samples = fs
|
|
predicted_mos_sig_seg_raw = []
|
|
predicted_mos_bak_seg_raw = []
|
|
predicted_mos_ovr_seg_raw = []
|
|
predicted_mos_sig_seg = []
|
|
predicted_mos_bak_seg = []
|
|
predicted_mos_ovr_seg = []
|
|
predicted_p808_mos = []
|
|
|
|
for idx in range(num_hops):
|
|
audio_seg = audio[
|
|
int(idx * hop_len_samples) : int((idx + INPUT_LENGTH) * hop_len_samples)
|
|
]
|
|
if len(audio_seg) < len_samples:
|
|
continue
|
|
input_features = np.array(audio_seg).astype("float32")[np.newaxis, :]
|
|
p808_input_features = np.array(
|
|
self.audio_melspec(audio=audio_seg[:-160])
|
|
).astype("float32")[np.newaxis, :, :]
|
|
oi = {"input_1": input_features}
|
|
p808_oi = {"input_1": p808_input_features}
|
|
p808_mos = self.p808_onnx_sess.run(None, p808_oi)[0][0][0]
|
|
mos_sig_raw, mos_bak_raw, mos_ovr_raw = self.onnx_sess.run(None, oi)[0][0]
|
|
mos_sig, mos_bak, mos_ovr = self.get_polyfit_val(
|
|
mos_sig_raw, mos_bak_raw, mos_ovr_raw, is_personalized_MOS
|
|
)
|
|
predicted_mos_sig_seg_raw.append(mos_sig_raw)
|
|
predicted_mos_bak_seg_raw.append(mos_bak_raw)
|
|
predicted_mos_ovr_seg_raw.append(mos_ovr_raw)
|
|
predicted_mos_sig_seg.append(mos_sig)
|
|
predicted_mos_bak_seg.append(mos_bak)
|
|
predicted_mos_ovr_seg.append(mos_ovr)
|
|
predicted_p808_mos.append(p808_mos)
|
|
clip_dict = {
|
|
"filename": "audio_clip",
|
|
"len_in_sec": actual_audio_len / fs,
|
|
"sr": fs,
|
|
}
|
|
clip_dict["num_hops"] = num_hops
|
|
clip_dict["OVRL_raw"] = np.mean(predicted_mos_ovr_seg_raw)
|
|
clip_dict["SIG_raw"] = np.mean(predicted_mos_sig_seg_raw)
|
|
clip_dict["BAK_raw"] = np.mean(predicted_mos_bak_seg_raw)
|
|
clip_dict["OVRL"] = np.mean(predicted_mos_ovr_seg)
|
|
clip_dict["SIG"] = np.mean(predicted_mos_sig_seg)
|
|
clip_dict["BAK"] = np.mean(predicted_mos_bak_seg)
|
|
clip_dict["P808_MOS"] = np.mean(predicted_p808_mos)
|
|
return clip_dict
|
|
|