File size: 5,162 Bytes
c113ea5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from datasets import load_dataset
import librosa
import IPython.display as ipd
from IPython.display import Audio, display
import random
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import json
ds0 = load_dataset('espnet/yodas', 'ja000')
print("finished loading ja000")
def wada_snr(wav):
# Direct blind estimation of the SNR of a speech signal.
#
# Paper on WADA SNR:
# http://www.cs.cmu.edu/~robust/Papers/KimSternIS08.pdf
#
# This function was adapted from this matlab code:
# https://labrosa.ee.columbia.edu/projects/snreval/#9
# init
eps = 1e-10
# next 2 lines define a fancy curve derived from a gamma distribution -- see paper
db_vals = np.arange(-20, 101)
g_vals = np.array([0.40974774, 0.40986926, 0.40998566, 0.40969089, 0.40986186, 0.40999006, 0.41027138, 0.41052627, 0.41101024, 0.41143264, 0.41231718, 0.41337272, 0.41526426, 0.4178192 , 0.42077252, 0.42452799, 0.42918886, 0.43510373, 0.44234195, 0.45161485, 0.46221153, 0.47491647, 0.48883809, 0.50509236, 0.52353709, 0.54372088, 0.56532427, 0.58847532, 0.61346212, 0.63954496, 0.66750818, 0.69583724, 0.72454762, 0.75414799, 0.78323148, 0.81240985, 0.84219775, 0.87166406, 0.90030504, 0.92880418, 0.95655449, 0.9835349 , 1.01047155, 1.0362095 , 1.06136425, 1.08579312, 1.1094819 , 1.13277995, 1.15472826, 1.17627308, 1.19703503, 1.21671694, 1.23535898, 1.25364313, 1.27103891, 1.28718029, 1.30302865, 1.31839527, 1.33294817, 1.34700935, 1.3605727 , 1.37345513, 1.38577122, 1.39733504, 1.40856397, 1.41959619, 1.42983624, 1.43958467, 1.44902176, 1.45804831, 1.46669568, 1.47486938, 1.48269965, 1.49034339, 1.49748214, 1.50435106, 1.51076426, 1.51698915, 1.5229097 , 1.528578 , 1.53389835, 1.5391211 , 1.5439065 , 1.54858517, 1.55310776, 1.55744391, 1.56164927, 1.56566348, 1.56938671, 1.57307767, 1.57654764, 1.57980083, 1.58304129, 1.58602496, 1.58880681, 1.59162477, 1.5941969 , 1.59693155, 1.599446 , 1.60185011, 1.60408668, 1.60627134, 1.60826199, 1.61004547, 1.61192472, 1.61369656, 1.61534074, 1.61688905, 1.61838916, 1.61985374, 1.62135878, 1.62268119, 1.62390423, 1.62513143, 1.62632463, 1.6274027 , 1.62842767, 1.62945532, 1.6303307 , 1.63128026, 1.63204102])
# peak normalize, get magnitude, clip lower bound
wav = np.array(wav)
max_val = np.abs(wav).max()
if max_val == 0:
max_val = eps
wav = wav / max_val
abs_wav = np.abs(wav)
abs_wav[abs_wav < eps] = eps
# calcuate statistics
# E[|z|]
v1 = max(eps, abs_wav.mean())
# E[log|z|]
v2 = np.log(abs_wav).mean()
# log(E[|z|]) - E[log(|z|)]
v3 = np.log(v1) - v2
# table interpolation
wav_snr_idx = None
if any(g_vals < v3):
wav_snr_idx = np.where(g_vals < v3)[0].max()
# handle edge cases or interpolate
if wav_snr_idx is None:
wav_snr = db_vals[0]
elif wav_snr_idx == len(db_vals) - 1:
wav_snr = db_vals[-1]
else:
wav_snr = db_vals[wav_snr_idx] + \
(v3-g_vals[wav_snr_idx]) / (g_vals[wav_snr_idx+1] - \
g_vals[wav_snr_idx]) * (db_vals[wav_snr_idx+1] - db_vals[wav_snr_idx])
# Calculate SNR
dEng = sum(wav**2)
dFactor = 10**(wav_snr / 10)
dNoiseEng = dEng / (1 + dFactor) # Noise energy
dSigEng = dEng * dFactor / (1 + dFactor) # Signal energy
snr = 10 * np.log10(dSigEng / dNoiseEng)
return snr
def preprocess_audio(data):
# �?ータが整数型�?�場合、浮動小数点型に変換
if data.dtype == np.int16:
data = data.astype(np.float32) / np.iinfo(np.int16).max
elif data.dtype == np.int32:
data = data.astype(np.float32) / np.iinfo(np.int32).max
# ス�?レオをモノラルに変換?���?要があれば?�?
if len(data.shape) == 2:
data = data.mean(axis=1)
return data
# 音声データの前処理とSNR計算を行う関数
def process_audio_data(item):
# 音声データの前処理
audio_data = item['audio']['array']
# 音声データが空でないことを確認
if len(audio_data) == 0:
return None
preprocessed_data = preprocess_audio(audio_data)
# WADA-SNRを計算
snr = wada_snr(preprocessed_data)
# データからidを取得
uuid = item['utt_id']
transcription = item['text']
return {
"ファイル名": uuid,
"SNR値": snr,
"トランスクリプション": transcription
}
import os
if __name__ == '__main__':
ds = load_dataset('espnet/yodas', 'ja000', trust_remote_code=True)
print("データ数: ", ds['train'].dataset_size)
# CPUのコア数を取得
cpu_count = os.cpu_count()
# 並列�?��?で関数を実�?
with ProcessPoolExecutor(max_workers=cpu_count) as executor:
results = list(executor.map(process_audio_data, ds['train']))
# Noneを除去
results = [result for result in results if result is not None]
# 結果をJSONファイルに保存
with open('audio_analysis_results.json', 'w') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
print("JSONファイルが保存されました") |