Spaces:
Running
Running
import os | |
import torch | |
import librosa | |
import warnings | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
from model import EvalNet, t_EvalNet | |
from utils import ( | |
get_modelist, | |
find_files, | |
embed, | |
_L, | |
MODEL_DIR, | |
SAMPLE_RATE, | |
HOP_LENGTH, | |
TIME_LENGTH, | |
TRANSLATE, | |
CLASSES, | |
) | |
def logMel(y, sr=SAMPLE_RATE): | |
mel = librosa.feature.melspectrogram( | |
y=y, | |
sr=sr, | |
hop_length=HOP_LENGTH, | |
fmin=27.5, | |
) | |
return librosa.power_to_db(mel, ref=np.max) | |
def logCqt(y, sr=SAMPLE_RATE): | |
cqt = librosa.cqt( | |
y, | |
sr=sr, | |
hop_length=HOP_LENGTH, | |
fmin=27.5, | |
n_bins=88, | |
bins_per_octave=12, | |
) | |
return ((1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(cqt), ref=np.max)) + 1.0 | |
def logChroma(y, sr=SAMPLE_RATE): | |
chroma = librosa.feature.chroma_stft( | |
y=y, | |
sr=sr, | |
hop_length=HOP_LENGTH, | |
) | |
return ( | |
(1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(chroma), ref=np.max) | |
) + 1.0 | |
def RoW_norm(data): | |
common_sum = 0 | |
square_sum = 0 | |
tfle = 0 | |
for i in range(len(data)): | |
tfle += (data[i].sum(-1).sum(0) != 0).astype("float").sum() | |
common_sum += data[i].sum(-1).sum(-1) | |
square_sum += (data[i] ** 2).sum(-1).sum(-1) | |
common_avg = common_sum / tfle | |
square_avg = square_sum / tfle | |
std = np.sqrt(square_avg - common_avg**2) | |
return common_avg, std | |
def norm(data): | |
size = data.shape | |
avg, std = RoW_norm(data) | |
avg = np.tile(avg.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3])) | |
std = np.tile(std.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3])) | |
return (data - avg) / std | |
def chunk_data(f): | |
x = [] | |
xdata = np.transpose(f) | |
s = SAMPLE_RATE * TIME_LENGTH // HOP_LENGTH | |
length = int(np.ceil((int(len(xdata) / s) + 1) * s)) | |
app = np.zeros((length - xdata.shape[0], xdata.shape[1])) | |
xdata = np.concatenate((xdata, app), 0) | |
for i in range(int(length / s)): | |
data = xdata[int(i * s) : int(i * s + s)] | |
x.append(np.transpose(data[:s, :])) | |
return np.array(x) | |
def load(audio_path: str, converto="mel"): | |
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE) | |
spec = eval("log%s(y, sr)" % converto.capitalize()) | |
x_spec = chunk_data(spec) | |
Xtr_spec = np.expand_dims(x_spec, axis=3) | |
return list(norm(Xtr_spec)) | |
def format_second(seconds): | |
integer_part = int(seconds) | |
decimal_part = round(seconds - integer_part, 3) | |
hours, remainder = divmod(integer_part, 3600) | |
minutes, seconds = divmod(remainder, 60) | |
return f"{hours:02}:{minutes:02}:{seconds:02}.{decimal_part:.3f}" | |
def infer(audio_path: str, log_name: str): | |
status = "Success" | |
filename = result = None | |
try: | |
if not audio_path: | |
raise ValueError("请输入音频!") | |
backbone = "_".join(log_name.split("_")[:-1]) | |
spec = log_name.split("_")[-1] | |
input = load(audio_path, converto=spec) | |
dur = librosa.get_duration(path=audio_path) | |
frames_per_3s = input[0].shape[1] | |
if "vit" in backbone or "swin" in backbone: | |
eval_net = t_EvalNet( | |
backbone, | |
len(TRANSLATE), | |
frames_per_3s, | |
weight_path=f"{MODEL_DIR}/{log_name}.pt", | |
) | |
else: | |
eval_net = EvalNet( | |
backbone, | |
len(TRANSLATE), | |
frames_per_3s, | |
weight_path=f"{MODEL_DIR}/{log_name}.pt", | |
) | |
input_size = eval_net.get_input_size() | |
embeded_input = embed(input, input_size) | |
output = [] | |
for x in embeded_input: | |
output.append(eval_net.forward(x)) | |
index = 0 | |
outputs = [] | |
for y in output: | |
preds = list(y.T) | |
for pred in preds: | |
start = index * TIME_LENGTH / frames_per_3s | |
if start > dur: | |
break | |
to = (index + 1) * TIME_LENGTH / frames_per_3s | |
outputs.append( | |
{ | |
_L("帧数"): f"{format_second(start)} - {format_second(to)}", | |
_L("技法"): TRANSLATE[CLASSES[torch.argmax(pred).item()]], | |
} | |
) | |
index += 1 | |
filename = os.path.basename(audio_path) | |
result = pd.DataFrame(outputs) | |
except Exception as e: | |
status = f"{e}" | |
return status, filename, result | |
if __name__ == "__main__": | |
warnings.filterwarnings("ignore") | |
models = get_modelist(assign_model="VGG19_mel") | |
examples = [] | |
example_wavs = find_files() | |
for wav in example_wavs: | |
examples.append([wav, models[0]]) | |
with gr.Blocks() as demo: | |
gr.Interface( | |
fn=infer, | |
inputs=[ | |
gr.Audio(label=_L("上传录音"), type="filepath"), | |
gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]), | |
], | |
outputs=[ | |
gr.Textbox(label=_L("状态栏"), show_copy_button=True), | |
gr.Textbox(label=_L("音频文件名"), show_copy_button=True), | |
gr.Dataframe(label=_L("古筝演奏技法逐帧检测")), | |
], | |
examples=examples, | |
cache_examples=False, | |
flagging_mode="never", | |
title=_L("建议录音时长不要过长"), | |
) | |
gr.Markdown( | |
f"# {_L('引用')}" | |
+ """ | |
```bibtex | |
@article{Zhou-2025, | |
author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han}, | |
title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research}, | |
journal = {Transactions of the International Society for Music Information Retrieval}, | |
volume = {8}, | |
number = {1}, | |
pages = {22--38}, | |
month = {Mar}, | |
year = {2025}, | |
url = {https://doi.org/10.5334/tismir.194}, | |
doi = {10.5334/tismir.194} | |
} | |
```""" | |
) | |
demo.launch() | |