Guzheng_Tech99 / app.py
admin
sync ms
dca18d5
raw
history blame
6.28 kB
import os
import torch
import librosa
import warnings
import numpy as np
import pandas as pd
import gradio as gr
from model import EvalNet, t_EvalNet
from utils import (
get_modelist,
find_files,
embed,
_L,
MODEL_DIR,
SAMPLE_RATE,
HOP_LENGTH,
TIME_LENGTH,
TRANSLATE,
CLASSES,
)
def logMel(y, sr=SAMPLE_RATE):
mel = librosa.feature.melspectrogram(
y=y,
sr=sr,
hop_length=HOP_LENGTH,
fmin=27.5,
)
return librosa.power_to_db(mel, ref=np.max)
def logCqt(y, sr=SAMPLE_RATE):
cqt = librosa.cqt(
y,
sr=sr,
hop_length=HOP_LENGTH,
fmin=27.5,
n_bins=88,
bins_per_octave=12,
)
return ((1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(cqt), ref=np.max)) + 1.0
def logChroma(y, sr=SAMPLE_RATE):
chroma = librosa.feature.chroma_stft(
y=y,
sr=sr,
hop_length=HOP_LENGTH,
)
return (
(1.0 / 80.0) * librosa.core.amplitude_to_db(np.abs(chroma), ref=np.max)
) + 1.0
def RoW_norm(data):
common_sum = 0
square_sum = 0
tfle = 0
for i in range(len(data)):
tfle += (data[i].sum(-1).sum(0) != 0).astype("float").sum()
common_sum += data[i].sum(-1).sum(-1)
square_sum += (data[i] ** 2).sum(-1).sum(-1)
common_avg = common_sum / tfle
square_avg = square_sum / tfle
std = np.sqrt(square_avg - common_avg**2)
return common_avg, std
def norm(data):
size = data.shape
avg, std = RoW_norm(data)
avg = np.tile(avg.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3]))
std = np.tile(std.reshape((1, -1, 1, 1)), (size[0], 1, size[2], size[3]))
return (data - avg) / std
def chunk_data(f):
x = []
xdata = np.transpose(f)
s = SAMPLE_RATE * TIME_LENGTH // HOP_LENGTH
length = int(np.ceil((int(len(xdata) / s) + 1) * s))
app = np.zeros((length - xdata.shape[0], xdata.shape[1]))
xdata = np.concatenate((xdata, app), 0)
for i in range(int(length / s)):
data = xdata[int(i * s) : int(i * s + s)]
x.append(np.transpose(data[:s, :]))
return np.array(x)
def load(audio_path: str, converto="mel"):
y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
spec = eval("log%s(y, sr)" % converto.capitalize())
x_spec = chunk_data(spec)
Xtr_spec = np.expand_dims(x_spec, axis=3)
return list(norm(Xtr_spec))
def format_second(seconds):
integer_part = int(seconds)
decimal_part = round(seconds - integer_part, 3)
hours, remainder = divmod(integer_part, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{hours:02}:{minutes:02}:{seconds:02}.{decimal_part:.3f}"
def infer(audio_path: str, log_name: str):
status = "Success"
filename = result = None
try:
if not audio_path:
raise ValueError("请输入音频!")
backbone = "_".join(log_name.split("_")[:-1])
spec = log_name.split("_")[-1]
input = load(audio_path, converto=spec)
dur = librosa.get_duration(path=audio_path)
frames_per_3s = input[0].shape[1]
if "vit" in backbone or "swin" in backbone:
eval_net = t_EvalNet(
backbone,
len(TRANSLATE),
frames_per_3s,
weight_path=f"{MODEL_DIR}/{log_name}.pt",
)
else:
eval_net = EvalNet(
backbone,
len(TRANSLATE),
frames_per_3s,
weight_path=f"{MODEL_DIR}/{log_name}.pt",
)
input_size = eval_net.get_input_size()
embeded_input = embed(input, input_size)
output = []
for x in embeded_input:
output.append(eval_net.forward(x))
index = 0
outputs = []
for y in output:
preds = list(y.T)
for pred in preds:
start = index * TIME_LENGTH / frames_per_3s
if start > dur:
break
to = (index + 1) * TIME_LENGTH / frames_per_3s
outputs.append(
{
_L("帧数"): f"{format_second(start)} - {format_second(to)}",
_L("技法"): TRANSLATE[CLASSES[torch.argmax(pred).item()]],
}
)
index += 1
filename = os.path.basename(audio_path)
result = pd.DataFrame(outputs)
except Exception as e:
status = f"{e}"
return status, filename, result
if __name__ == "__main__":
warnings.filterwarnings("ignore")
models = get_modelist(assign_model="VGG19_mel")
examples = []
example_wavs = find_files()
for wav in example_wavs:
examples.append([wav, models[0]])
with gr.Blocks() as demo:
gr.Interface(
fn=infer,
inputs=[
gr.Audio(label=_L("上传录音"), type="filepath"),
gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]),
],
outputs=[
gr.Textbox(label=_L("状态栏"), show_copy_button=True),
gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
gr.Dataframe(label=_L("古筝演奏技法逐帧检测")),
],
examples=examples,
cache_examples=False,
flagging_mode="never",
title=_L("建议录音时长不要过长"),
)
gr.Markdown(
f"# {_L('引用')}"
+ """
```bibtex
@article{Zhou-2025,
author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
journal = {Transactions of the International Society for Music Information Retrieval},
volume = {8},
number = {1},
pages = {22--38},
month = {Mar},
year = {2025},
url = {https://doi.org/10.5334/tismir.194},
doi = {10.5334/tismir.194}
}
```"""
)
demo.launch()