Spaces:
Running
Running
admin
commited on
Commit
·
dca18d5
1
Parent(s):
ece4103
sync ms
Browse files
app.py
CHANGED
@@ -5,25 +5,19 @@ import warnings
|
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
8 |
-
import librosa.display
|
9 |
from model import EvalNet, t_EvalNet
|
10 |
-
from utils import
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
CLASSES = list(TRANSLATE.keys())
|
23 |
-
TEMP_DIR = "./__pycache__/tmp"
|
24 |
-
SAMPLE_RATE = 44100
|
25 |
-
HOP_LENGTH = 512
|
26 |
-
TIME_LENGTH = 3
|
27 |
|
28 |
|
29 |
def logMel(y, sr=SAMPLE_RATE):
|
@@ -113,12 +107,14 @@ def format_second(seconds):
|
|
113 |
|
114 |
|
115 |
def infer(audio_path: str, log_name: str):
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
backbone = "_".join(log_name.split("_")[:-1])
|
120 |
-
spec = log_name.split("_")[-1]
|
121 |
try:
|
|
|
|
|
|
|
|
|
|
|
122 |
input = load(audio_path, converto=spec)
|
123 |
dur = librosa.get_duration(path=audio_path)
|
124 |
frames_per_3s = input[0].shape[1]
|
@@ -126,7 +122,7 @@ def infer(audio_path: str, log_name: str):
|
|
126 |
eval_net = t_EvalNet(
|
127 |
backbone,
|
128 |
len(TRANSLATE),
|
129 |
-
|
130 |
weight_path=f"{MODEL_DIR}/{log_name}.pt",
|
131 |
)
|
132 |
|
@@ -134,36 +130,41 @@ def infer(audio_path: str, log_name: str):
|
|
134 |
eval_net = EvalNet(
|
135 |
backbone,
|
136 |
len(TRANSLATE),
|
137 |
-
|
138 |
weight_path=f"{MODEL_DIR}/{log_name}.pt",
|
139 |
)
|
140 |
|
141 |
input_size = eval_net.get_input_size()
|
142 |
embeded_input = embed(input, input_size)
|
143 |
-
output =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
|
145 |
except Exception as e:
|
146 |
-
|
147 |
-
|
148 |
-
index = 0
|
149 |
-
outputs = []
|
150 |
-
for y in output:
|
151 |
-
preds = list(y.T)
|
152 |
-
for pred in preds:
|
153 |
-
start = index * TIME_LENGTH / frames_per_3s
|
154 |
-
if start > dur:
|
155 |
-
break
|
156 |
-
|
157 |
-
to = (index + 1) * TIME_LENGTH / frames_per_3s
|
158 |
-
outputs.append(
|
159 |
-
{
|
160 |
-
"Frame": f"{format_second(start)} - {format_second(to)}",
|
161 |
-
"Tech": TRANSLATE[CLASSES[torch.argmax(pred).item()]],
|
162 |
-
}
|
163 |
-
)
|
164 |
-
index += 1
|
165 |
|
166 |
-
return
|
167 |
|
168 |
|
169 |
if __name__ == "__main__":
|
@@ -178,36 +179,37 @@ if __name__ == "__main__":
|
|
178 |
gr.Interface(
|
179 |
fn=infer,
|
180 |
inputs=[
|
181 |
-
gr.Audio(label="
|
182 |
-
gr.Dropdown(choices=models, label="
|
183 |
],
|
184 |
outputs=[
|
185 |
-
gr.Textbox(label="
|
186 |
-
gr.
|
|
|
187 |
],
|
188 |
examples=examples,
|
189 |
cache_examples=False,
|
190 |
flagging_mode="never",
|
191 |
-
title="
|
192 |
)
|
193 |
|
194 |
gr.Markdown(
|
195 |
-
""
|
196 |
-
|
197 |
-
```bibtex
|
198 |
-
@article{Zhou-2025,
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
}
|
210 |
-
```"""
|
211 |
)
|
212 |
|
213 |
demo.launch()
|
|
|
5 |
import numpy as np
|
6 |
import pandas as pd
|
7 |
import gradio as gr
|
|
|
8 |
from model import EvalNet, t_EvalNet
|
9 |
+
from utils import (
|
10 |
+
get_modelist,
|
11 |
+
find_files,
|
12 |
+
embed,
|
13 |
+
_L,
|
14 |
+
MODEL_DIR,
|
15 |
+
SAMPLE_RATE,
|
16 |
+
HOP_LENGTH,
|
17 |
+
TIME_LENGTH,
|
18 |
+
TRANSLATE,
|
19 |
+
CLASSES,
|
20 |
+
)
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
def logMel(y, sr=SAMPLE_RATE):
|
|
|
107 |
|
108 |
|
109 |
def infer(audio_path: str, log_name: str):
|
110 |
+
status = "Success"
|
111 |
+
filename = result = None
|
|
|
|
|
|
|
112 |
try:
|
113 |
+
if not audio_path:
|
114 |
+
raise ValueError("请输入音频!")
|
115 |
+
|
116 |
+
backbone = "_".join(log_name.split("_")[:-1])
|
117 |
+
spec = log_name.split("_")[-1]
|
118 |
input = load(audio_path, converto=spec)
|
119 |
dur = librosa.get_duration(path=audio_path)
|
120 |
frames_per_3s = input[0].shape[1]
|
|
|
122 |
eval_net = t_EvalNet(
|
123 |
backbone,
|
124 |
len(TRANSLATE),
|
125 |
+
frames_per_3s,
|
126 |
weight_path=f"{MODEL_DIR}/{log_name}.pt",
|
127 |
)
|
128 |
|
|
|
130 |
eval_net = EvalNet(
|
131 |
backbone,
|
132 |
len(TRANSLATE),
|
133 |
+
frames_per_3s,
|
134 |
weight_path=f"{MODEL_DIR}/{log_name}.pt",
|
135 |
)
|
136 |
|
137 |
input_size = eval_net.get_input_size()
|
138 |
embeded_input = embed(input, input_size)
|
139 |
+
output = []
|
140 |
+
for x in embeded_input:
|
141 |
+
output.append(eval_net.forward(x))
|
142 |
+
|
143 |
+
index = 0
|
144 |
+
outputs = []
|
145 |
+
for y in output:
|
146 |
+
preds = list(y.T)
|
147 |
+
for pred in preds:
|
148 |
+
start = index * TIME_LENGTH / frames_per_3s
|
149 |
+
if start > dur:
|
150 |
+
break
|
151 |
+
|
152 |
+
to = (index + 1) * TIME_LENGTH / frames_per_3s
|
153 |
+
outputs.append(
|
154 |
+
{
|
155 |
+
_L("帧数"): f"{format_second(start)} - {format_second(to)}",
|
156 |
+
_L("技法"): TRANSLATE[CLASSES[torch.argmax(pred).item()]],
|
157 |
+
}
|
158 |
+
)
|
159 |
+
index += 1
|
160 |
+
|
161 |
+
filename = os.path.basename(audio_path)
|
162 |
+
result = pd.DataFrame(outputs)
|
163 |
|
164 |
except Exception as e:
|
165 |
+
status = f"{e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
+
return status, filename, result
|
168 |
|
169 |
|
170 |
if __name__ == "__main__":
|
|
|
179 |
gr.Interface(
|
180 |
fn=infer,
|
181 |
inputs=[
|
182 |
+
gr.Audio(label=_L("上传录音"), type="filepath"),
|
183 |
+
gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]),
|
184 |
],
|
185 |
outputs=[
|
186 |
+
gr.Textbox(label=_L("状态栏"), show_copy_button=True),
|
187 |
+
gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
|
188 |
+
gr.Dataframe(label=_L("古筝演奏技法逐帧检测")),
|
189 |
],
|
190 |
examples=examples,
|
191 |
cache_examples=False,
|
192 |
flagging_mode="never",
|
193 |
+
title=_L("建议录音时长不要过长"),
|
194 |
)
|
195 |
|
196 |
gr.Markdown(
|
197 |
+
f"# {_L('引用')}"
|
198 |
+
+ """
|
199 |
+
```bibtex
|
200 |
+
@article{Zhou-2025,
|
201 |
+
author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
|
202 |
+
title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
|
203 |
+
journal = {Transactions of the International Society for Music Information Retrieval},
|
204 |
+
volume = {8},
|
205 |
+
number = {1},
|
206 |
+
pages = {22--38},
|
207 |
+
month = {Mar},
|
208 |
+
year = {2025},
|
209 |
+
url = {https://doi.org/10.5334/tismir.194},
|
210 |
+
doi = {10.5334/tismir.194}
|
211 |
+
}
|
212 |
+
```"""
|
213 |
)
|
214 |
|
215 |
demo.launch()
|
model.py
CHANGED
@@ -3,7 +3,9 @@ import torch.nn as nn
|
|
3 |
import torch.nn.functional as F
|
4 |
import torchvision.models as models
|
5 |
import numpy as np
|
|
|
6 |
from datasets import load_dataset
|
|
|
7 |
|
8 |
|
9 |
class Interpolate(nn.Module):
|
@@ -79,7 +81,11 @@ class EvalNet:
|
|
79 |
raise ValueError("[Backbone not found] Please check if --model is correct!")
|
80 |
|
81 |
def _model_info(self, backbone: str):
|
82 |
-
backbone_list =
|
|
|
|
|
|
|
|
|
83 |
backbone_info = self._get_backbone(backbone, backbone_list)
|
84 |
return (
|
85 |
str(backbone_info["type"]),
|
@@ -228,7 +234,11 @@ class t_EvalNet:
|
|
228 |
raise ValueError("[Backbone not found] Please check if --model is correct!")
|
229 |
|
230 |
def _model_info(self, backbone: str):
|
231 |
-
backbone_list =
|
|
|
|
|
|
|
|
|
232 |
backbone_info = self._get_backbone(backbone, backbone_list)
|
233 |
return (
|
234 |
str(backbone_info["type"]),
|
|
|
3 |
import torch.nn.functional as F
|
4 |
import torchvision.models as models
|
5 |
import numpy as np
|
6 |
+
from modelscope.msdatasets import MsDataset
|
7 |
from datasets import load_dataset
|
8 |
+
from utils import EN_US
|
9 |
|
10 |
|
11 |
class Interpolate(nn.Module):
|
|
|
81 |
raise ValueError("[Backbone not found] Please check if --model is correct!")
|
82 |
|
83 |
def _model_info(self, backbone: str):
|
84 |
+
backbone_list = (
|
85 |
+
load_dataset("monetjoe/cv_backbones", split="train")
|
86 |
+
if EN_US
|
87 |
+
else MsDataset.load("monetjoe/cv_backbones", split="v1")
|
88 |
+
)
|
89 |
backbone_info = self._get_backbone(backbone, backbone_list)
|
90 |
return (
|
91 |
str(backbone_info["type"]),
|
|
|
234 |
raise ValueError("[Backbone not found] Please check if --model is correct!")
|
235 |
|
236 |
def _model_info(self, backbone: str):
|
237 |
+
backbone_list = (
|
238 |
+
load_dataset("monetjoe/cv_backbones", split="train")
|
239 |
+
if EN_US
|
240 |
+
else MsDataset.load("monetjoe/cv_backbones", split="v1")
|
241 |
+
)
|
242 |
backbone_info = self._get_backbone(backbone, backbone_list)
|
243 |
return (
|
244 |
str(backbone_info["type"]),
|
requirements.txt
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
-
torch
|
2 |
-
|
|
|
|
|
3 |
librosa
|
4 |
matplotlib
|
5 |
-
|
|
|
1 |
+
torch==2.6.0+cu118
|
2 |
+
-f https://download.pytorch.org/whl/torch
|
3 |
+
torchvision==0.21.0+cu118
|
4 |
+
-f https://download.pytorch.org/whl/torchvision
|
5 |
librosa
|
6 |
matplotlib
|
7 |
+
modelscope[framework]==1.21.0
|
utils.py
CHANGED
@@ -1,15 +1,64 @@
|
|
1 |
import os
|
2 |
import torch
|
|
|
|
|
3 |
import numpy as np
|
4 |
from torchvision.transforms import Compose, Resize, Normalize
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
)
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def toCUDA(x):
|
14 |
if hasattr(x, "cuda"):
|
15 |
if torch.cuda.is_available():
|
@@ -54,6 +103,7 @@ def embed(input: list, img_size: int):
|
|
54 |
for x in input:
|
55 |
x = np.array(x).transpose(2, 0, 1)
|
56 |
x = torch.from_numpy(x).repeat(3, 1, 1)
|
57 |
-
|
|
|
58 |
|
59 |
-
return
|
|
|
1 |
import os
|
2 |
import torch
|
3 |
+
import modelscope
|
4 |
+
import huggingface_hub
|
5 |
import numpy as np
|
6 |
from torchvision.transforms import Compose, Resize, Normalize
|
|
|
7 |
|
8 |
+
EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
|
9 |
+
|
10 |
+
ZH2EN = {
|
11 |
+
"上传录音": "Upload a recording",
|
12 |
+
"选择模型": "Select a model",
|
13 |
+
"状态栏": "Status",
|
14 |
+
"音频文件名": "Audio filename",
|
15 |
+
"古筝演奏技法逐帧检测": "Frame-level guzheng playing technique detection",
|
16 |
+
"建议录音时长不要过长": "It is suggested that the recording time should not be too long",
|
17 |
+
"引用": "Cite",
|
18 |
+
"颤音": "Vibrato",
|
19 |
+
"拨弦": "Plucks",
|
20 |
+
"上滑音": "Upward Portamento",
|
21 |
+
"下滑音": "Downward Portamento",
|
22 |
+
"花指\刮奏\连抹\连托": "Glissando",
|
23 |
+
"摇指": "Tremolo",
|
24 |
+
"点音": "Point Note",
|
25 |
+
"帧数": "Frame",
|
26 |
+
"技法": "Tech",
|
27 |
+
}
|
28 |
+
|
29 |
+
MODEL_DIR = (
|
30 |
+
huggingface_hub.snapshot_download(
|
31 |
+
"ccmusic-database/Guzheng_Tech99",
|
32 |
+
cache_dir="./__pycache__",
|
33 |
+
)
|
34 |
+
if EN_US
|
35 |
+
else modelscope.snapshot_download(
|
36 |
+
"ccmusic-database/Guzheng_Tech99",
|
37 |
+
cache_dir="./__pycache__",
|
38 |
+
)
|
39 |
)
|
40 |
|
41 |
|
42 |
+
def _L(zh_txt: str):
|
43 |
+
return ZH2EN[zh_txt] if EN_US else zh_txt
|
44 |
+
|
45 |
+
|
46 |
+
TRANSLATE = {
|
47 |
+
"chanyin": _L("颤音"), # Vibrato
|
48 |
+
"boxian": _L("拨弦"), # Plucks
|
49 |
+
"shanghua": _L("上滑音"), # Upward Portamento
|
50 |
+
"xiahua": _L("下滑音"), # Downward Portamento
|
51 |
+
"huazhi/guazou/lianmo/liantuo": _L("花指\刮奏\连抹\连托"), # Glissando
|
52 |
+
"yaozhi": _L("摇指"), # Tremolo
|
53 |
+
"dianyin": _L("点音"), # Point Note
|
54 |
+
}
|
55 |
+
CLASSES = list(TRANSLATE.keys())
|
56 |
+
TEMP_DIR = "./__pycache__/tmp"
|
57 |
+
SAMPLE_RATE = 44100
|
58 |
+
HOP_LENGTH = 512
|
59 |
+
TIME_LENGTH = 3
|
60 |
+
|
61 |
+
|
62 |
def toCUDA(x):
|
63 |
if hasattr(x, "cuda"):
|
64 |
if torch.cuda.is_available():
|
|
|
103 |
for x in input:
|
104 |
x = np.array(x).transpose(2, 0, 1)
|
105 |
x = torch.from_numpy(x).repeat(3, 1, 1)
|
106 |
+
x = torch.tensor(np.array([compose(x).float()]))
|
107 |
+
inputs.append(toCUDA(x))
|
108 |
|
109 |
+
return inputs
|