admin commited on
Commit
dca18d5
·
1 Parent(s): ece4103
Files changed (4) hide show
  1. app.py +69 -67
  2. model.py +12 -2
  3. requirements.txt +5 -3
  4. utils.py +56 -6
app.py CHANGED
@@ -5,25 +5,19 @@ import warnings
5
  import numpy as np
6
  import pandas as pd
7
  import gradio as gr
8
- import librosa.display
9
  from model import EvalNet, t_EvalNet
10
- from utils import get_modelist, find_files, embed, MODEL_DIR
11
-
12
-
13
- TRANSLATE = {
14
- "chanyin": "Vibrato", # 颤音
15
- "boxian": "Plucks", # 拨弦
16
- "shanghua": "Upward Portamento", # 上滑音
17
- "xiahua": "Downward Portamento", # 下滑音
18
- "huazhi/guazou/lianmo/liantuo": "Glissando", # 花指\刮奏\连抹\连托
19
- "yaozhi": "Tremolo", # 摇指
20
- "dianyin": "Point Note", # 点音
21
- }
22
- CLASSES = list(TRANSLATE.keys())
23
- TEMP_DIR = "./__pycache__/tmp"
24
- SAMPLE_RATE = 44100
25
- HOP_LENGTH = 512
26
- TIME_LENGTH = 3
27
 
28
 
29
  def logMel(y, sr=SAMPLE_RATE):
@@ -113,12 +107,14 @@ def format_second(seconds):
113
 
114
 
115
  def infer(audio_path: str, log_name: str):
116
- if not audio_path:
117
- return "Please input an audio!", None
118
-
119
- backbone = "_".join(log_name.split("_")[:-1])
120
- spec = log_name.split("_")[-1]
121
  try:
 
 
 
 
 
122
  input = load(audio_path, converto=spec)
123
  dur = librosa.get_duration(path=audio_path)
124
  frames_per_3s = input[0].shape[1]
@@ -126,7 +122,7 @@ def infer(audio_path: str, log_name: str):
126
  eval_net = t_EvalNet(
127
  backbone,
128
  len(TRANSLATE),
129
- input[0].shape[1],
130
  weight_path=f"{MODEL_DIR}/{log_name}.pt",
131
  )
132
 
@@ -134,36 +130,41 @@ def infer(audio_path: str, log_name: str):
134
  eval_net = EvalNet(
135
  backbone,
136
  len(TRANSLATE),
137
- input[0].shape[1],
138
  weight_path=f"{MODEL_DIR}/{log_name}.pt",
139
  )
140
 
141
  input_size = eval_net.get_input_size()
142
  embeded_input = embed(input, input_size)
143
- output = list(eval_net.forward(embeded_input))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  except Exception as e:
146
- return f"{e}", None
147
-
148
- index = 0
149
- outputs = []
150
- for y in output:
151
- preds = list(y.T)
152
- for pred in preds:
153
- start = index * TIME_LENGTH / frames_per_3s
154
- if start > dur:
155
- break
156
-
157
- to = (index + 1) * TIME_LENGTH / frames_per_3s
158
- outputs.append(
159
- {
160
- "Frame": f"{format_second(start)} - {format_second(to)}",
161
- "Tech": TRANSLATE[CLASSES[torch.argmax(pred).item()]],
162
- }
163
- )
164
- index += 1
165
 
166
- return os.path.basename(audio_path), pd.DataFrame(outputs)
167
 
168
 
169
  if __name__ == "__main__":
@@ -178,36 +179,37 @@ if __name__ == "__main__":
178
  gr.Interface(
179
  fn=infer,
180
  inputs=[
181
- gr.Audio(label="Upload audio", type="filepath"),
182
- gr.Dropdown(choices=models, label="Select a model", value=models[0]),
183
  ],
184
  outputs=[
185
- gr.Textbox(label="Audio filename", show_copy_button=True),
186
- gr.Dataframe(label="Frame-level guzheng playing technique detection"),
 
187
  ],
188
  examples=examples,
189
  cache_examples=False,
190
  flagging_mode="never",
191
- title="It is suggested that the recording time should not be too long",
192
  )
193
 
194
  gr.Markdown(
195
- """
196
- # Cite
197
- ```bibtex
198
- @article{Zhou-2025,
199
- author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
200
- title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
201
- journal = {Transactions of the International Society for Music Information Retrieval},
202
- volume = {8},
203
- number = {1},
204
- pages = {22--38},
205
- month = {Mar},
206
- year = {2025},
207
- url = {https://doi.org/10.5334/tismir.194},
208
- doi = {10.5334/tismir.194}
209
- }
210
- ```"""
211
  )
212
 
213
  demo.launch()
 
5
  import numpy as np
6
  import pandas as pd
7
  import gradio as gr
 
8
  from model import EvalNet, t_EvalNet
9
+ from utils import (
10
+ get_modelist,
11
+ find_files,
12
+ embed,
13
+ _L,
14
+ MODEL_DIR,
15
+ SAMPLE_RATE,
16
+ HOP_LENGTH,
17
+ TIME_LENGTH,
18
+ TRANSLATE,
19
+ CLASSES,
20
+ )
 
 
 
 
 
21
 
22
 
23
  def logMel(y, sr=SAMPLE_RATE):
 
107
 
108
 
109
  def infer(audio_path: str, log_name: str):
110
+ status = "Success"
111
+ filename = result = None
 
 
 
112
  try:
113
+ if not audio_path:
114
+ raise ValueError("请输入音频!")
115
+
116
+ backbone = "_".join(log_name.split("_")[:-1])
117
+ spec = log_name.split("_")[-1]
118
  input = load(audio_path, converto=spec)
119
  dur = librosa.get_duration(path=audio_path)
120
  frames_per_3s = input[0].shape[1]
 
122
  eval_net = t_EvalNet(
123
  backbone,
124
  len(TRANSLATE),
125
+ frames_per_3s,
126
  weight_path=f"{MODEL_DIR}/{log_name}.pt",
127
  )
128
 
 
130
  eval_net = EvalNet(
131
  backbone,
132
  len(TRANSLATE),
133
+ frames_per_3s,
134
  weight_path=f"{MODEL_DIR}/{log_name}.pt",
135
  )
136
 
137
  input_size = eval_net.get_input_size()
138
  embeded_input = embed(input, input_size)
139
+ output = []
140
+ for x in embeded_input:
141
+ output.append(eval_net.forward(x))
142
+
143
+ index = 0
144
+ outputs = []
145
+ for y in output:
146
+ preds = list(y.T)
147
+ for pred in preds:
148
+ start = index * TIME_LENGTH / frames_per_3s
149
+ if start > dur:
150
+ break
151
+
152
+ to = (index + 1) * TIME_LENGTH / frames_per_3s
153
+ outputs.append(
154
+ {
155
+ _L("帧数"): f"{format_second(start)} - {format_second(to)}",
156
+ _L("技法"): TRANSLATE[CLASSES[torch.argmax(pred).item()]],
157
+ }
158
+ )
159
+ index += 1
160
+
161
+ filename = os.path.basename(audio_path)
162
+ result = pd.DataFrame(outputs)
163
 
164
  except Exception as e:
165
+ status = f"{e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
+ return status, filename, result
168
 
169
 
170
  if __name__ == "__main__":
 
179
  gr.Interface(
180
  fn=infer,
181
  inputs=[
182
+ gr.Audio(label=_L("上传录音"), type="filepath"),
183
+ gr.Dropdown(choices=models, label=_L("选择模型"), value=models[0]),
184
  ],
185
  outputs=[
186
+ gr.Textbox(label=_L("状态栏"), show_copy_button=True),
187
+ gr.Textbox(label=_L("音频文件名"), show_copy_button=True),
188
+ gr.Dataframe(label=_L("古筝演奏技法逐帧检测")),
189
  ],
190
  examples=examples,
191
  cache_examples=False,
192
  flagging_mode="never",
193
+ title=_L("建议录音时长不要过长"),
194
  )
195
 
196
  gr.Markdown(
197
+ f"# {_L('引用')}"
198
+ + """
199
+ ```bibtex
200
+ @article{Zhou-2025,
201
+ author = {Monan Zhou and Shenyang Xu and Zhaorui Liu and Zhaowen Wang and Feng Yu and Wei Li and Baoqiang Han},
202
+ title = {CCMusic: An Open and Diverse Database for Chinese Music Information Retrieval Research},
203
+ journal = {Transactions of the International Society for Music Information Retrieval},
204
+ volume = {8},
205
+ number = {1},
206
+ pages = {22--38},
207
+ month = {Mar},
208
+ year = {2025},
209
+ url = {https://doi.org/10.5334/tismir.194},
210
+ doi = {10.5334/tismir.194}
211
+ }
212
+ ```"""
213
  )
214
 
215
  demo.launch()
model.py CHANGED
@@ -3,7 +3,9 @@ import torch.nn as nn
3
  import torch.nn.functional as F
4
  import torchvision.models as models
5
  import numpy as np
 
6
  from datasets import load_dataset
 
7
 
8
 
9
  class Interpolate(nn.Module):
@@ -79,7 +81,11 @@ class EvalNet:
79
  raise ValueError("[Backbone not found] Please check if --model is correct!")
80
 
81
  def _model_info(self, backbone: str):
82
- backbone_list = load_dataset("monetjoe/cv_backbones", split="train")
 
 
 
 
83
  backbone_info = self._get_backbone(backbone, backbone_list)
84
  return (
85
  str(backbone_info["type"]),
@@ -228,7 +234,11 @@ class t_EvalNet:
228
  raise ValueError("[Backbone not found] Please check if --model is correct!")
229
 
230
  def _model_info(self, backbone: str):
231
- backbone_list = load_dataset("monetjoe/cv_backbones", split="train")
 
 
 
 
232
  backbone_info = self._get_backbone(backbone, backbone_list)
233
  return (
234
  str(backbone_info["type"]),
 
3
  import torch.nn.functional as F
4
  import torchvision.models as models
5
  import numpy as np
6
+ from modelscope.msdatasets import MsDataset
7
  from datasets import load_dataset
8
+ from utils import EN_US
9
 
10
 
11
  class Interpolate(nn.Module):
 
81
  raise ValueError("[Backbone not found] Please check if --model is correct!")
82
 
83
  def _model_info(self, backbone: str):
84
+ backbone_list = (
85
+ load_dataset("monetjoe/cv_backbones", split="train")
86
+ if EN_US
87
+ else MsDataset.load("monetjoe/cv_backbones", split="v1")
88
+ )
89
  backbone_info = self._get_backbone(backbone, backbone_list)
90
  return (
91
  str(backbone_info["type"]),
 
234
  raise ValueError("[Backbone not found] Please check if --model is correct!")
235
 
236
  def _model_info(self, backbone: str):
237
+ backbone_list = (
238
+ load_dataset("monetjoe/cv_backbones", split="train")
239
+ if EN_US
240
+ else MsDataset.load("monetjoe/cv_backbones", split="v1")
241
+ )
242
  backbone_info = self._get_backbone(backbone, backbone_list)
243
  return (
244
  str(backbone_info["type"]),
requirements.txt CHANGED
@@ -1,5 +1,7 @@
1
- torch
2
- pillow
 
 
3
  librosa
4
  matplotlib
5
- torchvision
 
1
+ torch==2.6.0+cu118
2
+ -f https://download.pytorch.org/whl/torch
3
+ torchvision==0.21.0+cu118
4
+ -f https://download.pytorch.org/whl/torchvision
5
  librosa
6
  matplotlib
7
+ modelscope[framework]==1.21.0
utils.py CHANGED
@@ -1,15 +1,64 @@
1
  import os
2
  import torch
 
 
3
  import numpy as np
4
  from torchvision.transforms import Compose, Resize, Normalize
5
- from huggingface_hub import snapshot_download
6
 
7
- MODEL_DIR = snapshot_download(
8
- "ccmusic-database/Guzheng_Tech99",
9
- cache_dir="./__pycache__",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  )
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def toCUDA(x):
14
  if hasattr(x, "cuda"):
15
  if torch.cuda.is_available():
@@ -54,6 +103,7 @@ def embed(input: list, img_size: int):
54
  for x in input:
55
  x = np.array(x).transpose(2, 0, 1)
56
  x = torch.from_numpy(x).repeat(3, 1, 1)
57
- inputs.append(compose(x).float())
 
58
 
59
- return toCUDA(torch.tensor(np.array(inputs)))
 
1
  import os
2
  import torch
3
+ import modelscope
4
+ import huggingface_hub
5
  import numpy as np
6
  from torchvision.transforms import Compose, Resize, Normalize
 
7
 
8
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
9
+
10
+ ZH2EN = {
11
+ "上传录音": "Upload a recording",
12
+ "选择模型": "Select a model",
13
+ "状态栏": "Status",
14
+ "音频文件名": "Audio filename",
15
+ "古筝演奏技法逐帧检测": "Frame-level guzheng playing technique detection",
16
+ "建议录音时长不要过长": "It is suggested that the recording time should not be too long",
17
+ "引用": "Cite",
18
+ "颤音": "Vibrato",
19
+ "拨弦": "Plucks",
20
+ "上滑音": "Upward Portamento",
21
+ "下滑音": "Downward Portamento",
22
+ "花指\刮奏\连抹\连托": "Glissando",
23
+ "摇指": "Tremolo",
24
+ "点音": "Point Note",
25
+ "帧数": "Frame",
26
+ "技法": "Tech",
27
+ }
28
+
29
+ MODEL_DIR = (
30
+ huggingface_hub.snapshot_download(
31
+ "ccmusic-database/Guzheng_Tech99",
32
+ cache_dir="./__pycache__",
33
+ )
34
+ if EN_US
35
+ else modelscope.snapshot_download(
36
+ "ccmusic-database/Guzheng_Tech99",
37
+ cache_dir="./__pycache__",
38
+ )
39
  )
40
 
41
 
42
+ def _L(zh_txt: str):
43
+ return ZH2EN[zh_txt] if EN_US else zh_txt
44
+
45
+
46
+ TRANSLATE = {
47
+ "chanyin": _L("颤音"), # Vibrato
48
+ "boxian": _L("拨弦"), # Plucks
49
+ "shanghua": _L("上滑音"), # Upward Portamento
50
+ "xiahua": _L("下滑音"), # Downward Portamento
51
+ "huazhi/guazou/lianmo/liantuo": _L("花指\刮奏\连抹\连托"), # Glissando
52
+ "yaozhi": _L("摇指"), # Tremolo
53
+ "dianyin": _L("点音"), # Point Note
54
+ }
55
+ CLASSES = list(TRANSLATE.keys())
56
+ TEMP_DIR = "./__pycache__/tmp"
57
+ SAMPLE_RATE = 44100
58
+ HOP_LENGTH = 512
59
+ TIME_LENGTH = 3
60
+
61
+
62
  def toCUDA(x):
63
  if hasattr(x, "cuda"):
64
  if torch.cuda.is_available():
 
103
  for x in input:
104
  x = np.array(x).transpose(2, 0, 1)
105
  x = torch.from_numpy(x).repeat(3, 1, 1)
106
+ x = torch.tensor(np.array([compose(x).float()]))
107
+ inputs.append(toCUDA(x))
108
 
109
+ return inputs