HoneyTian commited on
Commit
9d169ba
·
1 Parent(s): d804263
examples/sound_classification_by_lstm/run.sh CHANGED
@@ -161,6 +161,11 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
161
  --model_dir "${file_dir}/best" \
162
  --serialization_dir "${file_dir}" \
163
 
 
 
 
 
 
164
  fi
165
 
166
 
@@ -175,6 +180,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
175
 
176
  cp "${file_dir}/evaluation.xlsx" "${final_model_dir}/evaluation.xlsx"
177
 
 
 
178
  cp "${file_dir}/trace_model.zip" "${final_model_dir}/trace_model.zip"
179
  cp "${file_dir}/trace_quant_model.zip" "${final_model_dir}/trace_quant_model.zip"
180
  cp "${file_dir}/script_model.zip" "${final_model_dir}/script_model.zip"
 
161
  --model_dir "${file_dir}/best" \
162
  --serialization_dir "${file_dir}" \
163
 
164
+ python3 step_6_export_onnx_model.py \
165
+ --vocabulary_dir "${vocabulary_dir}" \
166
+ --model_dir "${file_dir}/best" \
167
+ --serialization_dir "${file_dir}" \
168
+
169
  fi
170
 
171
 
 
180
 
181
  cp "${file_dir}/evaluation.xlsx" "${final_model_dir}/evaluation.xlsx"
182
 
183
+ cp "${file_dir}/model.onnx" "${final_model_dir}/model.onnx"
184
+
185
  cp "${file_dir}/trace_model.zip" "${final_model_dir}/trace_model.zip"
186
  cp "${file_dir}/trace_quant_model.zip" "${final_model_dir}/trace_quant_model.zip"
187
  cp "${file_dir}/script_model.zip" "${final_model_dir}/script_model.zip"
examples/sound_classification_by_lstm/run_batch.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+
4
+ # pretrained voicemail
5
+
6
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-2-ch64-lstm \
7
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/*/wav_finished/*/*.wav" \
8
+ --label_plan 2-voicemail \
9
+ --config_file "yaml/lstm-classifier-2-ch64.yaml"
10
+
11
+
12
+ # voicemail ch64
13
+
14
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-ph-2-ch64-lstm \
15
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-PH/wav_finished/*/*.wav" \
16
+ --label_plan 2-voicemail \
17
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
18
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
19
+
20
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-sg-2-ch64-lstm \
21
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-SG/wav_finished/*/*.wav" \
22
+ --label_plan 2-voicemail \
23
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
24
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
25
+
26
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-en-us-2-ch64-lstm \
27
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/en-US/wav_finished/*/*.wav" \
28
+ --label_plan 2-voicemail \
29
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
30
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
31
+
32
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-mx-2-ch64-lstm \
33
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/es-MX/wav_finished/*/*.wav" \
34
+ --label_plan 2-voicemail \
35
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
36
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
37
+
38
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-es-pe-2-ch64-lstm \
39
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/es-PE/wav_finished/*/*.wav" \
40
+ --label_plan 2-voicemail \
41
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
42
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
43
+
44
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-fi-fi-2-ch64-lstm \
45
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/fi-FI/wav_finished/*/*.wav" \
46
+ --label_plan 2-voicemail \
47
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
48
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
49
+
50
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-id-id-2-ch64-lstm \
51
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/id-ID/wav_finished/*/*.wav" \
52
+ --label_plan 2-voicemail \
53
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
54
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
55
+
56
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ja-jp-2-ch64-lstm \
57
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ja-JP/wav_finished/*/*.wav" \
58
+ --label_plan 2-voicemail \
59
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
60
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
61
+
62
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ko-kr-2-ch64-lstm \
63
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ko-KR/wav_finished/*/*.wav" \
64
+ --label_plan 2-voicemail \
65
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
66
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
67
+
68
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-ms-my-2-ch64-lstm \
69
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/ms-MY/wav_finished/*/*.wav" \
70
+ --label_plan 2-voicemail \
71
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
72
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
73
+
74
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-pt-br-2-ch64-lstm \
75
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/pt-BR/wav_finished/*/*.wav" \
76
+ --label_plan 2-voicemail \
77
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
78
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
79
+
80
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-th-th-2-ch64-lstm \
81
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/th-TH/wav_finished/*/*.wav" \
82
+ --label_plan 2-voicemail \
83
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
84
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
85
+
86
+ sh run.sh --stage 0 --stop_stage 6 --system_version centos --file_folder_name file_dir --final_model_name voicemail-zh-tw-2-ch64-lstm \
87
+ --filename_patterns "/data/tianxing/PycharmProjects/datasets/voicemail/zh-TW/wav_finished/*/*.wav" \
88
+ --label_plan 2-voicemail \
89
+ --config_file "yaml/lstm-classifier-2-ch64.yaml" \
90
+ --pretrained_model "/data/tianxing/PycharmProjects/cc_audio_8/trained_models/voicemail-2-ch64-lstm.zip"
91
+
examples/sound_classification_by_lstm/step_3_train_model.py CHANGED
@@ -27,8 +27,8 @@ from toolbox.torch.modules.loss import FocalLoss, HingeLoss, HingeLinear
27
  from toolbox.torch.training.metrics.categorical_accuracy import CategoricalAccuracy
28
  from toolbox.torch.utils.data.vocabulary import Vocabulary
29
  from toolbox.torch.utils.data.dataset.wave_classifier_excel_dataset import WaveClassifierExcelDataset
30
- from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import LSTMClassifierPretrainedModel
31
- from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import LSTMClassifierConfig
32
 
33
 
34
  def get_args():
@@ -171,7 +171,7 @@ def main():
171
 
172
  # models
173
  logger.info(f"prepare models. config_file: {args.config_file}")
174
- config = LSTMClassifierConfig.from_pretrained(
175
  pretrained_model_name_or_path=args.config_file,
176
  # num_labels=vocabulary.get_vocab_size(namespace="labels")
177
  )
@@ -180,7 +180,7 @@ def main():
180
  vocabulary.get_vocab_size(namespace="labels"),
181
  config.cls_head_param["num_labels"],
182
  ))
183
- model = LSTMClassifierPretrainedModel(
184
  config=config,
185
  )
186
 
 
27
  from toolbox.torch.training.metrics.categorical_accuracy import CategoricalAccuracy
28
  from toolbox.torch.utils.data.vocabulary import Vocabulary
29
  from toolbox.torch.utils.data.dataset.wave_classifier_excel_dataset import WaveClassifierExcelDataset
30
+ from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import WaveClassifierPretrainedModel
31
+ from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import WaveClassifierConfig
32
 
33
 
34
  def get_args():
 
171
 
172
  # models
173
  logger.info(f"prepare models. config_file: {args.config_file}")
174
+ config = WaveClassifierConfig.from_pretrained(
175
  pretrained_model_name_or_path=args.config_file,
176
  # num_labels=vocabulary.get_vocab_size(namespace="labels")
177
  )
 
180
  vocabulary.get_vocab_size(namespace="labels"),
181
  config.cls_head_param["num_labels"],
182
  ))
183
+ model = WaveClassifierPretrainedModel(
184
  config=config,
185
  )
186
 
examples/sound_classification_by_lstm/step_4_evaluation_model.py CHANGED
@@ -21,7 +21,7 @@ import torch
21
  from tqdm import tqdm
22
 
23
  from toolbox.torch.utils.data.vocabulary import Vocabulary
24
- from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import LSTMClassifierPretrainedModel
25
 
26
 
27
  def get_args():
@@ -64,7 +64,7 @@ def main():
64
  logger.info("prepare vocabulary, model")
65
  vocabulary = Vocabulary.from_files(args.vocabulary_dir)
66
 
67
- model = LSTMClassifierPretrainedModel.from_pretrained(
68
  pretrained_model_name_or_path=args.model_dir,
69
  )
70
  model.to(device)
 
21
  from tqdm import tqdm
22
 
23
  from toolbox.torch.utils.data.vocabulary import Vocabulary
24
+ from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import WaveClassifierPretrainedModel
25
 
26
 
27
  def get_args():
 
64
  logger.info("prepare vocabulary, model")
65
  vocabulary = Vocabulary.from_files(args.vocabulary_dir)
66
 
67
+ model = WaveClassifierPretrainedModel.from_pretrained(
68
  pretrained_model_name_or_path=args.model_dir,
69
  )
70
  model.to(device)
examples/sound_classification_by_lstm/step_5_export_models.py CHANGED
@@ -19,7 +19,7 @@ import numpy as np
19
  import torch
20
 
21
  from toolbox.torch.utils.data.vocabulary import Vocabulary
22
- from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import LSTMClassifierPretrainedModel
23
 
24
 
25
  def get_args():
@@ -61,7 +61,7 @@ def main():
61
  logger.info("prepare vocabulary, model")
62
  vocabulary = Vocabulary.from_files(args.vocabulary_dir)
63
 
64
- model = LSTMClassifierPretrainedModel.from_pretrained(
65
  pretrained_model_name_or_path=args.model_dir,
66
  num_labels=vocabulary.get_vocab_size(namespace="labels")
67
  )
@@ -99,8 +99,11 @@ def main():
99
  )
100
  script_quant_model = torch.jit.script(quantized_model)
101
  script_quant_model.save(serialization_dir / "script_quant_model.zip")
 
 
 
102
  return
103
 
104
 
105
- if __name__ == '__main__':
106
  main()
 
19
  import torch
20
 
21
  from toolbox.torch.utils.data.vocabulary import Vocabulary
22
+ from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import WaveClassifierPretrainedModel
23
 
24
 
25
  def get_args():
 
61
  logger.info("prepare vocabulary, model")
62
  vocabulary = Vocabulary.from_files(args.vocabulary_dir)
63
 
64
+ model = WaveClassifierPretrainedModel.from_pretrained(
65
  pretrained_model_name_or_path=args.model_dir,
66
  num_labels=vocabulary.get_vocab_size(namespace="labels")
67
  )
 
99
  )
100
  script_quant_model = torch.jit.script(quantized_model)
101
  script_quant_model.save(serialization_dir / "script_quant_model.zip")
102
+
103
+ # onnx
104
+
105
  return
106
 
107
 
108
+ if __name__ == "__main__":
109
  main()
examples/sound_classification_by_lstm/step_6_export_onnx_model.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from collections import defaultdict
5
+ import json
6
+ import logging
7
+ from logging.handlers import TimedRotatingFileHandler
8
+ import os
9
+ import platform
10
+ from pathlib import Path
11
+ import sys
12
+ import shutil
13
+ from typing import List
14
+
15
+ pwd = os.path.abspath(os.path.dirname(__file__))
16
+ sys.path.append(os.path.join(pwd, "../../"))
17
+
18
+ import numpy as np
19
+ import onnxruntime as ort
20
+ import torch
21
+
22
+ from toolbox.torch.utils.data.vocabulary import Vocabulary
23
+ from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import WaveClassifierExport
24
+
25
+
26
+ def get_args():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument("--vocabulary_dir", default="file_dir/best/vocabulary", type=str)
29
+ parser.add_argument("--model_dir", default="file_dir/best", type=str)
30
+
31
+ parser.add_argument("--serialization_dir", default="file_dir/best", type=str)
32
+
33
+ args = parser.parse_args()
34
+ return args
35
+
36
+
37
+ def logging_config():
38
+ fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s"
39
+
40
+ logging.basicConfig(format=fmt,
41
+ datefmt="%m/%d/%Y %H:%M:%S",
42
+ level=logging.DEBUG)
43
+ stream_handler = logging.StreamHandler()
44
+ stream_handler.setLevel(logging.INFO)
45
+ stream_handler.setFormatter(logging.Formatter(fmt))
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ return logger
50
+
51
+
52
+ def main():
53
+ args = get_args()
54
+
55
+ serialization_dir = Path(args.serialization_dir)
56
+ output_file = serialization_dir / "model.onnx"
57
+
58
+ logger = logging_config()
59
+
60
+ logger.info("export models on CPU")
61
+ device = torch.device("cpu")
62
+
63
+ logger.info("prepare vocabulary, model")
64
+ vocabulary = Vocabulary.from_files(args.vocabulary_dir)
65
+
66
+ model_export = WaveClassifierExport.from_pretrained(
67
+ pretrained_model_name_or_path=args.model_dir,
68
+ num_labels=vocabulary.get_vocab_size(namespace="labels")
69
+ )
70
+ model_export.to(device)
71
+ model_export.eval()
72
+
73
+ waveform = 0 + 25 * np.random.randn(16000,)
74
+ waveform = np.array(waveform, dtype=np.int16)
75
+ waveform = waveform / (1 << 15)
76
+ waveform = torch.tensor(waveform, dtype=torch.float32)
77
+ waveform = torch.unsqueeze(waveform, dim=0)
78
+ waveform = waveform.to(device)
79
+
80
+ spec = model_export.wave_encoder.wave_to_mel_spectrogram(waveform) + 1e-6
81
+ spec = spec.log()
82
+ # shape = [b, f, t]
83
+ spec = spec.transpose(1, 2)
84
+ # shape = [b, t, f]
85
+
86
+ logger.info("export jit models")
87
+
88
+ inputs = spec
89
+
90
+ lstm_layer_param = model_export.config.lstm_layer_param
91
+ num_layers = lstm_layer_param["num_layers"]
92
+ hidden_size = lstm_layer_param["hidden_size"]
93
+ h = torch.rand(size=(num_layers, 1, hidden_size), dtype=torch.float32)
94
+ c = torch.rand(size=(num_layers, 1, hidden_size), dtype=torch.float32)
95
+
96
+ # onnx
97
+ torch.onnx.export(model_export,
98
+ args=(inputs, h, c),
99
+ f=output_file.as_posix(),
100
+ input_names=["inputs", "h", "c"],
101
+ output_names=[
102
+ "logits", "new_h", "new_c",
103
+ ],
104
+ dynamic_axes={
105
+ "inputs": {0: "batch_size", 1: "time_steps"},
106
+ "h": {1: "batch_size"},
107
+ "c": {1: "batch_size"},
108
+ "logits": {0: "batch_size"},
109
+ "new_h": {1: "batch_size"},
110
+ "new_c": {1: "batch_size"},
111
+ })
112
+
113
+ ort_session = ort.InferenceSession(output_file.as_posix())
114
+ input_feed = {
115
+ "inputs": inputs.numpy(),
116
+ "h": h.numpy(),
117
+ "c": c.numpy(),
118
+ }
119
+ output_names = [
120
+ "logits", "new_h", "new_c"
121
+ ]
122
+ logits, new_h, new_c = ort_session.run(output_names, input_feed)
123
+ print(f"logits: {logits.shape}")
124
+ print(f"new_h: {new_h.shape}")
125
+ print(f"new_c: {new_c.shape}")
126
+ return
127
+
128
+
129
+ if __name__ == "__main__":
130
+ main()
examples/sound_classification_by_lstm/{step_6_infer.py → step_7_test_jit_model.py} RENAMED
File without changes
examples/sound_classification_by_lstm/step_8_test_onnx_model.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import os
5
+ from pathlib import Path
6
+ import shutil
7
+ import sys
8
+ import tempfile
9
+ import zipfile
10
+
11
+ pwd = os.path.abspath(os.path.dirname(__file__))
12
+ sys.path.append(os.path.join(pwd, "../../"))
13
+
14
+ import onnxruntime as ort
15
+ from scipy.io import wavfile
16
+ import torch
17
+ import torchaudio
18
+
19
+ from project_settings import project_path
20
+ from toolbox.torch.utils.data.vocabulary import Vocabulary
21
+ from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import WaveClassifierConfig
22
+
23
+
24
+ def get_args():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument(
27
+ "--model_file",
28
+ # default=(project_path / "trained_models/cc_audio_8.zip").as_posix(),
29
+ default=(project_path / "trained_models/voicemail-ms-my-2-ch64-lstm.zip").as_posix(),
30
+ type=str
31
+ )
32
+ parser.add_argument(
33
+ "--wav_file",
34
+ default=r"C:\Users\tianx\Desktop\a073d03d-d280-46df-9b2d-d904965f4500_zh-CN_h3f25ivhb0c0_1719478037746.wav",
35
+ type=str
36
+ )
37
+
38
+ parser.add_argument("--device", default="cpu", type=str)
39
+
40
+ args = parser.parse_args()
41
+ return args
42
+
43
+
44
+ def main():
45
+ args = get_args()
46
+
47
+ model_file = Path(args.model_file)
48
+
49
+ device = torch.device(args.device)
50
+
51
+ with zipfile.ZipFile(model_file, "r") as f_zip:
52
+ out_root = Path(tempfile.gettempdir()) / "cc_audio_8"
53
+ print(out_root.as_posix())
54
+ if out_root.exists():
55
+ shutil.rmtree(out_root.as_posix())
56
+ out_root.mkdir(parents=True, exist_ok=True)
57
+ f_zip.extractall(path=out_root)
58
+
59
+ tgt_path = out_root / model_file.stem
60
+ config_file = tgt_path / "config.yaml"
61
+ onnx_model_file = tgt_path / "model.onnx"
62
+ vocab_path = tgt_path / "vocabulary"
63
+
64
+ config = WaveClassifierConfig.from_pretrained(config_file.as_posix())
65
+ ort_session = ort.InferenceSession(onnx_model_file.as_posix())
66
+ vocabulary = Vocabulary.from_files(vocab_path.as_posix())
67
+
68
+ # transform
69
+ wave_to_mel_spectrogram = torchaudio.transforms.MelSpectrogram(
70
+ sample_rate=config.mel_spectrogram_param["sample_rate"],
71
+ n_fft=config.mel_spectrogram_param["n_fft"],
72
+ win_length=config.mel_spectrogram_param["win_length"],
73
+ hop_length=config.mel_spectrogram_param["hop_length"],
74
+ f_min=config.mel_spectrogram_param["f_min"],
75
+ f_max=config.mel_spectrogram_param["f_max"],
76
+ window_fn=torch.hamming_window if config.mel_spectrogram_param["window_fn"] == "hamming" else torch.hann_window,
77
+ n_mels=config.mel_spectrogram_param["n_mels"],
78
+ )
79
+
80
+ # infer
81
+ sample_rate, waveform = wavfile.read(args.wav_file)
82
+ waveform = waveform[:16000]
83
+ waveform = waveform / (1 << 15)
84
+ waveform = torch.tensor(waveform, dtype=torch.float32)
85
+ waveform = torch.unsqueeze(waveform, dim=0)
86
+ waveform = waveform.to(device)
87
+
88
+ spec = wave_to_mel_spectrogram(waveform) + 1e-6
89
+ spec = spec.log()
90
+ # shape = [b, f, t]
91
+ spec = spec.transpose(1, 2)
92
+ # shape = [b, t, f]
93
+ inputs = spec
94
+
95
+ lstm_layer_param = config.lstm_layer_param
96
+ num_layers = lstm_layer_param["num_layers"]
97
+ hidden_size = lstm_layer_param["hidden_size"]
98
+ h = torch.zeros(size=(num_layers, 1, hidden_size), dtype=torch.float32)
99
+ c = torch.zeros(size=(num_layers, 1, hidden_size), dtype=torch.float32)
100
+
101
+ input_feed = {
102
+ "inputs": inputs.numpy(),
103
+ "h": h.numpy(),
104
+ "c": c.numpy(),
105
+ }
106
+ output_names = [
107
+ "logits", "new_h", "new_c"
108
+ ]
109
+ logits, new_h, new_c = ort_session.run(output_names, input_feed)
110
+ print(f"logits: {logits.shape}")
111
+ print(f"new_h: {new_h.shape}")
112
+ print(f"new_c: {new_c.shape}")
113
+
114
+ return
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()
examples/sound_classification_by_lstm/{step_7_test_model.py → step_9_test_model.py} RENAMED
@@ -16,7 +16,7 @@ import torch
16
 
17
  from project_settings import project_path
18
  from toolbox.torch.utils.data.vocabulary import Vocabulary
19
- from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import LSTMClassifierPretrainedModel
20
 
21
 
22
  def get_args():
@@ -58,7 +58,7 @@ def main():
58
 
59
  vocabulary = Vocabulary.from_files(vocab_path.as_posix())
60
 
61
- model = LSTMClassifierPretrainedModel.from_pretrained(
62
  pretrained_model_name_or_path=tgt_path.as_posix(),
63
  )
64
  model.to(device)
 
16
 
17
  from project_settings import project_path
18
  from toolbox.torch.utils.data.vocabulary import Vocabulary
19
+ from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import WaveClassifierPretrainedModel
20
 
21
 
22
  def get_args():
 
58
 
59
  vocabulary = Vocabulary.from_files(vocab_path.as_posix())
60
 
61
+ model = WaveClassifierPretrainedModel.from_pretrained(
62
  pretrained_model_name_or_path=tgt_path.as_posix(),
63
  )
64
  model.to(device)
requirements.txt CHANGED
@@ -14,3 +14,5 @@ python-dotenv
14
  numpy
15
  onnxruntime
16
  scipy
 
 
 
14
  numpy
15
  onnxruntime
16
  scipy
17
+ onnx
18
+ onnxruntime
toolbox/torchaudio/models/lstm_audio_classifier/configuration_lstm_audio_classifier.py CHANGED
@@ -3,7 +3,7 @@
3
  from toolbox.torchaudio.configuration_utils import PretrainedConfig
4
 
5
 
6
- class LSTMClassifierConfig(PretrainedConfig):
7
  def __init__(self,
8
  mel_spectrogram_param: dict,
9
  lstm_layer_param: dict,
@@ -11,7 +11,7 @@ class LSTMClassifierConfig(PretrainedConfig):
11
  cls_head_param: dict,
12
  **kwargs
13
  ):
14
- super(LSTMClassifierConfig, self).__init__(**kwargs)
15
  self.mel_spectrogram_param = mel_spectrogram_param
16
  self.lstm_layer_param = lstm_layer_param
17
  self.pooling_layer_param = pooling_layer_param
 
3
  from toolbox.torchaudio.configuration_utils import PretrainedConfig
4
 
5
 
6
+ class WaveClassifierConfig(PretrainedConfig):
7
  def __init__(self,
8
  mel_spectrogram_param: dict,
9
  lstm_layer_param: dict,
 
11
  cls_head_param: dict,
12
  **kwargs
13
  ):
14
+ super(WaveClassifierConfig, self).__init__(**kwargs)
15
  self.mel_spectrogram_param = mel_spectrogram_param
16
  self.lstm_layer_param = lstm_layer_param
17
  self.pooling_layer_param = pooling_layer_param
toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py CHANGED
@@ -7,7 +7,7 @@ import torch
7
  import torchaudio
8
  import torch.nn as nn
9
  from toolbox.torchaudio.configuration_utils import CONFIG_FILE, PretrainedConfig
10
- from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import LSTMClassifierConfig
11
 
12
 
13
  MODEL_FILE = "model.pt"
@@ -163,16 +163,18 @@ class WaveEncoder(nn.Module):
163
  )
164
 
165
  def forward(self, inputs: torch.Tensor):
166
- # x: [batch_size, spec_dim, seq_length]
167
  x = inputs
168
 
169
  with torch.no_grad():
170
- # shape = [batch_size, spec_dim, seq_length]
171
  x = self.wave_to_mel_spectrogram(x) + 1e-6
172
  x = x.log()
173
- x = x - torch.mean(x, dim=-1, keepdim=True)
174
 
 
175
  x = x.transpose(1, 2)
 
176
 
177
  features, h, c = self.lstm_layer.forward(x)
178
  # features: shape, [b, t, hidden_size]
@@ -216,13 +218,13 @@ class ClsHead(nn.Module):
216
  return logits
217
 
218
 
219
- class LSTMClassifier(nn.Module):
220
  def __init__(self,
221
  wave_encoder: WaveEncoder,
222
  pooling_layer: PoolingLayer,
223
  cls_head: ClsHead
224
  ):
225
- super(LSTMClassifier, self).__init__()
226
  self.wave_encoder = wave_encoder
227
  self.pooling_layer = pooling_layer
228
  self.cls_head = cls_head
@@ -240,11 +242,11 @@ class LSTMClassifier(nn.Module):
240
  return logits
241
 
242
 
243
- class LSTMClassifierPretrainedModel(LSTMClassifier):
244
  def __init__(self,
245
- config: LSTMClassifierConfig,
246
  ):
247
- super(LSTMClassifierPretrainedModel, self).__init__(
248
  wave_encoder=WaveEncoder(
249
  mel_spectrogram_param=config.mel_spectrogram_param,
250
  lstm_layer_param=config.lstm_layer_param,
@@ -265,7 +267,7 @@ class LSTMClassifierPretrainedModel(LSTMClassifier):
265
 
266
  @classmethod
267
  def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
268
- config = LSTMClassifierConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
269
 
270
  model = cls(config)
271
 
@@ -301,14 +303,41 @@ class LSTMClassifierPretrainedModel(LSTMClassifier):
301
  return save_directory
302
 
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  def main():
305
- config = LSTMClassifierConfig.from_pretrained("examples/lstm_classifier.yaml")
306
- model = LSTMClassifierPretrainedModel(config)
307
- print(model)
 
 
 
 
 
 
 
308
 
309
- inputs = torch.rand(size=(1, 16000), dtype=torch.float32)
310
- outputs = model.forward(inputs)
311
- print(outputs)
312
  return
313
 
314
 
 
7
  import torchaudio
8
  import torch.nn as nn
9
  from toolbox.torchaudio.configuration_utils import CONFIG_FILE, PretrainedConfig
10
+ from toolbox.torchaudio.models.lstm_audio_classifier.configuration_lstm_audio_classifier import WaveClassifierConfig
11
 
12
 
13
  MODEL_FILE = "model.pt"
 
163
  )
164
 
165
  def forward(self, inputs: torch.Tensor):
166
+ # x: [b, num_samples]
167
  x = inputs
168
 
169
  with torch.no_grad():
170
+ # shape = [b, f, t]
171
  x = self.wave_to_mel_spectrogram(x) + 1e-6
172
  x = x.log()
173
+ # x = x - torch.mean(x, dim=-1, keepdim=True)
174
 
175
+ # shape = [b, f, t]
176
  x = x.transpose(1, 2)
177
+ # shape = [b, t, f]
178
 
179
  features, h, c = self.lstm_layer.forward(x)
180
  # features: shape, [b, t, hidden_size]
 
218
  return logits
219
 
220
 
221
+ class WaveClassifier(nn.Module):
222
  def __init__(self,
223
  wave_encoder: WaveEncoder,
224
  pooling_layer: PoolingLayer,
225
  cls_head: ClsHead
226
  ):
227
+ super(WaveClassifier, self).__init__()
228
  self.wave_encoder = wave_encoder
229
  self.pooling_layer = pooling_layer
230
  self.cls_head = cls_head
 
242
  return logits
243
 
244
 
245
+ class WaveClassifierPretrainedModel(WaveClassifier):
246
  def __init__(self,
247
+ config: WaveClassifierConfig,
248
  ):
249
+ super(WaveClassifierPretrainedModel, self).__init__(
250
  wave_encoder=WaveEncoder(
251
  mel_spectrogram_param=config.mel_spectrogram_param,
252
  lstm_layer_param=config.lstm_layer_param,
 
267
 
268
  @classmethod
269
  def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
270
+ config = WaveClassifierConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
271
 
272
  model = cls(config)
273
 
 
303
  return save_directory
304
 
305
 
306
+ class WaveClassifierExport(WaveClassifierPretrainedModel):
307
+ def __init__(self, config: WaveClassifierConfig):
308
+ super(WaveClassifierExport, self).__init__(config=config)
309
+
310
+ def forward(self,
311
+ inputs: torch.Tensor,
312
+ h: torch.Tensor = None,
313
+ c: torch.Tensor = None,
314
+ ):
315
+ # inputs shape = [b, t, f]
316
+ features, h, c = self.wave_encoder.lstm_layer.forward(inputs, h=h, c=c)
317
+ # features: shape, [b, t, hidden_size]
318
+ # h: shape, [num_layers, b, hidden_size]
319
+ # c: shape, [num_layers, b, hidden_size]
320
+
321
+ # features shape: [b, t, f]
322
+ feature = self.pooling_layer.forward(features)
323
+ # features shape: [b, f]
324
+ logits = self.cls_head.forward(feature)
325
+ # logits shape: [batch_size, num_classes]
326
+ return logits, h, c
327
+
328
+
329
  def main():
330
+ config = WaveClassifierConfig.from_pretrained("examples/lstm_classifier.yaml")
331
+ # model = WaveClassifierPretrainedModel(config)
332
+ # inputs = torch.rand(size=(1, 16000), dtype=torch.float32)
333
+ # outputs = model.forward(inputs)
334
+ # print(outputs)
335
+
336
+ model = WaveClassifierExport(config)
337
+ inputs = torch.rand(size=(1, 201, 80), dtype=torch.float32)
338
+ logits, h, c = model.export_forward(inputs)
339
+ print(logits)
340
 
 
 
 
341
  return
342
 
343