Spaces:
Sleeping
Sleeping
Christian J. Steinmetz
commited on
Commit
·
3c4fcfb
1
Parent(s):
4d2eb76
updating classifier configs and adding in kwargs to pretrained models
Browse files- cfg/model/cls_panns_16k.yaml +1 -1
- cfg/model/{cls_panns_44k.yaml → cls_panns_48k.yaml} +0 -0
- cfg/model/cls_panns_48k_64.yaml +17 -0
- cfg/model/{cls_panns_44k_mixup.yaml → cls_panns_48k_mixup.yaml} +0 -0
- cfg/model/cls_panns_48k_specaugment.yaml +16 -0
- cfg/model/cls_panns_48k_specaugment_label_smoothing.yaml +17 -0
- cfg/model/cls_panns_pt.yaml +1 -0
- remfx/classifier.py +8 -4
cfg/model/cls_panns_16k.yaml
CHANGED
|
@@ -11,5 +11,5 @@ model:
|
|
| 11 |
hop_length: 512
|
| 12 |
n_mels: 128
|
| 13 |
sample_rate: ${sample_rate}
|
| 14 |
-
model_sample_rate:
|
| 15 |
|
|
|
|
| 11 |
hop_length: 512
|
| 12 |
n_mels: 128
|
| 13 |
sample_rate: ${sample_rate}
|
| 14 |
+
model_sample_rate: 16000
|
| 15 |
|
cfg/model/{cls_panns_44k.yaml → cls_panns_48k.yaml}
RENAMED
|
File without changes
|
cfg/model/cls_panns_48k_64.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
model:
|
| 3 |
+
_target_: remfx.models.FXClassifier
|
| 4 |
+
lr: 3e-4
|
| 5 |
+
lr_weight_decay: 1e-3
|
| 6 |
+
sample_rate: ${sample_rate}
|
| 7 |
+
mixup: False
|
| 8 |
+
network:
|
| 9 |
+
_target_: remfx.classifier.Cnn14
|
| 10 |
+
num_classes: ${num_classes}
|
| 11 |
+
n_fft: 2048
|
| 12 |
+
hop_length: 512
|
| 13 |
+
n_mels: 64
|
| 14 |
+
sample_rate: ${sample_rate}
|
| 15 |
+
model_sample_rate: ${sample_rate}
|
| 16 |
+
specaugment: False
|
| 17 |
+
|
cfg/model/{cls_panns_44k_mixup.yaml → cls_panns_48k_mixup.yaml}
RENAMED
|
File without changes
|
cfg/model/cls_panns_48k_specaugment.yaml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
model:
|
| 3 |
+
_target_: remfx.models.FXClassifier
|
| 4 |
+
lr: 3e-4
|
| 5 |
+
lr_weight_decay: 1e-3
|
| 6 |
+
sample_rate: ${sample_rate}
|
| 7 |
+
mixup: False
|
| 8 |
+
network:
|
| 9 |
+
_target_: remfx.classifier.Cnn14
|
| 10 |
+
num_classes: ${num_classes}
|
| 11 |
+
n_fft: 2048
|
| 12 |
+
hop_length: 512
|
| 13 |
+
n_mels: 128
|
| 14 |
+
sample_rate: ${sample_rate}
|
| 15 |
+
model_sample_rate: ${sample_rate}
|
| 16 |
+
specaugment: True
|
cfg/model/cls_panns_48k_specaugment_label_smoothing.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# @package _global_
|
| 2 |
+
model:
|
| 3 |
+
_target_: remfx.models.FXClassifier
|
| 4 |
+
lr: 3e-4
|
| 5 |
+
lr_weight_decay: 1e-3
|
| 6 |
+
sample_rate: ${sample_rate}
|
| 7 |
+
mixup: False
|
| 8 |
+
label_smoothing: 0.15
|
| 9 |
+
network:
|
| 10 |
+
_target_: remfx.classifier.Cnn14
|
| 11 |
+
num_classes: ${num_classes}
|
| 12 |
+
n_fft: 2048
|
| 13 |
+
hop_length: 512
|
| 14 |
+
n_mels: 128
|
| 15 |
+
sample_rate: ${sample_rate}
|
| 16 |
+
model_sample_rate: ${sample_rate}
|
| 17 |
+
specaugment: True
|
cfg/model/cls_panns_pt.yaml
CHANGED
|
@@ -4,6 +4,7 @@ model:
|
|
| 4 |
lr: 3e-4
|
| 5 |
lr_weight_decay: 1e-3
|
| 6 |
sample_rate: ${sample_rate}
|
|
|
|
| 7 |
network:
|
| 8 |
_target_: remfx.classifier.PANNs
|
| 9 |
num_classes: ${num_classes}
|
|
|
|
| 4 |
lr: 3e-4
|
| 5 |
lr_weight_decay: 1e-3
|
| 6 |
sample_rate: ${sample_rate}
|
| 7 |
+
mixup: False
|
| 8 |
network:
|
| 9 |
_target_: remfx.classifier.PANNs
|
| 10 |
num_classes: ${num_classes}
|
remfx/classifier.py
CHANGED
|
@@ -31,7 +31,7 @@ class PANNs(torch.nn.Module):
|
|
| 31 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 32 |
)
|
| 33 |
|
| 34 |
-
def forward(self, x: torch.Tensor):
|
| 35 |
with torch.no_grad():
|
| 36 |
x = self.resample(x)
|
| 37 |
embed = panns_hear.get_scene_embeddings(x.view(x.shape[0], -1), self.model)
|
|
@@ -59,7 +59,7 @@ class Wav2CLIP(nn.Module):
|
|
| 59 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 60 |
)
|
| 61 |
|
| 62 |
-
def forward(self, x: torch.Tensor):
|
| 63 |
with torch.no_grad():
|
| 64 |
x = self.resample(x)
|
| 65 |
embed = wav2clip_hear.get_scene_embeddings(
|
|
@@ -89,7 +89,7 @@ class VGGish(nn.Module):
|
|
| 89 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 90 |
)
|
| 91 |
|
| 92 |
-
def forward(self, x: torch.Tensor):
|
| 93 |
with torch.no_grad():
|
| 94 |
x = self.resample(x)
|
| 95 |
embed = hearbaseline.vggish.get_scene_embeddings(
|
|
@@ -119,7 +119,7 @@ class wav2vec2(nn.Module):
|
|
| 119 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 120 |
)
|
| 121 |
|
| 122 |
-
def forward(self, x: torch.Tensor):
|
| 123 |
with torch.no_grad():
|
| 124 |
x = self.resample(x)
|
| 125 |
embed = hearbaseline.wav2vec2.get_scene_embeddings(
|
|
@@ -179,6 +179,10 @@ class Cnn14(nn.Module):
|
|
| 179 |
orig_freq=sample_rate, new_freq=model_sample_rate
|
| 180 |
)
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
def init_weight(self):
|
| 183 |
init_bn(self.bn0)
|
| 184 |
init_layer(self.fc1)
|
|
|
|
| 31 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 32 |
)
|
| 33 |
|
| 34 |
+
def forward(self, x: torch.Tensor, **kwargs):
|
| 35 |
with torch.no_grad():
|
| 36 |
x = self.resample(x)
|
| 37 |
embed = panns_hear.get_scene_embeddings(x.view(x.shape[0], -1), self.model)
|
|
|
|
| 59 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 60 |
)
|
| 61 |
|
| 62 |
+
def forward(self, x: torch.Tensor, **kwargs):
|
| 63 |
with torch.no_grad():
|
| 64 |
x = self.resample(x)
|
| 65 |
embed = wav2clip_hear.get_scene_embeddings(
|
|
|
|
| 89 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 90 |
)
|
| 91 |
|
| 92 |
+
def forward(self, x: torch.Tensor, **kwargs):
|
| 93 |
with torch.no_grad():
|
| 94 |
x = self.resample(x)
|
| 95 |
embed = hearbaseline.vggish.get_scene_embeddings(
|
|
|
|
| 119 |
torch.nn.Linear(hidden_dim, num_classes),
|
| 120 |
)
|
| 121 |
|
| 122 |
+
def forward(self, x: torch.Tensor, **kwargs):
|
| 123 |
with torch.no_grad():
|
| 124 |
x = self.resample(x)
|
| 125 |
embed = hearbaseline.wav2vec2.get_scene_embeddings(
|
|
|
|
| 179 |
orig_freq=sample_rate, new_freq=model_sample_rate
|
| 180 |
)
|
| 181 |
|
| 182 |
+
if self.specaugment:
|
| 183 |
+
self.freq_mask = torchaudio.transforms.FrequencyMasking(64, True)
|
| 184 |
+
self.time_mask = torchaudio.transforms.TimeMasking(128, True)
|
| 185 |
+
|
| 186 |
def init_weight(self):
|
| 187 |
init_bn(self.bn0)
|
| 188 |
init_layer(self.fc1)
|