Add code and weights

Browse files

Files changed (14) hide show

example.ipynb +116 -0
models/baseline.py +260 -0
models/helpers/utils.py +11 -0
models/mel.py +97 -0
split10/config.json +32 -0
split10/model.safetensors +3 -0
split100/config.json +32 -0
split100/model.safetensors +3 -0
split25/config.json +32 -0
split25/model.safetensors +3 -0
split5/config.json +32 -0
split5/model.safetensors +3 -0
split50/config.json +32 -0
split50/model.safetensors +3 -0

example.ipynb ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09225787-6a4b-4484-b00b-d0f731915a81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from models.baseline import Network\n",
+    "from models.mel import AugmentMelSTFT\n",
+    "import soundfile as sf\n",
+    "import torch\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c377b699-2c2e-468e-88b0-6767338988c8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio_path = \"/path/to/audio.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa950347-df0d-4135-801a-d54525c57e58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, Audio\n",
+    "\n",
+    "display(Audio(audio_path))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79faad26-0f20-439d-b152-10f4666db41d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mel = AugmentMelSTFT().eval()\n",
+    "model = Network.from_pretrained(\"split5\").eval()\n",
+    "\n",
+    "audio, sr = sf.read(audio_path, dtype=np.float32)\n",
+    "assert sr == 32_000\n",
+    "\n",
+    "audio = torch.as_tensor(audio)\n",
+    "\n",
+    "# audio.shape: (1,samples)\n",
+    "\n",
+    "audio = audio.unsqueeze(0)\n",
+    "\n",
+    "# audio.shape: (1,1,samples)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    mel_spec = mel(audio)\n",
+    "\n",
+    "# mel_spec.shape: (1, mel_bins, frames)\n",
+    "\n",
+    "mel_spec = mel_spec.unsqueeze(0)\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    logits = model(mel_spec)\n",
+    "\n",
+    "# logits.shape: (1,classes)\n",
+    "\n",
+    "logits = logits.squeeze(0)\n",
+    "\n",
+    "tau2022_classes = [\n",
+    "    \"airport\",\n",
+    "    \"bus\",\n",
+    "    \"metro\",\n",
+    "    \"metro_station\",\n",
+    "    \"park\",\n",
+    "    \"public_square\",\n",
+    "    \"shopping_mall\",\n",
+    "    \"street_pedestrian\",\n",
+    "    \"street_traffic\",\n",
+    "    \"tram\"\n",
+    "]\n",
+    "\n",
+    "best_prediction_idx = torch.argmax(logits)\n",
+    "\n",
+    "scores = torch.softmax(logits, dim=0)\n",
+    "\n",
+    "print(f\"Prediction: {tau2022_classes[best_prediction_idx]} (score: {scores[best_prediction_idx]:0.2f})\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

models/baseline.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import torch
+import torch.nn as nn
+from torchvision.ops.misc import Conv2dNormActivation
+from .helpers.utils import make_divisible
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+def initialize_weights(m):
+    if isinstance(m, nn.Conv2d):
+        nn.init.kaiming_normal_(m.weight, mode="fan_out")
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
+        nn.init.ones_(m.weight)
+        nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Linear):
+        nn.init.normal_(m.weight, 0, 0.01)
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+class Block(nn.Module):
+    def __init__(self, in_channels, out_channels, expansion_rate, stride):
+        super().__init__()
+        exp_channels = make_divisible(in_channels * expansion_rate, 8)
+        # create the three factorized convs that make up the inverted bottleneck block
+        exp_conv = Conv2dNormActivation(
+            in_channels,
+            exp_channels,
+            kernel_size=1,
+            stride=1,
+            norm_layer=nn.BatchNorm2d,
+            activation_layer=nn.ReLU,
+            inplace=False,
+        )
+        # depthwise convolution with possible stride
+        depth_conv = Conv2dNormActivation(
+            exp_channels,
+            exp_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=exp_channels,
+            norm_layer=nn.BatchNorm2d,
+            activation_layer=nn.ReLU,
+            inplace=False,
+        )
+        proj_conv = Conv2dNormActivation(
+            exp_channels,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            norm_layer=nn.BatchNorm2d,
+            activation_layer=None,
+            inplace=False,
+        )
+        self.after_block_activation = nn.ReLU()
+        if in_channels == out_channels:
+            self.use_shortcut = True
+            if stride == 1 or stride == (1, 1):
+                self.shortcut = nn.Sequential()
+            else:
+                # average pooling required for shortcut
+                self.shortcut = nn.Sequential(
+                    nn.AvgPool2d(kernel_size=3, stride=stride, padding=1),
+                    nn.Sequential(),
+                )
+        else:
+            self.use_shortcut = False
+        self.block = nn.Sequential(exp_conv, depth_conv, proj_conv)
+    def forward(self, x):
+        if self.use_shortcut:
+            x = self.block(x) + self.shortcut(x)
+        else:
+            x = self.block(x)
+        x = self.after_block_activation(x)
+        return x
+class NetworkConfig(PretrainedConfig):
+    def __init__(
+        self,
+        n_classes=10,
+        in_channels=1,
+        base_channels=32,
+        channels_multiplier=2.3,
+        expansion_rate=3.0,
+        n_blocks=(3, 2, 1),
+        strides=dict(b2=(1, 1), b3=(1, 2), b4=(2, 1)),
+        add_feats=False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.n_classes = n_classes
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.channels_multiplier = channels_multiplier
+        self.expansion_rate = expansion_rate
+        self.n_blocks = n_blocks
+        self.strides = strides
+        self.add_feats = add_feats
+class Network(PreTrainedModel):
+    config_class = NetworkConfig
+    def __init__(self, config):
+        super().__init__(config)
+        n_classes = config.n_classes
+        in_channels = config.in_channels
+        base_channels = config.base_channels
+        channels_multiplier = config.channels_multiplier
+        expansion_rate = config.expansion_rate
+        n_blocks = config.n_blocks
+        strides = config.strides
+        n_stages = len(n_blocks)
+        self.add_feats = config.add_feats
+        base_channels = make_divisible(base_channels, 8)
+        channels_per_stage = [base_channels] + [
+            make_divisible(base_channels * channels_multiplier**stage_id, 8)
+            for stage_id in range(n_stages)
+        ]
+        self.total_block_count = 0
+        self.in_c = nn.Sequential(
+            Conv2dNormActivation(
+                in_channels,
+                channels_per_stage[0] // 4,
+                activation_layer=torch.nn.ReLU,
+                kernel_size=3,
+                stride=2,
+                inplace=False,
+            ),
+            Conv2dNormActivation(
+                channels_per_stage[0] // 4,
+                channels_per_stage[0],
+                activation_layer=torch.nn.ReLU,
+                kernel_size=3,
+                stride=2,
+                inplace=False,
+            ),
+        )
+        self.stages = nn.Sequential()
+        for stage_id in range(n_stages):
+            stage = self._make_stage(
+                channels_per_stage[stage_id],
+                channels_per_stage[stage_id + 1],
+                n_blocks[stage_id],
+                strides=strides,
+                expansion_rate=expansion_rate,
+            )
+            self.stages.add_module(f"s{stage_id + 1}", stage)
+        ff_list = []
+        ff_list += [
+            nn.Conv2d(
+                channels_per_stage[-1],
+                n_classes,
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                padding=0,
+                bias=False,
+            ),
+            nn.BatchNorm2d(n_classes),
+        ]
+        ff_list.append(nn.AdaptiveAvgPool2d((1, 1)))
+        self.feed_forward = nn.Sequential(*ff_list)
+        self.apply(initialize_weights)
+    def _make_stage(self, in_channels, out_channels, n_blocks, strides, expansion_rate):
+        stage = nn.Sequential()
+        for index in range(n_blocks):
+            block_id = self.total_block_count + 1
+            bname = f"b{block_id}"
+            self.total_block_count = self.total_block_count + 1
+            if bname in strides:
+                stride = strides[bname]
+            else:
+                stride = (1, 1)
+            block = self._make_block(
+                in_channels, out_channels, stride=stride, expansion_rate=expansion_rate
+            )
+            stage.add_module(bname, block)
+            in_channels = out_channels
+        return stage
+    def _make_block(self, in_channels, out_channels, stride, expansion_rate):
+        block = Block(in_channels, out_channels, expansion_rate, stride)
+        return block
+    def _forward_conv(self, x):
+        x = self.in_c(x)
+        x = self.stages(x)
+        return x
+    def forward(self, x):
+        y = self._forward_conv(x)
+        x = self.feed_forward(y)
+        logits = x.squeeze(2).squeeze(2)
+        if self.add_feats:
+            return logits, y
+        else:
+            return logits
+def get_model(
+    n_classes=10,
+    in_channels=1,
+    base_channels=32,
+    channels_multiplier=2.3,
+    expansion_rate=3.0,
+    n_blocks=(3, 2, 1),
+    strides=None,
+    add_feats=False,
+):
+    """
+    @param n_classes: number of the classes to predict
+    @param in_channels: input channels to the network, for audio it is by default 1
+    @param base_channels: number of channels after in_conv
+    @param channels_multiplier: controls the increase in the width of the network after each stage
+    @param expansion_rate: determines the expansion rate in inverted bottleneck blocks
+    @param n_blocks: number of blocks that should exist in each stage
+    @param strides: default value set below
+    @return: full neural network model based on the specified configs
+    """
+    if strides is None:
+        strides = dict(b2=(1, 1), b3=(1, 2), b4=(2, 1))
+    model_config = {
+        "n_classes": n_classes,
+        "in_channels": in_channels,
+        "base_channels": base_channels,
+        "channels_multiplier": channels_multiplier,
+        "expansion_rate": expansion_rate,
+        "n_blocks": n_blocks,
+        "strides": strides,
+        "add_feats": add_feats,
+    }
+    m = Network(NetworkConfig(**model_config))
+    return m

models/helpers/utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from typing import Optional
+def make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v

models/mel.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import torch.nn as nn
+import torchaudio
+class AugmentMelSTFT(nn.Module):
+    def __init__(self, n_mels=128, sr=32000, win_length=800, hopsize=320, n_fft=1024, freqm=48, timem=192,
+                 fmin=0.0, fmax=None, fmin_aug_range=10, fmax_aug_range=2000, norm_mel=False):
+        """
+        :param n_mels: number of mel bins
+        :param sr: sampling rate used (same as passed as argument to dataset)
+        :param win_length: fft window length in samples
+        :param hopsize: fft hop size in samples
+        :param n_fft: length of fft
+        :param freqm: maximum possible length of mask along frequency dimension
+        :param timem: maximum possible length of mask along time dimension
+        :param fmin: minimum frequency used
+        :param fmax: maximum frequency used
+        :param fmin_aug_range: randomly changes min frequency
+        :param fmax_aug_range: randomly changes max frequency
+        """
+        torch.nn.Module.__init__(self)
+        # adapted from: https://github.com/CPJKU/kagglebirds2020/commit/70f8308b39011b09d41eb0f4ace5aa7d2b0e806e
+        self.win_length = win_length
+        self.n_mels = n_mels
+        self.n_fft = n_fft
+        self.sr = sr
+        self.fmin = fmin
+        if fmax is None:
+            # fmax is by default set to sampling_rate/2 -> Nyquist!
+            fmax = sr // 2 - fmax_aug_range // 2
+            print(f"Warning: FMAX is None setting to {fmax} ")
+        self.fmax = fmax
+        self.hopsize = hopsize
+        # buffers are not trained by the optimizer, persistent=False also avoids adding it to the model's state dict
+        self.register_buffer('window',
+                             torch.hann_window(win_length, periodic=False),
+                             persistent=False)
+        assert fmin_aug_range >= 1, f"fmin_aug_range={fmin_aug_range} should be >=1; 1 means no augmentation"
+        assert fmax_aug_range >= 1, f"fmax_aug_range={fmax_aug_range} should be >=1; 1 means no augmentation"
+        self.fmin_aug_range = fmin_aug_range
+        self.fmax_aug_range = fmax_aug_range
+        self.register_buffer("preemphasis_coefficient", torch.as_tensor([[[-.97, 1]]]), persistent=False)
+        if freqm == 0:
+            self.freqm = torch.nn.Identity()
+        else:
+            self.freqm = torchaudio.transforms.FrequencyMasking(freqm, iid_masks=True)
+        if timem == 0:
+            self.timem = torch.nn.Identity()
+        else:
+            self.timem = torchaudio.transforms.TimeMasking(timem, iid_masks=True)
+        self.norm_mel = norm_mel
+    def forward(self, x):
+        # shape: batch size x samples
+        # majority of energy located in lower end of the spectrum, pre-emphasis compensates for the average spectral
+        # shape
+        x = nn.functional.conv1d(x.unsqueeze(1), self.preemphasis_coefficient).squeeze(1)
+        # Short-Time Fourier Transform using Hanning window
+        x = torch.stft(x, self.n_fft, hop_length=self.hopsize, win_length=self.win_length,
+                       center=True, normalized=False, window=self.window, return_complex=False)
+        # shape: batch size x freqs (n_fft/2 + 1) x timeframes (samples/hop_length) x 2 (real and imaginary components)
+        # calculate power spectrum
+        x = (x ** 2).sum(dim=-1)
+        fmin = self.fmin + torch.randint(self.fmin_aug_range, (1,)).item()
+        fmax = self.fmax + self.fmax_aug_range // 2 - torch.randint(self.fmax_aug_range, (1,)).item()
+        if not self.training:
+            # don't augment eval data
+            fmin = self.fmin
+            fmax = self.fmax
+        # create mel filterbank
+        mel_basis, _ = torchaudio.compliance.kaldi.get_mel_banks(self.n_mels, self.n_fft, self.sr,
+                                                                 fmin, fmax, vtln_low=100.0, vtln_high=-500.,
+                                                                 vtln_warp_factor=1.0)
+        mel_basis = torch.as_tensor(torch.nn.functional.pad(mel_basis, (0, 1), mode='constant', value=0),
+                                    device=x.device)
+        if self.norm_mel:
+            mel_basis = mel_basis / mel_basis.sum(1)[:, None]
+        # apply mel filterbank to power spectrogram
+        with torch.cuda.amp.autocast(enabled=False):
+            melspec = torch.matmul(mel_basis, x)
+        # calculate log mel spectrogram
+        melspec = (melspec + 0.00001).log()
+        if self.training:
+            # don't augment eval data
+            melspec = self.freqm(melspec)
+            melspec = self.timem(melspec)
+        melspec = (melspec + 4.5) / 5.  # fast normalization
+        return melspec

split10/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_feats": false,
+  "architectures": [
+    "Network"
+  ],
+  "base_channels": 32,
+  "channels_multiplier": 1.8,
+  "expansion_rate": 2.1,
+  "in_channels": 1,
+  "n_blocks": [
+    3,
+    2,
+    1
+  ],
+  "n_classes": 10,
+  "strides": {
+    "b2": [
+      1,
+      1
+    ],
+    "b3": [
+      1,
+      2
+    ],
+    "b4": [
+      2,
+      1
+    ]
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.1"
+}

split10/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b57c03e4b9ecd67b1c33d8ed471bed10b4e5dfe872b2fa9bb1bb6bc3405b9b0
+size 139504

split100/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_feats": false,
+  "architectures": [
+    "Network"
+  ],
+  "base_channels": 32,
+  "channels_multiplier": 1.8,
+  "expansion_rate": 2.1,
+  "in_channels": 1,
+  "n_blocks": [
+    3,
+    2,
+    1
+  ],
+  "n_classes": 10,
+  "strides": {
+    "b2": [
+      1,
+      1
+    ],
+    "b3": [
+      1,
+      2
+    ],
+    "b4": [
+      2,
+      1
+    ]
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.1"
+}

split100/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02791b0e81bad11f603806e00252e76ceb4fb62f0d4880c115ed95513262b172
+size 139504

split25/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_feats": false,
+  "architectures": [
+    "Network"
+  ],
+  "base_channels": 32,
+  "channels_multiplier": 1.8,
+  "expansion_rate": 2.1,
+  "in_channels": 1,
+  "n_blocks": [
+    3,
+    2,
+    1
+  ],
+  "n_classes": 10,
+  "strides": {
+    "b2": [
+      1,
+      1
+    ],
+    "b3": [
+      1,
+      2
+    ],
+    "b4": [
+      2,
+      1
+    ]
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.1"
+}

split25/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10a46c2d2b99b39965a4fa4fbac6171275efb01bd7da08a7f768779df5effba0
+size 139504

split5/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_feats": false,
+  "architectures": [
+    "Network"
+  ],
+  "base_channels": 32,
+  "channels_multiplier": 1.8,
+  "expansion_rate": 2.1,
+  "in_channels": 1,
+  "n_blocks": [
+    3,
+    2,
+    1
+  ],
+  "n_classes": 10,
+  "strides": {
+    "b2": [
+      1,
+      1
+    ],
+    "b3": [
+      1,
+      2
+    ],
+    "b4": [
+      2,
+      1
+    ]
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.1"
+}

split5/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5401136f83a10102de48f37c724a50e188e4b89186177a9ceb2945ddc57f5b49
+size 139504

split50/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "add_feats": false,
+  "architectures": [
+    "Network"
+  ],
+  "base_channels": 32,
+  "channels_multiplier": 1.8,
+  "expansion_rate": 2.1,
+  "in_channels": 1,
+  "n_blocks": [
+    3,
+    2,
+    1
+  ],
+  "n_classes": 10,
+  "strides": {
+    "b2": [
+      1,
+      1
+    ],
+    "b3": [
+      1,
+      2
+    ],
+    "b4": [
+      2,
+      1
+    ]
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.37.1"
+}

split50/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:513d8bc217266fbb3a3f7354e3884eb5478550c47d9daa44cf1cdb7b08c54984
+size 139504