Spaces:

waidhoferj
/

dance-classifier

Runtime error

App Files Files Community

waidhoferj commited on Jul 8, 2023

Commit

51f4763

1 Parent(s): 17a2a7d

updated production build to use multiple overlapping samples

Browse files

Files changed (5) hide show

app.py +23 -5
models/config/train_local.yaml +8 -9
models/residual.py +2 -5
preprocessing/dataset.py +1 -5
preprocessing/pipelines.py +10 -6

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from functools import cache
 from pathlib import Path
 from models.residual import ResidualDancer
 from models.training_environment import TrainingEnvironment
-from preprocessing.pipelines import SpectrogramProductionPipeline
 import torch
 from torch import nn
 import yaml
@@ -17,6 +17,8 @@ CONFIG_FILE = Path("models/weights/ResidualDancer/multilabel/config.yaml")
 DANCE_MAPPING_FILE = Path("data/dance_mapping.csv")
 class DancePredictor:
     def __init__(
@@ -37,6 +39,9 @@ class DancePredictor:
         self.labels = np.array(labels)
         self.device = device
         self.model = self.get_model(weight_path)
         self.extractor = SpectrogramProductionPipeline()
     def get_model(self, weight_path: str) -> nn.Module:
@@ -87,10 +92,21 @@ class DancePredictor:
         waveform = torchaudio.functional.resample(
             waveform, sample_rate, self.resample_frequency
         )
-        features = self.extractor(waveform)
-        features = features.unsqueeze(0).to(self.device)
         results = self.model(features)
-        results = nn.functional.softmax(results.squeeze(0), dim=0)
         results = results.detach().cpu().numpy()
         result_mask = results > self.threshold
@@ -116,6 +132,9 @@ def predict(audio: tuple[int, np.ndarray]) -> list[str]:
     if audio is None:
         return "Dance Not Found"
     sample_rate, waveform = audio
     model = get_model(CONFIG_FILE)
     results = model(waveform, sample_rate)
@@ -133,7 +152,6 @@ def demo():
     recording_interface = gr.Interface(
         fn=predict,
-        description="Record at least **6 seconds** of the song.",
         inputs=gr.Audio(source="microphone", label="Song Recording"),
         outputs=gr.Label(label="Dances"),
         examples=example_audio,

 from pathlib import Path
 from models.residual import ResidualDancer
 from models.training_environment import TrainingEnvironment
+from preprocessing.pipelines import SpectrogramProductionPipeline, WaveformPreprocessing
 import torch
 from torch import nn
 import yaml
 DANCE_MAPPING_FILE = Path("data/dance_mapping.csv")
+MIN_DURATION = 3.0
 class DancePredictor:
     def __init__(
         self.labels = np.array(labels)
         self.device = device
         self.model = self.get_model(weight_path)
+        self.process_waveform = WaveformPreprocessing(
+            resample_frequency * expected_duration
+        )
         self.extractor = SpectrogramProductionPipeline()
     def get_model(self, weight_path: str) -> nn.Module:
         waveform = torchaudio.functional.resample(
             waveform, sample_rate, self.resample_frequency
         )
+        window_size = self.resample_frequency * self.expected_duration
+        n_preds = int(waveform.shape[1] // (window_size / 2))
+        step_size = int(waveform.shape[1] / n_preds)
+        inputs = [
+            waveform[:, i * step_size : i * step_size + window_size]
+            for i in range(n_preds)
+        ]
+        features = [self.extractor(window) for window in inputs]
+        features = torch.stack(features).to(self.device)
         results = self.model(features)
+        # Convert to probabilities
+        results = nn.functional.softmax(results, dim=1)
+        # Take average prediction over all of the windows
+        results = results.mean(dim=0)
         results = results.detach().cpu().numpy()
         result_mask = results > self.threshold
     if audio is None:
         return "Dance Not Found"
     sample_rate, waveform = audio
+    duration = len(waveform) / sample_rate
+    if duration < MIN_DURATION:
+        return f"Please record at least {MIN_DURATION} seconds of audio"
     model = get_model(CONFIG_FILE)
     results = model(waveform, sample_rate)
     recording_interface = gr.Interface(
         fn=predict,
         inputs=gr.Audio(source="microphone", label="Song Recording"),
         outputs=gr.Label(label="Dances"),
         examples=example_audio,

models/config/train_local.yaml CHANGED Viewed

@@ -1,12 +1,15 @@
 training_fn: residual.train_residual_dancer
-checkpoint: lightning_logs/version_176/checkpoints/epoch=12-step=40404.ckpt
 device: mps
 seed: 42
 dance_ids: &dance_ids
   - BCH
   - CHA
-  - JIV
   - ECS
   - QST
   - RMB
   - SFT
@@ -20,8 +23,7 @@ dance_ids: &dance_ids
 data_module:
   batch_size: 128
   num_workers: 10
-  # data_subset: 0.001
-  test_proportion: 0.001
 datasets:
   preprocessing.dataset.BestBallroomDataset:
@@ -31,7 +33,7 @@ datasets:
   preprocessing.dataset.Music4DanceDataset:
     song_data_path: data/songs_cleaned.csv
-    song_audio_path: data/samples # data/samples
     class_list: *dance_ids
     multi_label: True
     min_votes: 1
@@ -56,7 +58,4 @@ trainer:
   # overfit_batches: 1
 training_environment:
-  learning_rate: 0.000053
-  # loggers:
-  #   models.training_environment.SpectrogramLogger:
-  #     frequency: 100

 training_fn: residual.train_residual_dancer
 device: mps
 seed: 42
 dance_ids: &dance_ids
   - BCH
+  - BOL
   - CHA
   - ECS
+  - HST
+  - LHP
+  - NC2
+  - JIV
   - QST
   - RMB
   - SFT
 data_module:
   batch_size: 128
   num_workers: 10
+  test_proportion: 0.15
 datasets:
   preprocessing.dataset.BestBallroomDataset:
   preprocessing.dataset.Music4DanceDataset:
     song_data_path: data/songs_cleaned.csv
+    song_audio_path: data/samples
     class_list: *dance_ids
     multi_label: True
     min_votes: 1
   # overfit_batches: 1
 training_environment:
+  learning_rate: 0.00053

models/residual.py CHANGED Viewed

@@ -119,14 +119,11 @@ def train_residual_dancer(config: dict):
     data = DanceDataModule(dataset, **config["data_module"])
     model = ResidualDancer(n_classes=len(TARGET_CLASSES), **config["model"])
     label_weights = data.get_label_weights().to(DEVICE)
-    criterion = LabelWeightedBCELoss(
-        label_weights
-    )  # nn.CrossEntropyLoss(label_weights)
     train_env = TrainingEnvironment(model, criterion, config)
     callbacks = [
-        # cb.LearningRateFinder(update_attr=True),
-        cb.EarlyStopping("val/loss", patience=1),
         cb.StochasticWeightAveraging(1e-2),
         cb.RichProgressBar(),
     ]

     data = DanceDataModule(dataset, **config["data_module"])
     model = ResidualDancer(n_classes=len(TARGET_CLASSES), **config["model"])
     label_weights = data.get_label_weights().to(DEVICE)
+    criterion = LabelWeightedBCELoss(label_weights)
     train_env = TrainingEnvironment(model, criterion, config)
     callbacks = [
+        cb.EarlyStopping("val/loss", patience=2),
         cb.StochasticWeightAveraging(1e-2),
         cb.RichProgressBar(),
     ]

preprocessing/dataset.py CHANGED Viewed

@@ -424,11 +424,7 @@ def record_audio_durations(folder: str):
     music_files = iglob(os.path.join(folder, "**", "*.wav"), recursive=True)
     for file in music_files:
         meta = ta.info(file)
-        durations[file] = meta.num_frames / meta.sample_rate
     with open(os.path.join(folder, "audio_durations.json"), "w") as f:
         json.dump(durations, f)
-class GTZAN:
-    pass

     music_files = iglob(os.path.join(folder, "**", "*.wav"), recursive=True)
     for file in music_files:
         meta = ta.info(file)
+        durations[os.path.relpath(file, folder)] = meta.num_frames / meta.sample_rate
     with open(os.path.join(folder, "audio_durations.json"), "w") as f:
         json.dump(durations, f)

preprocessing/pipelines.py CHANGED Viewed

@@ -95,23 +95,27 @@ class WaveformPreprocessing(torch.nn.Module):
         self.expected_sample_length = expected_sample_length
     def forward(self, waveform: torch.Tensor) -> torch.Tensor:
         # Take out extra channels
-        if waveform.shape[0] > 1:
-            waveform = waveform.mean(0, keepdim=True)
         # ensure it is the correct length
-        waveform = self._rectify_duration(waveform)
         return waveform
-    def _rectify_duration(self, waveform: torch.Tensor):
         expected_samples = self.expected_sample_length
-        sample_count = waveform.shape[1]
         if expected_samples == sample_count:
             return waveform
         elif expected_samples > sample_count:
             pad_amount = expected_samples - sample_count
             return torch.nn.functional.pad(
-                waveform, (0, pad_amount), mode="constant", value=0.0
             )
         else:
             return waveform[:, :expected_samples]

         self.expected_sample_length = expected_sample_length
     def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        c_dim = 1 if len(waveform.shape) == 3 else 0
         # Take out extra channels
+        if waveform.shape[c_dim] > 1:
+            waveform = waveform.mean(c_dim, keepdim=True)
         # ensure it is the correct length
+        waveform = self._rectify_duration(waveform, c_dim)
         return waveform
+    def _rectify_duration(self, waveform: torch.Tensor, channel_dim: int):
         expected_samples = self.expected_sample_length
+        sample_count = waveform.shape[channel_dim + 1]
         if expected_samples == sample_count:
             return waveform
         elif expected_samples > sample_count:
             pad_amount = expected_samples - sample_count
             return torch.nn.functional.pad(
+                waveform,
+                (channel_dim + 1) * [0] + [pad_amount],
+                mode="constant",
+                value=0.0,
             )
         else:
             return waveform[:, :expected_samples]