Spaces:

thunnai
/

audiobox-aesthetics

Running

App Files Files Community

thunnai commited on Feb 12

Commit

169a7c1

1 Parent(s): 4cd5da8

port to support hf

Browse files

Files changed (20) hide show

.github/workflows/pre-commit.yaml +14 -0
.github/workflows/run-pytest.yaml +24 -0
.gitignore +5 -1
.pre-commit-config.yaml +22 -0
examples/example.jsonl +1 -0
examples/predict_from_jsonl.py +9 -0
examples/predict_single_file.py +8 -0
pyproject.toml +25 -23
sample_audio/libritts_spk-3170.wav +0 -0
sample_audio/libritts_spk-84.wav +0 -0
sample_audio/test.jsonl +1 -0
src/audiobox_aesthetics/cli.py +1 -1
src/audiobox_aesthetics/demo.py +86 -0
src/audiobox_aesthetics/export_model_to_hf.py +77 -0
src/audiobox_aesthetics/infer.py +6 -4
src/audiobox_aesthetics/inference.py +217 -0
src/audiobox_aesthetics/model/aes_wavlm.py +2 -2
src/audiobox_aesthetics/model/wavlm.py +13 -13
test/test_inference.py +74 -0
uv.lock +0 -0

.github/workflows/pre-commit.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: Pre-commit
+on:
+  pull_request:
+  push:
+    branches: [main]
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - uses: pre-commit/[email protected]

.github/workflows/run-pytest.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+name: PyTest
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+jobs:
+  run-pytest:
+    name: python
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+      - name: Install the project
+        run: uv sync --all-extras --dev
+      - name: Run tests
+        run: uv run pytest test/

.gitignore CHANGED Viewed

@@ -1,3 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -161,4 +165,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #  .idea/
 .vscode/
-.ruff_cache/

+.gradio/
+*.pt
+*.pth
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #  .idea/
 .vscode/
+.ruff_cache/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+repos:
+  - repo: https://github.com/google/yamlfmt
+    rev: v0.16.0
+    hooks:
+      - id: yamlfmt
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.23.3
+    hooks:
+      - id: gitleaks
+  - repo: https://github.com/astral-sh/uv-pre-commit
+    # uv version.
+    rev: 0.5.30
+    hooks:
+      - id: uv-lock
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.9.6
+    hooks:
+      - id: ruff
+        types_or: [python, pyi]
+        args: [--fix]
+      - id: ruff-format
+        types_or: [python, pyi]

examples/example.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"path": "sample_audio/libritts_spk-84.wav"}

examples/predict_from_jsonl.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from audiobox_aesthetics.inference import AudioBoxAesthetics, AudioFileList
+model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+model.eval()
+audio_file_list = AudioFileList.from_jsonl("examples/example.jsonl")
+predictions = model.predict_from_files(audio_file_list)
+print(predictions)

examples/predict_single_file.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from audiobox_aesthetics.inference import AudioBoxAesthetics
+model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+model.eval()
+wav = model.load_audio("sample_audio/libritts_spk-84.wav")
+predictions = model.predict_from_wavs(wav)
+print(predictions)

pyproject.toml CHANGED Viewed

@@ -6,23 +6,21 @@ build-backend = "setuptools.build_meta"
 name = "audiobox_aesthetics"
 version = "0.0.1"
 authors = [
-    {name="Andros Tjandra", email="[email protected]"},
-    {name="Yi-Chiao Wu"},
-    {name="Baishan Guo"},
-    {name="John Hoffman"},
-    {name="Brian Ellis"},
-    {name="Apoorv Vyas"},
-    {name="Bowen Shi"},
-    {name="Sanyuan Chen"},
-    {name="Matt Le"},
-    {name="Nick Zacharov"},
-    {name="Carleigh Wood"},
-    {name="Ann Lee"},
-    {name="Wei-ning Hsu"}
-]
-maintainers = [
-    {name="Andros Tjandra", email="[email protected]"}
 ]
 description = "Unified automatic quality assessment for speech, music, and sound."
 requires-python = ">=3.9"
 classifiers = [
@@ -30,14 +28,17 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 readme = "README.md"
-license = {file = "LICENSE"}
 dependencies = [
-  "numpy",
-  "torch>=2.2.0",
-  "torchaudio",
-  "tqdm",
-  "submitit"
 ]
 [project.scripts]
@@ -47,4 +48,5 @@ audio-aes = "audiobox_aesthetics.cli:app"
 Homepage = "https://github.com/facebookresearch/audiobox-aesthetics"
 Issues = "https://github.com/facebookresearch/audiobox-aesthetics/issues"

 name = "audiobox_aesthetics"
 version = "0.0.1"
 authors = [
+    { name = "Andros Tjandra", email = "[email protected]" },
+    { name = "Yi-Chiao Wu" },
+    { name = "Baishan Guo" },
+    { name = "John Hoffman" },
+    { name = "Brian Ellis" },
+    { name = "Apoorv Vyas" },
+    { name = "Bowen Shi" },
+    { name = "Sanyuan Chen" },
+    { name = "Matt Le" },
+    { name = "Nick Zacharov" },
+    { name = "Carleigh Wood" },
+    { name = "Ann Lee" },
+    { name = "Wei-ning Hsu" },
 ]
+maintainers = [{ name = "Andros Tjandra", email = "[email protected]" }]
 description = "Unified automatic quality assessment for speech, music, and sound."
 requires-python = ">=3.9"
 classifiers = [
     "Operating System :: OS Independent",
 ]
 readme = "README.md"
+license = { file = "LICENSE" }
 dependencies = [
+    "numpy",
+    "torch>=2.2.0",
+    "torchaudio",
+    "tqdm",
+    "submitit",
+    "huggingface-hub>=0.28.1",
+    "pydantic>=2.10.6",
+    "safetensors>=0.5.2",
 ]
 [project.scripts]
 Homepage = "https://github.com/facebookresearch/audiobox-aesthetics"
 Issues = "https://github.com/facebookresearch/audiobox-aesthetics/issues"
+[dependency-groups]
+dev = ["gradio>=4.44.1", "ipykernel>=6.29.5", "pytest>=8.3.4"]

sample_audio/libritts_spk-3170.wav ADDED Viewed

Binary file (292 kB). View file

sample_audio/libritts_spk-84.wav ADDED Viewed

Binary file (287 kB). View file

sample_audio/test.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"path": "sample_audio/libritts_spk-84.wav"}

src/audiobox_aesthetics/cli.py CHANGED Viewed

@@ -14,7 +14,7 @@ import requests
 import submitit
 from tqdm import tqdm
-from .infer import load_dataset, main_predict
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

 import submitit
 from tqdm import tqdm
+from audiobox_aesthetics.infer import load_dataset, main_predict
 logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

src/audiobox_aesthetics/demo.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import gradio as gr
+from audiobox_aesthetics.inference import (
+    AudioBoxAesthetics,
+    AudioFile,
+    AXIS_NAME_LOOKUP,
+)
+# Load the pre-trained model
+model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+model.eval()
+def predict_aesthetics(audio_file):
+    # Create an AudioFile instance
+    audio_file_instance = AudioFile(path=audio_file)
+    # Predict using the model
+    predictions = model.predict_from_files(audio_file_instance)
+    single_prediction = predictions[0]
+    data_view = [
+        [AXIS_NAME_LOOKUP[key], value] for key, value in single_prediction.items()
+    ]
+    return single_prediction, data_view
+def create_demo():
+    # Create a Gradio Blocks interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# AudioBox Aesthetics Prediction")
+        with gr.Group():
+            gr.Markdown("""Upload an audio file to predict its aesthetic scores.
+                This demo uses the AudioBox Aesthetics model to predict aesthetic scores for audio files along 4 axes:
+                - Content Enjoyment (CE)
+                - Content Usefulness (CU)
+                - Production Complexity (PC)
+                - Production Quality (PQ)
+                Scores range from 0 to 10.
+                For more details, see the [paper](https://arxiv.org/abs/2502.05139) or [code](https://github.com/facebookresearch/audiobox-aesthetics/tree/main).
+            """)
+        with gr.Row():
+            with gr.Group():
+                with gr.Column():
+                    audio_input = gr.Audio(
+                        sources="upload", type="filepath", label="Upload Audio"
+                    )
+                    submit_button = gr.Button("Predict", variant="primary")
+            with gr.Group():
+                with gr.Column():
+                    output_data = gr.Dataframe(
+                        headers=["Axes name", "Score"],
+                        datatype=["str", "number"],
+                        label="Aesthetic Scorest",
+                    )
+                    output_text = gr.Textbox(label="Raw prediction", interactive=False)
+        submit_button.click(
+            predict_aesthetics,
+            inputs=audio_input,
+            outputs=[output_text, output_data],
+        )
+        # Add examples
+        gr.Examples(
+            examples=[
+                "sample_audio/libritts_spk-84.wav",
+                "sample_audio/libritts_spk-3170.wav",
+            ],
+            inputs=audio_input,
+            outputs=[output_text, output_data],
+            fn=predict_aesthetics,
+            cache_examples=True,
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

src/audiobox_aesthetics/export_model_to_hf.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import requests
+import os
+import argparse
+import torch
+from audiobox_aesthetics.inference import AudioBoxAesthetics
+if __name__ == "__main__":
+    # Set up argument parser
+    parser = argparse.ArgumentParser(
+        description="Download and test AudioBox Aesthetics model"
+    )
+    parser.add_argument(
+        "--checkpoint-url",
+        default="https://dl.fbaipublicfiles.com/audiobox-aesthetics/checkpoint.pt",
+        help="URL for the base checkpoint",
+    )
+    parser.add_argument(
+        "--model-name",
+        default="audiobox-aesthetics",
+        help="Name to save/load the pretrained model",
+    )
+    parser.add_argument(
+        "--push-to-hub",
+        action="store_true",
+        help="Push the model to the Hugging Face Hub",
+    )
+    args = parser.parse_args()
+    checkpoint_local_path = "base_checkpoint.pth"
+    if not os.path.exists(checkpoint_local_path):
+        print("Downloading base checkpoint")
+        response = requests.get(args.checkpoint_url)
+        with open(checkpoint_local_path, "wb") as f:
+            f.write(response.content)
+    # get model config from the base checkpoint
+    checkpoint = torch.load(
+        checkpoint_local_path, map_location="cpu", weights_only=True
+    )
+    model_cfg = checkpoint["model_cfg"]
+    # extract normalization params from the base checkpoint
+    target_transform = checkpoint["target_transform"]
+    target_transform = {
+        axis: {
+            "mean": checkpoint["target_transform"][axis]["mean"],
+            "std": checkpoint["target_transform"][axis]["std"],
+        }
+        for axis in target_transform.keys()
+    }
+    model = AudioBoxAesthetics(
+        sample_rate=16_000, target_transform=target_transform, **model_cfg
+    )
+    model._load_base_checkpoint(checkpoint_local_path)
+    print("✅ Loaded model from base checkpoint")
+    model.save_pretrained(args.model_name, push_to_hub=args.push_to_hub)
+    print(f"✅ Saved model to {args.model_name}")
+    if args.push_to_hub:
+        model.push_to_hub(args.model_name)
+        print(f"✅ Pushed model to Hub under {args.model_name}")
+    # test load from pretrained
+    model = AudioBoxAesthetics.from_pretrained(args.model_name)
+    model.eval()
+    print(f"✅ Loaded model from pretrained {args.model_name}")
+    # test inference
+    wav = model.load_audio("sample_audio/libritts_spk-84.wav")
+    predictions = model.predict_from_wavs(wav)
+    print(predictions)
+    print("✅ Inference test passed")

src/audiobox_aesthetics/infer.py CHANGED Viewed

@@ -14,7 +14,7 @@ import torch
 import torchaudio
 import torch.nn.functional as F
-from .model.aes_wavlm import Normalize, WavlmAudioEncoderMultiOutput
 Batch = Dict[str, Any]
@@ -113,6 +113,8 @@ class AesWavlmPredictorMultiOutput:
             "bf16": torch.bfloat16,
         }.get(self.precision)
         self.target_transform = {
             axis: Normalize(
                 mean=ckpt["target_transform"][axis]["mean"],
@@ -205,8 +207,8 @@ def main_predict(input_file, ckpt, batch_size=10):
     for ii in tqdm(range(0, len(metadata), batch_size)):
         output = predictor.forward(metadata[ii : ii + batch_size])
         outputs.extend(output)
-    assert len(outputs) == len(
-        metadata
-    ), f"Output {len(outputs)} != input {len(metadata)} length"
     return outputs

 import torchaudio
 import torch.nn.functional as F
+from audiobox_aesthetics.model.aes_wavlm import Normalize, WavlmAudioEncoderMultiOutput
 Batch = Dict[str, Any]
             "bf16": torch.bfloat16,
         }.get(self.precision)
+        print("using precision", self.precision)
         self.target_transform = {
             axis: Normalize(
                 mean=ckpt["target_transform"][axis]["mean"],
     for ii in tqdm(range(0, len(metadata), batch_size)):
         output = predictor.forward(metadata[ii : ii + batch_size])
         outputs.extend(output)
+    assert len(outputs) == len(metadata), (
+        f"Output {len(outputs)} != input {len(metadata)} length"
+    )
     return outputs

src/audiobox_aesthetics/inference.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import re
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from audiobox_aesthetics.model.aes_wavlm import Normalize, WavlmAudioEncoderMultiOutput
+from audiobox_aesthetics.infer import make_inference_batch
+from pydantic import BaseModel
+import torchaudio
+from pydantic import BaseModel, Field
+from typing import Optional, List
+import json
+AXIS_NAME_LOOKUP = {
+    "CE": "Content Enjoyment",
+    "CU": "Content Usefulness",
+    "PC": "Production Complexity",
+    "PQ": "Production Quality",
+}
+class AudioFile(BaseModel):
+    """
+    Audio file to be processed
+    """
+    path: str
+    start_time: Optional[float] = Field(None, description="Start time in seconds")
+    end_time: Optional[float] = Field(None, description="End time in seconds")
+class AudioFileList(BaseModel):
+    """
+    List of audio files to be processed
+    """
+    files: List[AudioFile]
+    @classmethod
+    def from_jsonl(cls, filename: str) -> "AudioFileList":
+        audio_files = []
+        with open(filename, "r") as f:
+            for line in f:
+                data = json.loads(line.strip())
+                audio_file = AudioFile(**data)
+                audio_files.append(audio_file)
+        return cls(files=audio_files)
+# model
+class AudioBoxAesthetics(
+    nn.Module,
+    PyTorchModelHubMixin,
+    library_name="audiobox-aesthetics",
+    repo_url="https://github.com/facebookresearch/audiobox-aesthetics",
+):
+    def __init__(
+        self,
+        proj_num_layer: int = 1,
+        proj_ln: bool = False,
+        proj_act_fn: str = "gelu",
+        proj_dropout: float = 0.0,
+        nth_layer: int = 13,
+        use_weighted_layer_sum: bool = True,
+        precision: str = "32",
+        normalize_embed: bool = True,
+        output_dim: int = 1,
+        target_transform: dict = None,
+        sample_rate: int = 16_000,
+    ):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.encoder = WavlmAudioEncoderMultiOutput(
+            proj_num_layer=proj_num_layer,
+            proj_ln=proj_ln,
+            proj_act_fn=proj_act_fn,
+            proj_dropout=proj_dropout,
+            nth_layer=nth_layer,
+            use_weighted_layer_sum=use_weighted_layer_sum,
+            precision=precision,
+            normalize_embed=normalize_embed,
+            output_dim=output_dim,
+        )
+        self.target_transform = {
+            axis: Normalize(
+                mean=target_transform[axis]["mean"],
+                std=target_transform[axis]["std"],
+            )
+            for axis in target_transform.keys()
+        }
+    def _load_base_checkpoint(self, checkpoint_pth: str):
+        with open(checkpoint_pth, "rb") as fin:
+            ckpt = torch.load(fin, map_location="cpu", weights_only=True)
+            state_dict = {
+                re.sub("^model.", "", k): v for (k, v) in ckpt["state_dict"].items()
+            }
+            self.encoder.load_state_dict(state_dict)
+    def forward(self, batch, inference_mode: bool = True):
+        if inference_mode:
+            with torch.inference_mode():
+                result = self.encoder(batch)
+        else:
+            result = self.encoder(batch)
+        return result
+    def _process_single_audio(self, wav: torch.Tensor, sample_rate: int):
+        """
+        Process a single audio file to the target sample rate and return a tensor of shape (1, 1, T)
+        """
+        target_sample_rate = self.sample_rate
+        wav = torchaudio.functional.resample(wav, sample_rate, target_sample_rate)
+        # convert to mono
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        return wav, target_sample_rate
+    def load_audio(self, path: str, start_time: float = None, end_time: float = None):
+        """
+        Load an audio file form path
+        Args:
+            path: str - path to the audio file
+            start_time: float - start time in seconds
+            end_time: float - end time in seconds
+        Returns:
+            wav: torch.Tensor - audio tensor of shape (1, 1, T)
+        """
+        wav, sample_rate = torchaudio.load(path)
+        if start_time is not None and end_time is not None:
+            if start_time and end_time:
+                wav = wav[
+                    :, int(start_time * sample_rate) : int(end_time * sample_rate)
+                ]
+            elif start_time:
+                wav = wav[:, int(start_time * sample_rate) :]
+            elif end_time:
+                wav = wav[:, : int(end_time * sample_rate)]
+        wav, _sr = self._process_single_audio(wav, sample_rate)
+        return wav
+    def predict_from_files(
+        self, audio_file_list: AudioFileList | AudioFile
+    ) -> List[dict]:
+        """
+        Predict the aesthetic score for a list of audio files
+        """
+        if isinstance(audio_file_list, AudioFile):
+            audio_file_list = AudioFileList(files=[audio_file_list])
+        wavs = [
+            self.load_audio(file.path, file.start_time, file.end_time)
+            for file in audio_file_list.files
+        ]
+        return self.predict_from_wavs(wavs)
+    def predict_from_wavs(self, wavs: List[torch.Tensor] | torch.Tensor):
+        """
+        Predict the aesthetic score for a single audio file
+        Args:
+            wavs: List[torch.Tensor] - list of audio tensors of shape (1, 1, T) - must be at the sample rate of the model
+        Returns:
+            preds: List[dict] - list of dictionaries containing the aesthetic scores for each axis
+        """
+        if isinstance(wavs, torch.Tensor):
+            wavs = [wavs]
+        n_wavs = len(wavs)
+        wavs, masks, weights, bids = make_inference_batch(
+            wavs,
+            10,
+            10,
+            sample_rate=self.sample_rate,
+        )
+        # stack wavs, masks, weights, bids
+        wavs = torch.stack(wavs)
+        masks = torch.stack(masks)
+        weights = torch.tensor(weights)
+        bids = torch.tensor(bids)
+        if not wavs.shape[0] == masks.shape[0] == weights.shape[0] == bids.shape[0]:
+            raise ValueError("Batch size mismatch")
+        preds_all = self.forward({"wav": wavs, "mask": masks})
+        all_result = {}
+        # predict scores across all axis
+        for axis in self.target_transform.keys():
+            preds = self.target_transform[axis].inverse(preds_all[axis])
+            weighted_preds = []
+            for bii in range(n_wavs):
+                weights_bii = weights[bids == bii]
+                weighted_preds.append(
+                    (
+                        (preds[bids == bii] * weights_bii).sum() / weights_bii.sum()
+                    ).item()
+                )
+            all_result[axis] = weighted_preds
+        # re-arrenge result
+        preds = [dict(zip(all_result.keys(), vv)) for vv in zip(*all_result.values())]
+        return preds

src/audiobox_aesthetics/model/aes_wavlm.py CHANGED Viewed

@@ -9,8 +9,8 @@ import sys
 from torch import nn
 import torch
-from .utils import create_mlp_block
-from .wavlm import WavLM, WavLMConfig
 DEFAULT_AUDIO_CFG = WavLMConfig(

 from torch import nn
 import torch
+from audiobox_aesthetics.model.utils import create_mlp_block
+from audiobox_aesthetics.model.wavlm import WavLM, WavLMConfig
 DEFAULT_AUDIO_CFG = WavLMConfig(

src/audiobox_aesthetics/model/wavlm.py CHANGED Viewed

@@ -244,17 +244,17 @@ def quant_noise(module, p, block_size):
     # 2D matrix
     if not is_conv:
-        assert (
-            module.weight.size(1) % block_size == 0
-        ), "Input features must be a multiple of block sizes"
     # 4D matrix
     else:
         # 1x1 convolutions
         if module.kernel_size == (1, 1):
-            assert (
-                module.in_channels % block_size == 0
-            ), "Input channels must be a multiple of block sizes"
         # regular convolutions
         else:
             k = module.kernel_size[0] * module.kernel_size[1]
@@ -356,16 +356,16 @@ class MultiheadAttention(nn.Module):
         self.head_dim = embed_dim // num_heads
         self.q_head_dim = self.head_dim
         self.k_head_dim = self.head_dim
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
         self.scaling = self.head_dim**-0.5
         self.self_attention = self_attention
         self.encoder_decoder_attention = encoder_decoder_attention
         assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
         )
         k_bias = True
@@ -1255,9 +1255,9 @@ class ConvFeatureExtractionModel(nn.Module):
                 nn.init.kaiming_normal_(conv.weight)
                 return conv
-            assert (
-                is_layer_norm and is_group_norm
-            ) is False, "layer norm and group norm are exclusive"
             if is_layer_norm:
                 return nn.Sequential(

     # 2D matrix
     if not is_conv:
+        assert module.weight.size(1) % block_size == 0, (
+            "Input features must be a multiple of block sizes"
+        )
     # 4D matrix
     else:
         # 1x1 convolutions
         if module.kernel_size == (1, 1):
+            assert module.in_channels % block_size == 0, (
+                "Input channels must be a multiple of block sizes"
+            )
         # regular convolutions
         else:
             k = module.kernel_size[0] * module.kernel_size[1]
         self.head_dim = embed_dim // num_heads
         self.q_head_dim = self.head_dim
         self.k_head_dim = self.head_dim
+        assert self.head_dim * num_heads == self.embed_dim, (
+            "embed_dim must be divisible by num_heads"
+        )
         self.scaling = self.head_dim**-0.5
         self.self_attention = self_attention
         self.encoder_decoder_attention = encoder_decoder_attention
         assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and value to be of the same size"
         )
         k_bias = True
                 nn.init.kaiming_normal_(conv.weight)
                 return conv
+            assert (is_layer_norm and is_group_norm) is False, (
+                "layer norm and group norm are exclusive"
+            )
             if is_layer_norm:
                 return nn.Sequential(

test/test_inference.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from audiobox_aesthetics.inference import AudioBoxAesthetics, AudioFileList, AudioFile
+# cached results from running the CLI
+cli_results = {
+    "sample_audio/libritts_spk-84.wav": {
+        "CE": 6.1027421951293945,
+        "CU": 6.3574299812316895,
+        "PC": 1.7401179075241089,
+        "PQ": 6.733065128326416,
+    },
+}
+def test_inference():
+    audio_path = "sample_audio/libritts_spk-84.wav"
+    audio_file = AudioFile(path=audio_path)
+    model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+    model.eval()
+    predictions = model.predict_from_files(audio_file)
+    single_pred = predictions[0]
+    print(single_pred)
+    assert single_pred["CE"] == cli_results[audio_path]["CE"]
+    assert single_pred["CU"] == cli_results[audio_path]["CU"]
+    assert single_pred["PC"] == cli_results[audio_path]["PC"]
+    assert single_pred["PQ"] == cli_results[audio_path]["PQ"]
+def test_inference_load_from_jsonl():
+    audio_file_list = AudioFileList.from_jsonl("sample_audio/test.jsonl")
+    model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+    model.eval()
+    predictions = model.predict_from_files(audio_file_list)
+    single_pred = predictions[0]
+    assert single_pred["CE"] == cli_results[audio_file_list.files[0].path]["CE"]
+    assert single_pred["CU"] == cli_results[audio_file_list.files[0].path]["CU"]
+    assert single_pred["PC"] == cli_results[audio_file_list.files[0].path]["PC"]
+    assert single_pred["PQ"] == cli_results[audio_file_list.files[0].path]["PQ"]
+def test_inference_twice_on_same_audio_yields_same_result():
+    audio_file = AudioFile(path="sample_audio/libritts_spk-84.wav")
+    model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+    model.eval()
+    predictions_a = model.predict_from_files(audio_file)
+    predictions_b = model.predict_from_files(audio_file)
+    single_pred_a = predictions_a[0]
+    single_pred_b = predictions_b[0]
+    assert single_pred_a["CE"] == single_pred_b["CE"]
+    assert single_pred_a["CU"] == single_pred_b["CU"]
+    assert single_pred_a["PC"] == single_pred_b["PC"]
+    assert single_pred_a["PQ"] == single_pred_b["PQ"]
+def test_loading_from_wav():
+    audio_path = "sample_audio/libritts_spk-84.wav"
+    model = AudioBoxAesthetics.from_pretrained("audiobox-aesthetics")
+    model.eval()
+    wav = model.load_audio(audio_path)
+    predictions = model.predict_from_wavs(wav)
+    single_pred = predictions[0]
+    assert single_pred["CE"] == cli_results[audio_path]["CE"]
+    assert single_pred["CU"] == cli_results[audio_path]["CU"]
+    assert single_pred["PC"] == cli_results[audio_path]["PC"]
+    assert single_pred["PQ"] == cli_results[audio_path]["PQ"]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff