lucadellalib
/

focalcodec_12_5hz

Audio-to-Audio

Safetensors

torch

Model card Files Files and versions Community

lucadellalib commited on 11 days ago

Commit

78dc93d

verified ·

1 Parent(s): da86e1b

Update README.md

Browse files

Files changed (1) hide show

README.md +3 -333

README.md CHANGED Viewed

@@ -3,6 +3,8 @@ license: apache-2.0
 base_model:
 - microsoft/wavlm-large
 pipeline_tag: audio-to-audio
 ---
 # FocalCodec
@@ -25,16 +27,6 @@ See the readme at: https://github.com/lucadellalib/focalcodec
 ---------------------------------------------------------------------------------------------------------
-## 📌 Available Checkpoints
-|       Checkpoint        | Token Rate (Hz) | Bitrate (kbps) |   Dataset   |
-|:-----------------------:|:---------------:|:--------------:|:-----------:|
-|  **LibriTTS960_50Hz**   |       50.0      |      0.65      | LibriTTS960 |
-|  **LibriTTS960_25Hz**   |      25.0       |      0.33      | LibriTTS960 |
-| **LibriTTS960_12_5Hz**  |      12.5       |      0.16      | LibriTTS960 |
----------------------------------------------------------------------------------------------------------
 ## @ Citing
 ```
@@ -52,326 +44,4 @@ See the readme at: https://github.com/lucadellalib/focalcodec
 [[email protected]](mailto:[email protected])
----------------------------------------------------------------------------------------------------------
-# File information
-The repository contains the following file information:
-Filename: LibriTTS960_25Hz.json
-Content: {
-  "encoder_name": "WavLM",
-  "encoder_config": {
-    "hidden_dims": [
-      512,
-      512,
-      512,
-      512,
-      512,
-      512,
-      512
-    ],
-    "kernel_sizes": [
-      10,
-      3,
-      3,
-      3,
-      3,
-      2,
-      2
-    ],
-    "strides": [
-      5,
-      2,
-      2,
-      2,
-      2,
-      2,
-      2
-    ],
-    "num_layers": 6,
-    "dim": 1024,
-    "ffn_dim": 4096,
-    "num_heads": 16,
-    "num_buckets": 320,
-    "max_distance": 800,
-    "dropout": 0.0,
-    "conv_pos": 128,
-    "conv_pos_groups": 16
-  },
-  "compressor_name": "FocalEncoder",
-  "compressor_config": {
-    "input_dim": 1024,
-    "output_dim": 13,
-    "hidden_dims": [
-      1024,
-      512,
-      256
-    ],
-    "downscale_factors": [
-      2,
-      1,
-      1
-    ],
-    "focal_window": 7,
-    "focal_level": 2,
-    "focal_factor": 2,
-    "dropout": 0.0,
-    "use_post_norm": false,
-    "use_layerscale": false,
-    "layerscale_init": 0.0001,
-    "normalize_modulator": false
-  },
-  "quantizer_name": "BinarySphericalQuantizer",
-  "quantizer_config": {
-    "codebook_size": 8192
-  },
-  "decompressor_name": "FocalDecoder",
-  "decompressor_config": {
-    "input_dim": 13,
-    "output_dim": 1024,
-    "hidden_dims": [
-      256,
-      512,
-      1024
-    ],
-    "upscale_factors": [
-      1,
-      1,
-      2
-    ],
-    "focal_window": 7,
-    "focal_level": 2,
-    "focal_factor": 2,
-    "dropout": 0.0,
-    "use_post_norm": false,
-    "use_layerscale": false,
-    "layerscale_init": 0.0001,
-    "normalize_modulator": false
-  },
-  "decoder_name": "Vocos",
-  "decoder_config": {
-    "input_channels": 1024,
-    "num_layers": 8,
-    "dim": 512,
-    "ffn_dim": 1536,
-    "kernel_size": 7,
-    "padding": 3,
-    "layerscale_init": null,
-    "n_fft": 1024,
-    "hop_length": 320
-  }
-}
-Filename: focalcodec.png
-Content: "Content of the file is larger than 50 KB, too long to display."
-Filename: LibriTTS960_50Hz.json
-Content: {
-  "encoder_name": "WavLM",
-  "encoder_config": {
-    "hidden_dims": [
-      512,
-      512,
-      512,
-      512,
-      512,
-      512,
-      512
-    ],
-    "kernel_sizes": [
-      10,
-      3,
-      3,
-      3,
-      3,
-      2,
-      2
-    ],
-    "strides": [
-      5,
-      2,
-      2,
-      2,
-      2,
-      2,
-      2
-    ],
-    "num_layers": 6,
-    "dim": 1024,
-    "ffn_dim": 4096,
-    "num_heads": 16,
-    "num_buckets": 320,
-    "max_distance": 800,
-    "dropout": 0.0,
-    "conv_pos": 128,
-    "conv_pos_groups": 16
-  },
-  "compressor_name": "FocalEncoder",
-  "compressor_config": {
-    "input_dim": 1024,
-    "output_dim": 13,
-    "hidden_dims": [
-      1024,
-      512,
-      256
-    ],
-    "downscale_factors": [
-      1,
-      1,
-      1
-    ],
-    "focal_window": 7,
-    "focal_level": 2,
-    "focal_factor": 2,
-    "dropout": 0.0,
-    "use_post_norm": false,
-    "use_layerscale": false,
-    "layerscale_init": 0.0001,
-    "normalize_modulator": false
-  },
-  "quantizer_name": "BinarySphericalQuantizer",
-  "quantizer_config": {
-    "codebook_size": 8192
-  },
-  "decompressor_name": "FocalDecoder",
-  "decompressor_config": {
-    "input_dim": 13,
-    "output_dim": 1024,
-    "hidden_dims": [
-      256,
-      512,
-      1024
-    ],
-    "upscale_factors": [
-      1,
-      1,
-      1
-    ],
-    "focal_window": 7,
-    "focal_level": 2,
-    "focal_factor": 2,
-    "dropout": 0.0,
-    "use_post_norm": false,
-    "use_layerscale": false,
-    "layerscale_init": 0.0001,
-    "normalize_modulator": false
-  },
-  "decoder_name": "Vocos",
-  "decoder_config": {
-    "input_channels": 1024,
-    "num_layers": 8,
-    "dim": 512,
-    "ffn_dim": 1536,
-    "kernel_size": 7,
-    "padding": 3,
-    "layerscale_init": null,
-    "n_fft": 1024,
-    "hop_length": 320
-  }
-}
-Filename: LibriTTS960_12_5Hz.json
-Content: {
-  "encoder_name": "WavLM",
-  "encoder_config": {
-    "hidden_dims": [
-      512,
-      512,
-      512,
-      512,
-      512,
-      512,
-      512
-    ],
-    "kernel_sizes": [
-      10,
-      3,
-      3,
-      3,
-      3,
-      2,
-      2
-    ],
-    "strides": [
-      5,
-      2,
-      2,
-      2,
-      2,
-      2,
-      2
-    ],
-    "num_layers": 6,
-    "dim": 1024,
-    "ffn_dim": 4096,
-    "num_heads": 16,
-    "num_buckets": 320,
-    "max_distance": 800,
-    "dropout": 0.0,
-    "conv_pos": 128,
-    "conv_pos_groups": 16
-  },
-  "compressor_name": "FocalEncoder",
-  "compressor_config": {
-    "input_dim": 1024,
-    "output_dim": 13,
-    "hidden_dims": [
-      1024,
-      512,
-      256
-    ],
-    "downscale_factors": [
-      2,
-      2,
-      1
-    ],
-    "focal_window": 7,
-    "focal_level": 2,
-    "focal_factor": 2,
-    "dropout": 0.0,
-    "use_post_norm": false,
-    "use_layerscale": false,
-    "layerscale_init": 0.0001,
-    "normalize_modulator": false
-  },
-  "quantizer_name": "BinarySphericalQuantizer",
-  "quantizer_config": {
-    "codebook_size": 8192
-  },
-  "decompressor_name": "FocalDecoder",
-  "decompressor_config": {
-    "input_dim": 13,
-    "output_dim": 1024,
-    "hidden_dims": [
-      256,
-      512,
-      1024
-    ],
-    "upscale_factors": [
-      1,
-      2,
-      2
-    ],
-    "focal_window": 7,
-    "focal_level": 2,
-    "focal_factor": 2,
-    "dropout": 0.0,
-    "use_post_norm": false,
-    "use_layerscale": false,
-    "layerscale_init": 0.0001,
-    "normalize_modulator": false
-  },
-  "decoder_name": "Vocos",
-  "decoder_config": {
-    "input_channels": 1024,
-    "num_layers": 8,
-    "dim": 512,
-    "ffn_dim": 1536,
-    "kernel_size": 7,
-    "padding": 3,
-    "layerscale_init": null,
-    "n_fft": 1024,
-    "hop_length": 320
-  }
-}

 base_model:
 - microsoft/wavlm-large
 pipeline_tag: audio-to-audio
+datasets:
+- mythicinfinity/libritts
 ---
 # FocalCodec
 ---------------------------------------------------------------------------------------------------------
 ## @ Citing
 ```
 [[email protected]](mailto:[email protected])
+---------------------------------------------------------------------------------------------------------