Audio-to-Audio
Safetensors
torch
lucadellalib commited on
Commit
78dc93d
·
verified ·
1 Parent(s): da86e1b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -333
README.md CHANGED
@@ -3,6 +3,8 @@ license: apache-2.0
3
  base_model:
4
  - microsoft/wavlm-large
5
  pipeline_tag: audio-to-audio
 
 
6
  ---
7
 
8
  # FocalCodec
@@ -25,16 +27,6 @@ See the readme at: https://github.com/lucadellalib/focalcodec
25
 
26
  ---------------------------------------------------------------------------------------------------------
27
 
28
- ## 📌 Available Checkpoints
29
-
30
- | Checkpoint | Token Rate (Hz) | Bitrate (kbps) | Dataset |
31
- |:-----------------------:|:---------------:|:--------------:|:-----------:|
32
- | **LibriTTS960_50Hz** | 50.0 | 0.65 | LibriTTS960 |
33
- | **LibriTTS960_25Hz** | 25.0 | 0.33 | LibriTTS960 |
34
- | **LibriTTS960_12_5Hz** | 12.5 | 0.16 | LibriTTS960 |
35
-
36
- ---------------------------------------------------------------------------------------------------------
37
-
38
  ## @ Citing
39
 
40
  ```
@@ -52,326 +44,4 @@ See the readme at: https://github.com/lucadellalib/focalcodec
52
 
53
54
 
55
- ---------------------------------------------------------------------------------------------------------
56
-
57
- # File information
58
-
59
- The repository contains the following file information:
60
-
61
- Filename: LibriTTS960_25Hz.json
62
- Content: {
63
- "encoder_name": "WavLM",
64
- "encoder_config": {
65
- "hidden_dims": [
66
- 512,
67
- 512,
68
- 512,
69
- 512,
70
- 512,
71
- 512,
72
- 512
73
- ],
74
- "kernel_sizes": [
75
- 10,
76
- 3,
77
- 3,
78
- 3,
79
- 3,
80
- 2,
81
- 2
82
- ],
83
- "strides": [
84
- 5,
85
- 2,
86
- 2,
87
- 2,
88
- 2,
89
- 2,
90
- 2
91
- ],
92
- "num_layers": 6,
93
- "dim": 1024,
94
- "ffn_dim": 4096,
95
- "num_heads": 16,
96
- "num_buckets": 320,
97
- "max_distance": 800,
98
- "dropout": 0.0,
99
- "conv_pos": 128,
100
- "conv_pos_groups": 16
101
- },
102
- "compressor_name": "FocalEncoder",
103
- "compressor_config": {
104
- "input_dim": 1024,
105
- "output_dim": 13,
106
- "hidden_dims": [
107
- 1024,
108
- 512,
109
- 256
110
- ],
111
- "downscale_factors": [
112
- 2,
113
- 1,
114
- 1
115
- ],
116
- "focal_window": 7,
117
- "focal_level": 2,
118
- "focal_factor": 2,
119
- "dropout": 0.0,
120
- "use_post_norm": false,
121
- "use_layerscale": false,
122
- "layerscale_init": 0.0001,
123
- "normalize_modulator": false
124
- },
125
- "quantizer_name": "BinarySphericalQuantizer",
126
- "quantizer_config": {
127
- "codebook_size": 8192
128
- },
129
- "decompressor_name": "FocalDecoder",
130
- "decompressor_config": {
131
- "input_dim": 13,
132
- "output_dim": 1024,
133
- "hidden_dims": [
134
- 256,
135
- 512,
136
- 1024
137
- ],
138
- "upscale_factors": [
139
- 1,
140
- 1,
141
- 2
142
- ],
143
- "focal_window": 7,
144
- "focal_level": 2,
145
- "focal_factor": 2,
146
- "dropout": 0.0,
147
- "use_post_norm": false,
148
- "use_layerscale": false,
149
- "layerscale_init": 0.0001,
150
- "normalize_modulator": false
151
- },
152
- "decoder_name": "Vocos",
153
- "decoder_config": {
154
- "input_channels": 1024,
155
- "num_layers": 8,
156
- "dim": 512,
157
- "ffn_dim": 1536,
158
- "kernel_size": 7,
159
- "padding": 3,
160
- "layerscale_init": null,
161
- "n_fft": 1024,
162
- "hop_length": 320
163
- }
164
- }
165
-
166
- Filename: focalcodec.png
167
- Content: "Content of the file is larger than 50 KB, too long to display."
168
-
169
- Filename: LibriTTS960_50Hz.json
170
- Content: {
171
- "encoder_name": "WavLM",
172
- "encoder_config": {
173
- "hidden_dims": [
174
- 512,
175
- 512,
176
- 512,
177
- 512,
178
- 512,
179
- 512,
180
- 512
181
- ],
182
- "kernel_sizes": [
183
- 10,
184
- 3,
185
- 3,
186
- 3,
187
- 3,
188
- 2,
189
- 2
190
- ],
191
- "strides": [
192
- 5,
193
- 2,
194
- 2,
195
- 2,
196
- 2,
197
- 2,
198
- 2
199
- ],
200
- "num_layers": 6,
201
- "dim": 1024,
202
- "ffn_dim": 4096,
203
- "num_heads": 16,
204
- "num_buckets": 320,
205
- "max_distance": 800,
206
- "dropout": 0.0,
207
- "conv_pos": 128,
208
- "conv_pos_groups": 16
209
- },
210
- "compressor_name": "FocalEncoder",
211
- "compressor_config": {
212
- "input_dim": 1024,
213
- "output_dim": 13,
214
- "hidden_dims": [
215
- 1024,
216
- 512,
217
- 256
218
- ],
219
- "downscale_factors": [
220
- 1,
221
- 1,
222
- 1
223
- ],
224
- "focal_window": 7,
225
- "focal_level": 2,
226
- "focal_factor": 2,
227
- "dropout": 0.0,
228
- "use_post_norm": false,
229
- "use_layerscale": false,
230
- "layerscale_init": 0.0001,
231
- "normalize_modulator": false
232
- },
233
- "quantizer_name": "BinarySphericalQuantizer",
234
- "quantizer_config": {
235
- "codebook_size": 8192
236
- },
237
- "decompressor_name": "FocalDecoder",
238
- "decompressor_config": {
239
- "input_dim": 13,
240
- "output_dim": 1024,
241
- "hidden_dims": [
242
- 256,
243
- 512,
244
- 1024
245
- ],
246
- "upscale_factors": [
247
- 1,
248
- 1,
249
- 1
250
- ],
251
- "focal_window": 7,
252
- "focal_level": 2,
253
- "focal_factor": 2,
254
- "dropout": 0.0,
255
- "use_post_norm": false,
256
- "use_layerscale": false,
257
- "layerscale_init": 0.0001,
258
- "normalize_modulator": false
259
- },
260
- "decoder_name": "Vocos",
261
- "decoder_config": {
262
- "input_channels": 1024,
263
- "num_layers": 8,
264
- "dim": 512,
265
- "ffn_dim": 1536,
266
- "kernel_size": 7,
267
- "padding": 3,
268
- "layerscale_init": null,
269
- "n_fft": 1024,
270
- "hop_length": 320
271
- }
272
- }
273
-
274
- Filename: LibriTTS960_12_5Hz.json
275
- Content: {
276
- "encoder_name": "WavLM",
277
- "encoder_config": {
278
- "hidden_dims": [
279
- 512,
280
- 512,
281
- 512,
282
- 512,
283
- 512,
284
- 512,
285
- 512
286
- ],
287
- "kernel_sizes": [
288
- 10,
289
- 3,
290
- 3,
291
- 3,
292
- 3,
293
- 2,
294
- 2
295
- ],
296
- "strides": [
297
- 5,
298
- 2,
299
- 2,
300
- 2,
301
- 2,
302
- 2,
303
- 2
304
- ],
305
- "num_layers": 6,
306
- "dim": 1024,
307
- "ffn_dim": 4096,
308
- "num_heads": 16,
309
- "num_buckets": 320,
310
- "max_distance": 800,
311
- "dropout": 0.0,
312
- "conv_pos": 128,
313
- "conv_pos_groups": 16
314
- },
315
- "compressor_name": "FocalEncoder",
316
- "compressor_config": {
317
- "input_dim": 1024,
318
- "output_dim": 13,
319
- "hidden_dims": [
320
- 1024,
321
- 512,
322
- 256
323
- ],
324
- "downscale_factors": [
325
- 2,
326
- 2,
327
- 1
328
- ],
329
- "focal_window": 7,
330
- "focal_level": 2,
331
- "focal_factor": 2,
332
- "dropout": 0.0,
333
- "use_post_norm": false,
334
- "use_layerscale": false,
335
- "layerscale_init": 0.0001,
336
- "normalize_modulator": false
337
- },
338
- "quantizer_name": "BinarySphericalQuantizer",
339
- "quantizer_config": {
340
- "codebook_size": 8192
341
- },
342
- "decompressor_name": "FocalDecoder",
343
- "decompressor_config": {
344
- "input_dim": 13,
345
- "output_dim": 1024,
346
- "hidden_dims": [
347
- 256,
348
- 512,
349
- 1024
350
- ],
351
- "upscale_factors": [
352
- 1,
353
- 2,
354
- 2
355
- ],
356
- "focal_window": 7,
357
- "focal_level": 2,
358
- "focal_factor": 2,
359
- "dropout": 0.0,
360
- "use_post_norm": false,
361
- "use_layerscale": false,
362
- "layerscale_init": 0.0001,
363
- "normalize_modulator": false
364
- },
365
- "decoder_name": "Vocos",
366
- "decoder_config": {
367
- "input_channels": 1024,
368
- "num_layers": 8,
369
- "dim": 512,
370
- "ffn_dim": 1536,
371
- "kernel_size": 7,
372
- "padding": 3,
373
- "layerscale_init": null,
374
- "n_fft": 1024,
375
- "hop_length": 320
376
- }
377
- }
 
3
  base_model:
4
  - microsoft/wavlm-large
5
  pipeline_tag: audio-to-audio
6
+ datasets:
7
+ - mythicinfinity/libritts
8
  ---
9
 
10
  # FocalCodec
 
27
 
28
  ---------------------------------------------------------------------------------------------------------
29
 
 
 
 
 
 
 
 
 
 
 
30
  ## @ Citing
31
 
32
  ```
 
44
 
45
46
 
47
+ ---------------------------------------------------------------------------------------------------------