marcosegura34 commited on
Commit
24d8378
·
verified ·
1 Parent(s): 8ef6882

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +441 -0
  2. requirements.txt +15 -0
app.py ADDED
@@ -0,0 +1,441 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ import spaces
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import torch
8
+ import os
9
+ import sys
10
+
11
+ # to import modules from parent_dir
12
+ parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
13
+ sys.path.append(parent_dir)
14
+
15
+ from meldataset import get_mel_spectrogram, MAX_WAV_VALUE
16
+ from bigvgan import BigVGAN
17
+ import librosa
18
+ import numpy as np
19
+ from utils import plot_spectrogram
20
+ import PIL
21
+
22
+ if torch.cuda.is_available():
23
+ device = torch.device("cuda")
24
+ torch.backends.cudnn.benchmark = False
25
+ print(f"using GPU")
26
+ else:
27
+ device = torch.device("cpu")
28
+ print(f"using CPU")
29
+
30
+
31
+ def inference_gradio(input, model_choice): # Input is audio waveform in [T, channel]
32
+ sr, audio = input # Unpack input to sampling rate and audio itself
33
+ audio = np.transpose(audio) # Transpose to [channel, T] for librosa
34
+ audio = audio / MAX_WAV_VALUE # Convert int16 to float range used by BigVGAN
35
+
36
+ model = dict_model[model_choice]
37
+
38
+ if sr != model.h.sampling_rate: # Convert audio to model's sampling rate
39
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=model.h.sampling_rate)
40
+ if len(audio.shape) == 2: # Stereo
41
+ audio = librosa.to_mono(audio) # Convert to mono if stereo
42
+ audio = librosa.util.normalize(audio) * 0.95
43
+
44
+ output, spec_gen = inference_model(
45
+ audio, model
46
+ ) # Output is generated audio in ndarray, int16
47
+
48
+ spec_plot_gen = plot_spectrogram(spec_gen)
49
+
50
+ output_audio = (model.h.sampling_rate, output) # Tuple for gr.Audio output
51
+
52
+ buffer = spec_plot_gen.canvas.buffer_rgba()
53
+ output_image = PIL.Image.frombuffer(
54
+ "RGBA", spec_plot_gen.canvas.get_width_height(), buffer, "raw", "RGBA", 0, 1
55
+ )
56
+
57
+ return output_audio, output_image
58
+
59
+
60
+ @spaces.GPU(duration=120)
61
+ def inference_model(audio_input, model):
62
+ # Load model to device
63
+ model.to(device)
64
+
65
+ with torch.inference_mode():
66
+ wav = torch.FloatTensor(audio_input)
67
+ # Compute mel spectrogram from the ground truth audio
68
+ spec_gt = get_mel_spectrogram(wav.unsqueeze(0), model.h).to(device)
69
+
70
+ y_g_hat = model(spec_gt)
71
+
72
+ audio_gen = y_g_hat.squeeze().cpu()
73
+ spec_gen = get_mel_spectrogram(audio_gen.unsqueeze(0), model.h)
74
+ audio_gen = audio_gen.numpy() # [T], float [-1, 1]
75
+ audio_gen = (audio_gen * MAX_WAV_VALUE).astype("int16") # [T], int16
76
+ spec_gen = spec_gen.squeeze().numpy() # [C, T_frame]
77
+
78
+ # Unload to CPU
79
+ model.to("cpu")
80
+ # Delete GPU tensor
81
+ del spec_gt, y_g_hat
82
+
83
+ return audio_gen, spec_gen
84
+
85
+
86
+ css = """
87
+ a {
88
+ color: inherit;
89
+ text-decoration: underline;
90
+ }
91
+ .gradio-container {
92
+ font-family: 'IBM Plex Sans', sans-serif;
93
+ }
94
+ .gr-button {
95
+ color: white;
96
+ border-color: #000000;
97
+ background: #000000;
98
+ }
99
+ input[type='range'] {
100
+ accent-color: #000000;
101
+ }
102
+ .dark input[type='range'] {
103
+ accent-color: #dfdfdf;
104
+ }
105
+ .container {
106
+ max-width: 730px;
107
+ margin: auto;
108
+ padding-top: 1.5rem;
109
+ }
110
+ #gallery {
111
+ min-height: 22rem;
112
+ margin-bottom: 15px;
113
+ margin-left: auto;
114
+ margin-right: auto;
115
+ border-bottom-right-radius: .5rem !important;
116
+ border-bottom-left-radius: .5rem !important;
117
+ }
118
+ #gallery>div>.h-full {
119
+ min-height: 20rem;
120
+ }
121
+ .details:hover {
122
+ text-decoration: underline;
123
+ }
124
+ .gr-button {
125
+ white-space: nowrap;
126
+ }
127
+ .gr-button:focus {
128
+ border-color: rgb(147 197 253 / var(--tw-border-opacity));
129
+ outline: none;
130
+ box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
131
+ --tw-border-opacity: 1;
132
+ --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
133
+ --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
134
+ --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
135
+ --tw-ring-opacity: .5;
136
+ }
137
+ #advanced-btn {
138
+ font-size: .7rem !important;
139
+ line-height: 19px;
140
+ margin-top: 12px;
141
+ margin-bottom: 12px;
142
+ padding: 2px 8px;
143
+ border-radius: 14px !important;
144
+ }
145
+ #advanced-options {
146
+ margin-bottom: 20px;
147
+ }
148
+ .footer {
149
+ margin-bottom: 45px;
150
+ margin-top: 35px;
151
+ text-align: center;
152
+ border-bottom: 1px solid #e5e5e5;
153
+ }
154
+ .footer>p {
155
+ font-size: .8rem;
156
+ display: inline-block;
157
+ padding: 0 10px;
158
+ transform: translateY(10px);
159
+ background: white;
160
+ }
161
+ .dark .footer {
162
+ border-color: #303030;
163
+ }
164
+ .dark .footer>p {
165
+ background: #0b0f19;
166
+ }
167
+ .acknowledgments h4{
168
+ margin: 1.25em 0 .25em 0;
169
+ font-weight: bold;
170
+ font-size: 115%;
171
+ }
172
+ #container-advanced-btns{
173
+ display: flex;
174
+ flex-wrap: wrap;
175
+ justify-content: space-between;
176
+ align-items: center;
177
+ }
178
+ .animate-spin {
179
+ animation: spin 1s linear infinite;
180
+ }
181
+ @keyframes spin {
182
+ from {
183
+ transform: rotate(0deg);
184
+ }
185
+ to {
186
+ transform: rotate(360deg);
187
+ }
188
+ }
189
+ #share-btn-container {
190
+ display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
191
+ margin-top: 10px;
192
+ margin-left: auto;
193
+ }
194
+ #share-btn {
195
+ all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;right:0;
196
+ }
197
+ #share-btn * {
198
+ all: unset;
199
+ }
200
+ #share-btn-container div:nth-child(-n+2){
201
+ width: auto !important;
202
+ min-height: 0px !important;
203
+ }
204
+ #share-btn-container .wrap {
205
+ display: none !important;
206
+ }
207
+ .gr-form{
208
+ flex: 1 1 50%; border-top-right-radius: 0; border-bottom-right-radius: 0;
209
+ }
210
+ #prompt-container{
211
+ gap: 0;
212
+ }
213
+ #generated_id{
214
+ min-height: 700px
215
+ }
216
+ #setting_id{
217
+ margin-bottom: 12px;
218
+ text-align: center;
219
+ font-weight: 900;
220
+ }
221
+ """
222
+
223
+ # Script for loading the models
224
+
225
+ LIST_MODEL_ID = [
226
+ "bigvgan_24khz_100band",
227
+ "bigvgan_base_24khz_100band",
228
+ "bigvgan_22khz_80band",
229
+ "bigvgan_base_22khz_80band",
230
+ "bigvgan_v2_22khz_80band_256x",
231
+ "bigvgan_v2_22khz_80band_fmax8k_256x",
232
+ "bigvgan_v2_24khz_100band_256x",
233
+ "bigvgan_v2_44khz_128band_256x",
234
+ "bigvgan_v2_44khz_128band_512x",
235
+ ]
236
+
237
+ dict_model = {}
238
+ dict_config = {}
239
+
240
+ for model_name in LIST_MODEL_ID:
241
+
242
+ generator = BigVGAN.from_pretrained("nvidia/" + model_name)
243
+ generator.remove_weight_norm()
244
+ generator.eval()
245
+
246
+ dict_model[model_name] = generator
247
+ dict_config[model_name] = generator.h
248
+
249
+ # Script for Gradio UI
250
+
251
+ iface = gr.Blocks(css=css, title="BigVGAN - Demo")
252
+
253
+ with iface:
254
+ gr.HTML(
255
+ """
256
+ <div style="text-align: center; max-width: 900px; margin: 0 auto;">
257
+ <div
258
+ style="
259
+ display: inline-flex;
260
+ align-items: center;
261
+ gap: 0.8rem;
262
+ font-size: 1.5rem;
263
+ "
264
+ >
265
+ <h1 style="font-weight: 700; margin-bottom: 7px; line-height: normal;">
266
+ BigVGAN: A Universal Neural Vocoder with Large-Scale Training
267
+ </h1>
268
+ </div>
269
+ <p style="margin-bottom: 10px; font-size: 125%">
270
+ <a href="https://arxiv.org/abs/2206.04658">[Paper]</a> <a href="https://github.com/NVIDIA/BigVGAN">[Code]</a> <a href="https://bigvgan-demo.github.io/">[Demo]</a> <a href="https://research.nvidia.com/labs/adlr/projects/bigvgan/">[Project page]</a>
271
+ </p>
272
+ </div>
273
+ """
274
+ )
275
+ gr.HTML(
276
+ """
277
+ <div>
278
+ <h3>News</h3>
279
+ <p>[Jul 2024] We release BigVGAN-v2 along with pretrained checkpoints. Below are the highlights:</p>
280
+ <ul>
281
+ <li>Custom CUDA kernel for inference: we provide a fused upsampling + activation kernel written in CUDA for accelerated inference speed. Our test shows 1.5 - 3x faster speed on a single A100 GPU.</li>
282
+ <li>Improved discriminator and loss: BigVGAN-v2 is trained using a <a href="https://arxiv.org/abs/2311.14957" target="_blank">multi-scale sub-band CQT discriminator</a> and a <a href="https://arxiv.org/abs/2306.06546" target="_blank">multi-scale mel spectrogram loss</a>.</li>
283
+ <li>Larger training data: BigVGAN-v2 is trained using datasets containing diverse audio types, including speech in multiple languages, environmental sounds, and instruments.</li>
284
+ <li>We provide pretrained checkpoints of BigVGAN-v2 using diverse audio configurations, supporting up to 44 kHz sampling rate and 512x upsampling ratio. See the table below for the link.</li>
285
+ </ul>
286
+ </div>
287
+ """
288
+ )
289
+ gr.HTML(
290
+ """
291
+ <div>
292
+ <h3>Model Overview</h3>
293
+ BigVGAN is a universal neural vocoder model that generates audio waveforms using mel spectrogram as inputs.
294
+ <center><img src="https://user-images.githubusercontent.com/15963413/218609148-881e39df-33af-4af9-ab95-1427c4ebf062.png" width="800" style="margin-top: 20px; border-radius: 15px;"></center>
295
+ </div>
296
+ """
297
+ )
298
+ with gr.Accordion("Input"):
299
+
300
+ model_choice = gr.Dropdown(
301
+ label="Select the model to use",
302
+ info="The default model is bigvgan_v2_24khz_100band_256x",
303
+ value="bigvgan_v2_24khz_100band_256x",
304
+ choices=[m for m in LIST_MODEL_ID],
305
+ interactive=True,
306
+ )
307
+
308
+ audio_input = gr.Audio(
309
+ label="Input Audio", elem_id="input-audio", interactive=True
310
+ )
311
+
312
+ button = gr.Button("Submit")
313
+
314
+ with gr.Accordion("Output"):
315
+ with gr.Column():
316
+ output_audio = gr.Audio(label="Output Audio", elem_id="output-audio")
317
+ output_image = gr.Image(
318
+ label="Output Mel Spectrogram", elem_id="output-image-gen"
319
+ )
320
+
321
+ button.click(
322
+ inference_gradio,
323
+ inputs=[audio_input, model_choice],
324
+ outputs=[output_audio, output_image],
325
+ concurrency_limit=10,
326
+ )
327
+
328
+ gr.Examples(
329
+ [
330
+ [
331
+ os.path.join(os.path.dirname(__file__), "examples/jensen_24k.wav"),
332
+ "bigvgan_v2_24khz_100band_256x",
333
+ ],
334
+ [
335
+ os.path.join(os.path.dirname(__file__), "examples/libritts_24k.wav"),
336
+ "bigvgan_v2_24khz_100band_256x",
337
+ ],
338
+ [
339
+ os.path.join(os.path.dirname(__file__), "examples/queen_24k.wav"),
340
+ "bigvgan_v2_24khz_100band_256x",
341
+ ],
342
+ [
343
+ os.path.join(os.path.dirname(__file__), "examples/dance_24k.wav"),
344
+ "bigvgan_v2_24khz_100band_256x",
345
+ ],
346
+ [
347
+ os.path.join(os.path.dirname(__file__), "examples/megalovania_24k.wav"),
348
+ "bigvgan_v2_24khz_100band_256x",
349
+ ],
350
+ [
351
+ os.path.join(os.path.dirname(__file__), "examples/hifitts_44k.wav"),
352
+ "bigvgan_v2_44khz_128band_256x",
353
+ ],
354
+ [
355
+ os.path.join(os.path.dirname(__file__), "examples/musdbhq_44k.wav"),
356
+ "bigvgan_v2_44khz_128band_256x",
357
+ ],
358
+ [
359
+ os.path.join(os.path.dirname(__file__), "examples/musiccaps1_44k.wav"),
360
+ "bigvgan_v2_44khz_128band_256x",
361
+ ],
362
+ [
363
+ os.path.join(os.path.dirname(__file__), "examples/musiccaps2_44k.wav"),
364
+ "bigvgan_v2_44khz_128band_256x",
365
+ ],
366
+ ],
367
+ fn=inference_gradio,
368
+ inputs=[audio_input, model_choice],
369
+ outputs=[output_audio, output_image],
370
+ )
371
+
372
+ # Define the data for the table
373
+ data = {
374
+ "Model Name": [
375
+ "bigvgan_v2_44khz_128band_512x",
376
+ "bigvgan_v2_44khz_128band_256x",
377
+ "bigvgan_v2_24khz_100band_256x",
378
+ "bigvgan_v2_22khz_80band_256x",
379
+ "bigvgan_v2_22khz_80band_fmax8k_256x",
380
+ "bigvgan_24khz_100band",
381
+ "bigvgan_base_24khz_100band",
382
+ "bigvgan_22khz_80band",
383
+ "bigvgan_base_22khz_80band",
384
+ ],
385
+ "Sampling Rate": [
386
+ "44 kHz",
387
+ "44 kHz",
388
+ "24 kHz",
389
+ "22 kHz",
390
+ "22 kHz",
391
+ "24 kHz",
392
+ "24 kHz",
393
+ "22 kHz",
394
+ "22 kHz",
395
+ ],
396
+ "Mel band": [128, 128, 100, 80, 80, 100, 100, 80, 80],
397
+ "fmax": [22050, 22050, 12000, 11025, 8000, 12000, 12000, 8000, 8000],
398
+ "Upsampling Ratio": [512, 256, 256, 256, 256, 256, 256, 256, 256],
399
+ "Parameters": [
400
+ "122M",
401
+ "112M",
402
+ "112M",
403
+ "112M",
404
+ "112M",
405
+ "112M",
406
+ "14M",
407
+ "112M",
408
+ "14M",
409
+ ],
410
+ "Dataset": [
411
+ "Large-scale Compilation",
412
+ "Large-scale Compilation",
413
+ "Large-scale Compilation",
414
+ "Large-scale Compilation",
415
+ "Large-scale Compilation",
416
+ "LibriTTS",
417
+ "LibriTTS",
418
+ "LibriTTS + VCTK + LJSpeech",
419
+ "LibriTTS + VCTK + LJSpeech",
420
+ ],
421
+ "Fine-Tuned": ["No", "No", "No", "No", "No", "No", "No", "No", "No"],
422
+ }
423
+
424
+ base_url = "https://huggingface.co/nvidia/"
425
+
426
+ df = pd.DataFrame(data)
427
+ df["Model Name"] = df["Model Name"].apply(
428
+ lambda x: f'<a href="{base_url}{x}">{x}</a>'
429
+ )
430
+
431
+ html_table = gr.HTML(
432
+ f"""
433
+ <div style="text-align: center;">
434
+ {df.to_html(index=False, escape=False, classes='border="1" cellspacing="0" cellpadding="5" style="margin-left: auto; margin-right: auto;')}
435
+ <p><b>NOTE: The v1 models are trained using speech audio datasets ONLY! (24kHz models: LibriTTS, 22kHz models: LibriTTS + VCTK + LJSpeech).</b></p>
436
+ </div>
437
+ """
438
+ )
439
+
440
+ iface.queue()
441
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ librosa>=0.8.1
4
+ scipy
5
+ tensorboard
6
+ soundfile
7
+ matplotlib
8
+ pesq
9
+ auraloss
10
+ tqdm
11
+ nnAudio
12
+ ninja
13
+ huggingface_hub>=0.23.4
14
+ gradio>=4.38.1
15
+ spaces>=0.28.3