KingNish commited on
Commit
85b4489
·
verified ·
1 Parent(s): a1a370d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -74
app.py CHANGED
@@ -67,8 +67,8 @@ import time
67
  import copy
68
  from collections import Counter
69
  from models.soundstream_hubert_new import SoundStream
70
- from vocoder import build_codec_model, process_audio
71
- from post_process_audio import replace_low_freq_with_energy_matched
72
 
73
  device = "cuda:0"
74
 
@@ -82,9 +82,9 @@ model.eval()
82
 
83
  basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
84
  resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
85
- config_path = './xcodec_mini_infer/decoders/config.yaml'
86
- vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth'
87
- inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth'
88
 
89
  mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
90
 
@@ -97,14 +97,15 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
97
  # codec_model = torch.compile(codec_model)
98
  codec_model.eval()
99
 
100
- # Preload and compile vocoders - Not using vocoder now
101
- # vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
102
- # vocal_decoder.to(device)
103
- # inst_decoder.to(device)
104
- # vocal_decoder = torch.compile(vocal_decoder)
105
- # inst_decoder = torch.compile(inst_decoder)
106
- # vocal_decoder.eval()
107
- # inst_decoder.eval()
 
108
 
109
  @spaces.GPU(duration=120)
110
  def generate_music(
@@ -245,8 +246,8 @@ def generate_music(
245
  if len(soa_idx) != len(eoa_idx):
246
  raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
247
 
248
- vocals_codec_results = []
249
- instrumentals_codec_results = []
250
  range_begin = 1 if use_audio_prompt else 0
251
  for i in range(range_begin, len(soa_idx)):
252
  codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
@@ -254,63 +255,27 @@ def generate_music(
254
  codec_ids = codec_ids[1:]
255
  codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
256
  vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
257
- vocals_codec_results.append(vocals_ids)
258
  instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
259
- instrumentals_codec_results.append(instrumentals_ids)
260
- vocals_codec_result = np.concatenate(vocals_codec_results, axis=1)
261
- instrumentals_codec_result = np.concatenate(instrumentals_codec_results, axis=1)
262
-
263
-
264
- print("Converting to Audio...")
265
-
266
- # convert audio tokens to audio
267
- def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
268
- folder_path = os.path.dirname(path)
269
- if not os.path.exists(folder_path):
270
- os.makedirs(folder_path)
271
- limit = 0.99
272
- max_val = wav.abs().max()
273
- wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
274
- torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
275
-
276
- # reconstruct tracks
277
- recons_output_dir = os.path.join(output_dir, "recons")
278
- recons_mix_dir = os.path.join(recons_output_dir, 'mix')
279
- os.makedirs(recons_mix_dir, exist_ok=True)
280
-
281
- # Decode vocals
282
- with torch.no_grad():
283
- decoded_vocals_waveform = codec_model.decode(
284
- torch.as_tensor(vocals_codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
285
- decoded_vocals_waveform = decoded_vocals_waveform.cpu().squeeze(0)
286
 
287
- # Decode instrumentals
288
  with torch.no_grad():
289
- decoded_instrumentals_waveform = codec_model.decode(
290
- torch.as_tensor(instrumentals_codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
291
- decoded_instrumentals_waveform = decoded_instrumentals_waveform.cpu().squeeze(0)
 
 
 
292
 
293
- # Mix tracks
294
- mixed_waveform = (decoded_vocals_waveform + decoded_instrumentals_waveform) / 1.0
 
295
 
296
- vocal_sr = 16000
297
- instrumental_sr = 16000
298
- mixed_sr = 16000
299
-
300
- # added scaling to the audio
301
- limit = 0.99
302
- max_val = np.max(np.abs(mixed_waveform))
303
- mixed_waveform = mixed_waveform * min(limit / max_val, 1)
304
 
305
- max_val = np.max(np.abs(decoded_vocals_waveform))
306
- decoded_vocals_waveform = decoded_vocals_waveform * min(limit/ max_val, 1)
307
-
308
- max_val = np.max(np.abs(decoded_instrumentals_waveform))
309
- decoded_instrumentals_waveform = decoded_instrumentals_waveform * min(limit/max_val,1)
310
-
311
- print("All process Done")
312
-
313
- return (mixed_sr, mixed_waveform.numpy()), (vocal_sr, decoded_vocals_waveform.numpy()), (instrumental_sr, decoded_instrumentals_waveform.numpy())
314
 
315
  def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
316
  # Execute the command
@@ -351,11 +316,11 @@ with gr.Blocks() as demo:
351
  num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
352
  max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
353
  submit_btn = gr.Button("Submit")
354
- music_out_mix = gr.Audio(label="Final Audio Result", interactive=False)
355
- with gr.Accordion(label="Vocal and Instrumental Result", open=False):
356
- music_out_vocals = gr.Audio(label="Vocal Audio Result", interactive=False)
357
- music_out_instrumental = gr.Audio(label="Instrumental Audio Result", interactive=False)
358
 
 
 
 
 
359
 
360
  gr.Examples(
361
  examples=[
@@ -401,17 +366,16 @@ Living out my dreams with this mic and a deal
401
  ]
402
  ],
403
  inputs=[genre_txt, lyrics_txt],
404
- outputs=[music_out_mix, music_out_vocals, music_out_instrumental],
405
  cache_examples=True,
406
  cache_mode="eager",
407
  fn=infer
408
  )
409
 
410
- gr.Markdown("## We are actively working on improving YuE, and welcome community contributions! Feel free to submit PRs to enhance the model and demo.")
411
-
412
  submit_btn.click(
413
  fn=infer,
414
  inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
415
- outputs=[music_out_mix, music_out_vocals, music_out_instrumental]
416
  )
 
417
  demo.queue().launch(show_error=True)
 
67
  import copy
68
  from collections import Counter
69
  from models.soundstream_hubert_new import SoundStream
70
+ #from vocoder import build_codec_model, process_audio # removed vocoder
71
+ #from post_process_audio import replace_low_freq_with_energy_matched # removed post process
72
 
73
  device = "cuda:0"
74
 
 
82
 
83
  basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
84
  resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
85
+ #config_path = './xcodec_mini_infer/decoders/config.yaml' # removed vocoder
86
+ #vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # removed vocoder
87
+ #inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # removed vocoder
88
 
89
  mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
90
 
 
97
  # codec_model = torch.compile(codec_model)
98
  codec_model.eval()
99
 
100
+ # Preload and compile vocoders # removed vocoder
101
+ #vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
102
+ #vocal_decoder.to(device)
103
+ #inst_decoder.to(device)
104
+ #vocal_decoder = torch.compile(vocal_decoder)
105
+ #inst_decoder = torch.compile(inst_decoder)
106
+ #vocal_decoder.eval()
107
+ #inst_decoder.eval()
108
+
109
 
110
  @spaces.GPU(duration=120)
111
  def generate_music(
 
246
  if len(soa_idx) != len(eoa_idx):
247
  raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
248
 
249
+ vocals = []
250
+ instrumentals = []
251
  range_begin = 1 if use_audio_prompt else 0
252
  for i in range(range_begin, len(soa_idx)):
253
  codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
 
255
  codec_ids = codec_ids[1:]
256
  codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
257
  vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
258
+ vocals.append(vocals_ids)
259
  instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
260
+ instrumentals.append(instrumentals_ids)
261
+ vocals = np.concatenate(vocals, axis=1)
262
+ instrumentals = np.concatenate(instrumentals, axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
 
264
+ #convert audio tokens to audio
265
  with torch.no_grad():
266
+ decoded_vocals = codec_model.decode(
267
+ torch.as_tensor(vocals.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
268
+ device))
269
+ decoded_instrumentals = codec_model.decode(
270
+ torch.as_tensor(instrumentals.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
271
+ device))
272
 
273
+ decoded_vocals = decoded_vocals.cpu().squeeze(0)
274
+ decoded_instrumentals = decoded_instrumentals.cpu().squeeze(0)
275
+ mixed_audio = (decoded_vocals + decoded_instrumentals)/2
276
 
277
+ return (16000, mixed_audio.numpy()), (16000, decoded_vocals.numpy()), (16000, decoded_instrumentals.numpy())
 
 
 
 
 
 
 
278
 
 
 
 
 
 
 
 
 
 
279
 
280
  def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
281
  # Execute the command
 
316
  num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
317
  max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
318
  submit_btn = gr.Button("Submit")
 
 
 
 
319
 
320
+ music_out = gr.Audio(label="Mixed Audio Result")
321
+ with gr.Accordion(label="Vocal and Instrumental Result", open=False):
322
+ vocal_out = gr.Audio(label="Vocal Audio")
323
+ instrumental_out = gr.Audio(label="Instrumental Audio")
324
 
325
  gr.Examples(
326
  examples=[
 
366
  ]
367
  ],
368
  inputs=[genre_txt, lyrics_txt],
369
+ outputs=[music_out, vocal_out, instrumental_out],
370
  cache_examples=True,
371
  cache_mode="eager",
372
  fn=infer
373
  )
374
 
 
 
375
  submit_btn.click(
376
  fn=infer,
377
  inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
378
+ outputs=[music_out, vocal_out, instrumental_out]
379
  )
380
+ gr.Markdown("## Call for Contributions\nIf you find this space interesting please feel free to contribute.")
381
  demo.queue().launch(show_error=True)