YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 31

Commit

193bc92

verified ·

1 Parent(s): 0f34fab

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -113

app.py CHANGED Viewed

@@ -97,14 +97,14 @@ codec_model.load_state_dict(parameter_dict['codec_model'])
 # codec_model = torch.compile(codec_model)
 codec_model.eval()
-# Preload and compile vocoders
-vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
-vocal_decoder.to(device)
-inst_decoder.to(device)
 # vocal_decoder = torch.compile(vocal_decoder)
 # inst_decoder = torch.compile(inst_decoder)
-vocal_decoder.eval()
-inst_decoder.eval()
 def generate_music(
@@ -227,9 +227,7 @@ def generate_music(
                     pad_token_id=mmtokenizer.eoa,
                     logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
                     guidance_scale=guidance_scale,
-                    use_cache=True,
-                    # top_k=50,
-                    # num_beams=1
                 )
                 if output_seq[0][-1].item() != mmtokenizer.eoa:
                     tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
@@ -247,8 +245,8 @@ def generate_music(
         if len(soa_idx) != len(eoa_idx):
             raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
-        vocals = []
-        instrumentals = []
         range_begin = 1 if use_audio_prompt else 0
         for i in range(range_begin, len(soa_idx)):
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
@@ -256,18 +254,11 @@ def generate_music(
                 codec_ids = codec_ids[1:]
             codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
-            vocals.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
-            instrumentals.append(instrumentals_ids)
-        vocals = np.concatenate(vocals, axis=1)
-        instrumentals = np.concatenate(instrumentals, axis=1)
-        vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
-        inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
-        np.save(vocal_save_path, vocals)
-        np.save(inst_save_path, instrumentals)
-        stage1_output_set.append(vocal_save_path)
-        stage1_output_set.append(inst_save_path)
         print("Converting to Audio...")
@@ -286,103 +277,41 @@ def generate_music(
         recons_output_dir = os.path.join(output_dir, "recons")
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
-        tracks = []
-        for npy in stage1_output_set:
-            codec_result = np.load(npy)
-            decodec_rlt = []
-            with torch.no_grad():
-                decoded_waveform = codec_model.decode(
-                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
-                        device))
-            decoded_waveform = decoded_waveform.cpu().squeeze(0)
-            decodec_rlt.append(torch.as_tensor(decoded_waveform))
-            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
-            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
-            tracks.append(save_path)
-            save_audio(decodec_rlt, save_path, 16000)
-        # mix tracks
-        for inst_path in tracks:
-            try:
-                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
-                        and 'instrumental' in inst_path:
-                    # find pair
-                    vocal_path = inst_path.replace('instrumental', 'vocal')
-                    if not os.path.exists(vocal_path):
-                        continue
-                    # mix
-                    recons_mix = os.path.join(recons_mix_dir,
-                                              os.path.basename(inst_path).replace('instrumental', 'mixed'))
-                    vocal_stem, sr = sf.read(inst_path)
-                    instrumental_stem, _ = sf.read(vocal_path)
-                    mix_stem = (vocal_stem + instrumental_stem) / 1
-                    sf.write(recons_mix, mix_stem, sr)
-            except Exception as e:
-                print(e)
-        # vocoder to upsample audios
-        vocoder_output_dir = os.path.join(output_dir, 'vocoder')
-        vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
-        vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
-        os.makedirs(vocoder_mix_dir, exist_ok=True)
-        os.makedirs(vocoder_stems_dir, exist_ok=True)
-        instrumental_output = None
-        vocal_output = None
-        for npy in stage1_output_set:
-            if 'instrumental' in npy:
-                # Process instrumental
-                instrumental_output = process_audio(
-                    npy,
-                    os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
-                    rescale,
-                    argparse.Namespace(**locals()),  # Convert local variables to argparse.Namespace
-                    inst_decoder,
-                    codec_model
-                )
-            else:
-                # Process vocal
-                vocal_output = process_audio(
-                    npy,
-                    os.path.join(vocoder_stems_dir, 'vocal.mp3'),
-                    rescale,
-                    argparse.Namespace(**locals()),  # Convert local variables to argparse.Namespace
-                    vocal_decoder,
-                    codec_model
-                )
-        # mix tracks
-        try:
-            mix_output = instrumental_output + vocal_output
-            vocoder_mix = os.path.join(vocoder_mix_dir, os.path.basename(recons_mix))
-            save_audio(mix_output, vocoder_mix, 44100, rescale)
-            print(f"Created mix: {vocoder_mix}")
-        except RuntimeError as e:
-            print(e)
-            print(f"mix {vocoder_mix} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
-        # Post process
-        final_output_path = os.path.join(output_dir, os.path.basename(recons_mix))
-        replace_low_freq_with_energy_matched(
-            a_file=recons_mix,  # 16kHz
-            b_file=vocoder_mix,  # 48kHz
-            c_file=final_output_path,
-            cutoff_freq=5500.0
-        )
         print("All process Done")
-        # Load the final audio file and return the numpy array
-        final_audio, sr = torchaudio.load(final_output_path)
-        return (sr, final_audio.squeeze().numpy())
 @spaces.GPU(duration=120)
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
     try:
-        audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
                                cuda_idx=0, max_new_tokens=max_new_tokens)
-        return audio_data
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))
-        return None
     finally:
         print("Temporary files deleted.")
@@ -411,10 +340,13 @@ with gr.Blocks() as demo:
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
-                max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15,
-                                           interactive=True)
                 submit_btn = gr.Button("Submit")
-                music_out = gr.Audio(label="Audio Result")
         gr.Examples(
             examples=[
@@ -460,15 +392,17 @@ Living out my dreams with this mic and a deal
                 ]
             ],
             inputs=[genre_txt, lyrics_txt],
-            outputs=[music_out],
             cache_examples=True,
             cache_mode="eager",
             fn=infer
         )
     submit_btn.click(
         fn=infer,
         inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
-        outputs=[music_out]
     )
 demo.queue().launch(show_error=True)

 # codec_model = torch.compile(codec_model)
 codec_model.eval()
+# Preload and compile vocoders - Not using vocoder now
+# vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
+# vocal_decoder.to(device)
+# inst_decoder.to(device)
 # vocal_decoder = torch.compile(vocal_decoder)
 # inst_decoder = torch.compile(inst_decoder)
+# vocal_decoder.eval()
+# inst_decoder.eval()
 def generate_music(
                     pad_token_id=mmtokenizer.eoa,
                     logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
                     guidance_scale=guidance_scale,
+                    use_cache=True
                 )
                 if output_seq[0][-1].item() != mmtokenizer.eoa:
                     tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
         if len(soa_idx) != len(eoa_idx):
             raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+        vocals_codec_results = []
+        instrumentals_codec_results = []
         range_begin = 1 if use_audio_prompt else 0
         for i in range(range_begin, len(soa_idx)):
             codec_ids = ids[soa_idx[i] + 1:eoa_idx[i]]
                 codec_ids = codec_ids[1:]
             codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
             vocals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[0])
+            vocals_codec_results.append(vocals_ids)
             instrumentals_ids = codectool.ids2npy(rearrange(codec_ids, "(n b) -> b n", b=2)[1])
+            instrumentals_codec_results.append(instrumentals_ids)
+        vocals_codec_result = np.concatenate(vocals_codec_results, axis=1)
+        instrumentals_codec_result = np.concatenate(instrumentals_codec_results, axis=1)
         print("Converting to Audio...")
         recons_output_dir = os.path.join(output_dir, "recons")
         recons_mix_dir = os.path.join(recons_output_dir, 'mix')
         os.makedirs(recons_mix_dir, exist_ok=True)
+        # Decode vocals
+        with torch.no_grad():
+            decoded_vocals_waveform = codec_model.decode(
+                torch.as_tensor(vocals_codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
+        decoded_vocals_waveform = decoded_vocals_waveform.cpu().squeeze(0)
+        # Decode instrumentals
+        with torch.no_grad():
+            decoded_instrumentals_waveform = codec_model.decode(
+                torch.as_tensor(instrumentals_codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
+        decoded_instrumentals_waveform = decoded_instrumentals_waveform.cpu().squeeze(0)
+        # Mix tracks
+        mixed_waveform = (decoded_vocals_waveform + decoded_instrumentals_waveform) / 1.0
+        vocal_sr = 16000
+        instrumental_sr = 16000
+        mixed_sr = 16000
         print("All process Done")
+        return (mixed_sr, mixed_waveform.numpy()), (vocal_sr, decoded_vocals_waveform.numpy()), (instrumental_sr, decoded_instrumentals_waveform.numpy())
 @spaces.GPU(duration=120)
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):
     # Execute the command
     try:
+        mixed_audio_data, vocal_audio_data, instrumental_audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
                                cuda_idx=0, max_new_tokens=max_new_tokens)
+        return mixed_audio_data, vocal_audio_data, instrumental_audio_data
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))
+        return None, None, None
     finally:
         print("Temporary files deleted.")
             with gr.Column():
                 num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
+                max_new_tokens = gr.Slider(label="Duration of song", minimum=1, maximum=30, step=1, value=15, interactive=True)
                 submit_btn = gr.Button("Submit")
+                music_out_mix = gr.Audio(label="Final Audio Result", interactive=False)
+                with gr.Accordion(label="Vocal and Instrumental Result", open=False):
+                    music_out_vocals = gr.Audio(label="Vocal Audio Result", interactive=False)
+                    music_out_instrumental = gr.Audio(label="Instrumental Audio Result", interactive=False)
         gr.Examples(
             examples=[
                 ]
             ],
             inputs=[genre_txt, lyrics_txt],
+            outputs=[music_out_mix, music_out_vocals, music_out_instrumental],
             cache_examples=True,
             cache_mode="eager",
             fn=infer
         )
+        gr.Markdown("## We are actively working on improving YuE, and welcome community contributions! Feel free to submit PRs to enhance the model and demo.")
     submit_btn.click(
         fn=infer,
         inputs=[genre_txt, lyrics_txt, num_segments, max_new_tokens],
+        outputs=[music_out_mix, music_out_vocals, music_out_instrumental]
     )
 demo.queue().launch(show_error=True)