YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 31

Commit

1bb807a

verified ·

1 Parent(s): fdbe6f4

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -28

app.py CHANGED Viewed

@@ -261,34 +261,61 @@ def generate_music(
         vocals = np.concatenate(vocals, axis=1)
         instrumentals = np.concatenate(instrumentals, axis=1)
-        #convert audio tokens to audio
-        with torch.no_grad():
-            decoded_vocals = codec_model.decode(
-                torch.as_tensor(vocals.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
-                    device))
-            decoded_instrumentals = codec_model.decode(
-                torch.as_tensor(instrumentals.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
-                    device))
-        decoded_vocals = decoded_vocals.cpu().squeeze(0)
-        decoded_instrumentals = decoded_instrumentals.cpu().squeeze(0)
-        mixed_audio = (decoded_vocals + decoded_instrumentals)/2
-        # Scale to be between -1 and 1 and convert to int16
-        limit = 0.99
-        max_val = mixed_audio.abs().max()
-        mixed_audio = mixed_audio * min(limit / max_val, 1) if rescale else mixed_audio.clamp(-limit, limit)
-        mixed_audio = (mixed_audio * 32767).to(torch.int16).numpy()
-        max_val = decoded_vocals.abs().max()
-        decoded_vocals = decoded_vocals * min(limit / max_val, 1) if rescale else decoded_vocals.clamp(-limit, limit)
-        decoded_vocals = (decoded_vocals * 32767).to(torch.int16).numpy()
-        max_val = decoded_instrumentals.abs().max()
-        decoded_instrumentals = decoded_instrumentals * min(limit / max_val, 1) if rescale else decoded_instrumentals.clamp(-limit, limit)
-        decoded_instrumentals = (decoded_instrumentals * 32767).to(torch.int16).numpy()
-        return (16000, mixed_audio), (16000, decoded_vocals), (16000, decoded_instrumentals)
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):

         vocals = np.concatenate(vocals, axis=1)
         instrumentals = np.concatenate(instrumentals, axis=1)
+        vocal_save_path = os.path.join(stage1_output_dir, f"vocal_{random_id}".replace('.', '@') + '.npy')
+        inst_save_path = os.path.join(stage1_output_dir, f"instrumental_{random_id}".replace('.', '@') + '.npy')
+        np.save(vocal_save_path, vocals)
+        np.save(inst_save_path, instrumentals)
+        stage1_output_set.append(vocal_save_path)
+        stage1_output_set.append(inst_save_path)
+        print("Converting to Audio...")
+        # convert audio tokens to audio
+        def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
+            folder_path = os.path.dirname(path)
+            if not os.path.exists(folder_path):
+                os.makedirs(folder_path)
+            limit = 0.99
+            max_val = wav.abs().max()
+            wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
+            torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+        # reconstruct tracks
+        recons_output_dir = os.path.join(output_dir, "recons")
+        recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+        os.makedirs(recons_mix_dir, exist_ok=True)
+        tracks = []
+        for npy in stage1_output_set:
+            codec_result = np.load(npy)
+            decodec_rlt = []
+            with torch.no_grad():
+                decoded_waveform = codec_model.decode(
+                    torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(
+                        device))
+            decoded_waveform = decoded_waveform.cpu().squeeze(0)
+            decodec_rlt.append(torch.as_tensor(decoded_waveform))
+            decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+            save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+            tracks.append(save_path)
+            save_audio(decodec_rlt, save_path, 16000)
+        # mix tracks
+        for inst_path in tracks:
+            try:
+                if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                        and 'instrumental' in inst_path:
+                    # find pair
+                    vocal_path = inst_path.replace('instrumental', 'vocal')
+                    if not os.path.exists(vocal_path):
+                        continue
+                    # mix
+                    recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                    vocal_stem, sr = sf.read(inst_path)
+                    instrumental_stem, _ = sf.read(vocal_path)
+                    mix_stem = (vocal_stem + instrumental_stem) / 1
+                    return (16000, mix_stem), (16000, vocal_stem), (16000, instrumental_stem)
+            except Exception as e:
+                print(e)
+                return None, None, None
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=15):