YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 29

Commit

5c9769d

1 Parent(s): 9539c50

removed stage 2 for just testing what happens

Browse files

Files changed (2) hide show

app.py +1 -1
inference/infer.py +6 -147

app.py CHANGED Viewed

@@ -124,7 +124,7 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=
     command = [
         "python", "infer.py",
         "--stage1_model", "m-a-p/YuE-s1-7B-anneal-en-cot",
-        "--stage2_model", "m-a-p/YuE-s2-1B-general",
         "--genre_txt", f"{genre_txt_path}",
         "--lyrics_txt", f"{lyrics_txt_path}",
         "--run_n_segments", f"{num_segments}",

     command = [
         "python", "infer.py",
         "--stage1_model", "m-a-p/YuE-s1-7B-anneal-en-cot",
+        # "--stage2_model", "m-a-p/YuE-s2-1B-general",
         "--genre_txt", f"{genre_txt_path}",
         "--lyrics_txt", f"{lyrics_txt_path}",
         "--run_n_segments", f"{num_segments}",

inference/infer.py CHANGED Viewed

@@ -30,10 +30,8 @@ import re
 parser = argparse.ArgumentParser()
 # Model Configuration:
 parser.add_argument("--stage1_model", type=str, default="m-a-p/YuE-s1-7B-anneal-en-cot", help="The model checkpoint path or identifier for the Stage 1 model.")
-parser.add_argument("--stage2_model", type=str, default="m-a-p/YuE-s2-1B-general", help="The model checkpoint path or identifier for the Stage 2 model.")
 parser.add_argument("--max_new_tokens", type=int, default=3000, help="The maximum number of new tokens to generate in one pass during text generation.")
 parser.add_argument("--run_n_segments", type=int, default=2, help="The number of segments to process during the generation.")
-parser.add_argument("--stage2_batch_size", type=int, default=4, help="The batch size used in Stage 2 inference.")
 # Prompt
 parser.add_argument("--genre_txt", type=str, required=True, help="The file path to a text file containing genre tags that describe the musical style or characteristics (e.g., instrumental, genre, mood, vocal timbre, vocal gender). This is used as part of the generation prompt.")
 parser.add_argument("--lyrics_txt", type=str, required=True, help="The file path to a text file containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process.")
@@ -59,13 +57,10 @@ args = parser.parse_args()
 if args.use_audio_prompt and not args.audio_prompt_path:
     raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
 stage1_model = args.stage1_model
-stage2_model = args.stage2_model
 cuda_idx = args.cuda_idx
 max_new_tokens = args.max_new_tokens
 stage1_output_dir = os.path.join(args.output_dir, f"stage1")
-stage2_output_dir = stage1_output_dir.replace('stage1', 'stage2')
 os.makedirs(stage1_output_dir, exist_ok=True)
-os.makedirs(stage2_output_dir, exist_ok=True)
 # load tokenizer and model
 device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
@@ -83,7 +78,6 @@ model.to(device)
 model.eval()
 codectool = CodecManipulator("xcodec", 0, 1)
-codectool_stage2 = CodecManipulator("xcodec", 0, 8)
 model_config = OmegaConf.load(args.basic_model_config)
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(args.resume_path, map_location='cpu')
@@ -237,145 +231,8 @@ if not args.disable_offload_model:
     del model
     torch.cuda.empty_cache()
-print("Stage 2 inference...")
-model_stage2 = AutoModelForCausalLM.from_pretrained(
-    stage2_model,
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2"
-    )
-model_stage2.to(device)
-model_stage2.eval()
-def stage2_generate(model, prompt, batch_size=16):
-    codec_ids = codectool.unflatten(prompt, n_quantizer=1)
-    codec_ids = codectool.offset_tok_ids(
-                    codec_ids,
-                    global_offset=codectool.global_offset,
-                    codebook_size=codectool.codebook_size,
-                    num_codebooks=codectool.num_codebooks,
-                ).astype(np.int32)
-    # Prepare prompt_ids based on batch size or single input
-    if batch_size > 1:
-        codec_list = []
-        for i in range(batch_size):
-            idx_begin = i * 300
-            idx_end = (i + 1) * 300
-            codec_list.append(codec_ids[:, idx_begin:idx_end])
-        codec_ids = np.concatenate(codec_list, axis=0)
-        prompt_ids = np.concatenate(
-            [
-                np.tile([mmtokenizer.soa, mmtokenizer.stage_1], (batch_size, 1)),
-                codec_ids,
-                np.tile([mmtokenizer.stage_2], (batch_size, 1)),
-            ],
-            axis=1
-        )
-    else:
-        prompt_ids = np.concatenate([
-            np.array([mmtokenizer.soa, mmtokenizer.stage_1]),
-            codec_ids.flatten(),  # Flatten the 2D array to 1D
-            np.array([mmtokenizer.stage_2])
-        ]).astype(np.int32)
-        prompt_ids = prompt_ids[np.newaxis, ...]
-    codec_ids = torch.as_tensor(codec_ids).to(device)
-    prompt_ids = torch.as_tensor(prompt_ids).to(device)
-    len_prompt = prompt_ids.shape[-1]
-    block_list = LogitsProcessorList([BlockTokenRangeProcessor(0, 46358), BlockTokenRangeProcessor(53526, mmtokenizer.vocab_size)])
-    # Teacher forcing generate loop
-    for frames_idx in range(codec_ids.shape[1]):
-        cb0 = codec_ids[:, frames_idx:frames_idx+1]
-        prompt_ids = torch.cat([prompt_ids, cb0], dim=1)
-        input_ids = prompt_ids
-        with torch.no_grad():
-            stage2_output = model.generate(input_ids=input_ids,
-                min_new_tokens=7,
-                max_new_tokens=7,
-                eos_token_id=mmtokenizer.eoa,
-                pad_token_id=mmtokenizer.eoa,
-                logits_processor=block_list,
-            )
-        assert stage2_output.shape[1] - prompt_ids.shape[1] == 7, f"output new tokens={stage2_output.shape[1]-prompt_ids.shape[1]}"
-        prompt_ids = stage2_output
-    # Return output based on batch size
-    if batch_size > 1:
-        output = prompt_ids.cpu().numpy()[:, len_prompt:]
-        output_list = [output[i] for i in range(batch_size)]
-        output = np.concatenate(output_list, axis=0)
-    else:
-        output = prompt_ids[0].cpu().numpy()[len_prompt:]
-    return output
-def stage2_inference(model, stage1_output_set, stage2_output_dir, batch_size=4):
-    stage2_result = []
-    for i in tqdm(range(len(stage1_output_set))):
-        output_filename = os.path.join(stage2_output_dir, os.path.basename(stage1_output_set[i]))
-        if os.path.exists(output_filename):
-            print(f'{output_filename} stage2 has done.')
-            continue
-        # Load the prompt
-        prompt = np.load(stage1_output_set[i]).astype(np.int32)
-        # Only accept 6s segments
-        output_duration = prompt.shape[-1] // 50 // 6 * 6
-        num_batch = output_duration // 6
-        if num_batch <= batch_size:
-            # If num_batch is less than or equal to batch_size, we can infer the entire prompt at once
-            output = stage2_generate(model, prompt[:, :output_duration*50], batch_size=num_batch)
-        else:
-            # If num_batch is greater than batch_size, process in chunks of batch_size
-            segments = []
-            num_segments = (num_batch // batch_size) + (1 if num_batch % batch_size != 0 else 0)
-            for seg in range(num_segments):
-                start_idx = seg * batch_size * 300
-                # Ensure the end_idx does not exceed the available length
-                end_idx = min((seg + 1) * batch_size * 300, output_duration*50)  # Adjust the last segment
-                current_batch_size = batch_size if seg != num_segments-1 or num_batch % batch_size == 0 else num_batch % batch_size
-                segment = stage2_generate(
-                    model,
-                    prompt[:, start_idx:end_idx],
-                    batch_size=current_batch_size
-                )
-                segments.append(segment)
-            # Concatenate all the segments
-            output = np.concatenate(segments, axis=0)
-        # Process the ending part of the prompt
-        if output_duration*50 != prompt.shape[-1]:
-            ending = stage2_generate(model, prompt[:, output_duration*50:], batch_size=1)
-            output = np.concatenate([output, ending], axis=0)
-        output = codectool_stage2.ids2npy(output)
-        # Fix invalid codes (a dirty solution, which may harm the quality of audio)
-        # We are trying to find better one
-        fixed_output = copy.deepcopy(output)
-        for i, line in enumerate(output):
-            for j, element in enumerate(line):
-                if element < 0 or element > 1023:
-                    counter = Counter(line)
-                    most_frequant = sorted(counter.items(), key=lambda x: x[1], reverse=True)[0][0]
-                    fixed_output[i, j] = most_frequant
-        # save output
-        np.save(output_filename, fixed_output)
-        stage2_result.append(output_filename)
-    return stage2_result
-stage2_result = stage2_inference(model_stage2, stage1_output_set, stage2_output_dir, batch_size=args.stage2_batch_size)
-print(stage2_result)
-print('Stage 2 DONE.\n')
 # convert audio tokens to audio
 def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
     folder_path = os.path.dirname(path)
@@ -390,7 +247,7 @@ recons_output_dir = os.path.join(args.output_dir, "recons")
 recons_mix_dir = os.path.join(recons_output_dir, 'mix')
 os.makedirs(recons_mix_dir, exist_ok=True)
 tracks = []
-for npy in stage2_result:
     codec_result = np.load(npy)
     decodec_rlt=[]
     with torch.no_grad():
@@ -426,7 +283,8 @@ vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
 vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
 os.makedirs(vocoder_mix_dir, exist_ok=True)
 os.makedirs(vocoder_stems_dir, exist_ok=True)
-for npy in stage2_result:
     if 'instrumental' in npy:
         # Process instrumental
         instrumental_output = process_audio(
@@ -463,4 +321,5 @@ replace_low_freq_with_energy_matched(
     b_file=vocoder_mix,     # 48kHz
     c_file=os.path.join(args.output_dir, os.path.basename(recons_mix)),
     cutoff_freq=5500.0
-)

 parser = argparse.ArgumentParser()
 # Model Configuration:
 parser.add_argument("--stage1_model", type=str, default="m-a-p/YuE-s1-7B-anneal-en-cot", help="The model checkpoint path or identifier for the Stage 1 model.")
 parser.add_argument("--max_new_tokens", type=int, default=3000, help="The maximum number of new tokens to generate in one pass during text generation.")
 parser.add_argument("--run_n_segments", type=int, default=2, help="The number of segments to process during the generation.")
 # Prompt
 parser.add_argument("--genre_txt", type=str, required=True, help="The file path to a text file containing genre tags that describe the musical style or characteristics (e.g., instrumental, genre, mood, vocal timbre, vocal gender). This is used as part of the generation prompt.")
 parser.add_argument("--lyrics_txt", type=str, required=True, help="The file path to a text file containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process.")
 if args.use_audio_prompt and not args.audio_prompt_path:
     raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
 stage1_model = args.stage1_model
 cuda_idx = args.cuda_idx
 max_new_tokens = args.max_new_tokens
 stage1_output_dir = os.path.join(args.output_dir, f"stage1")
 os.makedirs(stage1_output_dir, exist_ok=True)
 # load tokenizer and model
 device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
 model.eval()
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(args.basic_model_config)
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(args.resume_path, map_location='cpu')
     del model
     torch.cuda.empty_cache()
+print("Converting to Audio...")
 # convert audio tokens to audio
 def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
     folder_path = os.path.dirname(path)
 recons_mix_dir = os.path.join(recons_output_dir, 'mix')
 os.makedirs(recons_mix_dir, exist_ok=True)
 tracks = []
+for npy in stage1_output_set:
     codec_result = np.load(npy)
     decodec_rlt=[]
     with torch.no_grad():
 vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
 os.makedirs(vocoder_mix_dir, exist_ok=True)
 os.makedirs(vocoder_stems_dir, exist_ok=True)
+for npy in stage1_output_set:
     if 'instrumental' in npy:
         # Process instrumental
         instrumental_output = process_audio(
     b_file=vocoder_mix,     # 48kHz
     c_file=os.path.join(args.output_dir, os.path.basename(recons_mix)),
     cutoff_freq=5500.0
+)
+print("All process Done")