YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 30

Commit

c3cdb06

verified ·

1 Parent(s): 75625eb

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -7

app.py CHANGED Viewed

@@ -90,15 +90,19 @@ mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model"
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
-codec_model.to(device)
 codec_model.eval()
 vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
 vocal_decoder.to(device)
 inst_decoder.to(device)
 vocal_decoder.eval()
 inst_decoder.eval()
@@ -112,13 +116,14 @@ def generate_music(
         audio_prompt_path="",
         prompt_start_time=0.0,
         prompt_end_time=30.0,
-        cuda_idx=0,
         rescale=False,
 ):
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
-    cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
@@ -173,7 +178,17 @@ def generate_music(
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
-        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
@@ -210,7 +225,7 @@ def generate_music(
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
-            with torch.no_grad():
                 output_seq = model.generate(
                     input_ids=input_ids,
                     max_new_tokens=max_new_tokens,
@@ -374,8 +389,7 @@ def generate_music(
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=10):
     # Execute the command
     try:
-        audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
-                               cuda_idx=0, max_new_tokens=max_new_tokens)
         return audio_data
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))

 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
+# Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
+codec_model = torch.compile(codec_model)
 codec_model.eval()
+# Preload and compile vocoders
 vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
 vocal_decoder.to(device)
 inst_decoder.to(device)
+vocal_decoder = torch.compile(vocal_decoder)
+inst_decoder = torch.compile(inst_decoder)
 vocal_decoder.eval()
 inst_decoder.eval()
         audio_prompt_path="",
         prompt_start_time=0.0,
         prompt_end_time=30.0,
         rescale=False,
 ):
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+    # Initial setup with memory-only processing
+    # ------------------------------------------
     max_new_tokens = max_new_tokens * 100
+    stage1_output_data = {}
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
+        generation_config = {
+            'top_p': 0.93,
+            'temperature': 1.0,
+            'repetition_penalty': 1.2,
+            'top_k': 50,  # Faster than top_p alone
+            'num_beams': 1,  # Disable beam search
+            'max_new_tokens': max_new_tokens,
+            'min_new_tokens': 100,
+            'do_sample': True,
+            'use_cache': True,
+            }
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
+            with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
                 output_seq = model.generate(
                     input_ids=input_ids,
                     max_new_tokens=max_new_tokens,
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=10):
     # Execute the command
     try:
+        audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments, max_new_tokens=max_new_tokens)
         return audio_data
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))