YuE-music-generator-demo-zero

Paused

App Files Files Community

KingNish commited on Jan 30

Commit

a02a3fd

verified ·

1 Parent(s): ab8cd62

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -23

app.py CHANGED Viewed

@@ -70,7 +70,7 @@ from models.soundstream_hubert_new import SoundStream
 from vocoder import build_codec_model, process_audio
 from post_process_audio import replace_low_freq_with_energy_matched
-device = "cuda"
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
@@ -90,23 +90,18 @@ mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model"
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
-# Load codec model
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
-codec_model = torch.compile(codec_model)
 codec_model.eval()
-# Preload and compile vocoders
 vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
 vocal_decoder.to(device)
 inst_decoder.to(device)
-vocal_decoder = torch.compile(vocal_decoder)
-inst_decoder = torch.compile(inst_decoder)
 vocal_decoder.eval()
 inst_decoder.eval()
-cuda_idx = 0
 def generate_music(
         max_new_tokens=5,
@@ -117,14 +112,13 @@ def generate_music(
         audio_prompt_path="",
         prompt_start_time=0.0,
         prompt_end_time=30.0,
         rescale=False,
 ):
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
-    # Initial setup with memory-only processing
-    # ------------------------------------------
     max_new_tokens = max_new_tokens * 100
-    stage1_output_data = {}
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
@@ -179,17 +173,7 @@ def generate_music(
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
-        generation_config = {
-            'top_p': 0.93,
-            'temperature': 1.0,
-            'repetition_penalty': 1.2,
-            'top_k': 50,  # Faster than top_p alone
-            'num_beams': 1,  # Disable beam search
-            'max_new_tokens': max_new_tokens,
-            'min_new_tokens': 100,
-            'do_sample': True,
-            'use_cache': True,
-            }
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
@@ -226,7 +210,7 @@ def generate_music(
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
-            with torch.inference_mode(), torch.autocast(device_type=device, dtype=torch.float16):
                 output_seq = model.generate(
                     input_ids=input_ids,
                     max_new_tokens=max_new_tokens,
@@ -390,7 +374,8 @@ def generate_music(
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=10):
     # Execute the command
     try:
-        audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments, max_new_tokens=max_new_tokens)
         return audio_data
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))

 from vocoder import build_codec_model, process_audio
 from post_process_audio import replace_low_freq_with_energy_matched
+device = "cuda:0"
 model = AutoModelForCausalLM.from_pretrained(
     "m-a-p/YuE-s1-7B-anneal-en-cot",
 codectool = CodecManipulator("xcodec", 0, 1)
 model_config = OmegaConf.load(basic_model_config)
 codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
 parameter_dict = torch.load(resume_path, map_location='cpu')
 codec_model.load_state_dict(parameter_dict['codec_model'])
+codec_model.to(device)
 codec_model.eval()
 vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
 vocal_decoder.to(device)
 inst_decoder.to(device)
 vocal_decoder.eval()
 inst_decoder.eval()
 def generate_music(
         max_new_tokens=5,
         audio_prompt_path="",
         prompt_start_time=0.0,
         prompt_end_time=30.0,
+        cuda_idx=0,
         rescale=False,
 ):
     if use_audio_prompt and not audio_prompt_path:
         raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+    cuda_idx = cuda_idx
     max_new_tokens = max_new_tokens * 100
     with tempfile.TemporaryDirectory() as output_dir:
         stage1_output_dir = os.path.join(output_dir, f"stage1")
         # Format text prompt
         run_n_segments = min(run_n_segments + 1, len(lyrics))
+        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
+            with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
                 output_seq = model.generate(
                     input_ids=input_ids,
                     max_new_tokens=max_new_tokens,
 def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=10):
     # Execute the command
     try:
+        audio_data = generate_music(genre_txt=genre_txt_content, lyrics_txt=lyrics_txt_content, run_n_segments=num_segments,
+                               cuda_idx=0, max_new_tokens=max_new_tokens)
         return audio_data
     except Exception as e:
         gr.Warning("An Error Occured: " + str(e))