YuE-music-generator-demo-zero

Runtime error

App Files Files Community

KingNish commited on Feb 7

Commit

c11e52c

verified ·

1 Parent(s): 25dc2c1

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -71

app.py CHANGED Viewed

@@ -118,7 +118,6 @@ def split_lyrics(lyrics: str):
     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
-@spaces.GPU(duration=178)
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
@@ -168,70 +167,69 @@ def generate_music(
         # Format text prompt
         run_n_segments = min(run_n_segments, len(lyrics)) + 1
-        print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
-        for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
-            print(str(i) +". " + str(p) + "\n\n")
-            section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2  # Guidance scale adjusted based on segment index
-            if i == 0:
-                continue
-            if i == 1:
-                if use_audio_prompt:
-                    audio_prompt = load_audio_mono(audio_prompt_path)
-                    audio_prompt.unsqueeze_(0)
-                    with torch.no_grad():
-                        raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
-                    raw_codes = raw_codes.transpose(0, 1)
-                    raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                    # Format audio prompt
-                    code_ids = codectool.npy2ids(raw_codes[0])
-                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
-                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
-                        mmtokenizer.eoa]
-                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
-                        "[end_of_reference]")
-                    head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
                 else:
-                    head_id = mmtokenizer.tokenize(prompt_texts[0])
-                prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-            else:
-                prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
-            prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
-            input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
-            # Use window slicing in case output sequence exceeds the context of model
-            max_context = 16384 - max_new_tokens - 1
-            if input_ids.shape[-1] > max_context:
-                print(
-                    f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
-                input_ids = input_ids[:, -(max_context):]
-            with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
-                output_seq = model.generate(
-                    input_ids=input_ids,
-                    max_new_tokens=max_new_tokens,
-                    min_new_tokens=100,
-                    do_sample=True,
-                    top_p=top_p,
-                    temperature=temperature,
-                    repetition_penalty=repetition_penalty,
-                    eos_token_id=mmtokenizer.eoa,
-                    pad_token_id=mmtokenizer.eoa,
-                    logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
-                    guidance_scale=guidance_scale,
-                    use_cache=True,
-                    num_beams=1
-                )
-                if output_seq[0][-1].item() != mmtokenizer.eoa:
-                    tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
-                    output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
-            if i > 1:
-                raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
-            else:
-                raw_output = output_seq
         # save raw output and check sanity
         ids = raw_output[0].cpu().numpy()
@@ -359,19 +357,50 @@ with gr.Blocks() as demo:
         # Examples updated to only include text inputs
         gr.Examples(
             examples=[
-                [
-                    "rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
-                    """[verse]
 Woke up in the morning, sun is shining bright
 Chasing all my dreams, gotta get my mind right
 City lights are fading, but my vision's clear
 Got my team beside me, no room for fear
-[chorus]
 Walking through the streets, beats inside my head
 Every step I take, closer to the bread
-People passing by, they don't understand
-Building up my future with my own two hands
                     """
                 ],
                 [

     structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
     return structured_lyrics
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
         # Format text prompt
         run_n_segments = min(run_n_segments, len(lyrics)) + 1
+        @spaces.GPU(duration=178)
+        def generator:
+            for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+                section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+                guidance_scale = 1.5 if i <= 1 else 1.2  # Guidance scale adjusted based on segment index
+                if i == 0:
+                    continue
+                if i == 1:
+                    if use_audio_prompt:
+                        audio_prompt = load_audio_mono(audio_prompt_path)
+                        audio_prompt.unsqueeze_(0)
+                        with torch.no_grad():
+                            raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
+                        raw_codes = raw_codes.transpose(0, 1)
+                        raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                        # Format audio prompt
+                        code_ids = codectool.npy2ids(raw_codes[0])
+                        audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
+                        audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
+                            mmtokenizer.eoa]
+                        sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
+                            "[end_of_reference]")
+                        head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
+                    else:
+                        head_id = mmtokenizer.tokenize(prompt_texts[0])
+                    prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
                 else:
+                    prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+                prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+                input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+                # Use window slicing in case output sequence exceeds the context of model
+                max_context = 16384 - max_new_tokens - 1
+                if input_ids.shape[-1] > max_context:
+                    print(
+                        f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
+                    input_ids = input_ids[:, -(max_context):]
+                with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
+                    output_seq = model.generate(
+                        input_ids=input_ids,
+                        max_new_tokens=max_new_tokens,
+                        min_new_tokens=100,
+                        do_sample=True,
+                        top_p=top_p,
+                        temperature=temperature,
+                        repetition_penalty=repetition_penalty,
+                        eos_token_id=mmtokenizer.eoa,
+                        pad_token_id=mmtokenizer.eoa,
+                        logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+                        guidance_scale=guidance_scale,
+                        use_cache=True,
+                        num_beams=1
+                    )
+                    if output_seq[0][-1].item() != mmtokenizer.eoa:
+                        tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+                        output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+                if i > 1:
+                    raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+                else:
+                    raw_output = output_seq
         # save raw output and check sanity
         ids = raw_output[0].cpu().numpy()
         # Examples updated to only include text inputs
         gr.Examples(
             examples=[
+                ["rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
+                """[verse]
 Woke up in the morning, sun is shining bright
 Chasing all my dreams, gotta get my mind right
 City lights are fading, but my vision's clear
 Got my team beside me, no room for fear
 Walking through the streets, beats inside my head
 Every step I take, closer to the bread
+[chorus]
+This is my life, and I'm aiming for the top
+Never gonna quit, no, I'm never gonna stop
+Through the highs and lows, I'mma keep it real
+Living out my dreams with this mic and a deal
+[verse]
+Late nights grinding, writing down these rhymes
+Clock is ticking fast, can't afford to waste time
+Haters gonna hate, but I brush it off
+Turn the negativity into something strong
+Mama working hard, wanna make her proud"""],
+                [
+                    "inspiring female uplifting pop airy vocal electronic bright vocal vocal",
+                    """[verse]
+Staring at the sunset, colors paint the sky
+Thoughts of you keep swirling, can't deny
+I know I let you down, I made mistakes
+But I'm here to mend the heart I didn't break
+[chorus]
+Every road you take, I'll be one step behind
+Every dream you chase, I'm reaching for the light
+You can't fight this feeling now
+I won't back down
+I'm the whisper in the wind, the shadow by your side
+The warmth you feel within when you can't hide
+You know you can't deny it now
+I won't back down
+[verse]
+They might say I'm foolish, chasing after you
+But they don't feel this love the way we do
+My heart beats only for you, can't you see?
+I won't let you slip away from me
                     """
                 ],
                 [