KingNish commited on
Commit
c11e52c
·
verified ·
1 Parent(s): 25dc2c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -71
app.py CHANGED
@@ -118,7 +118,6 @@ def split_lyrics(lyrics: str):
118
  structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
119
  return structured_lyrics
120
 
121
- @spaces.GPU(duration=178)
122
  def generate_music(
123
  genre_txt=None,
124
  lyrics_txt=None,
@@ -168,70 +167,69 @@ def generate_music(
168
  # Format text prompt
169
  run_n_segments = min(run_n_segments, len(lyrics)) + 1
170
 
171
- print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
172
-
173
- for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
174
- print(str(i) +". " + str(p) + "\n\n")
175
- section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
176
- guidance_scale = 1.5 if i <= 1 else 1.2 # Guidance scale adjusted based on segment index
177
- if i == 0:
178
- continue
179
- if i == 1:
180
- if use_audio_prompt:
181
- audio_prompt = load_audio_mono(audio_prompt_path)
182
- audio_prompt.unsqueeze_(0)
183
- with torch.no_grad():
184
- raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
185
- raw_codes = raw_codes.transpose(0, 1)
186
- raw_codes = raw_codes.cpu().numpy().astype(np.int16)
187
- # Format audio prompt
188
- code_ids = codectool.npy2ids(raw_codes[0])
189
- audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)] # 50 is tps of xcodec
190
- audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
191
- mmtokenizer.eoa]
192
- sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
193
- "[end_of_reference]")
194
- head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
 
 
195
  else:
196
- head_id = mmtokenizer.tokenize(prompt_texts[0])
197
- prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
198
- else:
199
- prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
200
-
201
- prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
202
- input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
203
-
204
- # Use window slicing in case output sequence exceeds the context of model
205
- max_context = 16384 - max_new_tokens - 1
206
- if input_ids.shape[-1] > max_context:
207
- print(
208
- f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
209
- input_ids = input_ids[:, -(max_context):]
210
-
211
- with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
212
- output_seq = model.generate(
213
- input_ids=input_ids,
214
- max_new_tokens=max_new_tokens,
215
- min_new_tokens=100,
216
- do_sample=True,
217
- top_p=top_p,
218
- temperature=temperature,
219
- repetition_penalty=repetition_penalty,
220
- eos_token_id=mmtokenizer.eoa,
221
- pad_token_id=mmtokenizer.eoa,
222
- logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
223
- guidance_scale=guidance_scale,
224
- use_cache=True,
225
- num_beams=1
226
- )
227
- if output_seq[0][-1].item() != mmtokenizer.eoa:
228
- tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
229
- output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
230
-
231
- if i > 1:
232
- raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
233
- else:
234
- raw_output = output_seq
235
 
236
  # save raw output and check sanity
237
  ids = raw_output[0].cpu().numpy()
@@ -359,19 +357,50 @@ with gr.Blocks() as demo:
359
  # Examples updated to only include text inputs
360
  gr.Examples(
361
  examples=[
362
- [
363
- "rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
364
- """[verse]
365
  Woke up in the morning, sun is shining bright
366
  Chasing all my dreams, gotta get my mind right
367
  City lights are fading, but my vision's clear
368
  Got my team beside me, no room for fear
369
-
370
- [chorus]
371
  Walking through the streets, beats inside my head
372
  Every step I take, closer to the bread
373
- People passing by, they don't understand
374
- Building up my future with my own two hands
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  """
376
  ],
377
  [
 
118
  structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
119
  return structured_lyrics
120
 
 
121
  def generate_music(
122
  genre_txt=None,
123
  lyrics_txt=None,
 
167
  # Format text prompt
168
  run_n_segments = min(run_n_segments, len(lyrics)) + 1
169
 
170
+ @spaces.GPU(duration=178)
171
+ def generator:
172
+ for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
173
+ section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
174
+ guidance_scale = 1.5 if i <= 1 else 1.2 # Guidance scale adjusted based on segment index
175
+ if i == 0:
176
+ continue
177
+ if i == 1:
178
+ if use_audio_prompt:
179
+ audio_prompt = load_audio_mono(audio_prompt_path)
180
+ audio_prompt.unsqueeze_(0)
181
+ with torch.no_grad():
182
+ raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
183
+ raw_codes = raw_codes.transpose(0, 1)
184
+ raw_codes = raw_codes.cpu().numpy().astype(np.int16)
185
+ # Format audio prompt
186
+ code_ids = codectool.npy2ids(raw_codes[0])
187
+ audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)] # 50 is tps of xcodec
188
+ audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
189
+ mmtokenizer.eoa]
190
+ sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
191
+ "[end_of_reference]")
192
+ head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
193
+ else:
194
+ head_id = mmtokenizer.tokenize(prompt_texts[0])
195
+ prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
196
  else:
197
+ prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
198
+
199
+ prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
200
+ input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
201
+
202
+ # Use window slicing in case output sequence exceeds the context of model
203
+ max_context = 16384 - max_new_tokens - 1
204
+ if input_ids.shape[-1] > max_context:
205
+ print(
206
+ f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
207
+ input_ids = input_ids[:, -(max_context):]
208
+
209
+ with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
210
+ output_seq = model.generate(
211
+ input_ids=input_ids,
212
+ max_new_tokens=max_new_tokens,
213
+ min_new_tokens=100,
214
+ do_sample=True,
215
+ top_p=top_p,
216
+ temperature=temperature,
217
+ repetition_penalty=repetition_penalty,
218
+ eos_token_id=mmtokenizer.eoa,
219
+ pad_token_id=mmtokenizer.eoa,
220
+ logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
221
+ guidance_scale=guidance_scale,
222
+ use_cache=True,
223
+ num_beams=1
224
+ )
225
+ if output_seq[0][-1].item() != mmtokenizer.eoa:
226
+ tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
227
+ output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
228
+
229
+ if i > 1:
230
+ raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
231
+ else:
232
+ raw_output = output_seq
 
 
 
233
 
234
  # save raw output and check sanity
235
  ids = raw_output[0].cpu().numpy()
 
357
  # Examples updated to only include text inputs
358
  gr.Examples(
359
  examples=[
360
+ ["rap piano street tough piercing vocal hip-hop synthesizer clear vocal male",
361
+ """[verse]
 
362
  Woke up in the morning, sun is shining bright
363
  Chasing all my dreams, gotta get my mind right
364
  City lights are fading, but my vision's clear
365
  Got my team beside me, no room for fear
 
 
366
  Walking through the streets, beats inside my head
367
  Every step I take, closer to the bread
368
+
369
+ [chorus]
370
+ This is my life, and I'm aiming for the top
371
+ Never gonna quit, no, I'm never gonna stop
372
+ Through the highs and lows, I'mma keep it real
373
+ Living out my dreams with this mic and a deal
374
+
375
+ [verse]
376
+ Late nights grinding, writing down these rhymes
377
+ Clock is ticking fast, can't afford to waste time
378
+ Haters gonna hate, but I brush it off
379
+ Turn the negativity into something strong
380
+ Mama working hard, wanna make her proud"""],
381
+ [
382
+ "inspiring female uplifting pop airy vocal electronic bright vocal vocal",
383
+ """[verse]
384
+ Staring at the sunset, colors paint the sky
385
+ Thoughts of you keep swirling, can't deny
386
+ I know I let you down, I made mistakes
387
+ But I'm here to mend the heart I didn't break
388
+
389
+ [chorus]
390
+ Every road you take, I'll be one step behind
391
+ Every dream you chase, I'm reaching for the light
392
+ You can't fight this feeling now
393
+ I won't back down
394
+ I'm the whisper in the wind, the shadow by your side
395
+ The warmth you feel within when you can't hide
396
+ You know you can't deny it now
397
+ I won't back down
398
+
399
+ [verse]
400
+ They might say I'm foolish, chasing after you
401
+ But they don't feel this love the way we do
402
+ My heart beats only for you, can't you see?
403
+ I won't let you slip away from me
404
  """
405
  ],
406
  [