waytan22 commited on
Commit
3779445
·
1 Parent(s): 4d12c78

update single track diffusion

Browse files
app.py CHANGED
@@ -56,7 +56,7 @@ with open(op.join(APP_DIR, 'conf/vocab.yaml'), 'r', encoding='utf-8') as file:
56
 
57
 
58
  # 模拟歌曲生成函数
59
- def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=None, top_k=None, progress=gr.Progress(track_tqdm=True)):
60
  global MODEL
61
  global STRUCTS
62
  params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
@@ -105,7 +105,7 @@ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_co
105
  progress(0.0, "Start Generation")
106
  start = time.time()
107
 
108
- audio_data = MODEL(lyric_norm, description, prompt_audio, genre, op.join(APP_DIR, "ckpt/prompt.pt"), params).cpu().permute(1, 0).float().numpy()
109
 
110
  end = time.time()
111
 
@@ -204,7 +204,9 @@ lyrics
204
  interactive=True,
205
  elem_id="top_k",
206
  )
207
- generate_btn = gr.Button("Generate Song", variant="primary")
 
 
208
 
209
  with gr.Column():
210
  output_audio = gr.Audio(label="Generated Song", type="numpy")
@@ -235,6 +237,11 @@ lyrics
235
  inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k],
236
  outputs=[output_audio, output_json]
237
  )
 
 
 
 
 
238
 
239
 
240
  # 启动应用
 
56
 
57
 
58
  # 模拟歌曲生成函数
59
+ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=None, top_k=None, gen_type="all", progress=gr.Progress(track_tqdm=True)):
60
  global MODEL
61
  global STRUCTS
62
  params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
 
105
  progress(0.0, "Start Generation")
106
  start = time.time()
107
 
108
+ audio_data = MODEL(lyric_norm, description, prompt_audio, genre, op.join(APP_DIR, "ckpt/prompt.pt"), gen_type, params).cpu().permute(1, 0).float().numpy()
109
 
110
  end = time.time()
111
 
 
204
  interactive=True,
205
  elem_id="top_k",
206
  )
207
+ with gr.Row():
208
+ generate_btn = gr.Button("Generate Song", variant="primary")
209
+ generate_bgm_btn = gr.Button("Generate Pure Music", variant="primary")
210
 
211
  with gr.Column():
212
  output_audio = gr.Audio(label="Generated Song", type="numpy")
 
237
  inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k],
238
  outputs=[output_audio, output_json]
239
  )
240
+ generate_bgm_btn.click(
241
+ fn=generate_song,
242
+ inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k, gr.State("bgm")],
243
+ outputs=[output_audio, output_json]
244
+ )
245
 
246
 
247
  # 启动应用
codeclm/models/codeclm.py CHANGED
@@ -271,13 +271,19 @@ class CodecLM:
271
  return gen_tokens
272
 
273
  @torch.no_grad()
274
- def generate_audio(self, gen_tokens: torch.Tensor, prompt=None, vocal_prompt=None, bgm_prompt=None, chunked=False):
275
  """Generate Audio from tokens"""
276
  assert gen_tokens.dim() == 3
277
  if self.seperate_tokenizer is not None:
278
  gen_tokens_song = gen_tokens[:, [0], :]
279
  gen_tokens_vocal = gen_tokens[:, [1], :]
280
  gen_tokens_bgm = gen_tokens[:, [2], :]
 
 
 
 
 
 
281
  # gen_audio_song = self.audiotokenizer.decode(gen_tokens_song, prompt)
282
  gen_audio_seperate = self.seperate_tokenizer.decode([gen_tokens_vocal, gen_tokens_bgm], vocal_prompt, bgm_prompt, chunked=chunked)
283
  return gen_audio_seperate
 
271
  return gen_tokens
272
 
273
  @torch.no_grad()
274
+ def generate_audio(self, gen_tokens: torch.Tensor, prompt=None, vocal_prompt=None, bgm_prompt=None, chunked=False, gen_type="all"):
275
  """Generate Audio from tokens"""
276
  assert gen_tokens.dim() == 3
277
  if self.seperate_tokenizer is not None:
278
  gen_tokens_song = gen_tokens[:, [0], :]
279
  gen_tokens_vocal = gen_tokens[:, [1], :]
280
  gen_tokens_bgm = gen_tokens[:, [2], :]
281
+ if gen_type == "bgm":
282
+ gen_tokens_vocal = torch.full_like(gen_tokens_vocal, 3142)
283
+ vocal_prompt = None
284
+ elif gen_type == "vocal":
285
+ gen_tokens_bgm = torch.full_like(gen_tokens_bgm, 9670)
286
+ bgm_prompt = None
287
  # gen_audio_song = self.audiotokenizer.decode(gen_tokens_song, prompt)
288
  gen_audio_seperate = self.seperate_tokenizer.decode([gen_tokens_vocal, gen_tokens_bgm], vocal_prompt, bgm_prompt, chunked=chunked)
289
  return gen_audio_seperate
generate.py CHANGED
@@ -70,6 +70,7 @@ if __name__ == "__main__":
70
  ckpt_path = sys.argv[1]
71
  input_jsonl = sys.argv[2]
72
  save_dir = sys.argv[3]
 
73
  cfg_path = os.path.join(ckpt_path, 'config.yaml')
74
  ckpt_path = os.path.join(ckpt_path, 'model.pt')
75
  cfg = OmegaConf.load(cfg_path)
@@ -146,15 +147,15 @@ if __name__ == "__main__":
146
  with torch.autocast(device_type="cuda", dtype=torch.float16):
147
  tokens = model.generate(**generate_inp, return_tokens=True)
148
  mid_time = time.time()
149
-
150
  with torch.no_grad():
151
  if melody_is_wav:
152
- wav_seperate = model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav)
153
  else:
154
- wav_seperate = model.generate_audio(tokens)
155
  end_time = time.time()
156
  torchaudio.save(target_wav_name, wav_seperate[0].cpu().float(), cfg.sample_rate)
157
- print(f"process{item['idx']}, lm cost {mid_time - start_time}s, diffusion cost {end_time - mid_time}")
158
 
159
  item["idx"] = f"{item['idx']}"
160
  item["wav_path"] = target_wav_name
 
70
  ckpt_path = sys.argv[1]
71
  input_jsonl = sys.argv[2]
72
  save_dir = sys.argv[3]
73
+ gen_type = sys.argv[4] if len(sys.argv) > 4 else "all"
74
  cfg_path = os.path.join(ckpt_path, 'config.yaml')
75
  ckpt_path = os.path.join(ckpt_path, 'model.pt')
76
  cfg = OmegaConf.load(cfg_path)
 
147
  with torch.autocast(device_type="cuda", dtype=torch.float16):
148
  tokens = model.generate(**generate_inp, return_tokens=True)
149
  mid_time = time.time()
150
+
151
  with torch.no_grad():
152
  if melody_is_wav:
153
+ wav_seperate = model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav, gen_type=gen_type)
154
  else:
155
+ wav_seperate = model.generate_audio(tokens, gen_type=gen_type)
156
  end_time = time.time()
157
  torchaudio.save(target_wav_name, wav_seperate[0].cpu().float(), cfg.sample_rate)
158
+ print(f"process{item['idx']} {gen_type}, lm cost {mid_time - start_time}s, diffusion cost {end_time - mid_time}")
159
 
160
  item["idx"] = f"{item['idx']}"
161
  item["wav_path"] = target_wav_name
generate.sh CHANGED
@@ -7,4 +7,5 @@ export PYTHONPATH="$(pwd)/codeclm/tokenizer/":"$(pwd)":"$(pwd)/codeclm/tokenizer
7
  CKPT_PATH=$1
8
  JSONL=$2
9
  SAVE_DIR=$3
10
- python3 generate.py $CKPT_PATH $JSONL $SAVE_DIR
 
 
7
  CKPT_PATH=$1
8
  JSONL=$2
9
  SAVE_DIR=$3
10
+ GEN_TYEP=$4
11
+ python3 generate.py $CKPT_PATH $JSONL $SAVE_DIR $GEN_TYEP
generate_lowmem.py CHANGED
@@ -71,6 +71,7 @@ if __name__ == "__main__":
71
  ckpt_path = sys.argv[1]
72
  input_jsonl = sys.argv[2]
73
  save_dir = sys.argv[3]
 
74
  cfg_path = os.path.join(ckpt_path, 'config.yaml')
75
  ckpt_path = os.path.join(ckpt_path, 'model.pt')
76
  cfg = OmegaConf.load(cfg_path)
@@ -220,12 +221,12 @@ if __name__ == "__main__":
220
  for item in new_items:
221
  with torch.no_grad():
222
  if 'raw_pmt_wav' in item:
223
- wav_seperate = model.generate_audio(item['tokens'], item['raw_pmt_wav'], item['raw_vocal_wav'], item['raw_bgm_wav'], chunked=True)
224
  del item['raw_pmt_wav']
225
  del item['raw_vocal_wav']
226
  del item['raw_bgm_wav']
227
  else:
228
- wav_seperate = model.generate_audio(item['tokens'], chunked=True)
229
  torchaudio.save(item['wav_path'], wav_seperate[0].cpu().float(), cfg.sample_rate)
230
  del item['tokens']
231
  del item['pmt_wav']
 
71
  ckpt_path = sys.argv[1]
72
  input_jsonl = sys.argv[2]
73
  save_dir = sys.argv[3]
74
+ gen_type = sys.argv[4] if len(sys.argv) > 4 else "all"
75
  cfg_path = os.path.join(ckpt_path, 'config.yaml')
76
  ckpt_path = os.path.join(ckpt_path, 'model.pt')
77
  cfg = OmegaConf.load(cfg_path)
 
221
  for item in new_items:
222
  with torch.no_grad():
223
  if 'raw_pmt_wav' in item:
224
+ wav_seperate = model.generate_audio(item['tokens'], item['raw_pmt_wav'], item['raw_vocal_wav'], item['raw_bgm_wav'], chunked=True, gen_type=gen_type)
225
  del item['raw_pmt_wav']
226
  del item['raw_vocal_wav']
227
  del item['raw_bgm_wav']
228
  else:
229
+ wav_seperate = model.generate_audio(item['tokens'], chunked=True, gen_type=gen_type)
230
  torchaudio.save(item['wav_path'], wav_seperate[0].cpu().float(), cfg.sample_rate)
231
  del item['tokens']
232
  del item['pmt_wav']
generate_lowmem.sh CHANGED
@@ -7,4 +7,5 @@ export PYTHONPATH="$(pwd)/codeclm/tokenizer/":"$(pwd)":"$(pwd)/codeclm/tokenizer
7
  CKPT_PATH=$1
8
  JSONL=$2
9
  SAVE_DIR=$3
10
- python3 generate_lowmem.py $CKPT_PATH $JSONL $SAVE_DIR
 
 
7
  CKPT_PATH=$1
8
  JSONL=$2
9
  SAVE_DIR=$3
10
+ GEN_TYEP=$4
11
+ python3 generate_lowmem.py $CKPT_PATH $JSONL $SAVE_DIR $GEN_TYEP
levo_inference.py CHANGED
@@ -67,7 +67,7 @@ class LeVoInference(torch.nn.Module):
67
 
68
  self.model.set_generation_params(**self.default_params)
69
 
70
- def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, params = dict()):
71
  params = {**self.default_params, **params}
72
  self.model.set_generation_params(**params)
73
 
@@ -105,8 +105,8 @@ class LeVoInference(torch.nn.Module):
105
 
106
  with torch.no_grad():
107
  if melody_is_wav:
108
- wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav)
109
  else:
110
- wav_seperate = self.model.generate_audio(tokens)
111
 
112
  return wav_seperate[0]
 
67
 
68
  self.model.set_generation_params(**self.default_params)
69
 
70
+ def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, gen_type: str = "all", params = dict()):
71
  params = {**self.default_params, **params}
72
  self.model.set_generation_params(**params)
73
 
 
105
 
106
  with torch.no_grad():
107
  if melody_is_wav:
108
+ wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav, gen_type=gen_type)
109
  else:
110
+ wav_seperate = self.model.generate_audio(tokens, gen_type=gen_type)
111
 
112
  return wav_seperate[0]
tools/gradio/app.py CHANGED
@@ -49,7 +49,7 @@ with open(op.join(APP_DIR, 'conf/vocab.yaml'), 'r', encoding='utf-8') as file:
49
  STRUCTS = yaml.safe_load(file)
50
 
51
 
52
- def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=None, top_k=None, progress=gr.Progress(track_tqdm=True)):
53
  global MODEL
54
  global STRUCTS
55
  params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
@@ -98,7 +98,7 @@ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_co
98
  progress(0.0, "Start Generation")
99
  start = time.time()
100
 
101
- audio_data = MODEL(lyric_norm, description, prompt_audio, genre, op.join(APP_DIR, "ckpt/prompt.pt"), params).cpu().permute(1, 0).float().numpy()
102
 
103
  end = time.time()
104
 
@@ -119,7 +119,7 @@ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_co
119
  # 创建Gradio界面
120
  with gr.Blocks(title="SongGeneration Demo Space") as demo:
121
  gr.Markdown("# 🎵 SongGeneration Demo Space")
122
- gr.Markdown("Demo interface for the song generation model. Provide a lyrics, and optionally an audio or text prompt, to generate a custom song.")
123
 
124
  with gr.Row():
125
  with gr.Column():
@@ -197,7 +197,9 @@ lyrics
197
  interactive=True,
198
  elem_id="top_k",
199
  )
200
- generate_btn = gr.Button("Generate Song", variant="primary")
 
 
201
 
202
  with gr.Column():
203
  output_audio = gr.Audio(label="Generated Song", type="numpy")
@@ -228,6 +230,11 @@ lyrics
228
  inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k],
229
  outputs=[output_audio, output_json]
230
  )
 
 
 
 
 
231
 
232
 
233
  # 启动应用
 
49
  STRUCTS = yaml.safe_load(file)
50
 
51
 
52
+ def generate_song(lyric, description=None, prompt_audio=None, genre=None, cfg_coef=None, temperature=None, top_k=None, gen_type="all", progress=gr.Progress(track_tqdm=True)):
53
  global MODEL
54
  global STRUCTS
55
  params = {'cfg_coef':cfg_coef, 'temperature':temperature, 'top_k':top_k}
 
98
  progress(0.0, "Start Generation")
99
  start = time.time()
100
 
101
+ audio_data = MODEL(lyric_norm, description, prompt_audio, genre, op.join(APP_DIR, "ckpt/prompt.pt"), gen_type, params).cpu().permute(1, 0).float().numpy()
102
 
103
  end = time.time()
104
 
 
119
  # 创建Gradio界面
120
  with gr.Blocks(title="SongGeneration Demo Space") as demo:
121
  gr.Markdown("# 🎵 SongGeneration Demo Space")
122
+ gr.Markdown("Demo interface for the song generation model. Provide a lyrics, and optionally an audio or text prompt, to generate a custom song. The code is in [GIT](https://github.com/tencent-ailab/SongGeneration)")
123
 
124
  with gr.Row():
125
  with gr.Column():
 
197
  interactive=True,
198
  elem_id="top_k",
199
  )
200
+ with gr.Row():
201
+ generate_btn = gr.Button("Generate Song", variant="primary")
202
+ generate_bgm_btn = gr.Button("Generate Pure Music", variant="primary")
203
 
204
  with gr.Column():
205
  output_audio = gr.Audio(label="Generated Song", type="numpy")
 
230
  inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k],
231
  outputs=[output_audio, output_json]
232
  )
233
+ generate_bgm_btn.click(
234
+ fn=generate_song,
235
+ inputs=[lyric, description, prompt_audio, genre, cfg_coef, temperature, top_k, gr.State("bgm")],
236
+ outputs=[output_audio, output_json]
237
+ )
238
 
239
 
240
  # 启动应用
tools/gradio/levo_inference.py CHANGED
@@ -62,7 +62,7 @@ class LeVoInference(torch.nn.Module):
62
 
63
  self.model.set_generation_params(**self.default_params)
64
 
65
- def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, params = dict()):
66
  params = {**self.default_params, **params}
67
  self.model.set_generation_params(**params)
68
 
@@ -97,14 +97,11 @@ class LeVoInference(torch.nn.Module):
97
 
98
  with torch.autocast(device_type="cuda", dtype=torch.float16):
99
  tokens = self.model.generate(**generate_inp, return_tokens=True)
100
-
101
- if tokens.shape[-1] > 3000:
102
- tokens = tokens[..., :3000]
103
 
104
  with torch.no_grad():
105
  if melody_is_wav:
106
- wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav)
107
  else:
108
- wav_seperate = self.model.generate_audio(tokens)
109
 
110
  return wav_seperate[0]
 
62
 
63
  self.model.set_generation_params(**self.default_params)
64
 
65
+ def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, gen_type: str = "all", params = dict()):
66
  params = {**self.default_params, **params}
67
  self.model.set_generation_params(**params)
68
 
 
97
 
98
  with torch.autocast(device_type="cuda", dtype=torch.float16):
99
  tokens = self.model.generate(**generate_inp, return_tokens=True)
 
 
 
100
 
101
  with torch.no_grad():
102
  if melody_is_wav:
103
+ wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav, gen_type=gen_type)
104
  else:
105
+ wav_seperate = self.model.generate_audio(tokens, gen_type=gen_type)
106
 
107
  return wav_seperate[0]
tools/gradio/levo_inference_lowmem.py CHANGED
@@ -40,7 +40,7 @@ class LeVoInference(torch.nn.Module):
40
  )
41
 
42
 
43
- def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, params = dict()):
44
  if prompt_audio_path is not None and os.path.exists(prompt_audio_path):
45
  separator = Separator()
46
  audio_tokenizer = builders.get_audio_tokenizer_model(self.cfg.audio_tokenizer_checkpoint, self.cfg)
@@ -112,15 +112,12 @@ class LeVoInference(torch.nn.Module):
112
  max_duration = self.max_duration,
113
  seperate_tokenizer = seperate_tokenizer,
114
  )
115
-
116
- if tokens.shape[-1] > 3000:
117
- tokens = tokens[..., :3000]
118
 
119
  with torch.no_grad():
120
  if melody_is_wav:
121
- wav_seperate = model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav)
122
  else:
123
- wav_seperate = model.generate_audio(tokens)
124
 
125
  del seperate_tokenizer
126
  del model
 
40
  )
41
 
42
 
43
+ def forward(self, lyric: str, description: str = None, prompt_audio_path: os.PathLike = None, genre: str = None, auto_prompt_path: os.PathLike = None, gen_type: str = "all", params = dict()):
44
  if prompt_audio_path is not None and os.path.exists(prompt_audio_path):
45
  separator = Separator()
46
  audio_tokenizer = builders.get_audio_tokenizer_model(self.cfg.audio_tokenizer_checkpoint, self.cfg)
 
112
  max_duration = self.max_duration,
113
  seperate_tokenizer = seperate_tokenizer,
114
  )
 
 
 
115
 
116
  with torch.no_grad():
117
  if melody_is_wav:
118
+ wav_seperate = self.model.generate_audio(tokens, pmt_wav, vocal_wav, bgm_wav, gen_type=gen_type)
119
  else:
120
+ wav_seperate = self.model.generate_audio(tokens, gen_type=gen_type)
121
 
122
  del seperate_tokenizer
123
  del model