Update app.py
Browse files
app.py
CHANGED
@@ -73,18 +73,15 @@ from models.soundstream_hubert_new import SoundStream
|
|
73 |
device = "cuda:0"
|
74 |
|
75 |
model = AutoModelForCausalLM.from_pretrained(
|
76 |
-
"m-a-p/YuE-s1-7B-anneal-en-cot",
|
77 |
torch_dtype=torch.float16,
|
78 |
attn_implementation="flash_attention_2",
|
79 |
-
|
80 |
).to(device)
|
81 |
model.eval()
|
82 |
|
83 |
basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
|
84 |
resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
|
85 |
-
#config_path = './xcodec_mini_infer/decoders/config.yaml' # removed vocoder
|
86 |
-
#vocal_decoder_path = './xcodec_mini_infer/decoders/decoder_131000.pth' # removed vocoder
|
87 |
-
#inst_decoder_path = './xcodec_mini_infer/decoders/decoder_151000.pth' # removed vocoder
|
88 |
|
89 |
mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
|
90 |
|
@@ -94,18 +91,8 @@ model_config = OmegaConf.load(basic_model_config)
|
|
94 |
codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
|
95 |
parameter_dict = torch.load(resume_path, map_location='cpu')
|
96 |
codec_model.load_state_dict(parameter_dict['codec_model'])
|
97 |
-
# codec_model = torch.compile(codec_model)
|
98 |
codec_model.eval()
|
99 |
|
100 |
-
# Preload and compile vocoders # removed vocoder
|
101 |
-
#vocal_decoder, inst_decoder = build_codec_model(config_path, vocal_decoder_path, inst_decoder_path)
|
102 |
-
#vocal_decoder.to(device)
|
103 |
-
#inst_decoder.to(device)
|
104 |
-
#vocal_decoder = torch.compile(vocal_decoder)
|
105 |
-
#inst_decoder = torch.compile(inst_decoder)
|
106 |
-
#vocal_decoder.eval()
|
107 |
-
#inst_decoder.eval()
|
108 |
-
|
109 |
|
110 |
@spaces.GPU(duration=120)
|
111 |
def generate_music(
|
@@ -309,8 +296,8 @@ def generate_music(
|
|
309 |
continue
|
310 |
# mix
|
311 |
recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
|
312 |
-
vocal_stem, sr = sf.read(
|
313 |
-
instrumental_stem, _ = sf.read(
|
314 |
mix_stem = (vocal_stem + instrumental_stem) / 1
|
315 |
return (sr, (mix_stem * 32767).astype(np.int16)), (sr, (vocal_stem * 32767).astype(np.int16)), (sr, (instrumental_stem * 32767).astype(np.int16))
|
316 |
except Exception as e:
|
|
|
73 |
device = "cuda:0"
|
74 |
|
75 |
model = AutoModelForCausalLM.from_pretrained(
|
76 |
+
"m-a-p/YuE-s1-7B-anneal-en-icl", # "m-a-p/YuE-s1-7B-anneal-en-cot",
|
77 |
torch_dtype=torch.float16,
|
78 |
attn_implementation="flash_attention_2",
|
79 |
+
low_cpu_mem_usage=True,
|
80 |
).to(device)
|
81 |
model.eval()
|
82 |
|
83 |
basic_model_config = './xcodec_mini_infer/final_ckpt/config.yaml'
|
84 |
resume_path = './xcodec_mini_infer/final_ckpt/ckpt_00360000.pth'
|
|
|
|
|
|
|
85 |
|
86 |
mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
|
87 |
|
|
|
91 |
codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
|
92 |
parameter_dict = torch.load(resume_path, map_location='cpu')
|
93 |
codec_model.load_state_dict(parameter_dict['codec_model'])
|
|
|
94 |
codec_model.eval()
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
@spaces.GPU(duration=120)
|
98 |
def generate_music(
|
|
|
296 |
continue
|
297 |
# mix
|
298 |
recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
|
299 |
+
vocal_stem, sr = sf.read(vocal_path)
|
300 |
+
instrumental_stem, _ = sf.read(inst_path)
|
301 |
mix_stem = (vocal_stem + instrumental_stem) / 1
|
302 |
return (sr, (mix_stem * 32767).astype(np.int16)), (sr, (vocal_stem * 32767).astype(np.int16)), (sr, (instrumental_stem * 32767).astype(np.int16))
|
303 |
except Exception as e:
|