Spaces:
Running
on
T4
Running
on
T4
fix memory leak
Browse files- tortoise/api.py +32 -51
tortoise/api.py
CHANGED
|
@@ -243,28 +243,22 @@ class TextToSpeech:
|
|
| 243 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
| 244 |
model_dim=1024,
|
| 245 |
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
| 246 |
-
train_solo_embeddings=False).
|
| 247 |
self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
|
| 248 |
self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
|
| 249 |
|
| 250 |
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
| 251 |
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
| 252 |
-
layer_drop=0, unconditioned_percentage=0).
|
| 253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
| 254 |
|
| 255 |
-
self.vocoder = UnivNetGenerator().
|
| 256 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
| 257 |
self.vocoder.eval(inference=True)
|
| 258 |
|
| 259 |
# Random latent generators (RLGs) are loaded lazily.
|
| 260 |
self.rlg_auto = None
|
| 261 |
self.rlg_diffusion = None
|
| 262 |
-
@contextmanager
|
| 263 |
-
def temporary_cuda(self, model):
|
| 264 |
-
m = model.to(self.device)
|
| 265 |
-
yield m
|
| 266 |
-
m = model.cpu()
|
| 267 |
-
|
| 268 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
| 269 |
"""
|
| 270 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
|
@@ -328,7 +322,6 @@ class TextToSpeech:
|
|
| 328 |
# Presets are defined here.
|
| 329 |
presets = {
|
| 330 |
'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
|
| 331 |
-
# 'ultra_fast': {'num_autoregressive_samples': 16, 'diffusion_iterations': 30},
|
| 332 |
'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
|
| 333 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
| 334 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
|
@@ -409,57 +402,45 @@ class TextToSpeech:
|
|
| 409 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
| 410 |
|
| 411 |
with torch.no_grad():
|
| 412 |
-
|
| 413 |
-
stop_mel_token = self.autoregressive.stop_mel_token
|
| 414 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
| 415 |
if verbose:
|
| 416 |
print("Generating autoregressive samples..")
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
max_generate_length=max_mel_tokens,
|
| 427 |
-
**hf_generate_kwargs)
|
| 428 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
| 429 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
| 430 |
# results, but will increase memory usage.
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
best_latents = autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
| 437 |
-
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
|
| 438 |
-
torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
| 439 |
-
return_latent=True, clip_inputs=False)
|
| 440 |
-
del auto_conditioning
|
| 441 |
|
| 442 |
if verbose:
|
| 443 |
print("Transforming autoregressive outputs into audio..")
|
| 444 |
wav_candidates = []
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
verbose=verbose)
|
| 461 |
-
wav = vocoder.inference(mel)
|
| 462 |
-
wav_candidates.append(wav.cpu())
|
| 463 |
|
| 464 |
def potentially_redact(clip, text):
|
| 465 |
if self.enable_redaction:
|
|
|
|
| 243 |
self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
| 244 |
model_dim=1024,
|
| 245 |
heads=16, number_text_tokens=255, start_text_token=255, checkpointing=False,
|
| 246 |
+
train_solo_embeddings=False).cuda().eval()
|
| 247 |
self.autoregressive.load_state_dict(torch.load(get_model_path('autoregressive.pth', models_dir)), strict=False)
|
| 248 |
self.autoregressive.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=kv_cache, half=self.half)
|
| 249 |
|
| 250 |
self.diffusion = DiffusionTts(model_channels=1024, num_layers=10, in_channels=100, out_channels=200,
|
| 251 |
in_latent_channels=1024, in_tokens=8193, dropout=0, use_fp16=False, num_heads=16,
|
| 252 |
+
layer_drop=0, unconditioned_percentage=0).cuda().eval()
|
| 253 |
self.diffusion.load_state_dict(torch.load(get_model_path('diffusion_decoder.pth', models_dir)))
|
| 254 |
|
| 255 |
+
self.vocoder = UnivNetGenerator().cuda()
|
| 256 |
self.vocoder.load_state_dict(torch.load(get_model_path('vocoder.pth', models_dir), map_location=torch.device('cpu'))['model_g'])
|
| 257 |
self.vocoder.eval(inference=True)
|
| 258 |
|
| 259 |
# Random latent generators (RLGs) are loaded lazily.
|
| 260 |
self.rlg_auto = None
|
| 261 |
self.rlg_diffusion = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
def get_conditioning_latents(self, voice_samples, return_mels=False):
|
| 263 |
"""
|
| 264 |
Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
|
|
|
|
| 322 |
# Presets are defined here.
|
| 323 |
presets = {
|
| 324 |
'ultra_fast': {'num_autoregressive_samples': 1, 'diffusion_iterations': 15},
|
|
|
|
| 325 |
'fast': {'num_autoregressive_samples': 32, 'diffusion_iterations': 50},
|
| 326 |
'standard': {'num_autoregressive_samples': 256, 'diffusion_iterations': 200},
|
| 327 |
'high_quality': {'num_autoregressive_samples': 256, 'diffusion_iterations': 400},
|
|
|
|
| 402 |
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=diffusion_iterations, cond_free=cond_free, cond_free_k=cond_free_k)
|
| 403 |
|
| 404 |
with torch.no_grad():
|
|
|
|
|
|
|
| 405 |
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
| 406 |
if verbose:
|
| 407 |
print("Generating autoregressive samples..")
|
| 408 |
+
codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens,
|
| 409 |
+
do_sample=True,
|
| 410 |
+
top_p=top_p,
|
| 411 |
+
temperature=temperature,
|
| 412 |
+
num_return_sequences=num_autoregressive_samples,
|
| 413 |
+
length_penalty=length_penalty,
|
| 414 |
+
repetition_penalty=repetition_penalty,
|
| 415 |
+
max_generate_length=max_mel_tokens,
|
| 416 |
+
**hf_generate_kwargs)
|
|
|
|
|
|
|
| 417 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
| 418 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
| 419 |
# results, but will increase memory usage.
|
| 420 |
+
best_latents = self.autoregressive(auto_conditioning.repeat(k, 1), text_tokens.repeat(k, 1),
|
| 421 |
+
torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
|
| 422 |
+
torch.tensor([codes.shape[-1]*self.autoregressive.mel_length_compression], device=text_tokens.device),
|
| 423 |
+
return_latent=True, clip_inputs=False)
|
| 424 |
+
del auto_conditioning
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
| 426 |
if verbose:
|
| 427 |
print("Transforming autoregressive outputs into audio..")
|
| 428 |
wav_candidates = []
|
| 429 |
+
latents = best_latents
|
| 430 |
+
# Find the first occurrence of the "calm" token and trim the codes to that.
|
| 431 |
+
ctokens = 0
|
| 432 |
+
for k in range(codes.shape[-1]):
|
| 433 |
+
if codes[0, k] == calm_token:
|
| 434 |
+
ctokens += 1
|
| 435 |
+
else:
|
| 436 |
+
ctokens = 0
|
| 437 |
+
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
| 438 |
+
latents = latents[:, :k]
|
| 439 |
+
break
|
| 440 |
+
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning, temperature=diffusion_temperature,
|
| 441 |
+
verbose=verbose)
|
| 442 |
+
wav = self.vocoder.inference(mel)
|
| 443 |
+
wav_candidates.append(wav.cpu())
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
def potentially_redact(clip, text):
|
| 446 |
if self.enable_redaction:
|