|
import os |
|
import torch |
|
import torchaudio |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
from huggingface_hub import snapshot_download, hf_hub_download |
|
from vinorm import TTSnorm |
|
|
|
def generate_speech(text, language="vi", speaker_wav=None): |
|
|
|
checkpoint_dir = "model/" |
|
repo_id = "capleaf/viXTTS" |
|
use_deepspeed = False |
|
|
|
os.makedirs(checkpoint_dir, exist_ok=True) |
|
|
|
required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"] |
|
files_in_dir = os.listdir(checkpoint_dir) |
|
if not all(file in files_in_dir for file in required_files): |
|
snapshot_download( |
|
repo_id=repo_id, |
|
repo_type="model", |
|
local_dir=checkpoint_dir, |
|
) |
|
hf_hub_download( |
|
repo_id="coqui/XTTS-v2", |
|
filename="speakers_xtts.pth", |
|
local_dir=checkpoint_dir, |
|
) |
|
|
|
|
|
xtts_config = os.path.join(checkpoint_dir, "config.json") |
|
config = XttsConfig() |
|
config.load_json(xtts_config) |
|
MODEL = Xtts.init_from_config(config) |
|
MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed) |
|
|
|
if torch.cuda.is_available(): |
|
MODEL.cuda() |
|
|
|
|
|
normalized_text = TTSnorm(text) |
|
|
|
|
|
with torch.no_grad(): |
|
gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(audio_path=speaker_wav) |
|
out = MODEL.inference( |
|
normalized_text, |
|
language, |
|
gpt_cond_latent, |
|
speaker_embedding, |
|
temperature=0.7, |
|
) |
|
|
|
|
|
output_file = "output.wav" |
|
torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 22050) |
|
return output_file |
|
|