File size: 1,861 Bytes
4eeb5fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from huggingface_hub import snapshot_download, hf_hub_download
from vinorm import TTSnorm
def generate_speech(text, language="vi", speaker_wav=None):
# Tải mô hình nếu chưa được tải
checkpoint_dir = "model/"
repo_id = "capleaf/viXTTS"
use_deepspeed = False
os.makedirs(checkpoint_dir, exist_ok=True)
required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
files_in_dir = os.listdir(checkpoint_dir)
if not all(file in files_in_dir for file in required_files):
snapshot_download(
repo_id=repo_id,
repo_type="model",
local_dir=checkpoint_dir,
)
hf_hub_download(
repo_id="coqui/XTTS-v2",
filename="speakers_xtts.pth",
local_dir=checkpoint_dir,
)
# Cấu hình và tải mô hình
xtts_config = os.path.join(checkpoint_dir, "config.json")
config = XttsConfig()
config.load_json(xtts_config)
MODEL = Xtts.init_from_config(config)
MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)
if torch.cuda.is_available():
MODEL.cuda()
# Chuẩn hóa văn bản
normalized_text = TTSnorm(text)
# Tạo giọng nói
with torch.no_grad():
gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(audio_path=speaker_wav)
out = MODEL.inference(
normalized_text,
language,
gpt_cond_latent,
speaker_embedding,
temperature=0.7,
)
# Lưu file âm thanh
output_file = "output.wav"
torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 22050)
return output_file
|