import os import torch import torchaudio from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from huggingface_hub import snapshot_download, hf_hub_download from vinorm import TTSnorm def generate_speech(text, language="vi", speaker_wav=None): # Tải mô hình nếu chưa được tải checkpoint_dir = "model/" repo_id = "capleaf/viXTTS" use_deepspeed = False os.makedirs(checkpoint_dir, exist_ok=True) required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"] files_in_dir = os.listdir(checkpoint_dir) if not all(file in files_in_dir for file in required_files): snapshot_download( repo_id=repo_id, repo_type="model", local_dir=checkpoint_dir, ) hf_hub_download( repo_id="coqui/XTTS-v2", filename="speakers_xtts.pth", local_dir=checkpoint_dir, ) # Cấu hình và tải mô hình xtts_config = os.path.join(checkpoint_dir, "config.json") config = XttsConfig() config.load_json(xtts_config) MODEL = Xtts.init_from_config(config) MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed) if torch.cuda.is_available(): MODEL.cuda() # Chuẩn hóa văn bản normalized_text = TTSnorm(text) # Tạo giọng nói with torch.no_grad(): gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(audio_path=speaker_wav) out = MODEL.inference( normalized_text, language, gpt_cond_latent, speaker_embedding, temperature=0.7, ) # Lưu file âm thanh output_file = "output.wav" torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 22050) return output_file