Spaces:

TDN-M
/

GV-a

Sleeping

App Files Files Community

TDN-M commited on Jan 24

Commit

93ff1fe

verified ·

1 Parent(s): f746763

Update tts.py

Browse files

Files changed (1) hide show

tts.py +99 -41

tts.py CHANGED Viewed

@@ -1,58 +1,116 @@
 import os
 import torch
 import torchaudio
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
-from huggingface_hub import snapshot_download, hf_hub_download
 from vinorm import TTSnorm
-def generate_speech(text, language="vi", speaker_wav=None):
-    # Tải mô hình nếu chưa được tải
-    checkpoint_dir = "model/"
-    repo_id = "capleaf/viXTTS"
-    use_deepspeed = False
-    os.makedirs(checkpoint_dir, exist_ok=True)
-    required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
-    files_in_dir = os.listdir(checkpoint_dir)
-    if not all(file in files_in_dir for file in required_files):
-        snapshot_download(
-            repo_id=repo_id,
-            repo_type="model",
-            local_dir=checkpoint_dir,
-        )
-        hf_hub_download(
-            repo_id="coqui/XTTS-v2",
-            filename="speakers_xtts.pth",
-            local_dir=checkpoint_dir,
-        )
-    # Cấu hình và tải mô hình
-    xtts_config = os.path.join(checkpoint_dir, "config.json")
-    config = XttsConfig()
-    config.load_json(xtts_config)
-    MODEL = Xtts.init_from_config(config)
-    MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)
-    if torch.cuda.is_available():
-        MODEL.cuda()
-    # Chuẩn hóa văn bản
-    normalized_text = TTSnorm(text)
-    # Tạo giọng nói
-    with torch.no_grad():
-        gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(audio_path=speaker_wav)
         out = MODEL.inference(
-            normalized_text,
             language,
             gpt_cond_latent,
             speaker_embedding,
-            temperature=0.7,
         )
-    # Lưu file âm thanh
-    output_file = "output.wav"
-    torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 22050)
-    return output_file

 import os
+import re
 import torch
 import torchaudio
+from huggingface_hub import snapshot_download, hf_hub_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
+# Cấu hình đường dẫn và tải mô hình
+checkpoint_dir = "model/"
+repo_id = "capleaf/viXTTS"
+use_deepspeed = False
+# Tạo thư mục nếu chưa tồn tại
+os.makedirs(checkpoint_dir, exist_ok=True)
+# Kiểm tra và tải các file cần thiết
+required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
+files_in_dir = os.listdir(checkpoint_dir)
+if not all(file in files_in_dir for file in required_files):
+    snapshot_download(
+        repo_id=repo_id,
+        repo_type="model",
+        local_dir=checkpoint_dir,
+    )
+    hf_hub_download(
+        repo_id="coqui/XTTS-v2",
+        filename="speakers_xtts.pth",
+        local_dir=checkpoint_dir,
+    )
+# Tải cấu hình và mô hình
+xtts_config = os.path.join(checkpoint_dir, "config.json")
+config = XttsConfig()
+config.load_json(xtts_config)
+MODEL = Xtts.init_from_config(config)
+MODEL.load_checkpoint(config, checkpoint_dir=checkpoint_dir, use_deepspeed=use_deepspeed)
+# Sử dụng GPU nếu có
+if torch.cuda.is_available():
+    MODEL.cuda()
+# Danh sách ngôn ngữ được hỗ trợ (chỉ tiếng Việt và tiếng Anh)
+supported_languages = ["vi", "en"]
+def normalize_vietnamese_text(text):
+    """
+    Chuẩn hóa văn bản tiếng Việt.
+    """
+    text = (
+        TTSnorm(text, unknown=False, lower=False, rule=True)
+        .replace("..", ".")
+        .replace("!.", "!")
+        .replace("?.", "?")
+        .replace(" .", ".")
+        .replace(" ,", ",")
+        .replace('"', "")
+        .replace("'", "")
+        .replace("AI", "Ây Ai")
+        .replace("A.I", "Ây Ai")
+    )
+    return text
+def generate_speech(text, language="vi", speaker_wav=None, normalize_text=True):
+    """
+    Tạo giọng nói từ văn bản.
+    """
+    if language not in supported_languages:
+        raise ValueError(f"Ngôn ngữ {language} không được hỗ trợ. Chỉ hỗ trợ tiếng Việt (vi) và tiếng Anh (en).")
+    if len(text) < 2:
+        raise ValueError("Văn bản quá ngắn. Vui lòng nhập văn bản dài hơn.")
+    try:
+        # Chuẩn hóa văn bản nếu cần
+        if normalize_text and language == "vi":
+            text = normalize_vietnamese_text(text)
+        # Lấy latent và embedding từ file âm thanh mẫu
+        gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(
+            audio_path=speaker_wav,
+            gpt_cond_len=30,
+            gpt_cond_chunk_len=4,
+            max_ref_length=60,
+        )
+        # Tạo giọng nói
         out = MODEL.inference(
+            text,
             language,
             gpt_cond_latent,
             speaker_embedding,
+            repetition_penalty=5.0,
+            temperature=0.75,
+            enable_text_splitting=True,
         )
+        # Lưu file âm thanh
+        output_file = "output.wav"
+        torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0), 24000)
+        return output_file
+    except Exception as e:
+        raise RuntimeError(f"Lỗi khi tạo giọng nói: {str(e)}")
+if __name__ == "__main__":
+    # Ví dụ sử dụng
+    text = "Xin chào, đây là một đoạn văn bản được chuyển thành giọng nói."
+    speaker_wav = "voices/sample_voice.wav"  # Đường dẫn đến file âm thanh mẫu trong thư mục /voices
+    output_audio = generate_speech(text, language="vi", speaker_wav=speaker_wav)
+    print(f"File âm thanh đã được tạo: {output_audio}")