Spaces:

TDN-M
/

GV-a

Sleeping

App Files Files Community

TDN-M commited on Feb 12

Commit

3182b39

1 Parent(s): 7e90549

tts

Browse files

Files changed (1) hide show

tts.py +49 -25

tts.py CHANGED Viewed

@@ -6,7 +6,7 @@ from huggingface_hub import hf_hub_download
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
-from torch.cuda.amp import autocast
 # Cấu hình đường dẫn và tải mô hình
 checkpoint_dir = "model/"
@@ -34,11 +34,14 @@ required_files = ["model.pth", "config.json", "vocab.json", "speakers_xtts.pth"]
 for file in required_files:
     file_path = os.path.join(checkpoint_dir, file)
     if not os.path.exists(file_path):
-        hf_hub_download(
-            repo_id=repo_id if file != "speakers_xtts.pth" else "coqui/XTTS-v2",
-            filename=file,
-            local_dir=checkpoint_dir,
-        )
 # Tải cấu hình và mô hình
 xtts_config = os.path.join(checkpoint_dir, "config.json")
@@ -52,30 +55,44 @@ MODEL.to(device)
 supported_languages = ["vi", "en"]
 def normalize_vietnamese_text(text):
-    text = (
-        TTSnorm(text, unknown=False, lower=False, rule=True)
-        .replace("..", ".")
-        .replace("!.", "!")
-        .replace("?.", "?")
-        .replace(" .", ".")
-        .replace(" ,", ",")
-        .replace('"', "")
-        .replace("'", "")
-        .replace("AI", "Ây Ai")
-        .replace("A.I", "Ây Ai")
-    )
-    return text
-def generate_speech(text, language="vi", speaker_wav=None, normalize_text=True):
     if language not in supported_languages:
-        raise ValueError(f"Ngôn ngữ {language} không được hỗ trợ.")
     if len(text) < 2:
         raise ValueError("Văn bản quá ngắn.")
     try:
         if normalize_text and language == "vi":
             text = normalize_vietnamese_text(text)
         with torch.no_grad():
-            with autocast(enabled=use_fp16):
                 gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(
                     audio_path=speaker_wav,
                     gpt_cond_len=30 if device == "cuda" else 15,
@@ -87,12 +104,19 @@ def generate_speech(text, language="vi", speaker_wav=None, normalize_text=True):
                     language,
                     gpt_cond_latent,
                     speaker_embedding,
-                    repetition_penalty=5.0,
-                    temperature=0.75,
                     enable_text_splitting=True,
                 )
-        output_file = f"output_{os.urandom(4).hex()}.wav"
         torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0).to("cpu"), 24000)
         return output_file
     except Exception as e:
         raise RuntimeError(f"Lỗi khi tạo giọng nói: {str(e)}")

 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from vinorm import TTSnorm
+from torch.amp import autocast
 # Cấu hình đường dẫn và tải mô hình
 checkpoint_dir = "model/"
 for file in required_files:
     file_path = os.path.join(checkpoint_dir, file)
     if not os.path.exists(file_path):
+        try:
+            hf_hub_download(
+                repo_id=repo_id if file != "speakers_xtts.pth" else "coqui/XTTS-v2",
+                filename=file,
+                local_dir=checkpoint_dir,
+            )
+        except Exception as e:
+            raise RuntimeError(f"Không thể tải file {file} từ Hugging Face Hub: {str(e)}")
 # Tải cấu hình và mô hình
 xtts_config = os.path.join(checkpoint_dir, "config.json")
 supported_languages = ["vi", "en"]
 def normalize_vietnamese_text(text):
+    try:
+        text = (
+            TTSnorm(text, unknown=False, lower=False, rule=True)
+            .replace("..", ".")
+            .replace("!.", "!")
+            .replace("?.", "?")
+            .replace(" .", ".")
+            .replace(" ,", ",")
+            .replace('"', "")
+            .replace("'", "")
+            .replace("AI", "Ây Ai")
+            .replace("A.I", "Ây Ai")
+        )
+        return text
+    except Exception as e:
+        raise RuntimeError(f"Lỗi khi chuẩn hóa văn bản: {str(e)}")
+def generate_speech(
+    text,
+    language="vi",
+    speaker_wav=None,
+    normalize_text=True,
+    repetition_penalty=5.0,
+    temperature=0.75,
+):
     if language not in supported_languages:
+        raise ValueError(f"Ngôn ngữ {language} không được hỗ trợ. Các ngôn ngữ được hỗ trợ: {', '.join(supported_languages)}")
     if len(text) < 2:
         raise ValueError("Văn bản quá ngắn.")
+    if speaker_wav and not os.path.isfile(speaker_wav):
+        raise ValueError(f"File speaker_wav không tồn tại: {speaker_wav}")
     try:
         if normalize_text and language == "vi":
             text = normalize_vietnamese_text(text)
         with torch.no_grad():
+            with autocast(device_type='cuda', enabled=use_fp16):
                 gpt_cond_latent, speaker_embedding = MODEL.get_conditioning_latents(
                     audio_path=speaker_wav,
                     gpt_cond_len=30 if device == "cuda" else 15,
                     language,
                     gpt_cond_latent,
                     speaker_embedding,
+                    repetition_penalty=repetition_penalty,
+                    temperature=temperature,
                     enable_text_splitting=True,
                 )
+        output_dir = "outputs/"
+        os.makedirs(output_dir, exist_ok=True)
+        output_file = os.path.join(output_dir, f"output_{os.urandom(4).hex()}.wav")
         torchaudio.save(output_file, torch.tensor(out["wav"]).unsqueeze(0).to("cpu"), 24000)
+        if device == "cuda":
+            torch.cuda.empty_cache()
         return output_file
     except Exception as e:
         raise RuntimeError(f"Lỗi khi tạo giọng nói: {str(e)}")