Spaces:
Running
Running
File size: 4,711 Bytes
5efbc82 a7aae29 6a83fff a7aae29 5efbc82 a7aae29 9a88d9c a7aae29 6a83fff 9a88d9c 9acb9c3 a7aae29 9acb9c3 a7aae29 9acb9c3 a7aae29 9acb9c3 a7aae29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import torch
import torchaudio
import subprocess
# Set environment variables for CPU-only usage
os.environ['COQUI_TOS_AGREED'] = '1'
os.environ['NUMBA_DISABLE_JIT'] = '1'
os.environ['FORCE_CPU'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
# Fix PyTorch weights_only issue for XTTS
import torch.serialization
from TTS.tts.configs.xtts_config import XttsConfig
torch.serialization.add_safe_globals([XttsConfig])
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
print("Testing XTTS C3PO voice cloning...")
# C3PO model path
model_path = "XTTS-v2_C3PO/"
config_path = "XTTS-v2_C3PO/config.json"
# Check if model files exist, if not download them
if not os.path.exists(config_path):
print("C3PO model not found locally, downloading...")
try:
subprocess.run([
"git", "clone",
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
"XTTS-v2_C3PO"
], check=True)
print("C3PO model downloaded successfully")
except subprocess.CalledProcessError as e:
print(f"Failed to download C3PO model: {e}")
exit(1)
# Load configuration
config = XttsConfig()
config.load_json(config_path)
# Initialize and load model
model = Xtts.init_from_config(config)
model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
)
device = "cpu" # Force CPU usage
print(f"C3PO model loaded on {device} (forced CPU mode)")
# Text to convert to speech
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
# Look for reference audio in the C3PO model directory
reference_audio_path = None
for file in os.listdir(model_path):
if file.endswith(('.wav', '.mp3', '.m4a')):
reference_audio_path = os.path.join(model_path, file)
print(f"Found C3PO reference audio: {file}")
break
# If no reference audio found, create a simple test reference
if reference_audio_path is None:
print("No reference audio found in C3PO model, creating test reference...")
reference_audio_path = "test_reference.wav"
# Generate a simple sine wave as placeholder
import numpy as np
sample_rate = 24000
duration = 3 # seconds
frequency = 440 # Hz
t = np.linspace(0, duration, int(sample_rate * duration))
audio_data = 0.3 * np.sin(2 * np.pi * frequency * t)
# Save as WAV
torchaudio.save(reference_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
print(f"Test reference audio created: {reference_audio_path}")
try:
# Generate conditioning latents
print("Processing reference audio...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
audio_path=reference_audio_path,
gpt_cond_len=30,
gpt_cond_chunk_len=4,
max_ref_length=60
)
# Generate speech
print("Generating C3PO speech...")
out = model.inference(
text,
"en", # language
gpt_cond_latent,
speaker_embedding,
repetition_penalty=5.0,
temperature=0.75,
)
# Save output
output_path = "c3po_test_output.wav"
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
print(f"C3PO speech generated successfully! Saved as: {output_path}")
# Test multilingual capabilities
print("\nTesting multilingual C3PO...")
multilingual_tests = [
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
]
for lang, test_text in multilingual_tests:
print(f"Generating {lang.upper()} speech...")
out = model.inference(
test_text,
lang,
gpt_cond_latent,
speaker_embedding,
repetition_penalty=5.0,
temperature=0.75,
)
output_path = f"c3po_test_{lang}.wav"
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
print(f"C3PO {lang.upper()} speech saved as: {output_path}")
except Exception as e:
print(f"Error during speech generation: {e}")
import traceback
traceback.print_exc()
print("XTTS C3PO test completed!")
print("\nGenerated files:")
for file in os.listdir("."):
if file.startswith("c3po_test") and file.endswith(".wav"):
print(f" - {file}") |