Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
import numpy as np | |
from pathlib import Path | |
from typing import List, Union | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.pretrained import EncoderClassifier | |
import tempfile | |
import os | |
class VoiceCloneSystem: | |
"""语音克隆系统:将输入文本转换为目标说话人的语音""" | |
def __init__(self, device: str = "cpu"): | |
""" | |
初始化语音克隆系统 | |
Args: | |
device: 使用的设备,'cpu' 或 'cuda' | |
""" | |
self.device = device | |
print("正在加载模型...") | |
# 加载说话人编码器 | |
self.speaker_encoder = EncoderClassifier.from_hparams( | |
source="speechbrain/spkrec-xvect-voxceleb", | |
savedir="tmp/spkrec-xvect-voxceleb", | |
run_opts={"device": device} | |
) | |
# 加载文本到语音模型 | |
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
self.tts_model = SpeechT5ForTextToSpeech.from_pretrained( | |
"microsoft/speecht5_tts" | |
).to(device) | |
# 加载声码器 | |
self.vocoder = SpeechT5HifiGan.from_pretrained( | |
"microsoft/speecht5_hifigan" | |
).to(device) | |
print("模型加载完成!") | |
def extract_speaker_embedding( | |
self, | |
audio_paths: List[Union[str, Path]] | |
) -> torch.Tensor: | |
""" | |
从参考音频中提取说话人特征 | |
Args: | |
audio_paths: 参考音频文件路径列表 | |
Returns: | |
说话人特征向量 | |
""" | |
embeddings = [] | |
for audio_path in audio_paths: | |
# 加载音频 | |
waveform, sr = torchaudio.load(str(audio_path)) | |
# 重采样到16kHz | |
if sr != 16000: | |
waveform = torchaudio.functional.resample(waveform, sr, 16000) | |
# 确保音频是单声道 | |
if waveform.shape[0] > 1: | |
waveform = torch.mean(waveform, dim=0, keepdim=True) | |
# 提取特征 | |
with torch.no_grad(): | |
embedding = self.speaker_encoder.encode_batch(waveform.to(self.device)) | |
embeddings.append(embedding) | |
# 计算平均特征 | |
mean_embedding = torch.mean(torch.stack(embeddings), dim=0) | |
return mean_embedding | |
def generate_speech( | |
self, | |
text: str, | |
speaker_embedding: torch.Tensor | |
) -> torch.Tensor: | |
""" | |
生成语音 | |
Args: | |
text: 输入文本 | |
speaker_embedding: 说话人特征向量 | |
Returns: | |
生成的语音波形 | |
""" | |
# 处理输入文本 | |
inputs = self.processor(text=text, return_tensors="pt") | |
# 生成语音 | |
speech = self.tts_model.generate_speech( | |
inputs["input_ids"].to(self.device), | |
speaker_embedding.to(self.device), | |
vocoder=self.vocoder | |
) | |
return speech | |
def clone_voice( | |
self, | |
text: str, | |
reference_audio_paths: List[Union[str, Path]] | |
) -> torch.Tensor: | |
""" | |
主函数:克隆声音 | |
Args: | |
text: 要转换的文本 | |
reference_audio_paths: 参考音频文件路径列表 | |
Returns: | |
生成的语音波形 | |
""" | |
# 1. 提取说话人特征 | |
speaker_embedding = self.extract_speaker_embedding(reference_audio_paths) | |
# 2. 生成语音 | |
speech = self.generate_speech(text, speaker_embedding) | |
return speech | |
def save_audio( | |
self, | |
waveform: torch.Tensor, | |
output_path: Union[str, Path], | |
sample_rate: int = 16000 | |
): | |
""" | |
保存音频文件 | |
Args: | |
waveform: 音频波形 | |
output_path: 输出文件路径 | |
sample_rate: 采样率 | |
""" | |
# 确保输出目录存在 | |
output_path = Path(output_path) | |
output_path.parent.mkdir(parents=True, exist_ok=True) | |
# 保存音频 | |
torchaudio.save( | |
str(output_path), | |
waveform.unsqueeze(0).cpu(), | |
sample_rate | |
) |