|
import torch |
|
import torchaudio |
|
import gradio as gr |
|
|
|
from zonos.model import Zonos |
|
from zonos.conditioning import make_cond_dict |
|
|
|
|
|
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda") |
|
model.bfloat16() |
|
|
|
|
|
def tts(text, reference_audio): |
|
""" |
|
text: str |
|
reference_audio: (numpy.ndarray, int) -> (data, sample_rate) |
|
""" |
|
if reference_audio is None: |
|
return "No reference audio provided." |
|
|
|
|
|
|
|
wav_np, sr = reference_audio |
|
|
|
|
|
wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0) |
|
if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]: |
|
|
|
wav_torch = wav_torch.T |
|
|
|
|
|
spk_embedding = model.embed_spk_audio(wav_torch, sr) |
|
|
|
|
|
cond_dict = make_cond_dict( |
|
text=text, |
|
speaker=spk_embedding.to(torch.bfloat16), |
|
language="en-us", |
|
) |
|
conditioning = model.prepare_conditioning(cond_dict) |
|
|
|
|
|
with torch.no_grad(): |
|
torch.manual_seed(421) |
|
codes = model.generate(conditioning) |
|
|
|
|
|
wavs = model.autoencoder.decode(codes).cpu() |
|
out_audio = wavs[0].numpy() |
|
|
|
|
|
return (model.autoencoder.sampling_rate, out_audio) |
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn=tts, |
|
inputs=[ |
|
gr.Textbox(label="Text to Synthesize"), |
|
gr.Audio(source="upload", type="numpy", label="Reference Audio (for speaker embedding)"), |
|
], |
|
outputs=gr.Audio(label="Generated Audio"), |
|
title="Zonos TTS Demo (Hybrid)", |
|
description=( |
|
"Provide a reference audio snippet for speaker embedding, " |
|
"enter text, and generate speech with Zonos TTS." |
|
), |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |
|
|