|
import gradio as gr |
|
import numpy as np |
|
import os |
|
from pathlib import Path |
|
from synthesizer.inference import Synthesizer |
|
from encoder import inference as encoder |
|
from vocoder import inference as vocoder |
|
from pydub import AudioSegment |
|
|
|
|
|
project_name = "Real-Time-Voice-Cloning" |
|
encoder.load_model(Path(project_name) / "encoder/saved_models/pretrained.pt") |
|
synthesizer = Synthesizer(Path(project_name) / "synthesizer/saved_models/pretrained/pretrained.pt") |
|
vocoder.load_model(Path(project_name) / "vocoder/saved_models/pretrained/pretrained.pt") |
|
|
|
def clone_voice(text, reference_audio): |
|
|
|
audio_path = "reference_audio.wav" |
|
reference_audio.export(audio_path, format="wav") |
|
|
|
|
|
audio = encoder.preprocess_wav(audio_path) |
|
embedding = encoder.embed_utterance(audio) |
|
|
|
|
|
specs = synthesizer.synthesize_spectrograms([text], [embedding]) |
|
generated_wav = vocoder.infer_waveform(specs[0]) |
|
|
|
|
|
output_path = "output.wav" |
|
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") |
|
AudioSegment(generated_wav, frame_rate=synthesizer.sample_rate, sample_width=2, channels=1).export(output_path, format="wav") |
|
return output_path |
|
|
|
iface = gr.Interface( |
|
fn=clone_voice, |
|
inputs=[gr.Textbox(label="Text"), gr.Audio(label="Reference Audio", type="file")], |
|
outputs=gr.Audio(label="Generated Audio"), |
|
title="Real-Time Voice Cloning", |
|
description="Generate new speech using a reference audio sample and provided text." |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|