baymax / app.py
noahabebe's picture
Create app.py
8dd00fc verified
import gradio as gr
import numpy as np
import os
from pathlib import Path
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pydub import AudioSegment
# Load the models
project_name = "Real-Time-Voice-Cloning"
encoder.load_model(Path(project_name) / "encoder/saved_models/pretrained.pt")
synthesizer = Synthesizer(Path(project_name) / "synthesizer/saved_models/pretrained/pretrained.pt")
vocoder.load_model(Path(project_name) / "vocoder/saved_models/pretrained/pretrained.pt")
def clone_voice(text, reference_audio):
# Save the uploaded reference audio
audio_path = "reference_audio.wav"
reference_audio.export(audio_path, format="wav")
# Process the audio to extract embedding
audio = encoder.preprocess_wav(audio_path)
embedding = encoder.embed_utterance(audio)
# Synthesize the new speech
specs = synthesizer.synthesize_spectrograms([text], [embedding])
generated_wav = vocoder.infer_waveform(specs[0])
# Save and return the generated audio
output_path = "output.wav"
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
AudioSegment(generated_wav, frame_rate=synthesizer.sample_rate, sample_width=2, channels=1).export(output_path, format="wav")
return output_path
iface = gr.Interface(
fn=clone_voice,
inputs=[gr.Textbox(label="Text"), gr.Audio(label="Reference Audio", type="file")],
outputs=gr.Audio(label="Generated Audio"),
title="Real-Time Voice Cloning",
description="Generate new speech using a reference audio sample and provided text."
)
if __name__ == "__main__":
iface.launch()