import gradio as gr import numpy as np import os from pathlib import Path from synthesizer.inference import Synthesizer from encoder import inference as encoder from vocoder import inference as vocoder from pydub import AudioSegment # Load the models project_name = "Real-Time-Voice-Cloning" encoder.load_model(Path(project_name) / "encoder/saved_models/pretrained.pt") synthesizer = Synthesizer(Path(project_name) / "synthesizer/saved_models/pretrained/pretrained.pt") vocoder.load_model(Path(project_name) / "vocoder/saved_models/pretrained/pretrained.pt") def clone_voice(text, reference_audio): # Save the uploaded reference audio audio_path = "reference_audio.wav" reference_audio.export(audio_path, format="wav") # Process the audio to extract embedding audio = encoder.preprocess_wav(audio_path) embedding = encoder.embed_utterance(audio) # Synthesize the new speech specs = synthesizer.synthesize_spectrograms([text], [embedding]) generated_wav = vocoder.infer_waveform(specs[0]) # Save and return the generated audio output_path = "output.wav" generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") AudioSegment(generated_wav, frame_rate=synthesizer.sample_rate, sample_width=2, channels=1).export(output_path, format="wav") return output_path iface = gr.Interface( fn=clone_voice, inputs=[gr.Textbox(label="Text"), gr.Audio(label="Reference Audio", type="file")], outputs=gr.Audio(label="Generated Audio"), title="Real-Time Voice Cloning", description="Generate new speech using a reference audio sample and provided text." ) if __name__ == "__main__": iface.launch()