# !pip install TTS gradio numpy librosa torch from TTS.api import TTS import gradio as gr import numpy as np import librosa import torch import tempfile import os # Check device availability device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize TTS model model_name = "tts_models/multilingual/multi-dataset/your_tts" tts = TTS(model_name=model_name).to(device) def process_audio(audio_path, max_duration=10): """Load and trim audio to specified duration""" y, sr = librosa.load(audio_path, sr=16000, mono=True) max_samples = max_duration * sr if len(y) > max_samples: y = y[:int(max_samples)] return y, sr def generate_speech(audio_file, text): # Create temp files with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \ tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file: ref_path = ref_file.name out_path = out_file.name # Process reference audio y, sr = process_audio(audio_file) librosa.output.write_wav(ref_path, y, sr) # Generate speech try: tts.tts_to_file( text=text, speaker_wav=ref_path, language="en", file_path=out_path ) # Clean up temporary files os.unlink(ref_path) return out_path except Exception as e: print(f"Error: {e}") return None # Gradio interface with gr.Blocks(title="Voice Clone TTS") as demo: gr.Markdown(""" # 🎤 Voice Clone Text-to-Speech 1. Upload a short English voice sample (5-10 seconds) 2. Enter text you want to speak 3. Generate audio in your voice! """) with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Upload Voice Sample", interactive=True ) text_input = gr.Textbox( label="Text to Speak", placeholder="Enter English text here...", lines=4 ) btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", interactive=False ) error_output = gr.Textbox(label="Processing Info", visible=False) # Example inputs gr.Examples( examples=[ ["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"], ["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"] ], inputs=[audio_input, text_input], outputs=audio_output, fn=generate_speech, cache_examples=True ) btn.click( fn=generate_speech, inputs=[audio_input, text_input], outputs=audio_output ) if __name__ == "__main__": demo.launch(server_port=7860, share=True)