File size: 3,038 Bytes
4760b00
 
f74edeb
4760b00
3220f5e
4760b00
 
ba92b2d
 
f8f4a26
4760b00
 
f8f4a26
4760b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba92b2d
4760b00
 
 
ba92b2d
4760b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3220f5e
4760b00
 
 
 
 
 
 
 
3220f5e
 
4760b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3220f5e
4760b00
 
 
 
 
 
 
 
 
 
 
3220f5e
4760b00
f74edeb
 
 
 
3220f5e
4760b00
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# !pip install TTS gradio numpy librosa torch

from TTS.api import TTS
import gradio as gr
import numpy as np
import librosa
import torch
import tempfile
import os

# Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model
model_name = "tts_models/multilingual/multi-dataset/your_tts"
tts = TTS(model_name=model_name).to(device)

def process_audio(audio_path, max_duration=10):
    """Load and trim audio to specified duration"""
    y, sr = librosa.load(audio_path, sr=16000, mono=True)
    max_samples = max_duration * sr
    if len(y) > max_samples:
        y = y[:int(max_samples)]
    return y, sr

def generate_speech(audio_file, text):
    # Create temp files
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
         tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:
        
        ref_path = ref_file.name
        out_path = out_file.name
    
    # Process reference audio
    y, sr = process_audio(audio_file)
    librosa.output.write_wav(ref_path, y, sr)
    
    # Generate speech
    try:
        tts.tts_to_file(
            text=text,
            speaker_wav=ref_path,
            language="en",
            file_path=out_path
        )
        
        # Clean up temporary files
        os.unlink(ref_path)
        return out_path
    except Exception as e:
        print(f"Error: {e}")
        return None

# Gradio interface
with gr.Blocks(title="Voice Clone TTS") as demo:
    gr.Markdown("""
    # 🎤 Voice Clone Text-to-Speech
    1. Upload a short English voice sample (5-10 seconds)
    2. Enter text you want to speak
    3. Generate audio in your voice!
    """)
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Upload Voice Sample",
                interactive=True
            )
            text_input = gr.Textbox(
                label="Text to Speak",
                placeholder="Enter English text here...",
                lines=4
            )
            btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Speech",
                interactive=False
            )
            error_output = gr.Textbox(label="Processing Info", visible=False)
    
    # Example inputs
    gr.Examples(
        examples=[
            ["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
            ["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
        ],
        inputs=[audio_input, text_input],
        outputs=audio_output,
        fn=generate_speech,
        cache_examples=True
    )
    
    btn.click(
        fn=generate_speech,
        inputs=[audio_input, text_input],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch(server_port=7860, share=True)