VocalForge-AI / app.py
shukdevdatta123's picture
Update app.py
4760b00 verified
raw
history blame
3.04 kB
# !pip install TTS gradio numpy librosa torch
from TTS.api import TTS
import gradio as gr
import numpy as np
import librosa
import torch
import tempfile
import os
# Check device availability
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize TTS model
model_name = "tts_models/multilingual/multi-dataset/your_tts"
tts = TTS(model_name=model_name).to(device)
def process_audio(audio_path, max_duration=10):
"""Load and trim audio to specified duration"""
y, sr = librosa.load(audio_path, sr=16000, mono=True)
max_samples = max_duration * sr
if len(y) > max_samples:
y = y[:int(max_samples)]
return y, sr
def generate_speech(audio_file, text):
# Create temp files
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref_file, \
tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as out_file:
ref_path = ref_file.name
out_path = out_file.name
# Process reference audio
y, sr = process_audio(audio_file)
librosa.output.write_wav(ref_path, y, sr)
# Generate speech
try:
tts.tts_to_file(
text=text,
speaker_wav=ref_path,
language="en",
file_path=out_path
)
# Clean up temporary files
os.unlink(ref_path)
return out_path
except Exception as e:
print(f"Error: {e}")
return None
# Gradio interface
with gr.Blocks(title="Voice Clone TTS") as demo:
gr.Markdown("""
# 🎀 Voice Clone Text-to-Speech
1. Upload a short English voice sample (5-10 seconds)
2. Enter text you want to speak
3. Generate audio in your voice!
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Upload Voice Sample",
interactive=True
)
text_input = gr.Textbox(
label="Text to Speak",
placeholder="Enter English text here...",
lines=4
)
btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(
label="Generated Speech",
interactive=False
)
error_output = gr.Textbox(label="Processing Info", visible=False)
# Example inputs
gr.Examples(
examples=[
["examples/sample_voice.wav", "Hello! Welcome to the future of voice cloning technology"],
["examples/sample_voice2.wav", "This text is spoken in a completely cloned voice"]
],
inputs=[audio_input, text_input],
outputs=audio_output,
fn=generate_speech,
cache_examples=True
)
btn.click(
fn=generate_speech,
inputs=[audio_input, text_input],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch(server_port=7860, share=True)