Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from outetts.v0_1.interface import InterfaceHF | |
import soundfile as sf | |
import tempfile | |
import os | |
from faster_whisper import WhisperModel | |
def initialize_models(): | |
"""Initialize the OuteTTS and Faster-Whisper models""" | |
tts_interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M") | |
# Use tiny model with lowest compute settings for maximum speed | |
asr_model = WhisperModel("tiny", | |
device="cpu", | |
compute_type="int8", # Use int8 quantization for efficiency | |
num_workers=1, # Limit workers for low-resource environment | |
cpu_threads=1) # Limit CPU threads | |
return tts_interface, asr_model | |
# Initialize models globally to avoid reloading | |
TTS_INTERFACE, ASR_MODEL = initialize_models() | |
def transcribe_audio(audio_path): | |
"""Transcribe audio using Faster-Whisper tiny""" | |
try: | |
# Transcribe with minimal settings for speed | |
segments, _ = ASR_MODEL.transcribe(audio_path, | |
beam_size=1, # Reduce beam size | |
best_of=1, # Don't generate alternatives | |
temperature=1.0, # No temperature sampling | |
condition_on_previous_text=False, # Don't condition on previous | |
compression_ratio_threshold=2.4, # Less strict threshold | |
log_prob_threshold=-1.0, # Less strict threshold | |
no_speech_threshold=0.6) # Less strict threshold | |
# Combine all segments | |
text = " ".join([segment.text for segment in segments]).strip() | |
return text | |
except Exception as e: | |
return f"Error transcribing audio: {str(e)}" | |
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): | |
"""Process the audio file and generate speech with the cloned voice""" | |
try: | |
# If no reference text provided, transcribe the audio | |
if not reference_text.strip(): | |
reference_text = transcribe_audio(audio_path) | |
if reference_text.startswith("Error"): | |
return None, reference_text | |
# Create speaker from reference audio | |
speaker = TTS_INTERFACE.create_speaker( | |
audio_path, | |
reference_text | |
) | |
# Generate speech with cloned voice | |
output = TTS_INTERFACE.generate( | |
text=text_to_speak, | |
speaker=speaker, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
max_lenght=4096 | |
) | |
# Save to temporary file and return path | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
output.save(temp_file.name) | |
return temp_file.name, f"Voice cloning successful!\nReference text used: {reference_text}" | |
except Exception as e: | |
return None, f"Error: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="Voice Cloning with OuteTTS") as demo: | |
gr.Markdown("# ποΈ Voice Cloning with OuteTTS") | |
gr.Markdown(""" | |
This app uses OuteTTS to clone voices. Upload a reference audio file, provide the text being spoken in that audio (or leave blank for automatic transcription), | |
and enter the new text you want to be spoken in the cloned voice. | |
Note: For best results, use clear audio with minimal background noise. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
# Input components | |
audio_input = gr.Audio(label="Upload Reference Audio", type="filepath") | |
reference_text = gr.Textbox( | |
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)", | |
placeholder="Leave empty to auto-transcribe or enter the exact text from the reference audio" | |
) | |
text_to_speak = gr.Textbox( | |
label="Text to Speak (what you want the cloned voice to say)", | |
placeholder="Enter the text you want the cloned voice to speak" | |
) | |
with gr.Row(): | |
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, | |
label="Temperature (higher = more variation)") | |
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, | |
label="Repetition Penalty") | |
# Submit button | |
submit_btn = gr.Button("Generate Voice", variant="primary") | |
with gr.Column(): | |
# Output components | |
output_audio = gr.Audio(label="Generated Speech") | |
output_message = gr.Textbox(label="Status", max_lines=3) | |
# Handle submission | |
submit_btn.click( | |
fn=process_audio_file, | |
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], | |
outputs=[output_audio, output_message] | |
) | |
gr.Markdown(""" | |
### Tips for best results: | |
1. Use high-quality reference audio (clear speech, minimal background noise) | |
2. If providing reference text manually, ensure it matches the audio exactly | |
3. If using auto-transcription, verify the transcribed text in the status message | |
4. Keep generated text relatively short for better quality | |
5. Adjust temperature and repetition penalty if needed: | |
- Lower temperature (0.1-0.3) for more consistent output | |
- Higher repetition penalty (1.1-1.3) to avoid repetition | |
""") | |
if __name__ == "__main__": | |
demo.launch() |