Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from outetts.v0_1.interface import InterfaceGGUF | |
import soundfile as sf | |
import tempfile | |
import os | |
from faster_whisper import WhisperModel | |
import huggingface_hub | |
def download_model(): | |
"""Download the GGUF model from HuggingFace""" | |
model_path = huggingface_hub.hf_hub_download( | |
repo_id="OuteAI/OuteTTS-0.1-350M-GGUF", | |
filename="OuteTTS-0.1-350M-Q6_K.gguf" | |
) | |
return model_path | |
def initialize_models(): | |
"""Initialize the OuteTTS and Faster-Whisper models""" | |
# Download and initialize GGUF model | |
model_path = download_model() | |
tts_interface = InterfaceGGUF(model_path) | |
# Initialize Whisper | |
asr_model = WhisperModel("tiny", | |
device="cpu", | |
compute_type="int8", | |
num_workers=1, | |
cpu_threads=1) | |
return tts_interface, asr_model | |
# Initialize models globally to avoid reloading | |
try: | |
TTS_INTERFACE, ASR_MODEL = initialize_models() | |
except Exception as e: | |
print(f"Error initializing models: {str(e)}") | |
raise | |
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): | |
"""Process the audio file and generate speech with the cloned voice""" | |
try: | |
# If no reference text provided, transcribe the audio | |
if not reference_text.strip(): | |
gr.Info("Transcribing audio...") | |
reference_text = transcribe_audio(audio_path) | |
if reference_text.startswith("Error"): | |
return None, reference_text | |
gr.Info(f"Using reference text: {reference_text}") | |
# Limit text lengths to prevent context overflow | |
reference_text = reference_text[:2000] # Limit reference text | |
text_to_speak = text_to_speak[:300] # Limit output text | |
# Create speaker from reference audio | |
speaker = TTS_INTERFACE.create_speaker( | |
audio_path, | |
reference_text, | |
) | |
# Generate speech with cloned voice | |
output = TTS_INTERFACE.generate( | |
text=text_to_speak, | |
speaker=speaker, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
max_lenght=2048 # Note: Using original typo from docs ('lenght') | |
) | |
# Save to temporary file and return path | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
output.save(temp_file.name) | |
return temp_file.name, f"""Processing complete! | |
Reference text: {reference_text[:300]}... | |
(Showing first 300 characters of reference text)""" | |
except Exception as e: | |
return None, f"Error: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo: | |
gr.Markdown("# ποΈ Voice Cloning with OuteTTS (GGUF)") | |
gr.Markdown(""" | |
This app uses the GGUF version of OuteTTS optimized for CPU performance. Upload a reference audio file, | |
provide the text being spoken in that audio (or leave blank for automatic transcription), | |
and enter the new text you want to be spoken in the cloned voice. | |
Note: | |
- For best results, use clear audio with minimal background noise | |
- Reference text is limited to 2000 characters | |
- Output text is limited to 300 characters | |
- Short inputs work best for quality results | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
# Input components | |
audio_input = gr.Audio( | |
label="Upload Reference Audio", | |
type="filepath", | |
max_length=30 # Limit audio length to 30 seconds | |
) | |
with gr.Row(): | |
transcribe_btn = gr.Button("π Transcribe Audio", variant="secondary") | |
reference_text = gr.Textbox( | |
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)", | |
placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio", | |
lines=3, | |
max_lines=5 | |
) | |
text_to_speak = gr.Textbox( | |
label="Text to Speak (what you want the cloned voice to say, max 300 characters)", | |
placeholder="Enter the text you want the cloned voice to speak (keep it short for best results)", | |
lines=3, | |
max_lines=5 | |
) | |
with gr.Row(): | |
temperature = gr.Slider( | |
minimum=0.1, | |
maximum=0.5, # Reduced maximum temperature | |
value=0.1, | |
step=0.05, | |
label="Temperature (keep low for stability)" | |
) | |
repetition_penalty = gr.Slider( | |
minimum=1.0, | |
maximum=1.3, # Reduced maximum | |
value=1.1, | |
step=0.05, | |
label="Repetition Penalty" | |
) | |
# Submit button | |
submit_btn = gr.Button("ποΈ Generate Voice", variant="primary") | |
with gr.Column(): | |
# Output components | |
output_audio = gr.Audio(label="Generated Speech") | |
output_message = gr.Textbox(label="Status", lines=4) | |
# Add warning about processing time | |
gr.Markdown(""" | |
β οΈ Note: Initial processing may take a few moments. Please be patient. | |
""") | |
# Handle transcription button | |
def transcribe_audio(audio_path): | |
"""Transcribe audio using Faster-Whisper tiny""" | |
try: | |
if not audio_path: | |
return "Please upload audio first." | |
segments, _ = ASR_MODEL.transcribe( | |
audio_path, | |
beam_size=1, | |
best_of=1, | |
temperature=1.0, | |
condition_on_previous_text=False, | |
compression_ratio_threshold=2.4, | |
log_prob_threshold=-1.0, | |
no_speech_threshold=0.6 | |
) | |
text = " ".join([segment.text for segment in segments]).strip() | |
return text[:2000] # Limit transcription length | |
except Exception as e: | |
return f"Error transcribing audio: {str(e)}" | |
transcribe_btn.click( | |
fn=transcribe_audio, | |
inputs=[audio_input], | |
outputs=[reference_text], | |
) | |
# Handle main generation | |
submit_btn.click( | |
fn=process_audio_file, | |
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], | |
outputs=[output_audio, output_message] | |
) | |
gr.Markdown(""" | |
### Tips for best results: | |
1. Use clear, short audio samples (5-15 seconds is ideal) | |
2. Keep both reference and output text concise | |
3. Use lower temperature (0.1-0.2) for more stable output | |
4. Start with short phrases to test the voice | |
5. If generation fails, try: | |
- Using shorter text | |
- Reducing temperature | |
- Using clearer audio | |
- Simplifying the text | |
""") | |
if __name__ == "__main__": | |
demo.launch() |