Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from outetts.v0_1.interface import InterfaceGGUF | |
import soundfile as sf | |
import tempfile | |
import os | |
from faster_whisper import WhisperModel | |
import huggingface_hub | |
def download_model(): | |
"""Download the GGUF model from HuggingFace""" | |
model_path = huggingface_hub.hf_hub_download( | |
repo_id="OuteAI/OuteTTS-0.1-350M-GGUF", | |
filename="OuteTTS-0.1-350M-Q6_K.gguf" | |
) | |
return model_path | |
def initialize_models(): | |
"""Initialize the OuteTTS and Faster-Whisper models""" | |
# Download and initialize GGUF model | |
model_path = download_model() | |
tts_interface = InterfaceGGUF(model_path) | |
# Initialize Whisper | |
asr_model = WhisperModel("tiny", | |
device="cpu", | |
compute_type="int8", | |
num_workers=1, | |
cpu_threads=1) | |
return tts_interface, asr_model | |
# Initialize models globally to avoid reloading | |
TTS_INTERFACE, ASR_MODEL = initialize_models() | |
def transcribe_audio(audio_path): | |
"""Transcribe audio using Faster-Whisper tiny""" | |
try: | |
segments, _ = ASR_MODEL.transcribe(audio_path, | |
beam_size=1, | |
best_of=1, | |
temperature=1.0, | |
condition_on_previous_text=False, | |
compression_ratio_threshold=2.4, | |
log_prob_threshold=-1.0, | |
no_speech_threshold=0.6) | |
text = " ".join([segment.text for segment in segments]).strip() | |
return text | |
except Exception as e: | |
return f"Error transcribing audio: {str(e)}" | |
def process_audio_file(audio_path, reference_text, text_to_speak, temperature=0.1, repetition_penalty=1.1): | |
"""Process the audio file and generate speech with the cloned voice""" | |
try: | |
# If no reference text provided, transcribe the audio | |
if not reference_text.strip(): | |
gr.Info("Transcribing audio...") | |
reference_text = transcribe_audio(audio_path) | |
if reference_text.startswith("Error"): | |
return None, reference_text | |
gr.Info(f"Using reference text: {reference_text}") | |
# Create speaker from reference audio | |
speaker = TTS_INTERFACE.create_speaker( | |
audio_path, | |
reference_text[:4000] # Limit reference text length | |
) | |
# Generate speech with cloned voice | |
output = TTS_INTERFACE.generate( | |
text=text_to_speak[:500], # Limit output text length | |
speaker=speaker, | |
temperature=temperature, | |
repetition_penalty=repetition_penalty, | |
max_lenght=2048 # Reduced from 4096 to avoid errors | |
) | |
# Save to temporary file and return path | |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
output.save(temp_file.name) | |
return temp_file.name, f"""Processing complete! | |
Reference text: {reference_text[:500]}... | |
(Showing first 500 characters of reference text)""" | |
except Exception as e: | |
return None, f"Error: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks(title="Voice Cloning with OuteTTS (GGUF)") as demo: | |
gr.Markdown("# ποΈ Voice Cloning with OuteTTS (GGUF)") | |
gr.Markdown(""" | |
This app uses the GGUF version of OuteTTS for optimized CPU performance. Upload a reference audio file, | |
provide the text being spoken in that audio (or leave blank for automatic transcription), | |
and enter the new text you want to be spoken in the cloned voice. | |
Note: | |
- For best results, use clear audio with minimal background noise | |
- Reference text is limited to 4000 characters | |
- Output text is limited to 500 characters | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
# Input components | |
audio_input = gr.Audio(label="Upload Reference Audio", type="filepath") | |
with gr.Row(): | |
transcribe_btn = gr.Button("π Transcribe Audio", variant="secondary") | |
reference_text = gr.Textbox( | |
label="Reference Text (what is being said in the audio, leave blank for auto-transcription)", | |
placeholder="Click 'Transcribe Audio' or enter the exact text from the reference audio", | |
lines=3 | |
) | |
text_to_speak = gr.Textbox( | |
label="Text to Speak (what you want the cloned voice to say, max 500 characters)", | |
placeholder="Enter the text you want the cloned voice to speak", | |
lines=3, | |
max_lines=5 | |
) | |
with gr.Row(): | |
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, | |
label="Temperature (higher = more variation)") | |
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.1, | |
label="Repetition Penalty") | |
# Submit button | |
submit_btn = gr.Button("ποΈ Generate Voice", variant="primary") | |
with gr.Column(): | |
# Output components | |
output_audio = gr.Audio(label="Generated Speech") | |
output_message = gr.Textbox(label="Status", lines=4) | |
# Handle transcription button | |
def transcribe_button(audio): | |
if not audio: | |
return "Please upload audio first." | |
return transcribe_audio(audio) | |
transcribe_btn.click( | |
fn=transcribe_button, | |
inputs=[audio_input], | |
outputs=[reference_text], | |
) | |
# Handle main generation | |
submit_btn.click( | |
fn=process_audio_file, | |
inputs=[audio_input, reference_text, text_to_speak, temperature, repetition_penalty], | |
outputs=[output_audio, output_message] | |
) | |
gr.Markdown(""" | |
### Tips for best results: | |
1. Use high-quality reference audio (clear speech, minimal background noise) | |
2. Try to keep reference audio under 30 seconds | |
3. If auto-transcription isn't accurate, you can manually correct the text | |
4. Keep generated text short for better quality | |
5. Adjust temperature and repetition penalty if needed: | |
- Lower temperature (0.1-0.3) for more consistent output | |
- Higher repetition penalty (1.1-1.3) to avoid repetition | |
""") | |
if __name__ == "__main__": | |
demo.launch() |