Spaces:
Running
Running
| import os | |
| import gradio as gr | |
| import outetts | |
| from outetts.version.v2.interface import _DEFAULT_SPEAKERS | |
| import torch | |
| import spaces | |
| def get_available_speakers(): | |
| speakers = list(_DEFAULT_SPEAKERS.keys()) | |
| return speakers | |
| def generate_tts( | |
| text, temperature, repetition_penalty, | |
| speaker_selection, reference_audio | |
| ): | |
| model_config = outetts.HFModelConfig_v2( | |
| model_path="OuteAI/OuteTTS-0.3-1B", | |
| tokenizer_path="OuteAI/OuteTTS-0.3-1B", | |
| dtype=torch.bfloat16, | |
| device="cuda" | |
| ) | |
| interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config) | |
| """Generate TTS with error handling and new features.""" | |
| try: | |
| # Validate inputs for custom speaker | |
| if reference_audio: | |
| speaker = interface.create_speaker(reference_audio) | |
| # Use selected default speaker | |
| elif speaker_selection and speaker_selection != "None": | |
| speaker = interface.load_default_speaker(speaker_selection) | |
| # No speaker - random characteristics | |
| else: | |
| speaker = None | |
| gen_cfg = outetts.GenerationConfig( | |
| text=text, | |
| temperature=temperature, | |
| repetition_penalty=repetition_penalty, | |
| max_length=4096, | |
| speaker=speaker, | |
| ) | |
| output = interface.generate(config=gen_cfg) | |
| # Verify output | |
| if output.audio is None: | |
| raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.") | |
| # Save and return output | |
| output_path = "output.wav" | |
| output.save(output_path) | |
| return output_path, None | |
| except Exception as e: | |
| return None, str(e) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# OuteTTS-0.3-1B Text-to-Speech Demo") | |
| error_box = gr.Textbox(label="Error Messages", visible=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Speaker selection | |
| speaker_dropdown = gr.Dropdown( | |
| choices=get_available_speakers(), | |
| value="en_male_1", | |
| label="Speaker Selection" | |
| ) | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter text here..." | |
| ) | |
| temperature = gr.Slider( | |
| 0.1, 1.0, | |
| value=0.1, | |
| label="Temperature (lower = more stable tone, higher = more expressive)" | |
| ) | |
| repetition_penalty = gr.Slider( | |
| 0.5, 2.0, | |
| value=1.1, | |
| label="Repetition Penalty" | |
| ) | |
| gr.Markdown(""" | |
| ### Voice Cloning Guidelines: | |
| - Use around 7-10 seconds of clear, noise-free audio | |
| - For transcription interface will use Whisper turbo to transcribe the audio file | |
| - Longer audio clips will reduce maximum output length | |
| - Custom speaker overrides speaker selection | |
| """) | |
| reference_audio = gr.Audio( | |
| label="Reference Audio (for voice cloning)", | |
| type="filepath" | |
| ) | |
| submit_button = gr.Button("Generate Speech") | |
| with gr.Column(): | |
| audio_output = gr.Audio( | |
| label="Generated Audio", | |
| type="filepath" | |
| ) | |
| submit_button.click( | |
| fn=generate_tts, | |
| inputs=[ | |
| text_input, | |
| temperature, | |
| repetition_penalty, | |
| speaker_dropdown, | |
| reference_audio, | |
| ], | |
| outputs=[audio_output, error_box] | |
| ).then( | |
| fn=lambda x: gr.update(visible=bool(x)), | |
| inputs=[error_box], | |
| outputs=[error_box] | |
| ) | |
| demo.launch() |