edwko commited on
Commit
9e95c6f
·
verified ·
1 Parent(s): 3525f81

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -0
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import outetts
4
+ from outetts.version.v2.interface import _DEFAULT_SPEAKERS
5
+
6
+ model_config = outetts.HFModelConfig_v2(
7
+ model_path="OuteAI/OuteTTS-0.3-1B",
8
+ tokenizer_path="OuteAI/OuteTTS-0.3-1B",
9
+ )
10
+ interface = outetts.InterfaceHF(model_version="0.3", cfg=model_config)
11
+
12
+ def get_available_speakers():
13
+ speakers = list(_DEFAULT_SPEAKERS.keys())
14
+ return speakers
15
+
16
+ def generate_tts(
17
+ text, temperature, repetition_penalty,
18
+ speaker_selection, reference_audio
19
+ ):
20
+ """Generate TTS with error handling and new features."""
21
+ try:
22
+ # Validate inputs for custom speaker
23
+ if reference_audio:
24
+ speaker = interface.create_speaker(reference_audio)
25
+
26
+ # Use selected default speaker
27
+ elif speaker_selection and speaker_selection != "None":
28
+ speaker = interface.load_default_speaker(speaker_selection)
29
+
30
+ # No speaker - random characteristics
31
+ else:
32
+ speaker = None
33
+
34
+ gen_cfg = outetts.GenerationConfig(
35
+ text=text,
36
+ temperature=temperature,
37
+ repetition_penalty=repetition_penalty,
38
+ max_length=4096,
39
+ speaker=speaker,
40
+ )
41
+ output = interface.generate(config=gen_cfg)
42
+
43
+ # Verify output
44
+ if output.audio is None:
45
+ raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
46
+
47
+ # Save and return output
48
+ output_path = "output.wav"
49
+ output.save(output_path)
50
+ return output_path, None
51
+
52
+ except Exception as e:
53
+ return None, str(e)
54
+
55
+ with gr.Blocks() as demo:
56
+ gr.Markdown("# OuteTTS-0.3-1B Text-to-Speech Demo")
57
+
58
+ error_box = gr.Textbox(label="Error Messages", visible=False)
59
+
60
+ with gr.Row():
61
+ with gr.Column():
62
+
63
+ # Speaker selection
64
+ speaker_dropdown = gr.Dropdown(
65
+ choices=get_available_speakers(),
66
+ value="en_male_1",
67
+ label="Speaker Selection"
68
+ )
69
+
70
+ text_input = gr.Textbox(
71
+ label="Text to Synthesize",
72
+ placeholder="Enter text here..."
73
+ )
74
+
75
+ temperature = gr.Slider(
76
+ 0.1, 1.0,
77
+ value=0.1,
78
+ label="Temperature (lower = more stable tone, higher = more expressive)"
79
+ )
80
+
81
+ repetition_penalty = gr.Slider(
82
+ 0.5, 2.0,
83
+ value=1.1,
84
+ label="Repetition Penalty"
85
+ )
86
+
87
+ gr.Markdown("""
88
+ ### Voice Cloning Guidelines:
89
+ - Use around 7-10 seconds of clear, noise-free audio
90
+ - For transcription interface will use Whisper turbo to transcribe the audio file
91
+ - Longer audio clips will reduce maximum output length
92
+ - Custom speaker overrides speaker selection
93
+ """)
94
+
95
+ reference_audio = gr.Audio(
96
+ label="Reference Audio (for voice cloning)",
97
+ type="filepath"
98
+ )
99
+
100
+ submit_button = gr.Button("Generate Speech")
101
+
102
+ with gr.Column():
103
+ audio_output = gr.Audio(
104
+ label="Generated Audio",
105
+ type="filepath"
106
+ )
107
+
108
+ submit_button.click(
109
+ fn=generate_tts,
110
+ inputs=[
111
+ text_input,
112
+ temperature,
113
+ repetition_penalty,
114
+ speaker_dropdown,
115
+ reference_audio,
116
+ ],
117
+ outputs=[audio_output, error_box]
118
+ ).then(
119
+ fn=lambda x: gr.update(visible=bool(x)),
120
+ inputs=[error_box],
121
+ outputs=[error_box]
122
+ )
123
+
124
+ demo.launch()