shukdevdatta123 commited on
Commit
8d0ba45
·
verified ·
1 Parent(s): d98f3c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -110
app.py CHANGED
@@ -4,20 +4,27 @@ import gradio as gr
4
  import os
5
  import random
6
  import torch
 
7
 
 
8
  IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
9
  CHAR_LIMIT = None if IS_DUPLICATE else 5000
10
 
 
11
  CUDA_AVAILABLE = torch.cuda.is_available()
 
 
12
  models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
13
  pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'abefhijpz'}
14
  pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
15
  pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
16
 
 
17
  @spaces.GPU(duration=10)
18
  def forward_gpu(ps, ref_s, speed):
19
  return models[True](ps, ref_s, speed)
20
 
 
21
  def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
22
  text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
23
  pipeline = pipelines[voice[0]]
@@ -40,36 +47,12 @@ def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
40
  return (24000, audio.numpy()), ps
41
  return None, ''
42
 
43
- # Arena API
44
- def predict(text, voice='af_heart', speed=1):
45
- return generate_first(text, voice, speed, use_gpu=False)[0]
46
-
47
  def tokenize_first(text, voice='af_heart'):
48
- # Split the input text into words and return as a list of words (fix applied here)
49
  words = text.split() # This splits the text into words based on spaces
50
  return words # Return a list of words
51
 
52
- def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
53
- text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
54
- pipeline = pipelines[voice[0]]
55
- pack = pipeline.load_voice(voice)
56
- use_gpu = use_gpu and CUDA_AVAILABLE
57
- for _, ps, _ in pipeline(text, voice, speed):
58
- ref_s = pack[len(ps)-1]
59
- try:
60
- if use_gpu:
61
- audio = forward_gpu(ps, ref_s, speed)
62
- else:
63
- audio = models[False](ps, ref_s, speed)
64
- except gr.exceptions.Error as e:
65
- if use_gpu:
66
- gr.Warning(str(e))
67
- gr.Info('Switching to CPU')
68
- audio = models[False](ps, ref_s, speed)
69
- else:
70
- raise gr.Error(e)
71
- yield 24000, audio.numpy()
72
-
73
  random_texts = {}
74
  for lang in ['en']:
75
  with open(f'{lang}.txt', 'r') as r:
@@ -79,81 +62,104 @@ def get_random_text(voice):
79
  lang = dict(a='en', b='en')[voice[0]]
80
  return random.choice(random_texts[lang])
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  CHOICES = {
83
- '🇺🇸 🚺 Heart ❤️': 'af_heart',
84
- '🇺🇸 🚺 Bella 🔥': 'af_bella',
85
- '🇺🇸 🚺 Nicole 🎧': 'af_nicole',
86
- '🇺🇸 🚺 Aoede': 'af_aoede',
87
- '🇺🇸 🚺 Kore': 'af_kore',
88
- '🇺🇸 🚺 Sarah': 'af_sarah',
89
- '🇺🇸 🚺 Nova': 'af_nova',
90
- '🇺🇸 🚺 Sky': 'af_sky',
91
- '🇺🇸 🚺 Alloy': 'af_alloy',
92
- '🇺🇸 🚺 Jessica': 'af_jessica',
93
- '🇺🇸 🚺 River': 'af_river',
94
-
95
- '🇺🇸 🚹 Michael': 'am_michael',
96
- '🇺🇸 🚹 Fenrir': 'am_fenrir',
97
- '🇺🇸 🚹 Puck': 'am_puck',
98
- '🇺🇸 🚹 Echo': 'am_echo',
99
- '🇺🇸 🚹 Eric': 'am_eric',
100
- '🇺🇸 🚹 Liam': 'am_liam',
101
- '🇺🇸 🚹 Onyx': 'am_onyx',
102
- '🇺🇸 🚹 Santa': 'am_santa',
103
- '🇺🇸 🚹 Adam': 'am_adam',
104
-
105
- '🇬🇧 🚺 Emma': 'bf_emma',
106
- '🇬🇧 🚺 Isabella': 'bf_isabella',
107
- '🇬🇧 🚺 Alice': 'bf_alice',
108
- '🇬🇧 🚺 Lily': 'bf_lily',
109
-
110
- '🇬🇧 🚹 George': 'bm_george',
111
- '🇬🇧 🚹 Fable': 'bm_fable',
112
- '🇬🇧 🚹 Lewis': 'bm_lewis',
113
- '🇬🇧 🚹 Daniel': 'bm_daniel',
114
-
115
- '🇪🇸 🚺 Dora': 'ef_dora',
116
-
117
- '🇪🇸 🚹 Alex': 'em_alex',
118
- '🇪🇸 🚹 Santa': 'em_santa',
119
-
120
- '🇫🇷 🚺 Siwis': 'ff_siwis',
121
-
122
- '🇮🇳 🚹 Alpha': 'hf_alpha',
123
- '🇮🇳 🚹 Beta': 'hf_beta',
124
-
125
- '🇮🇳 🚹 Omega': 'hm_omega',
126
- '🇮🇳 🚹 Psi': 'hm_psi',
127
-
128
- '🇮🇹 🚺 Sara': 'if_sara',
129
-
130
- '🇮🇹 🚺 Nicola': 'im_nicola',
131
-
132
- '🇯🇵 🚹 Alpha': 'jf_alpha',
133
- '🇯🇵 🚹 Gongitsune': 'jf_gongitsune',
134
- '🇯🇵 🚹 Nezumi': 'jf_nezumi',
135
- '🇯🇵 🚹 Tebukuro': 'jf_tebukuro',
136
-
137
- '🇯🇵 🚹 Kumo': 'jm_kumo',
138
-
139
- '🇧🇷 🚺 Dora': 'pf_dora',
140
-
141
- '🇧🇷 🚹 Alex': 'pm_alex',
142
- '🇧🇷 🚹 Santa': 'pm_santa',
143
-
144
- '🇨🇳 🚺 Xiaobei': 'zf_xiaobei',
145
- '🇨🇳 🚺 Xiaoni': 'zf_xiaoni',
146
- '🇨🇳 🚺 Xiaoxiao': 'zf_xiaoxiao',
147
- '🇨🇳 🚺 Xiaoyi': 'zf_xiaoyi',
148
-
149
- '🇨🇳 🚹 Yunjian': 'zm_yunjian',
150
- '🇨🇳 🚹 Yunxi': 'zm_yunxi',
151
- '🇨🇳 🚹 Yunxia': 'zm_yunxia',
152
- '🇨🇳 🚹 Yunyang': 'zm_yunyang',
153
  }
 
 
154
  for v in CHOICES.values():
155
  pipelines[v[0]].load_voice(v)
156
 
 
157
  with gr.Blocks() as generate_tab:
158
  out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
159
  generate_btn = gr.Button('Generate', variant='primary')
@@ -162,17 +168,21 @@ with gr.Blocks() as generate_tab:
162
  tokenize_btn = gr.Button('Tokenize', variant='secondary')
163
  predict_btn = gr.Button('Predict', variant='secondary', visible=False)
164
 
165
- BANNER_TEXT = '''
166
- [***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
167
- As of January 31st, 2025, Kokoro was the most-liked [**TTS model**](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=likes) and the most-liked [**TTS space**](https://huggingface.co/spaces?sort=likes&search=tts) on Hugging Face.
168
- This demo only showcases English, but you can directly use the model to access other languages.
169
- '''
 
 
170
 
171
- API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
172
- API_NAME = None if API_OPEN else False
173
  with gr.Blocks() as app:
174
  with gr.Row():
175
- gr.Markdown(BANNER_TEXT, container=True)
 
 
 
176
  with gr.Row():
177
  with gr.Column():
178
  text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
@@ -188,11 +198,12 @@ with gr.Blocks() as app:
188
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
189
  random_btn = gr.Button('Random Text', variant='secondary')
190
  with gr.Column():
191
- gr.TabbedInterface([generate_tab], ['Generate'])
192
- random_btn.click(fn=get_random_text, inputs=[voice], outputs=[text], api_name=API_NAME)
193
- generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps], api_name=API_NAME)
194
- tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps], api_name=API_NAME)
195
- predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
 
196
 
197
  if __name__ == '__main__':
198
- app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)
 
4
  import os
5
  import random
6
  import torch
7
+ import openai
8
 
9
+ # Check if running in a duplicate space
10
  IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
11
  CHAR_LIMIT = None if IS_DUPLICATE else 5000
12
 
13
+ # Check if CUDA is available
14
  CUDA_AVAILABLE = torch.cuda.is_available()
15
+
16
+ # Load the models (GPU and CPU versions)
17
  models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
18
  pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'abefhijpz'}
19
  pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
20
  pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
21
 
22
+ # GPU function to generate audio
23
  @spaces.GPU(duration=10)
24
  def forward_gpu(ps, ref_s, speed):
25
  return models[True](ps, ref_s, speed)
26
 
27
+ # Function to generate first output
28
  def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
29
  text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
30
  pipeline = pipelines[voice[0]]
 
47
  return (24000, audio.numpy()), ps
48
  return None, ''
49
 
50
+ # Function to tokenize first
 
 
 
51
  def tokenize_first(text, voice='af_heart'):
 
52
  words = text.split() # This splits the text into words based on spaces
53
  return words # Return a list of words
54
 
55
+ # Function to get random text for the "Random Text" button
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  random_texts = {}
57
  for lang in ['en']:
58
  with open(f'{lang}.txt', 'r') as r:
 
62
  lang = dict(a='en', b='en')[voice[0]]
63
  return random.choice(random_texts[lang])
64
 
65
+ # OpenAI GPT-4 translation function
66
+ def translate_to_english(text, model="gpt-4"):
67
+ try:
68
+ response = openai.Completion.create(
69
+ model=model,
70
+ prompt=f"Translate the following text to English:\n\n{text}",
71
+ temperature=0.5,
72
+ max_tokens=500,
73
+ )
74
+ return response.choices[0].text.strip()
75
+ except Exception as e:
76
+ return str(e)
77
+
78
+ # Function to handle generation for translated text
79
+ def translate_and_generate(text, voice, speed):
80
+ translated_text = translate_to_english(text)
81
+ audio, tokens = generate_first(translated_text, voice, speed, use_gpu=CUDA_AVAILABLE)
82
+ return audio, tokens, translated_text
83
+
84
+ # Predefined voices for the dropdown menu
85
  CHOICES = {
86
+ '🇺🇸 🚺 Heart ❤️': 'af_heart',
87
+ '🇺🇸 🚺 Bella 🔥': 'af_bella',
88
+ '🇺🇸 🚺 Nicole 🎧': 'af_nicole',
89
+ '🇺🇸 🚺 Aoede': 'af_aoede',
90
+ '🇺🇸 🚺 Kore': 'af_kore',
91
+ '🇺🇸 🚺 Sarah': 'af_sarah',
92
+ '🇺🇸 🚺 Nova': 'af_nova',
93
+ '🇺🇸 🚺 Sky': 'af_sky',
94
+ '🇺🇸 🚺 Alloy': 'af_alloy',
95
+ '🇺🇸 🚺 Jessica': 'af_jessica',
96
+ '🇺🇸 🚺 River': 'af_river',
97
+
98
+ '🇺🇸 🚹 Michael': 'am_michael',
99
+ '🇺🇸 🚹 Fenrir': 'am_fenrir',
100
+ '🇺🇸 🚹 Puck': 'am_puck',
101
+ '🇺🇸 🚹 Echo': 'am_echo',
102
+ '🇺🇸 🚹 Eric': 'am_eric',
103
+ '🇺🇸 🚹 Liam': 'am_liam',
104
+ '🇺🇸 🚹 Onyx': 'am_onyx',
105
+ '🇺🇸 🚹 Santa': 'am_santa',
106
+ '🇺🇸 🚹 Adam': 'am_adam',
107
+
108
+ '🇬🇧 🚺 Emma': 'bf_emma',
109
+ '🇬🇧 🚺 Isabella': 'bf_isabella',
110
+ '🇬🇧 🚺 Alice': 'bf_alice',
111
+ '🇬🇧 🚺 Lily': 'bf_lily',
112
+
113
+ '🇬🇧 🚹 George': 'bm_george',
114
+ '🇬🇧 🚹 Fable': 'bm_fable',
115
+ '🇬🇧 🚹 Lewis': 'bm_lewis',
116
+ '🇬🇧 🚹 Daniel': 'bm_daniel',
117
+
118
+ '🇪🇸 🚺 Dora': 'ef_dora',
119
+
120
+ '🇪🇸 🚹 Alex': 'em_alex',
121
+ '🇪🇸 🚹 Santa': 'em_santa',
122
+
123
+ '🇫🇷 🚺 Siwis': 'ff_siwis',
124
+
125
+ '🇮🇳 🚹 Alpha': 'hf_alpha',
126
+ '🇮🇳 🚹 Beta': 'hf_beta',
127
+
128
+ '🇮🇳 🚹 Omega': 'hm_omega',
129
+ '🇮🇳 🚹 Psi': 'hm_psi',
130
+
131
+ '🇮🇹 🚺 Sara': 'if_sara',
132
+
133
+ '🇮🇹 🚺 Nicola': 'im_nicola',
134
+
135
+ '🇯🇵 🚹 Alpha': 'jf_alpha',
136
+ '🇯🇵 🚹 Gongitsune': 'jf_gongitsune',
137
+ '🇯🇵 🚹 Nezumi': 'jf_nezumi',
138
+ '🇯🇵 🚹 Tebukuro': 'jf_tebukuro',
139
+
140
+ '🇯🇵 🚹 Kumo': 'jm_kumo',
141
+
142
+ '🇧🇷 🚺 Dora': 'pf_dora',
143
+
144
+ '🇧🇷 🚹 Alex': 'pm_alex',
145
+ '🇧🇷 🚹 Santa': 'pm_santa',
146
+
147
+ '🇨🇳 🚺 Xiaobei': 'zf_xiaobei',
148
+ '🇨🇳 🚺 Xiaoni': 'zf_xiaoni',
149
+ '🇨🇳 🚺 Xiaoxiao': 'zf_xiaoxiao',
150
+ '🇨🇳 🚺 Xiaoyi': 'zf_xiaoyi',
151
+
152
+ '🇨🇳 🚹 Yunjian': 'zm_yunjian',
153
+ '🇨🇳 🚹 Yunxi': 'zm_yunxi',
154
+ '🇨🇳 🚹 Yunxia': 'zm_yunxia',
155
+ '🇨🇳 🚹 Yunyang': 'zm_yunyang',
156
  }
157
+
158
+ # Load voices
159
  for v in CHOICES.values():
160
  pipelines[v[0]].load_voice(v)
161
 
162
+ # Build the interface
163
  with gr.Blocks() as generate_tab:
164
  out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
165
  generate_btn = gr.Button('Generate', variant='primary')
 
168
  tokenize_btn = gr.Button('Tokenize', variant='secondary')
169
  predict_btn = gr.Button('Predict', variant='secondary', visible=False)
170
 
171
+ # Translator Tab
172
+ with gr.Blocks() as translator_tab:
173
+ trans_out_audio = gr.Audio(label='Translated Audio Output', interactive=False, streaming=False, autoplay=True)
174
+ trans_out_tokens = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the translated audio')
175
+ translate_btn = gr.Button('Translate & Generate Audio', variant='primary')
176
+
177
+ translate_btn.click(fn=translate_and_generate, inputs=[text, voice, speed], outputs=[trans_out_audio, trans_out_tokens, text], api_name=None)
178
 
179
+ # Main Interface
 
180
  with gr.Blocks() as app:
181
  with gr.Row():
182
+ gr.Markdown('''[***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
183
+ As of January 31st, 2025, Kokoro was the most-liked [**TTS model**](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=likes) and the most-liked [**TTS space**](https://huggingface.co/spaces?sort=likes&search=tts) on Hugging Face.
184
+ This demo only showcases English, but you can directly use the model to access other languages.''', container=True)
185
+
186
  with gr.Row():
187
  with gr.Column():
188
  text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
 
198
  speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
199
  random_btn = gr.Button('Random Text', variant='secondary')
200
  with gr.Column():
201
+ gr.TabbedInterface([generate_tab, translator_tab], ['Generate', 'Translator'])
202
+
203
+ random_btn.click(fn=get_random_text, inputs=[voice], outputs=[text])
204
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
205
+ tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
206
+ predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
207
 
208
  if __name__ == '__main__':
209
+ app.queue().launch(show_api=True)