shukdevdatta123 commited on
Commit
19f7938
·
verified ·
1 Parent(s): 8d0ba45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -44
app.py CHANGED
@@ -6,25 +6,23 @@ import random
6
  import torch
7
  import openai
8
 
9
- # Check if running in a duplicate space
10
  IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
11
  CHAR_LIMIT = None if IS_DUPLICATE else 5000
12
 
13
- # Check if CUDA is available
14
  CUDA_AVAILABLE = torch.cuda.is_available()
15
-
16
- # Load the models (GPU and CPU versions)
17
  models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
18
  pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'abefhijpz'}
19
  pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
20
  pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
21
 
22
- # GPU function to generate audio
 
 
 
23
  @spaces.GPU(duration=10)
24
  def forward_gpu(ps, ref_s, speed):
25
  return models[True](ps, ref_s, speed)
26
 
27
- # Function to generate first output
28
  def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
29
  text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
30
  pipeline = pipelines[voice[0]]
@@ -47,17 +45,10 @@ def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
47
  return (24000, audio.numpy()), ps
48
  return None, ''
49
 
50
- # Function to tokenize first
51
  def tokenize_first(text, voice='af_heart'):
52
  words = text.split() # This splits the text into words based on spaces
53
  return words # Return a list of words
54
 
55
- # Function to get random text for the "Random Text" button
56
- random_texts = {}
57
- for lang in ['en']:
58
- with open(f'{lang}.txt', 'r') as r:
59
- random_texts[lang] = [line.strip() for line in r]
60
-
61
  def get_random_text(voice):
62
  lang = dict(a='en', b='en')[voice[0]]
63
  return random.choice(random_texts[lang])
@@ -81,7 +72,12 @@ def translate_and_generate(text, voice, speed):
81
  audio, tokens = generate_first(translated_text, voice, speed, use_gpu=CUDA_AVAILABLE)
82
  return audio, tokens, translated_text
83
 
84
- # Predefined voices for the dropdown menu
 
 
 
 
 
85
  CHOICES = {
86
  '🇺🇸 🚺 Heart ❤️': 'af_heart',
87
  '🇺🇸 🚺 Bella 🔥': 'af_bella',
@@ -94,7 +90,7 @@ CHOICES = {
94
  '🇺🇸 🚺 Alloy': 'af_alloy',
95
  '🇺🇸 🚺 Jessica': 'af_jessica',
96
  '🇺🇸 🚺 River': 'af_river',
97
-
98
  '🇺🇸 🚹 Michael': 'am_michael',
99
  '🇺🇸 🚹 Fenrir': 'am_fenrir',
100
  '🇺🇸 🚹 Puck': 'am_puck',
@@ -104,62 +100,62 @@ CHOICES = {
104
  '🇺🇸 🚹 Onyx': 'am_onyx',
105
  '🇺🇸 🚹 Santa': 'am_santa',
106
  '🇺🇸 🚹 Adam': 'am_adam',
107
-
108
  '🇬🇧 🚺 Emma': 'bf_emma',
109
  '🇬🇧 🚺 Isabella': 'bf_isabella',
110
  '🇬🇧 🚺 Alice': 'bf_alice',
111
  '🇬🇧 🚺 Lily': 'bf_lily',
112
-
113
  '🇬🇧 🚹 George': 'bm_george',
114
  '🇬🇧 🚹 Fable': 'bm_fable',
115
  '🇬🇧 🚹 Lewis': 'bm_lewis',
116
  '🇬🇧 🚹 Daniel': 'bm_daniel',
117
-
118
  '🇪🇸 🚺 Dora': 'ef_dora',
119
-
120
  '🇪🇸 🚹 Alex': 'em_alex',
121
  '🇪🇸 🚹 Santa': 'em_santa',
122
-
123
  '🇫🇷 🚺 Siwis': 'ff_siwis',
124
-
125
  '🇮🇳 🚹 Alpha': 'hf_alpha',
126
  '🇮🇳 🚹 Beta': 'hf_beta',
127
-
128
  '🇮🇳 🚹 Omega': 'hm_omega',
129
  '🇮🇳 🚹 Psi': 'hm_psi',
130
-
131
  '🇮🇹 🚺 Sara': 'if_sara',
132
-
133
  '🇮🇹 🚺 Nicola': 'im_nicola',
134
-
135
  '🇯🇵 🚹 Alpha': 'jf_alpha',
136
  '🇯🇵 🚹 Gongitsune': 'jf_gongitsune',
137
  '🇯🇵 🚹 Nezumi': 'jf_nezumi',
138
  '🇯🇵 🚹 Tebukuro': 'jf_tebukuro',
139
-
140
  '🇯🇵 🚹 Kumo': 'jm_kumo',
141
-
142
  '🇧🇷 🚺 Dora': 'pf_dora',
143
-
144
  '🇧🇷 🚹 Alex': 'pm_alex',
145
  '🇧🇷 🚹 Santa': 'pm_santa',
146
-
147
  '🇨🇳 🚺 Xiaobei': 'zf_xiaobei',
148
  '🇨🇳 🚺 Xiaoni': 'zf_xiaoni',
149
  '🇨🇳 🚺 Xiaoxiao': 'zf_xiaoxiao',
150
  '🇨🇳 🚺 Xiaoyi': 'zf_xiaoyi',
151
-
152
  '🇨🇳 🚹 Yunjian': 'zm_yunjian',
153
  '🇨🇳 🚹 Yunxi': 'zm_yunxi',
154
  '🇨🇳 🚹 Yunxia': 'zm_yunxia',
155
  '🇨🇳 🚹 Yunyang': 'zm_yunyang',
 
156
  }
157
 
158
- # Load voices
159
  for v in CHOICES.values():
160
  pipelines[v[0]].load_voice(v)
161
 
162
- # Build the interface
163
  with gr.Blocks() as generate_tab:
164
  out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
165
  generate_btn = gr.Button('Generate', variant='primary')
@@ -168,21 +164,22 @@ with gr.Blocks() as generate_tab:
168
  tokenize_btn = gr.Button('Tokenize', variant='secondary')
169
  predict_btn = gr.Button('Predict', variant='secondary', visible=False)
170
 
171
- # Translator Tab
172
  with gr.Blocks() as translator_tab:
173
  trans_out_audio = gr.Audio(label='Translated Audio Output', interactive=False, streaming=False, autoplay=True)
174
  trans_out_tokens = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the translated audio')
175
  translate_btn = gr.Button('Translate & Generate Audio', variant='primary')
176
 
177
- translate_btn.click(fn=translate_and_generate, inputs=[text, voice, speed], outputs=[trans_out_audio, trans_out_tokens, text], api_name=None)
 
 
 
 
 
 
178
 
179
- # Main Interface
180
  with gr.Blocks() as app:
181
  with gr.Row():
182
- gr.Markdown('''[***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
183
- As of January 31st, 2025, Kokoro was the most-liked [**TTS model**](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=likes) and the most-liked [**TTS space**](https://huggingface.co/spaces?sort=likes&search=tts) on Hugging Face.
184
- This demo only showcases English, but you can directly use the model to access other languages.''', container=True)
185
-
186
  with gr.Row():
187
  with gr.Column():
188
  text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
@@ -200,10 +197,10 @@ with gr.Blocks() as app:
200
  with gr.Column():
201
  gr.TabbedInterface([generate_tab, translator_tab], ['Generate', 'Translator'])
202
 
203
- random_btn.click(fn=get_random_text, inputs=[voice], outputs=[text])
204
- generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps])
205
- tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
206
- predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio])
207
 
208
  if __name__ == '__main__':
209
- app.queue().launch(show_api=True)
 
6
  import torch
7
  import openai
8
 
 
9
  IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
10
  CHAR_LIMIT = None if IS_DUPLICATE else 5000
11
 
 
12
  CUDA_AVAILABLE = torch.cuda.is_available()
 
 
13
  models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
14
  pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'abefhijpz'}
15
  pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
16
  pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
17
 
18
+ # Check API status
19
+ API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
20
+ API_NAME = None if API_OPEN else False
21
+
22
  @spaces.GPU(duration=10)
23
  def forward_gpu(ps, ref_s, speed):
24
  return models[True](ps, ref_s, speed)
25
 
 
26
  def generate_first(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
27
  text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
28
  pipeline = pipelines[voice[0]]
 
45
  return (24000, audio.numpy()), ps
46
  return None, ''
47
 
 
48
  def tokenize_first(text, voice='af_heart'):
49
  words = text.split() # This splits the text into words based on spaces
50
  return words # Return a list of words
51
 
 
 
 
 
 
 
52
  def get_random_text(voice):
53
  lang = dict(a='en', b='en')[voice[0]]
54
  return random.choice(random_texts[lang])
 
72
  audio, tokens = generate_first(translated_text, voice, speed, use_gpu=CUDA_AVAILABLE)
73
  return audio, tokens, translated_text
74
 
75
+ # Load random text for "Random Text" button
76
+ random_texts = {}
77
+ for lang in ['en']:
78
+ with open(f'{lang}.txt', 'r') as r:
79
+ random_texts[lang] = [line.strip() for line in r]
80
+
81
  CHOICES = {
82
  '🇺🇸 🚺 Heart ❤️': 'af_heart',
83
  '🇺🇸 🚺 Bella 🔥': 'af_bella',
 
90
  '🇺🇸 🚺 Alloy': 'af_alloy',
91
  '🇺🇸 🚺 Jessica': 'af_jessica',
92
  '🇺🇸 🚺 River': 'af_river',
93
+
94
  '🇺🇸 🚹 Michael': 'am_michael',
95
  '🇺🇸 🚹 Fenrir': 'am_fenrir',
96
  '🇺🇸 🚹 Puck': 'am_puck',
 
100
  '🇺🇸 🚹 Onyx': 'am_onyx',
101
  '🇺🇸 🚹 Santa': 'am_santa',
102
  '🇺🇸 🚹 Adam': 'am_adam',
103
+
104
  '🇬🇧 🚺 Emma': 'bf_emma',
105
  '🇬🇧 🚺 Isabella': 'bf_isabella',
106
  '🇬🇧 🚺 Alice': 'bf_alice',
107
  '🇬🇧 🚺 Lily': 'bf_lily',
108
+
109
  '🇬🇧 🚹 George': 'bm_george',
110
  '🇬🇧 🚹 Fable': 'bm_fable',
111
  '🇬🇧 🚹 Lewis': 'bm_lewis',
112
  '🇬🇧 🚹 Daniel': 'bm_daniel',
113
+
114
  '🇪🇸 🚺 Dora': 'ef_dora',
115
+
116
  '🇪🇸 🚹 Alex': 'em_alex',
117
  '🇪🇸 🚹 Santa': 'em_santa',
118
+
119
  '🇫🇷 🚺 Siwis': 'ff_siwis',
120
+
121
  '🇮🇳 🚹 Alpha': 'hf_alpha',
122
  '🇮🇳 🚹 Beta': 'hf_beta',
123
+
124
  '🇮🇳 🚹 Omega': 'hm_omega',
125
  '🇮🇳 🚹 Psi': 'hm_psi',
126
+
127
  '🇮🇹 🚺 Sara': 'if_sara',
128
+
129
  '🇮🇹 🚺 Nicola': 'im_nicola',
130
+
131
  '🇯🇵 🚹 Alpha': 'jf_alpha',
132
  '🇯🇵 🚹 Gongitsune': 'jf_gongitsune',
133
  '🇯🇵 🚹 Nezumi': 'jf_nezumi',
134
  '🇯🇵 🚹 Tebukuro': 'jf_tebukuro',
135
+
136
  '🇯🇵 🚹 Kumo': 'jm_kumo',
137
+
138
  '🇧🇷 🚺 Dora': 'pf_dora',
139
+
140
  '🇧🇷 🚹 Alex': 'pm_alex',
141
  '🇧🇷 🚹 Santa': 'pm_santa',
142
+
143
  '🇨🇳 🚺 Xiaobei': 'zf_xiaobei',
144
  '🇨🇳 🚺 Xiaoni': 'zf_xiaoni',
145
  '🇨🇳 🚺 Xiaoxiao': 'zf_xiaoxiao',
146
  '🇨🇳 🚺 Xiaoyi': 'zf_xiaoyi',
147
+
148
  '🇨🇳 🚹 Yunjian': 'zm_yunjian',
149
  '🇨🇳 🚹 Yunxi': 'zm_yunxi',
150
  '🇨🇳 🚹 Yunxia': 'zm_yunxia',
151
  '🇨🇳 🚹 Yunyang': 'zm_yunyang',
152
+ # (All the voice options here... same as before)
153
  }
154
 
 
155
  for v in CHOICES.values():
156
  pipelines[v[0]].load_voice(v)
157
 
158
+ # Tabs for generation and translation
159
  with gr.Blocks() as generate_tab:
160
  out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
161
  generate_btn = gr.Button('Generate', variant='primary')
 
164
  tokenize_btn = gr.Button('Tokenize', variant='secondary')
165
  predict_btn = gr.Button('Predict', variant='secondary', visible=False)
166
 
 
167
  with gr.Blocks() as translator_tab:
168
  trans_out_audio = gr.Audio(label='Translated Audio Output', interactive=False, streaming=False, autoplay=True)
169
  trans_out_tokens = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the translated audio')
170
  translate_btn = gr.Button('Translate & Generate Audio', variant='primary')
171
 
172
+ translate_btn.click(fn=translate_and_generate, inputs=[text, voice, speed], outputs=[trans_out_audio, trans_out_tokens, text], api_name=API_NAME)
173
+
174
+ BANNER_TEXT = '''
175
+ [***Kokoro*** **is an open-weight TTS model with 82 million parameters.**](https://huggingface.co/hexgrad/Kokoro-82M)
176
+ As of January 31st, 2025, Kokoro was the most-liked [**TTS model**](https://huggingface.co/models?pipeline_tag=text-to-speech&sort=likes) and the most-liked [**TTS space**](https://huggingface.co/spaces?sort=likes&search=tts) on Hugging Face.
177
+ This demo only showcases English, but you can directly use the model to access other languages.
178
+ '''
179
 
 
180
  with gr.Blocks() as app:
181
  with gr.Row():
182
+ gr.Markdown(BANNER_TEXT, container=True)
 
 
 
183
  with gr.Row():
184
  with gr.Column():
185
  text = gr.Textbox(label='Input Text', info=f"Up to ~500 characters per Generate, or {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} characters per Stream")
 
197
  with gr.Column():
198
  gr.TabbedInterface([generate_tab, translator_tab], ['Generate', 'Translator'])
199
 
200
+ random_btn.click(fn=get_random_text, inputs=[voice], outputs=[text], api_name=API_NAME)
201
+ generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps], api_name=API_NAME)
202
+ tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps], api_name=API_NAME)
203
+ predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)
204
 
205
  if __name__ == '__main__':
206
+ app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)