rbgo commited on
Commit
21a3273
·
verified ·
1 Parent(s): 7743176

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -35
app.py CHANGED
@@ -208,10 +208,9 @@ def create_interface():
208
  gr.HTML("""
209
  <div id="intro-section">
210
  <h3>🔬 Our Exciting Quest</h3>
211
- <p>We're on a thrilling journey to help developers discover the perfect TTS models for their innovative audio projects!
212
- We've put these 12 cutting-edge models using the test prompts.</p>
213
 
214
- <p><strong>Featured TTS Engines:</strong></p>
215
  <ul>
216
  <li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li>
217
  <li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li>
@@ -226,20 +225,19 @@ def create_interface():
226
  <ol>
227
  <li><strong>Outstanding Speech Quality</strong><br>
228
  Several models—namely <strong>Kokoro-82M</strong>, <strong>csm-1b</strong>, <strong>Spark-TTS-0.5B</strong>,
229
- <strong>Orpheus-3b-0.1-ft</strong>, <strong>F5-TTS</strong>, and <strong>Llasa-3B</strong>—delivered exceptionally
230
  natural, clear, and realistic synthesized speech. Among these, <strong>csm-1b</strong> and <strong>F5-TTS</strong>
231
- stood out as the most well-rounded: they combined top-tier naturalness and intelligibility with solid controllability.
232
  </li>
233
  <li><strong>Superior Controllability</strong><br>
234
- <strong>Zonos-v0.1-transformer</strong> emerged as the leader in fine-grained control: it offers detailed
235
  adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise
236
  voice modulation.
237
  </li>
238
  <li><strong>Performance vs. Footprint Trade-off</strong><br>
239
- Smaller models (e.g., <strong>Kokoro-82M</strong> at 82 million parameters) can still achieve “Good” or
240
- “Excellent” ratings in many scenarios, especially when efficient inference or low VRAM usage is critical.
241
  Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual
242
- synthesis, zero-shot voice cloning, and multi-speaker generationbut require heavier compute resources.
243
  </li>
244
  <li><strong>Special Notes on Multilingual & Cloning Capabilities</strong><br>
245
  <strong>Spark-TTS-0.5B</strong> and <strong>XTTS-v2</strong> excel at cross-lingual and zero-shot voice
@@ -368,39 +366,39 @@ def create_interface():
368
  )
369
 
370
  # Methodology Section
371
- with gr.Accordion("📋 Detailed Evaluation Methodology", open=False):
372
- gr.Markdown("""
373
- ### Test Prompt
374
 
375
- `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!`
376
 
377
 
378
- ### Model Evaluation Criteria:
379
 
380
- 🎭 **Naturalness (Human-like Quality)**
381
- - Prosody and rhythm patterns
382
- - Emotional expression capability
383
- - Voice texture and warmth
384
- - Natural breathing and pauses
385
 
386
- 🗣️ **Intelligibility (Clarity & Accuracy)**
387
- - Word pronunciation precision
388
- - Consonant and vowel clarity
389
- - Sentence comprehensibility
390
- - Technical term handling
391
 
392
- 🎛️ **Controllability (Flexibility)**
393
- - Parameter responsiveness
394
- - Tone modification capability
395
- - Speed and pitch control
396
- - Customization potential
397
 
398
- ### Key Insights:
399
- - Smaller models (82M-500M) can excel in specific scenarios
400
- - Larger models (1B-3B+) offer more versatility but require more resources
401
- - Architecture matters as much as parameter count
402
- - Training data quality significantly impacts output quality
403
- """)
404
 
405
  # Footer
406
  # gr.HTML("""
 
208
  gr.HTML("""
209
  <div id="intro-section">
210
  <h3>🔬 Our Exciting Quest</h3>
211
+ <p>Were on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects. In this gallery, you’ll find 12 state-of-the-art TTS models, each evaluated using a consistent test prompt to assess their synthesized speech.</p>
 
212
 
213
+ <p><strong>Featured TTS Models:</strong></p>
214
  <ul>
215
  <li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li>
216
  <li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li>
 
225
  <ol>
226
  <li><strong>Outstanding Speech Quality</strong><br>
227
  Several models—namely <strong>Kokoro-82M</strong>, <strong>csm-1b</strong>, <strong>Spark-TTS-0.5B</strong>,
228
+ <strong>Orpheus-3b-0.1-ft</strong>, <strong>F5-TTS</strong>, and <strong>Llasa-3B</strong> delivered exceptionally
229
  natural, clear, and realistic synthesized speech. Among these, <strong>csm-1b</strong> and <strong>F5-TTS</strong>
230
+ stood out as the most well-rounded model as they combined good synthesized speech with solid controllability.
231
  </li>
232
  <li><strong>Superior Controllability</strong><br>
233
+ <strong>Zonos-v0.1-transformer</strong> emerged as the best in fine-grained control: it offers detailed
234
  adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise
235
  voice modulation.
236
  </li>
237
  <li><strong>Performance vs. Footprint Trade-off</strong><br>
238
+ Smaller models (e.g., <strong>Kokoro-82M</strong> at 82 million parameters) can still excel in many scenarios, especially when efficient inference or low VRAM usage is critical.
 
239
  Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual
240
+ synthesis, zero-shot voice cloning, and multi-speaker generation but require heavier compute resources.
241
  </li>
242
  <li><strong>Special Notes on Multilingual & Cloning Capabilities</strong><br>
243
  <strong>Spark-TTS-0.5B</strong> and <strong>XTTS-v2</strong> excel at cross-lingual and zero-shot voice
 
366
  )
367
 
368
  # Methodology Section
369
+ # with gr.Accordion("📋 Detailed Evaluation Methodology", open=False):
370
+ # gr.Markdown("""
371
+ # ### Test Prompt
372
 
373
+ # `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!`
374
 
375
 
376
+ # ### Model Evaluation Criteria:
377
 
378
+ # 🎭 **Naturalness (Human-like Quality)**
379
+ # - Prosody and rhythm patterns
380
+ # - Emotional expression capability
381
+ # - Voice texture and warmth
382
+ # - Natural breathing and pauses
383
 
384
+ # 🗣️ **Intelligibility (Clarity & Accuracy)**
385
+ # - Word pronunciation precision
386
+ # - Consonant and vowel clarity
387
+ # - Sentence comprehensibility
388
+ # - Technical term handling
389
 
390
+ # 🎛️ **Controllability (Flexibility)**
391
+ # - Parameter responsiveness
392
+ # - Tone modification capability
393
+ # - Speed and pitch control
394
+ # - Customization potential
395
 
396
+ # ### Key Insights:
397
+ # - Smaller models (82M-500M) can excel in specific scenarios
398
+ # - Larger models (1B-3B+) offer more versatility but require more resources
399
+ # - Architecture matters as much as parameter count
400
+ # - Training data quality significantly impacts output quality
401
+ # """)
402
 
403
  # Footer
404
  # gr.HTML("""