Spaces:
Running
Running
# --------------------------------------------------------------- | |
# app.py – "TTS Showcase" (Gradio Implementation) | |
# --------------------------------------------------------------- | |
import os | |
import gradio as gr | |
# ---------- 1. Demo metadata ---------- | |
MODELS = { | |
"nari-labs/Dia-1.6B": "Dia-1.6B", | |
"hexgrad/Kokoro-82M": "Kokoro-82M", | |
"sesame/csm-1b": "csm-1b", | |
"SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B", | |
"canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft", | |
"SWivid/F5-TTS": "F5-TTS", | |
"Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer", | |
"coqui/XTTS-v2": "XTTS-v2", | |
"HKUSTAudio/Llasa-3B": "Llasa-3B", | |
"amphion/MaskGCT": "MaskGCT", | |
"OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B", | |
"ByteDance/MegaTTS3": "MegaTTS3" | |
} | |
# Performance ratings for each model | |
MODEL_RATINGS = { | |
"nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"}, | |
"hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
"sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, | |
"SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
"canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
"SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, | |
"Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"}, | |
"coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
"HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"}, | |
"amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, | |
"OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"}, | |
"ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"} | |
} | |
# Model descriptions for better understanding | |
MODEL_DESCRIPTIONS = { | |
"nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality", | |
"hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity", | |
"sesame/csm-1b": "High-quality synthesis with excellent naturalness", | |
"SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance", | |
"canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality", | |
"SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings", | |
"Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model", | |
"coqui/XTTS-v2": "Multi-lingual excellence with proven performance", | |
"HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model", | |
"amphion/MaskGCT": "Masked generative modeling approach", | |
"OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance", | |
"ByteDance/MegaTTS3": "Industrial-grade TTS solution" | |
} | |
# Folder that contains subfolders with the audio clips | |
SAMPLES_DIR = "samples" | |
CLIP_NAME = "generated-audio.wav" | |
# Test prompt used for evaluation | |
TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!" | |
def repo_to_slug(repo: str) -> str: | |
"""Convert huggingface/xxx to huggingface_xxx for folder naming.""" | |
return repo.replace("/", "_") | |
def get_rating_emoji(rating: str) -> str: | |
"""Convert rating to emoji.""" | |
if rating == "Excellent": | |
return "🟢" | |
elif rating == "Good": | |
return "🟡" | |
else: | |
return "🟠" | |
def get_audio_path(repo: str) -> str: | |
"""Get the audio file path for a given repository.""" | |
audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME) | |
return audio_path if os.path.isfile(audio_path) else None | |
def filter_models(search_term: str): | |
"""Filter models based on search term.""" | |
if not search_term.strip(): | |
return list(MODELS.keys()) | |
search_lower = search_term.lower().strip() | |
return [ | |
repo for repo, name in MODELS.items() | |
if search_lower in repo.lower() or search_lower in name.lower() | |
] | |
def create_model_card(repo: str) -> str: | |
"""Create a formatted model card with ratings and description.""" | |
display_name = MODELS[repo] | |
description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model") | |
ratings = MODEL_RATINGS.get(repo, {}) | |
card_html = f""" | |
<div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;"> | |
<h3 style="color: #2c3e50; margin-top: 0;">🎤 {display_name}</h3> | |
</div> | |
""" | |
return card_html | |
# ---------- 2. Custom CSS ---------- | |
custom_css = """ | |
#title { | |
text-align: center; | |
background: rgb(203, 255, 77); | |
color: white; | |
padding: 2rem; | |
border-radius: 15px; | |
margin-bottom: 2rem; | |
} | |
#intro-section { | |
background: #f8f9fa; | |
color: #2c3e50; | |
padding: 1.5rem; | |
border-radius: 10px; | |
margin: 1rem 0; | |
border-left: 4px solid rgb(0, 72, 10); | |
} | |
#intro-section h2, | |
#intro-section h3 { | |
color: #2c3e50; | |
} | |
#intro-section p { | |
color: #34495e; | |
} | |
#intro-section ul li { | |
color: #34495e; | |
} | |
#intro-section .mission-text { | |
color: #667eea !important; | |
font-weight: bold; | |
text-align: center; | |
} | |
#intro-section strong { | |
color: #2c3e50 !important; | |
} | |
#intro-section em { | |
color: #2c3e50 !important; | |
} | |
#intro-section .mission-text strong { | |
color: #667eea !important; | |
} | |
#test-prompt { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
padding: 1.5rem; | |
border-radius: 10px; | |
text-align: center; | |
margin: 1rem 0; | |
} | |
.model-grid { | |
display: grid; | |
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); | |
gap: 1rem; | |
margin: 1rem 0; | |
} | |
#footer { | |
text-align: center; | |
padding: 2rem; | |
color: #666; | |
border-top: 1px solid #eee; | |
margin-top: 2rem; | |
} | |
/* make all the text in our white‐background cards dark */ | |
.model-grid .gr-html * { | |
color: #2c3e50 !important; | |
} | |
.model-card { | |
background: white; | |
color: #2c3e50 !important; | |
border: 1px solid #ddd; | |
border-radius: 12px; | |
padding: 20px; | |
margin: 10px 0; | |
} | |
""" | |
# ---------- 3. Main Gradio Interface ---------- | |
def create_interface(): | |
with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo: | |
# Header Section | |
gr.HTML(""" | |
<div id="title"> | |
<h1>🎙️ Open-Source Text-to-Speech Model Gallery</h1> | |
</div> | |
""") | |
# Introduction Section | |
gr.HTML(""" | |
<div id="intro-section"> | |
<h3>🔬 Our Exciting Quest</h3> | |
<p>We’re on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects. In this gallery, you’ll find 12 state-of-the-art TTS models, each evaluated using a consistent test prompt to assess their synthesized speech.</p> | |
<p><strong>Featured TTS Models:</strong></p> | |
<ul> | |
<li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li> | |
<li>🎪 <strong>Kokoro-82M</strong> - Lightweight powerhouse</li> | |
<li>🎨 <strong>F5-TTS</strong> - Advanced flow-based synthesis</li> | |
<li>🎵 <strong>XTTS-v2</strong> - Multi-lingual excellence</li> | |
<li>🎼 <strong>MaskGCT</strong> - Masked generative modeling</li> | |
<li>🎤 <strong>Llasa-3B</strong> - Large-scale audio synthesis</li> | |
<li><em>...and 6 more incredible models!</em></li> | |
</ul> | |
<h3>🔑 Key Findings</h3> | |
<ol> | |
<li><strong>Outstanding Speech Quality</strong><br> | |
Several models—namely <strong>Kokoro-82M</strong>, <strong>csm-1b</strong>, <strong>Spark-TTS-0.5B</strong>, | |
<strong>Orpheus-3b-0.1-ft</strong>, <strong>F5-TTS</strong>, and <strong>Llasa-3B</strong> delivered exceptionally | |
natural, clear, and realistic synthesized speech. Among these, <strong>csm-1b</strong> and <strong>F5-TTS</strong> | |
stood out as the most well-rounded model as they combined good synthesized speech with solid controllability. | |
</li> | |
<li><strong>Superior Controllability</strong><br> | |
<strong>Zonos-v0.1-transformer</strong> emerged as the best in fine-grained control: it offers detailed | |
adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise | |
voice modulation. | |
</li> | |
<li><strong>Performance vs. Footprint Trade-off</strong><br> | |
Smaller models (e.g., <strong>Kokoro-82M</strong> at 82 million parameters) can still excel in many scenarios, especially when efficient inference or low VRAM usage is critical. | |
Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual | |
synthesis, zero-shot voice cloning, and multi-speaker generation but require heavier compute resources. | |
</li> | |
<li><strong>Special Notes on Multilingual & Cloning Capabilities</strong><br> | |
<strong>Spark-TTS-0.5B</strong> and <strong>XTTS-v2</strong> excel at cross-lingual and zero-shot voice | |
cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. | |
<strong>Llama-OuteTTS-1.0-1B</strong> and <strong>MegaTTS3</strong> also offer multilingual input handling, | |
though they may require careful sampling parameter tuning to achieve optimal results. | |
</li> | |
</ol> | |
</div> | |
""") | |
# Test Prompt Section | |
# gr.HTML(f""" | |
# <div id="test-prompt"> | |
# <h3>🎯 Universal Test Prompt</h3> | |
# <p style="font-style: italic; font-size: 1.1em;">"{TEST_PROMPT}"</p> | |
# <p style="font-size: 0.9em; opacity: 0.9;"> | |
# Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models | |
# </p> | |
# </div> | |
# """) | |
# Evaluation Criteria | |
# with gr.Row(): | |
# with gr.Column(): | |
# gr.HTML(""" | |
# <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> | |
# <div style="font-size: 2rem;">🎭</div> | |
# <strong>Naturalness</strong><br> | |
# <small>Human-like quality & emotional expression</small> | |
# </div> | |
# """) | |
# with gr.Column(): | |
# gr.HTML(""" | |
# <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> | |
# <div style="font-size: 2rem;">🗣️</div> | |
# <strong>Intelligibility</strong><br> | |
# <small>Clarity & pronunciation accuracy</small> | |
# </div> | |
# """) | |
# with gr.Column(): | |
# gr.HTML(""" | |
# <div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> | |
# <div style="font-size: 2rem;">🎛️</div> | |
# <strong>Controllability</strong><br> | |
# <small>Tone, pace & parameter flexibility</small> | |
# </div> | |
# """) | |
# gr.Markdown("---") | |
# gr.Markdown(""" | |
# ## 🔑 Key Findings | |
# 1. **Outstanding Speech Quality** | |
# Several models—namely **Kokoro-82M**, **csm-1b**, **Spark-TTS-0.5B**, **Orpheus-3b-0.1-ft**, **F5-TTS**, and **Llasa-3B**—delivered exceptionally natural, clear, and realistic synthesized speech. Among these, **csm-1b** and **F5-TTS** stood out as the most well-rounded: they combined top-tier naturalness and intelligibility with solid controllability. | |
# 2. **Superior Controllability** | |
# **Zonos-v0.1-transformer** emerged as the leader in fine-grained control: it offers detailed adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise voice modulation. | |
# 3. **Performance vs. Footprint Trade-off** | |
# Smaller models (e.g., **Kokoro-82M** at 82 million parameters) can still achieve “Good” or “Excellent” ratings in many scenarios, especially when efficient inference or low VRAM usage is critical. Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual synthesis, zero-shot voice cloning, and multi-speaker generation—but require heavier compute resources. | |
# 4. **Special Notes on Multilingual & Cloning Capabilities** | |
# **Spark-TTS-0.5B** and **XTTS-v2** excel at cross-lingual and zero-shot voice cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. **Llama-OuteTTS-1.0-1B** and **MegaTTS3** also offer multilingual input handling, though they may require careful sampling parameter tuning to achieve optimal results. | |
# """) | |
# Search and Filter Section | |
with gr.Row(): | |
search_box = gr.Textbox( | |
label="🔍 Search Models", | |
placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')", | |
value="", | |
scale=3 | |
) | |
clear_btn = gr.Button("Clear", scale=1) | |
# Model Gallery Section | |
gr.Markdown("## 🎧 Model Gallery") | |
# Create model cards and audio players | |
model_components = [] | |
for repo, display_name in MODELS.items(): | |
with gr.Group(): | |
# Model information card | |
model_info = gr.HTML(create_model_card(repo)) | |
# Audio player | |
audio_path = get_audio_path(repo) | |
if audio_path: | |
audio_player = gr.Audio( | |
value=audio_path, | |
label=f"🎵 {display_name} Audio Sample", | |
interactive=False | |
) | |
else: | |
audio_player = gr.HTML(f"<p style='color: red;'>🤷♂️ Audio sample not found for {display_name}</p>") | |
model_components.append((repo, model_info, audio_player)) | |
# Search functionality | |
def update_visibility(search_term): | |
filtered_repos = filter_models(search_term) | |
updates = [] | |
for repo, model_info, audio_player in model_components: | |
visible = repo in filtered_repos | |
updates.extend([ | |
gr.update(visible=visible), # model_info | |
gr.update(visible=visible) # audio_player | |
]) | |
return updates | |
# Connect search functionality | |
search_box.change( | |
fn=update_visibility, | |
inputs=[search_box], | |
outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]] | |
) | |
clear_btn.click( | |
fn=lambda: "", | |
outputs=[search_box] | |
) | |
# Methodology Section | |
# with gr.Accordion("📋 Detailed Evaluation Methodology", open=False): | |
# gr.Markdown(""" | |
# ### Test Prompt | |
# `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!` | |
# ### Model Evaluation Criteria: | |
# 🎭 **Naturalness (Human-like Quality)** | |
# - Prosody and rhythm patterns | |
# - Emotional expression capability | |
# - Voice texture and warmth | |
# - Natural breathing and pauses | |
# 🗣️ **Intelligibility (Clarity & Accuracy)** | |
# - Word pronunciation precision | |
# - Consonant and vowel clarity | |
# - Sentence comprehensibility | |
# - Technical term handling | |
# 🎛️ **Controllability (Flexibility)** | |
# - Parameter responsiveness | |
# - Tone modification capability | |
# - Speed and pitch control | |
# - Customization potential | |
# ### Key Insights: | |
# - Smaller models (82M-500M) can excel in specific scenarios | |
# - Larger models (1B-3B+) offer more versatility but require more resources | |
# - Architecture matters as much as parameter count | |
# - Training data quality significantly impacts output quality | |
# """) | |
# Footer | |
# gr.HTML(""" | |
# <div id="footer"> | |
# <p><strong>🚀 Ready to deploy your own TTS model?</strong></p> | |
# <p>This demo showcases the power of open-source TTS technology. Each model offers unique strengths for different applications.</p> | |
# <p><em>Built with ❤️ using Gradio • All models are open-source and available on Hugging Face</em></p> | |
# <p>⚡ Powered by Inferless</p> | |
# </div> | |
# """) | |
return demo | |
# ---------- 4. Launch the application ---------- | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch( | |
share=True, | |
inbrowser=True, | |
show_error=True | |
) |