|
|
|
|
|
|
|
import os |
|
import gradio as gr |
|
|
|
|
|
MODELS = { |
|
"nari-labs/Dia-1.6B": "Dia-1.6B", |
|
"hexgrad/Kokoro-82M": "Kokoro-82M", |
|
"sesame/csm-1b": "csm-1b", |
|
"SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B", |
|
"canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft", |
|
"SWivid/F5-TTS": "F5-TTS", |
|
"Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer", |
|
"coqui/XTTS-v2": "XTTS-v2", |
|
"HKUSTAudio/Llasa-3B": "Llasa-3B", |
|
"amphion/MaskGCT": "MaskGCT", |
|
"OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B", |
|
"ByteDance/MegaTTS3": "MegaTTS3" |
|
} |
|
|
|
|
|
MODEL_RATINGS = { |
|
"nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"}, |
|
"hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, |
|
"sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, |
|
"SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, |
|
"canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, |
|
"SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, |
|
"Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"}, |
|
"coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, |
|
"HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"}, |
|
"amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, |
|
"OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"}, |
|
"ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"} |
|
} |
|
|
|
|
|
MODEL_DESCRIPTIONS = { |
|
"nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality", |
|
"hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity", |
|
"sesame/csm-1b": "High-quality synthesis with excellent naturalness", |
|
"SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance", |
|
"canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality", |
|
"SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings", |
|
"Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model", |
|
"coqui/XTTS-v2": "Multi-lingual excellence with proven performance", |
|
"HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model", |
|
"amphion/MaskGCT": "Masked generative modeling approach", |
|
"OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance", |
|
"ByteDance/MegaTTS3": "Industrial-grade TTS solution" |
|
} |
|
|
|
|
|
SAMPLES_DIR = "samples" |
|
CLIP_NAME = "generated-audio.wav" |
|
|
|
|
|
TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!" |
|
|
|
def repo_to_slug(repo: str) -> str: |
|
"""Convert huggingface/xxx to huggingface_xxx for folder naming.""" |
|
return repo.replace("/", "_") |
|
|
|
def get_rating_emoji(rating: str) -> str: |
|
"""Convert rating to emoji.""" |
|
if rating == "Excellent": |
|
return "π’" |
|
elif rating == "Good": |
|
return "π‘" |
|
else: |
|
return "π " |
|
|
|
def get_audio_path(repo: str) -> str: |
|
"""Get the audio file path for a given repository.""" |
|
audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME) |
|
return audio_path if os.path.isfile(audio_path) else None |
|
|
|
def filter_models(search_term: str): |
|
"""Filter models based on search term.""" |
|
if not search_term.strip(): |
|
return list(MODELS.keys()) |
|
|
|
search_lower = search_term.lower().strip() |
|
return [ |
|
repo for repo, name in MODELS.items() |
|
if search_lower in repo.lower() or search_lower in name.lower() |
|
] |
|
|
|
def create_model_card(repo: str) -> str: |
|
"""Create a formatted model card with ratings and description.""" |
|
display_name = MODELS[repo] |
|
description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model") |
|
ratings = MODEL_RATINGS.get(repo, {}) |
|
|
|
card_html = f""" |
|
<div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;"> |
|
<h3 style="color: #2c3e50; margin-top: 0;">π€ {display_name}</h3> |
|
<div style="display: flex; gap: 15px; margin: 15px 0;"> |
|
<span style="color: #888;"><strong style="color: #888;">Naturalness:</strong> {get_rating_emoji(ratings.get('naturalness', 'Moderate'))} {ratings.get('naturalness', 'Moderate')}</span> |
|
<span style="color: #888;"><strong style="color: #888;">Intelligibility:</strong> {get_rating_emoji(ratings.get('intelligibility', 'Moderate'))} {ratings.get('intelligibility', 'Moderate')}</span> |
|
<span style="color: #888;"><strong style="color: #888;">Controllability:</strong> {get_rating_emoji(ratings.get('controllability', 'Moderate'))} {ratings.get('controllability', 'Moderate')}</span> |
|
</div> |
|
|
|
<p style="font-size: 0.9em; color: #888; margin: 5px 0;">Repository: <code style="color: #888;">{repo}</code></p> |
|
</div> |
|
""" |
|
return card_html |
|
|
|
|
|
custom_css = """ |
|
#title { |
|
text-align: center; |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 2rem; |
|
border-radius: 15px; |
|
margin-bottom: 2rem; |
|
} |
|
|
|
#intro-section { |
|
background: #f8f9fa; |
|
color: #2c3e50; |
|
padding: 1.5rem; |
|
border-radius: 10px; |
|
margin: 1rem 0; |
|
border-left: 4px solid #667eea; |
|
} |
|
|
|
#intro-section h2, |
|
#intro-section h3 { |
|
color: #2c3e50; |
|
} |
|
|
|
#intro-section p { |
|
color: #34495e; |
|
} |
|
|
|
#intro-section ul li { |
|
color: #34495e; |
|
} |
|
|
|
#intro-section .mission-text { |
|
color: #667eea !important; |
|
font-weight: bold; |
|
text-align: center; |
|
} |
|
|
|
#intro-section strong { |
|
color: #2c3e50 !important; |
|
} |
|
|
|
#intro-section em { |
|
color: #2c3e50 !important; |
|
} |
|
|
|
#intro-section .mission-text strong { |
|
color: #667eea !important; |
|
} |
|
|
|
#test-prompt { |
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
color: white; |
|
padding: 1.5rem; |
|
border-radius: 10px; |
|
text-align: center; |
|
margin: 1rem 0; |
|
} |
|
|
|
.model-grid { |
|
display: grid; |
|
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); |
|
gap: 1rem; |
|
margin: 1rem 0; |
|
} |
|
|
|
#footer { |
|
text-align: center; |
|
padding: 2rem; |
|
color: #666; |
|
border-top: 1px solid #eee; |
|
margin-top: 2rem; |
|
} |
|
|
|
/* make all the text in our whiteβbackground cards dark */ |
|
.model-grid .gr-html * { |
|
color: #2c3e50 !important; |
|
} |
|
|
|
.model-card { |
|
background: white; |
|
color: #2c3e50 !important; |
|
border: 1px solid #ddd; |
|
border-radius: 12px; |
|
padding: 20px; |
|
margin: 10px 0; |
|
} |
|
|
|
""" |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(css=custom_css, title="ποΈ TTS Model Gallery", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.HTML(""" |
|
<div id="title"> |
|
<h1>ποΈ Open-Source Text-to-Speech Model Gallery</h1> |
|
</div> |
|
""") |
|
|
|
|
|
gr.HTML(""" |
|
<div id="intro-section"> |
|
<h3>π¬ Our Exciting Quest</h3> |
|
<p>We're on a thrilling journey to help developers discover the perfect TTS models for their innovative audio projects! |
|
We've put these 12 cutting-edge models using the test prompts.</p> |
|
|
|
<p><strong>Featured TTS Engines:</strong></p> |
|
<ul> |
|
<li>π <strong>Dia-1.6B</strong> - Expressive conversational voice</li> |
|
<li>πͺ <strong>Kokoro-82M</strong> - Lightweight powerhouse</li> |
|
<li>π¨ <strong>F5-TTS</strong> - Advanced flow-based synthesis</li> |
|
<li>π΅ <strong>XTTS-v2</strong> - Multi-lingual excellence</li> |
|
<li>πΌ <strong>MaskGCT</strong> - Masked generative modeling</li> |
|
<li>π€ <strong>Llasa-3B</strong> - Large-scale audio synthesis</li> |
|
<li><em>...and 6 more incredible models!</em></li> |
|
</ul> |
|
|
|
</div> |
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.HTML(""" |
|
<div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> |
|
<div style="font-size: 2rem;">π</div> |
|
<strong>Naturalness</strong><br> |
|
<small>Human-like quality & emotional expression</small> |
|
</div> |
|
""") |
|
with gr.Column(): |
|
gr.HTML(""" |
|
<div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> |
|
<div style="font-size: 2rem;">π£οΈ</div> |
|
<strong>Intelligibility</strong><br> |
|
<small>Clarity & pronunciation accuracy</small> |
|
</div> |
|
""") |
|
with gr.Column(): |
|
gr.HTML(""" |
|
<div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;"> |
|
<div style="font-size: 2rem;">ποΈ</div> |
|
<strong>Controllability</strong><br> |
|
<small>Tone, pace & parameter flexibility</small> |
|
</div> |
|
""") |
|
|
|
gr.Markdown("---") |
|
|
|
|
|
with gr.Row(): |
|
search_box = gr.Textbox( |
|
label="π Search Models", |
|
placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')", |
|
value="", |
|
scale=3 |
|
) |
|
clear_btn = gr.Button("Clear", scale=1) |
|
|
|
|
|
gr.Markdown("## π§ Model Gallery") |
|
|
|
|
|
model_components = [] |
|
|
|
for repo, display_name in MODELS.items(): |
|
with gr.Group(): |
|
|
|
model_info = gr.HTML(create_model_card(repo)) |
|
|
|
|
|
audio_path = get_audio_path(repo) |
|
if audio_path: |
|
audio_player = gr.Audio( |
|
value=audio_path, |
|
label=f"π΅ {display_name} Audio Sample", |
|
interactive=False |
|
) |
|
else: |
|
audio_player = gr.HTML(f"<p style='color: red;'>π€·ββοΈ Audio sample not found for {display_name}</p>") |
|
|
|
model_components.append((repo, model_info, audio_player)) |
|
|
|
|
|
def update_visibility(search_term): |
|
filtered_repos = filter_models(search_term) |
|
updates = [] |
|
|
|
for repo, model_info, audio_player in model_components: |
|
visible = repo in filtered_repos |
|
updates.extend([ |
|
gr.update(visible=visible), |
|
gr.update(visible=visible) |
|
]) |
|
|
|
return updates |
|
|
|
|
|
search_box.change( |
|
fn=update_visibility, |
|
inputs=[search_box], |
|
outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]] |
|
) |
|
|
|
clear_btn.click( |
|
fn=lambda: "", |
|
outputs=[search_box] |
|
) |
|
|
|
|
|
with gr.Accordion("π Detailed Evaluation Methodology", open=False): |
|
gr.Markdown(""" |
|
### Test Prompt |
|
|
|
`Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!` |
|
|
|
|
|
### Model Evaluation Criteria: |
|
|
|
π **Naturalness (Human-like Quality)** |
|
- Prosody and rhythm patterns |
|
- Emotional expression capability |
|
- Voice texture and warmth |
|
- Natural breathing and pauses |
|
|
|
π£οΈ **Intelligibility (Clarity & Accuracy)** |
|
- Word pronunciation precision |
|
- Consonant and vowel clarity |
|
- Sentence comprehensibility |
|
- Technical term handling |
|
|
|
ποΈ **Controllability (Flexibility)** |
|
- Parameter responsiveness |
|
- Tone modification capability |
|
- Speed and pitch control |
|
- Customization potential |
|
|
|
### Key Insights: |
|
- Smaller models (82M-500M) can excel in specific scenarios |
|
- Larger models (1B-3B+) offer more versatility but require more resources |
|
- Architecture matters as much as parameter count |
|
- Training data quality significantly impacts output quality |
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_interface() |
|
demo.launch( |
|
share=True, |
|
inbrowser=True, |
|
show_error=True |
|
) |