rbgo's picture
Update app.py
a8a8f61 verified
raw
history blame
15.1 kB
# ---------------------------------------------------------------
# app.py – "TTS Showcase" (Gradio Implementation)
# ---------------------------------------------------------------
import os
import gradio as gr
# ---------- 1. Demo metadata ----------
MODELS = {
"nari-labs/Dia-1.6B": "Dia-1.6B",
"hexgrad/Kokoro-82M": "Kokoro-82M",
"sesame/csm-1b": "csm-1b",
"SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B",
"canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft",
"SWivid/F5-TTS": "F5-TTS",
"Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer",
"coqui/XTTS-v2": "XTTS-v2",
"HKUSTAudio/Llasa-3B": "Llasa-3B",
"amphion/MaskGCT": "MaskGCT",
"OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B",
"ByteDance/MegaTTS3": "MegaTTS3"
}
# Performance ratings for each model
MODEL_RATINGS = {
"nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"},
"hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
"sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
"SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
"canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"},
"SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"},
"Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"},
"coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
"HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"},
"amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"},
"OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"},
"ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"}
}
# Model descriptions for better understanding
MODEL_DESCRIPTIONS = {
"nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality",
"hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity",
"sesame/csm-1b": "High-quality synthesis with excellent naturalness",
"SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance",
"canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality",
"SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings",
"Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model",
"coqui/XTTS-v2": "Multi-lingual excellence with proven performance",
"HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model",
"amphion/MaskGCT": "Masked generative modeling approach",
"OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance",
"ByteDance/MegaTTS3": "Industrial-grade TTS solution"
}
# Folder that contains subfolders with the audio clips
SAMPLES_DIR = "samples"
CLIP_NAME = "generated-audio.wav"
# Test prompt used for evaluation
TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!"
def repo_to_slug(repo: str) -> str:
"""Convert huggingface/xxx to huggingface_xxx for folder naming."""
return repo.replace("/", "_")
def get_rating_emoji(rating: str) -> str:
"""Convert rating to emoji."""
if rating == "Excellent":
return "🟒"
elif rating == "Good":
return "🟑"
else:
return "🟠"
def get_audio_path(repo: str) -> str:
"""Get the audio file path for a given repository."""
audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME)
return audio_path if os.path.isfile(audio_path) else None
def filter_models(search_term: str):
"""Filter models based on search term."""
if not search_term.strip():
return list(MODELS.keys())
search_lower = search_term.lower().strip()
return [
repo for repo, name in MODELS.items()
if search_lower in repo.lower() or search_lower in name.lower()
]
def create_model_card(repo: str) -> str:
"""Create a formatted model card with ratings and description."""
display_name = MODELS[repo]
description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model")
ratings = MODEL_RATINGS.get(repo, {})
card_html = f"""
<div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;">
<h3 style="color: #2c3e50; margin-top: 0;">🎀 {display_name}</h3>
<div style="display: flex; gap: 15px; margin: 15px 0;">
<span style="color: #888;"><strong style="color: #888;">Naturalness:</strong> {get_rating_emoji(ratings.get('naturalness', 'Moderate'))} {ratings.get('naturalness', 'Moderate')}</span>
<span style="color: #888;"><strong style="color: #888;">Intelligibility:</strong> {get_rating_emoji(ratings.get('intelligibility', 'Moderate'))} {ratings.get('intelligibility', 'Moderate')}</span>
<span style="color: #888;"><strong style="color: #888;">Controllability:</strong> {get_rating_emoji(ratings.get('controllability', 'Moderate'))} {ratings.get('controllability', 'Moderate')}</span>
</div>
<p style="font-size: 0.9em; color: #888; margin: 5px 0;">Repository: <code style="color: #888;">{repo}</code></p>
</div>
"""
return card_html
# ---------- 2. Custom CSS ----------
custom_css = """
#title {
text-align: center;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 2rem;
border-radius: 15px;
margin-bottom: 2rem;
}
#intro-section {
background: #f8f9fa;
color: #2c3e50;
padding: 1.5rem;
border-radius: 10px;
margin: 1rem 0;
border-left: 4px solid #667eea;
}
#intro-section h2,
#intro-section h3 {
color: #2c3e50;
}
#intro-section p {
color: #34495e;
}
#intro-section ul li {
color: #34495e;
}
#intro-section .mission-text {
color: #667eea !important;
font-weight: bold;
text-align: center;
}
#intro-section strong {
color: #2c3e50 !important;
}
#intro-section em {
color: #2c3e50 !important;
}
#intro-section .mission-text strong {
color: #667eea !important;
}
#test-prompt {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 1.5rem;
border-radius: 10px;
text-align: center;
margin: 1rem 0;
}
.model-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(400px, 1fr));
gap: 1rem;
margin: 1rem 0;
}
#footer {
text-align: center;
padding: 2rem;
color: #666;
border-top: 1px solid #eee;
margin-top: 2rem;
}
/* make all the text in our white‐background cards dark */
.model-grid .gr-html * {
color: #2c3e50 !important;
}
.model-card {
background: white;
color: #2c3e50 !important;
border: 1px solid #ddd;
border-radius: 12px;
padding: 20px;
margin: 10px 0;
}
"""
# ---------- 3. Main Gradio Interface ----------
def create_interface():
with gr.Blocks(css=custom_css, title="πŸŽ™οΈ TTS Model Gallery", theme=gr.themes.Soft()) as demo:
# Header Section
gr.HTML("""
<div id="title">
<h1>πŸŽ™οΈ Open-Source Text-to-Speech Model Gallery</h1>
</div>
""")
# Introduction Section
gr.HTML("""
<div id="intro-section">
<h3>πŸ”¬ Our Exciting Quest</h3>
<p>We're on a thrilling journey to help developers discover the perfect TTS models for their innovative audio projects!
We've put these 12 cutting-edge models through their paces using a scientifically designed universal test prompt.</p>
<p><strong>Featured TTS Engines:</strong></p>
<ul>
<li>🎭 <strong>Dia-1.6B</strong> - Expressive conversational voice</li>
<li>πŸŽͺ <strong>Kokoro-82M</strong> - Lightweight powerhouse</li>
<li>🎨 <strong>F5-TTS</strong> - Advanced flow-based synthesis</li>
<li>🎡 <strong>XTTS-v2</strong> - Multi-lingual excellence</li>
<li>🎼 <strong>MaskGCT</strong> - Masked generative modeling</li>
<li>🎀 <strong>Llasa-3B</strong> - Large-scale audio synthesis</li>
<li><em>...and 6 more incredible models!</em></li>
</ul>
</div>
""")
# Test Prompt Section
# gr.HTML(f"""
# <div id="test-prompt">
# <h3>🎯 Universal Test Prompt</h3>
# <p style="font-style: italic; font-size: 1.1em;">"{TEST_PROMPT}"</p>
# <p style="font-size: 0.9em; opacity: 0.9;">
# Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models
# </p>
# </div>
# """)
# Evaluation Criteria
with gr.Row():
with gr.Column():
gr.HTML("""
<div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
<div style="font-size: 2rem;">🎭</div>
<strong>Naturalness</strong><br>
<small>Human-like quality & emotional expression</small>
</div>
""")
with gr.Column():
gr.HTML("""
<div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
<div style="font-size: 2rem;">πŸ—£οΈ</div>
<strong>Intelligibility</strong><br>
<small>Clarity & pronunciation accuracy</small>
</div>
""")
with gr.Column():
gr.HTML("""
<div style="text-align: center; padding: 1rem; background: rgba(102, 126, 234, 0.1); border-radius: 8px;">
<div style="font-size: 2rem;">πŸŽ›οΈ</div>
<strong>Controllability</strong><br>
<small>Tone, pace & parameter flexibility</small>
</div>
""")
gr.Markdown("---")
# Search and Filter Section
with gr.Row():
search_box = gr.Textbox(
label="πŸ” Search Models",
placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')",
value="",
scale=3
)
clear_btn = gr.Button("Clear", scale=1)
# Model Gallery Section
gr.Markdown("## 🎧 Model Gallery")
# Create model cards and audio players
model_components = []
for repo, display_name in MODELS.items():
with gr.Group():
# Model information card
model_info = gr.HTML(create_model_card(repo))
# Audio player
audio_path = get_audio_path(repo)
if audio_path:
audio_player = gr.Audio(
value=audio_path,
label=f"🎡 {display_name} Audio Sample",
interactive=False
)
else:
audio_player = gr.HTML(f"<p style='color: red;'>πŸ€·β€β™‚οΈ Audio sample not found for {display_name}</p>")
model_components.append((repo, model_info, audio_player))
# Search functionality
def update_visibility(search_term):
filtered_repos = filter_models(search_term)
updates = []
for repo, model_info, audio_player in model_components:
visible = repo in filtered_repos
updates.extend([
gr.update(visible=visible), # model_info
gr.update(visible=visible) # audio_player
])
return updates
# Connect search functionality
search_box.change(
fn=update_visibility,
inputs=[search_box],
outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]]
)
clear_btn.click(
fn=lambda: "",
outputs=[search_box]
)
# Methodology Section
with gr.Accordion("πŸ“‹ Detailed Evaluation Methodology", open=False):
gr.Markdown("""
### Test Prompt
`Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!`
### Model Evaluation Criteria:
🎭 **Naturalness (Human-like Quality)**
- Prosody and rhythm patterns
- Emotional expression capability
- Voice texture and warmth
- Natural breathing and pauses
πŸ—£οΈ **Intelligibility (Clarity & Accuracy)**
- Word pronunciation precision
- Consonant and vowel clarity
- Sentence comprehensibility
- Technical term handling
πŸŽ›οΈ **Controllability (Flexibility)**
- Parameter responsiveness
- Tone modification capability
- Speed and pitch control
- Customization potential
### Key Insights:
- Smaller models (82M-500M) can excel in specific scenarios
- Larger models (1B-3B+) offer more versatility but require more resources
- Architecture matters as much as parameter count
- Training data quality significantly impacts output quality
""")
# Footer
# gr.HTML("""
# <div id="footer">
# <p><strong>πŸš€ Ready to deploy your own TTS model?</strong></p>
# <p>This demo showcases the power of open-source TTS technology. Each model offers unique strengths for different applications.</p>
# <p><em>Built with ❀️ using Gradio β€’ All models are open-source and available on Hugging Face</em></p>
# <p>⚑ Powered by Inferless</p>
# </div>
# """)
return demo
# ---------- 4. Launch the application ----------
if __name__ == "__main__":
demo = create_interface()
demo.launch(
share=True,
inbrowser=True,
show_error=True
)