# --------------------------------------------------------------- # app.py – "TTS Showcase" (Gradio Implementation) # --------------------------------------------------------------- import os import gradio as gr # ---------- 1. Demo metadata ---------- MODELS = { "nari-labs/Dia-1.6B": "Dia-1.6B", "hexgrad/Kokoro-82M": "Kokoro-82M", "sesame/csm-1b": "csm-1b", "SparkAudio/Spark-TTS-0.5B": "Spark-TTS-0.5B", "canopylabs/orpheus-3b-0.1-ft": "Orpheus-3b-0.1-ft", "SWivid/F5-TTS": "F5-TTS", "Zyphra/Zonos-v0.1-transformer": "Zonos-v0.1-transformer", "coqui/XTTS-v2": "XTTS-v2", "HKUSTAudio/Llasa-3B": "Llasa-3B", "amphion/MaskGCT": "MaskGCT", "OuteAI/Llama-OuteTTS-1.0-1B": "Llama-OuteTTS-1.0-1B", "ByteDance/MegaTTS3": "MegaTTS3" } # Performance ratings for each model MODEL_RATINGS = { "nari-labs/Dia-1.6B": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Good"}, "hexgrad/Kokoro-82M": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, "sesame/csm-1b": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, "SparkAudio/Spark-TTS-0.5B": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, "canopylabs/orpheus-3b-0.1-ft": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Moderate"}, "SWivid/F5-TTS": {"naturalness": "Excellent", "intelligibility": "Excellent", "controllability": "Good"}, "Zyphra/Zonos-v0.1-transformer": {"naturalness": "Good", "intelligibility": "Moderate", "controllability": "Excellent"}, "coqui/XTTS-v2": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, "HKUSTAudio/Llasa-3B": {"naturalness": "Excellent", "intelligibility": "Good", "controllability": "Moderate"}, "amphion/MaskGCT": {"naturalness": "Good", "intelligibility": "Excellent", "controllability": "Moderate"}, "OuteAI/Llama-OuteTTS-1.0-1B": {"naturalness": "Moderate", "intelligibility": "Moderate", "controllability": "Moderate"}, "ByteDance/MegaTTS3": {"naturalness": "Good", "intelligibility": "Good", "controllability": "Moderate"} } # Model descriptions for better understanding MODEL_DESCRIPTIONS = { "nari-labs/Dia-1.6B": "Expressive conversational voice with moderate quality", "hexgrad/Kokoro-82M": "Lightweight powerhouse with excellent clarity", "sesame/csm-1b": "High-quality synthesis with excellent naturalness", "SparkAudio/Spark-TTS-0.5B": "Efficient model with excellent performance", "canopylabs/orpheus-3b-0.1-ft": "Fine-tuned large model with superior quality", "SWivid/F5-TTS": "Advanced flow-based synthesis with top ratings", "Zyphra/Zonos-v0.1-transformer": "Highly controllable transformer-based model", "coqui/XTTS-v2": "Multi-lingual excellence with proven performance", "HKUSTAudio/Llasa-3B": "Large-scale audio synthesis model", "amphion/MaskGCT": "Masked generative modeling approach", "OuteAI/Llama-OuteTTS-1.0-1B": "LLM-based TTS with moderate performance", "ByteDance/MegaTTS3": "Industrial-grade TTS solution" } # Folder that contains subfolders with the audio clips SAMPLES_DIR = "samples" CLIP_NAME = "generated-audio.wav" # Test prompt used for evaluation TEST_PROMPT = "Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!" def repo_to_slug(repo: str) -> str: """Convert huggingface/xxx to huggingface_xxx for folder naming.""" return repo.replace("/", "_") def get_rating_emoji(rating: str) -> str: """Convert rating to emoji.""" if rating == "Excellent": return "🟢" elif rating == "Good": return "🟡" else: return "🟠" def get_audio_path(repo: str) -> str: """Get the audio file path for a given repository.""" audio_path = os.path.join(SAMPLES_DIR, repo_to_slug(repo), CLIP_NAME) return audio_path if os.path.isfile(audio_path) else None def filter_models(search_term: str): """Filter models based on search term.""" if not search_term.strip(): return list(MODELS.keys()) search_lower = search_term.lower().strip() return [ repo for repo, name in MODELS.items() if search_lower in repo.lower() or search_lower in name.lower() ] def create_model_card(repo: str) -> str: """Create a formatted model card with ratings and description.""" display_name = MODELS[repo] description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model") ratings = MODEL_RATINGS.get(repo, {}) card_html = f"""

🎤 {display_name}

""" return card_html # ---------- 2. Custom CSS ---------- custom_css = """ #title { text-align: center; background: rgb(203, 255, 77); color: white; padding: 2rem; border-radius: 15px; margin-bottom: 2rem; } #intro-section { background: #f8f9fa; color: #2c3e50; padding: 1.5rem; border-radius: 10px; margin: 1rem 0; border-left: 4px solid rgb(0, 72, 10); } #intro-section h2, #intro-section h3 { color: #2c3e50; } #intro-section p { color: #34495e; } #intro-section ul li { color: #34495e; } #intro-section .mission-text { color: #667eea !important; font-weight: bold; text-align: center; } #intro-section strong { color: #2c3e50 !important; } #intro-section em { color: #2c3e50 !important; } #intro-section .mission-text strong { color: #667eea !important; } #test-prompt { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1.5rem; border-radius: 10px; text-align: center; margin: 1rem 0; } .model-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 1rem; margin: 1rem 0; } #footer { text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem; } /* make all the text in our white‐background cards dark */ .model-grid .gr-html * { color: #2c3e50 !important; } .model-card { background: white; color: #2c3e50 !important; border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; } """ # ---------- 3. Main Gradio Interface ---------- def create_interface(): with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo: # Header Section gr.HTML("""

🎙️ Open-Source Text-to-Speech Model Gallery

""") # Introduction Section gr.HTML("""

🔬 Our Exciting Quest

We’re on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects. In this gallery, you’ll find 12 state-of-the-art TTS models, each evaluated using a consistent test prompt to assess their synthesized speech.

Featured TTS Models:

🔑 Key Findings

  1. Outstanding Speech Quality
    Several models—namely Kokoro-82M, csm-1b, Spark-TTS-0.5B, Orpheus-3b-0.1-ft, F5-TTS, and Llasa-3B delivered exceptionally natural, clear, and realistic synthesized speech. Among these, csm-1b and F5-TTS stood out as the most well-rounded model as they combined good synthesized speech with solid controllability.
  2. Superior Controllability
    Zonos-v0.1-transformer emerged as the best in fine-grained control: it offers detailed adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise voice modulation.
  3. Performance vs. Footprint Trade-off
    Smaller models (e.g., Kokoro-82M at 82 million parameters) can still excel in many scenarios, especially when efficient inference or low VRAM usage is critical. Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual synthesis, zero-shot voice cloning, and multi-speaker generation but require heavier compute resources.
  4. Special Notes on Multilingual & Cloning Capabilities
    Spark-TTS-0.5B and XTTS-v2 excel at cross-lingual and zero-shot voice cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. Llama-OuteTTS-1.0-1B and MegaTTS3 also offer multilingual input handling, though they may require careful sampling parameter tuning to achieve optimal results.
""") # Test Prompt Section # gr.HTML(f""" #
#

🎯 Universal Test Prompt

#

"{TEST_PROMPT}"

#

# Carefully crafted to test naturalness, intelligibility, and technical pronunciation across all models #

#
# """) # Evaluation Criteria # with gr.Row(): # with gr.Column(): # gr.HTML(""" #
#
🎭
# Naturalness
# Human-like quality & emotional expression #
# """) # with gr.Column(): # gr.HTML(""" #
#
🗣️
# Intelligibility
# Clarity & pronunciation accuracy #
# """) # with gr.Column(): # gr.HTML(""" #
#
🎛️
# Controllability
# Tone, pace & parameter flexibility #
# """) # gr.Markdown("---") # gr.Markdown(""" # ## 🔑 Key Findings # 1. **Outstanding Speech Quality** # Several models—namely **Kokoro-82M**, **csm-1b**, **Spark-TTS-0.5B**, **Orpheus-3b-0.1-ft**, **F5-TTS**, and **Llasa-3B**—delivered exceptionally natural, clear, and realistic synthesized speech. Among these, **csm-1b** and **F5-TTS** stood out as the most well-rounded: they combined top-tier naturalness and intelligibility with solid controllability. # 2. **Superior Controllability** # **Zonos-v0.1-transformer** emerged as the leader in fine-grained control: it offers detailed adjustments for prosody, emotion, and audio quality, making it ideal for use cases that demand precise voice modulation. # 3. **Performance vs. Footprint Trade-off** # Smaller models (e.g., **Kokoro-82M** at 82 million parameters) can still achieve “Good” or “Excellent” ratings in many scenarios, especially when efficient inference or low VRAM usage is critical. Larger models (1 billion–3 billion+ parameters) generally offer more versatility—handling multilingual synthesis, zero-shot voice cloning, and multi-speaker generation—but require heavier compute resources. # 4. **Special Notes on Multilingual & Cloning Capabilities** # **Spark-TTS-0.5B** and **XTTS-v2** excel at cross-lingual and zero-shot voice cloning, making them strong candidates for projects that need multi-language support or short-clip cloning. **Llama-OuteTTS-1.0-1B** and **MegaTTS3** also offer multilingual input handling, though they may require careful sampling parameter tuning to achieve optimal results. # """) # Search and Filter Section with gr.Row(): search_box = gr.Textbox( label="🔍 Search Models", placeholder="Filter by name or family (e.g., 'F5', 'TTS', '3B')", value="", scale=3 ) clear_btn = gr.Button("Clear", scale=1) # Model Gallery Section gr.Markdown("## 🎧 Model Gallery") # Create model cards and audio players model_components = [] for repo, display_name in MODELS.items(): with gr.Group(): # Model information card model_info = gr.HTML(create_model_card(repo)) # Audio player audio_path = get_audio_path(repo) if audio_path: audio_player = gr.Audio( value=audio_path, label=f"🎵 {display_name} Audio Sample", interactive=False ) else: audio_player = gr.HTML(f"

🤷‍♂️ Audio sample not found for {display_name}

") model_components.append((repo, model_info, audio_player)) # Search functionality def update_visibility(search_term): filtered_repos = filter_models(search_term) updates = [] for repo, model_info, audio_player in model_components: visible = repo in filtered_repos updates.extend([ gr.update(visible=visible), # model_info gr.update(visible=visible) # audio_player ]) return updates # Connect search functionality search_box.change( fn=update_visibility, inputs=[search_box], outputs=[comp for repo, model_info, audio_player in model_components for comp in [model_info, audio_player]] ) clear_btn.click( fn=lambda: "", outputs=[search_box] ) # Methodology Section # with gr.Accordion("📋 Detailed Evaluation Methodology", open=False): # gr.Markdown(""" # ### Test Prompt # `Hello, this is a universal test sentence. Can the advanced Zylophonic system clearly articulate this and express a hint of excitement? The quick brown fox certainly hopes so!` # ### Model Evaluation Criteria: # 🎭 **Naturalness (Human-like Quality)** # - Prosody and rhythm patterns # - Emotional expression capability # - Voice texture and warmth # - Natural breathing and pauses # 🗣️ **Intelligibility (Clarity & Accuracy)** # - Word pronunciation precision # - Consonant and vowel clarity # - Sentence comprehensibility # - Technical term handling # 🎛️ **Controllability (Flexibility)** # - Parameter responsiveness # - Tone modification capability # - Speed and pitch control # - Customization potential # ### Key Insights: # - Smaller models (82M-500M) can excel in specific scenarios # - Larger models (1B-3B+) offer more versatility but require more resources # - Architecture matters as much as parameter count # - Training data quality significantly impacts output quality # """) # Footer # gr.HTML(""" # # """) return demo # ---------- 4. Launch the application ---------- if __name__ == "__main__": demo = create_interface() demo.launch( share=True, inbrowser=True, show_error=True )