Spaces:

lisabdunlap
/

StringSight-Agents

Running

App Files Files Community

Lisa Dunlap commited on 2 days ago

Commit

1af0726

1 Parent(s): 224c2b6

initial commit

Browse files

Files changed (30) hide show

.gitattributes +2 -0
.gitignore +6 -0
app.py +26 -0
data/taubench_airline/cluster_scores.json +3 -0
data/taubench_airline/cluster_scores_df.jsonl +3 -0
data/taubench_airline/clustered_results_lightweight.jsonl +3 -0
data/taubench_airline/model_cluster_scores.json +3 -0
data/taubench_airline/model_cluster_scores_df.jsonl +3 -0
data/taubench_airline/model_scores.json +3 -0
data/taubench_airline/model_scores_df.jsonl +3 -0
requirements.txt +9 -0
stringsight/dashboard/__init__.py +13 -0
stringsight/dashboard/app.py +1312 -0
stringsight/dashboard/clusters_tab.py +234 -0
stringsight/dashboard/conversation_display.py +674 -0
stringsight/dashboard/data_loader.py +189 -0
stringsight/dashboard/demo.py +73 -0
stringsight/dashboard/demo_examples.py +86 -0
stringsight/dashboard/examples_helpers.py +238 -0
stringsight/dashboard/examples_tab.py +185 -0
stringsight/dashboard/launcher.py +127 -0
stringsight/dashboard/load_data_tab.py +151 -0
stringsight/dashboard/metrics_adapter.py +46 -0
stringsight/dashboard/overview_tab.py +479 -0
stringsight/dashboard/plots_tab.py +444 -0
stringsight/dashboard/plotting.py +616 -0
stringsight/dashboard/run_pipeline_tab.py +1070 -0
stringsight/dashboard/side_by_side_display.py +204 -0
stringsight/dashboard/state.py +27 -0
stringsight/dashboard/utils.py +2027 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/**/*.jsonl filter=lfs diff=lfs merge=lfs -text
+data/**/*.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+# Python cache
+__pycache__/
+*.py[cod]
+# Local gradio artifacts
+.gradio/

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from pathlib import Path
+import gradio as gr
+# Hard-disable the Run Pipeline tab for this app entrypoint
+os.environ["ENABLE_RUN_PIPELINE_TAB"] = "0"
+from stringsight.dashboard import state
+from stringsight.dashboard.app import create_app
+# Point the dashboard to the repository's ./data directory
+state.BASE_RESULTS_DIR = str((Path(__file__).parent / "data").resolve())
+# Build the Gradio application; expose for Spaces
+demo: gr.Blocks = create_app()
+app: gr.Blocks = demo
+if __name__ == "__main__":
+    # Local launch settings; Spaces will auto-serve the "demo/app" object
+    port = int(os.environ.get("PORT", 7860))
+    demo.launch(server_name="0.0.0.0", server_port=port)

data/taubench_airline/cluster_scores.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fb852010abe92b482d5880f6ef89859545e00609cf5999a37319b7de89aee81
+size 74398946

data/taubench_airline/cluster_scores_df.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd541fd6c20c51c6d6c2061bcae12fd979aa563ade3cebe7a93cf7364da044ca
+size 58028003

data/taubench_airline/clustered_results_lightweight.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14f3aece9c76950a4cd31fa33e02ada38cb7e38e1a0f544d650c9031cba08869
+size 121263508

data/taubench_airline/model_cluster_scores.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:101a4f1137ca6ae230fa27b6bb3309d5d5e31ce5c41e6686ce6291c344cb4fa3
+size 76420820

data/taubench_airline/model_cluster_scores_df.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90176fb8107e8c31fdb4a62e7076adbc573889bd8e4bcad41fb2eb5d67d1e6f9
+size 58100860

data/taubench_airline/model_scores.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef23296a9c714064f0363155f558267a4c8679b0b2dd4f828f1d77552ae0b0db
+size 74327684

data/taubench_airline/model_scores_df.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aee6668d6abbc9e5ce1db3927b65f8c5e10da1fa1f066a2dc73822401af06ad
+size 57978938

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==5.31.0
+pandas>=2.0.0
+numpy>=1.24.0
+plotly>=5.15.0
+scikit-learn>=1.3.0
+plotly-express>=0.4.1
+markdown
+pygments
+pyflakes

stringsight/dashboard/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""Dashboard visualization for StringSight pipeline results.
+This module provides a Gradio interface for exploring model performance,
+cluster analysis, and detailed examples from pipeline output.
+Usage:
+    from stringsight.dashboard import launch_app
+    launch_app(results_dir="path/to/results")
+"""
+from .app import launch_app, create_app
+__all__ = ["launch_app", "create_app"]

stringsight/dashboard/app.py ADDED Viewed

	@@ -0,0 +1,1312 @@

+"""
+Main Gradio application for LMM-Vibes pipeline results visualization.
+This module creates a comprehensive Gradio interface for exploring model performance,
+cluster analysis, and detailed examples from pipeline output.
+"""
+import gradio as gr
+from gradio.themes import Soft
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+import os
+from .data_loader import (
+    load_pipeline_results,
+    load_property_examples,
+    scan_for_result_subfolders,
+    validate_results_directory,
+    get_available_models
+)
+from .metrics_adapter import get_all_models
+from .utils import (
+    compute_model_rankings,
+    create_model_summary_card,
+    format_cluster_dataframe,
+    search_clusters_by_text,
+    get_top_clusters_for_model,
+    create_interactive_cluster_viewer,
+    get_cluster_statistics,
+    get_unique_values_for_dropdowns,
+    get_example_data,
+    format_examples_display,
+    get_total_clusters_count
+)
+# ---------------------------------------------------------------------------
+# NEW: centralised state + logic split into per-tab modules
+# ---------------------------------------------------------------------------
+from .state import app_state
+# Tab-specific logic (moved out of this file)
+from .load_data_tab import (
+    load_data,
+    get_available_experiments,
+    get_experiment_choices,
+    refresh_experiment_dropdown,
+    load_experiment_data,
+)
+from .overview_tab import create_overview, create_model_quality_plot, create_model_quality_table, get_available_model_quality_metrics
+from .clusters_tab import view_clusters_interactive, view_clusters_table
+from .examples_tab import (
+    get_dropdown_choices,
+    update_example_dropdowns,
+    view_examples,
+)
+from .plots_tab import create_plots_tab, create_plot_with_toggle, update_quality_metric_visibility, update_cluster_selection, get_available_quality_metrics
+# app_state now comes from dashboard.state
+# Feature flag: enable or disable the Run Pipeline tab (default: disabled)
+ENABLE_RUN_PIPELINE_TAB = os.environ.get("ENABLE_RUN_PIPELINE_TAB", "0") not in ("0", "false", "False", "off", "")
+def update_top_n_slider_maximum():
+    """Update the top N slider maximum based on total clusters in loaded data."""
+    from .state import app_state
+    if not app_state.get("metrics"):
+        return gr.Slider(minimum=1, maximum=10, value=3, step=1)
+    total_clusters = get_total_clusters_count(app_state["metrics"])
+    max_value = max(10, total_clusters)  # At least 10, or total clusters if more
+    return gr.Slider(
+        label="Top N Clusters per Model",
+        minimum=1,
+        maximum=max_value,
+        value=min(3, max_value),
+        step=1,
+        info=f"Number of top clusters to show per model (max: {total_clusters})"
+    )
+def clear_search_bars():
+    """Clear all search bars when new data is loaded."""
+    return "", ""  # Returns empty strings for search_clusters and search_examples
+def create_app() -> gr.Blocks:
+    """Create the main Gradio application."""
+    # Custom CSS for minimal margins and better sidebar layout + polished header/tabs
+    custom_css = """
+    /* Ensure the app itself spans the full page width (inside shadow root) */
+    :host {
+        width: 100% !important;
+        max-width: 100% !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        /* Override Gradio's layout max width if present */
+        --layout-max-width: 100% !important;
+    }
+    /* Base font stack for broad compatibility */
+    body, .gradio-container {
+        font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", "Liberation Sans", sans-serif;
+    }
+    /* Ensure Examples tab inherits same font (avoid code blocks) */
+    #examples-container, #examples-container *:not(code):not(pre) {
+        font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", "Liberation Sans", sans-serif !important;
+    }
+    /* Universal reset for all elements */
+    * {
+        box-sizing: border-box !important;
+    }
+    .main-container {
+        width: 100% !important;
+        max-width: 100% !important;
+        margin: 0 !important;
+        padding: 5px 0 0 8px !important;
+    }
+    .gradio-container {
+        width: 100% !important;
+        max-width: none !important;
+        margin: 0 !important;
+        padding: 5px 0 0 8px !important;
+    }
+    /* --- Polished sticky header --- */
+    #app-header {
+        position: sticky;
+        top: 0;
+        z-index: 50;
+        backdrop-filter: saturate(180%) blur(8px);
+        -webkit-backdrop-filter: saturate(180%) blur(8px);
+        background: rgba(255,255,255,.85);
+        border-bottom: 1px solid rgba(15,23,42,.06);
+        padding: 12px 16px;
+        margin: 0 0 8px 0 !important;
+        display: flex;
+        align-items: center;
+        justify-content: space-between;
+        width: 100%;
+    }
+    .brand { display:flex; align-items:center; gap:10px; font-weight:600; font-size:18px; color:#0f172a; }
+    .brand small { font-weight:500; color:#64748b; }
+    .header-right { display:flex; gap:8px; align-items:center; margin-left:auto; }
+    /* Ensure the right group actually sticks to the right */
+    #app-header > *:last-child { margin-left: auto !important; }
+    #app-header .header-right { margin-left: auto !important; justify-content: flex-end !important; }
+    #app-header .header-right > * { margin-left: 0 !important; }
+    .header-badge { background:#eef2ff; color:#3730a3; border-radius:9999px; padding:2px 8px; font-size:12px; border:1px solid #c7d2fe; }
+    /* Round the tab buttons into pills with clear active state */
+    .tabs .tab-nav button { border-radius:9999px !important; padding:6px 12px !important; }
+    .tabs .tab-nav button.selected { background:#eef2ff !important; color:#3730a3 !important; }
+    /* Tone down color for model selection group (Gradio renders as pill labels) */
+    #selected-models label { background: #f8fafc !important; color: #111827 !important; border: 1px solid #e2e8f0 !important; }
+    #selected-models label:hover { background: #f1f5f9 !important; }
+    #selected-models .selected, #selected-models [data-selected="true"],
+    #selected-models label[aria-pressed="true"],
+    #selected-models label:has(input:checked) { background: #f1f5f9 !important; border-color: #e2e8f0 !important; color: #111827 !important; }
+    #selected-models input[type="checkbox"] { accent-color: #94a3b8 !important; }
+    /* Help panel card */
+    #help-panel { margin: 8px 12px; padding: 12px; background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; }
+    #help-panel .gr-prose, #help-panel .prose, #help-panel .markdown, #help-panel p, #help-panel div { background: #ffffff !important; }
+    /* Style the Close button with a light tint */
+    #help-close-btn button { background: #eef2ff !important; color: #3730a3 !important; border: 1px solid #c7d2fe !important; }
+    #help-close-btn button:hover { background: #e0e7ff !important; }
+    /* Compact Help button */
+    #help-btn { flex: 0 0 auto !important; width: auto !important; display: inline-flex !important; }
+    #help-btn button { padding: 2px 8px !important; min-width: unset !important; width: auto !important; }
+    .tabs {
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    .tab-nav {
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    .tab-content {
+        margin: 0 !important;
+        padding: 5px 0 2px 8px !important;
+    }
+    .sidebar {
+        border-left: 1px solid #e0e0e0;
+        background-color: #f8f9fa;
+        padding: 8px !important;
+        order: 2;
+    }
+    .main-content {
+        padding: 5px 0 2px 8px !important;
+        order: 1;
+    }
+    /* Additional selectors to override Gradio's default margins */
+    .block {
+        margin: 0 !important;
+        padding: 2px 0 2px 8px !important;
+    }
+    .form {
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    body {
+        margin: 0 !important;
+        padding: 5px 0 0 8px !important;
+    }
+    .app {
+        margin: 0 !important;
+        padding: 5px 0 0 8px !important;
+    }
+    /* Target specific Gradio container classes */
+    .gradio-row {
+        margin: 0 !important;
+        padding: 0 !important;
+    }
+    .gradio-column {
+        margin: 0 !important;
+        padding: 0 0 0 8px !important;
+    }
+    /* Override any container padding */
+    .container {
+        width: 100% !important;
+        max-width: none !important;
+        padding: 5px 0 0 8px !important;
+        margin: 0 !important;
+    }
+    /* Target the root element */
+    #root {
+        padding: 5px 0 0 8px !important;
+        margin: 0 !important;
+    }
+    /* Make sure no right padding on wrapper elements */
+    .wrap {
+        width: 100% !important;
+        max-width: none !important;
+        padding: 0 !important;
+        margin: 0 !important;
+    }
+    /* Aggressive targeting of common Gradio elements */
+    div[class*="gradio"] {
+        padding-right: 0 !important;
+        margin-right: 0 !important;
+    }
+    /* Target any div that might have padding */
+    .gradio-blocks > div,
+    .gradio-blocks div[style*="padding"] {
+        padding-right: 0 !important;
+        margin-right: 0 !important;
+    }
+    /* Ensure content fills width */
+    .gradio-blocks {
+        width: 100% !important;
+        max-width: none !important;
+        padding: 5px 0 0 8px !important;
+        margin: 0 !important;
+    }
+    /* Catch-all: remove max-width and auto-centering from any container-like nodes */
+    [class*="container"], [class*="Container"], [class*="main"], [class*="Main"], [class*="block"], [class*="Block"] {
+        max-width: none !important;
+        margin-left: 0 !important;
+        margin-right: 0 !important;
+    }
+    /* Slight right margin for overall app */
+    .gradio-container {
+        margin-right: 12px !important;
+    }
+    /* Ensure slight right padding inside the app content */
+    .main-container,
+    .gradio-blocks,
+    .tab-content,
+    .main-content,
+    .container,
+    #root,
+    .app,
+    .wrap,
+    .gradio-column {
+        padding-right: 12px !important;
+    }
+    /* Final override: ensure host has slight right padding so it's always visible */
+    :host {
+        padding-right: 12px !important;
+    }
+    """
+    # Modern theme setup (Inter font, neutral slate, indigo primary)
+    theme = Soft(
+        primary_hue="indigo",
+        neutral_hue="slate",
+    )
+    with gr.Blocks(title="LMM-Vibes Pipeline Results Explorer", theme=theme, css=custom_css, fill_width=True) as app:
+        # Header helpers
+        def _current_experiment_name() -> str:
+            from .state import app_state
+            from . import state
+            path = app_state.get("current_results_dir") or state.BASE_RESULTS_DIR or ""
+            if not path:
+                return "No experiment loaded"
+            try:
+                return Path(path).name
+            except Exception:
+                return str(path)
+        def _render_badge_html() -> str:
+            exp = _current_experiment_name()
+            return f"<span class=\"header-badge\">{exp}</span>"
+        # Polished sticky header
+        with gr.Row(elem_id="app-header"):
+            with gr.Row(elem_classes=["header-left"]):
+                gr.HTML(
+                    value=(
+                        "<div class=\"brand\">🧵 StringSight <small>Evaluation Console</small></div>"
+                    )
+                )
+                # Move experiment selection to the header when a base directory is provided
+                from . import state
+                if state.BASE_RESULTS_DIR:
+                    experiment_dropdown = gr.Dropdown(
+                        label="Select Experiment",
+                        choices=get_experiment_choices(),
+                        value="Select an experiment...",
+                        show_label=False,
+                        interactive=True,
+                    )
+            with gr.Row(elem_classes=["header-right"]):
+                help_btn = gr.Button("Help", variant="secondary", elem_id="help-btn")
+        # Separate badge element we can update after data loads
+        current_experiment_badge = gr.HTML(value=_render_badge_html(), visible=False)
+        # Contextual Help panel (hidden by default)
+        with gr.Group(visible=False, elem_id="help-panel") as help_panel:
+            help_md = gr.Markdown(
+                """
+                **Overview**: Compare model quality metrics and view model cards with top behavior clusters. Use Filter Controls to refine and switch between Plot/Table.
+                **View Clusters**: Explore clusters interactively. Use the search field in this tab to filter cluster labels; optional tag filter appears when available.
+                **View Examples**: Inspect individual examples with rich conversation rendering. Filter by prompt/model/cluster; adjust max examples and formatting options.
+                """
+            )
+            help_close_btn = gr.Button("Close", variant="secondary", elem_id="help-close-btn")
+        with gr.Row():
+            # Sidebar for data loading and model selection
+            with gr.Column(scale=1, min_width=180, elem_classes=["sidebar"]):
+                from . import state
+                if state.BASE_RESULTS_DIR:
+                    gr.Markdown(f"Base Results Directory: `{state.BASE_RESULTS_DIR}`")
+                else:
+                    gr.Markdown("Provide the path to your pipeline results directory containing either:")
+                    gr.Markdown("• **Legacy format**: `model_stats.json` + `clustered_results.jsonl`")
+                    gr.Markdown("• **Functional format**: `model_cluster_scores.json` + `cluster_scores.json` + `model_scores.json` + `clustered_results.jsonl`")
+                    gr.Markdown("*The app will automatically detect which format you're using.*")
+                if not state.BASE_RESULTS_DIR:
+                    results_dir_input = gr.Textbox(
+                        label="Results Directory Path",
+                        placeholder="/path/to/your/results/directory",
+                        info="Directory containing pipeline results (legacy or functional format)"
+                    )
+                data_status = gr.Markdown("")
+                models_info = gr.Markdown("", visible=False)
+                # Model selection (will be updated after loading)
+                selected_models = gr.CheckboxGroup(
+                    label="Select Models for Analysis",
+                    show_label=False,
+                    choices=["all"],  # Provide default to prevent errors
+                    value=[],
+                    info="Choose which models to include in comparisons",
+                    elem_id="selected-models"
+                )
+                # Consolidated Tag selection (hidden until data provides tags)
+                selected_tags = gr.CheckboxGroup(
+                    label="Filter by Tags",
+                    show_label=False,
+                    choices=[],
+                    value=[],
+                    info="Filter clusters/examples/plots by tags (derived from metadata)",
+                    visible=False,
+                )
+            # Main content area with reduced margins
+            with gr.Column(scale=6, elem_classes=["main-content"]):
+                with gr.Tabs(selected=1) as main_tabs:
+                    # Tab 0: Run Pipeline (conditionally enabled)
+                    if ENABLE_RUN_PIPELINE_TAB:
+                        from .run_pipeline_tab import create_run_pipeline_tab
+                        with gr.TabItem("🚀 Run Pipeline", id=0) as pipeline_tab:
+                            pipeline_components = create_run_pipeline_tab()
+                            # Store pipeline components for later event handler setup
+                    # Tab 1: Overview
+                    with gr.TabItem("📊 Overview", id=1) as overview_tab:
+                        # Accordion for Filter Controls
+                        with gr.Accordion("Filter Controls", open=False, visible=True) as filter_controls_acc:
+                            with gr.Row():
+                                min_cluster_size = gr.Slider(
+                                    label="Minimum Cluster Size",
+                                    minimum=1, maximum=50, value=5, step=1,
+                                    # info="Hide clusters with fewer than this many examples"
+                                )
+                                score_significant_only = gr.Checkbox(
+                                    label="Show Only Frequency Significant Clusters",
+                                    value=False,
+                                    info="Only show clusters where the distinctiveness score is statistically significant"
+                                )
+                                quality_significant_only = gr.Checkbox(
+                                    label="Show Only Quality Significant Clusters",
+                                    value=False,
+                                    info="Only show clusters where the quality score is statistically significant"
+                                )
+                            with gr.Row():
+                                sort_by = gr.Dropdown(
+                                    label="Sort Clusters By",
+                                    choices=[
+                                        ("Relative Frequency (Descending)", "salience_desc"),
+                                        ("Relative Frequency (Ascending)", "salience_asc"),
+                                        ("Quality (Ascending)", "quality_asc"),
+                                        ("Quality (Descending)", "quality_desc"),
+                                        ("Frequency (Descending)", "frequency_desc"),
+                                        ("Frequency (Ascending)", "frequency_asc")
+                                    ],
+                                    value="salience_desc",
+                                    # info="How to sort clusters within each model card"
+                                )
+                                top_n_overview = gr.Slider(
+                                    label="Top N Clusters per Model",
+                                    minimum=1, maximum=10, value=3, step=1,
+                                    # info="Number of top clusters to show per model"
+                                )
+                        # Accordion for Quality Plot
+                        with gr.Accordion("Benchmark Metrics", open=True, visible=True) as metrics_acc:
+                            with gr.Row():
+                                quality_metric_overview = gr.Dropdown(
+                                    label="Quality Metric",
+                                    show_label=False,
+                                    choices=["helpfulness", "accuracy", "harmlessness", "honesty"],
+                                    value="accuracy",
+                                    # info="Select quality metric to display"
+                                )
+                                quality_view_type = gr.Dropdown(
+                                    label="View Type",
+                                    show_label=False,
+                                    choices=["Plot", "Table"],
+                                    value="Table",
+                                    # info="Choose between plot or table view"
+                                )
+                            quality_plot_display = gr.Plot(
+                                label="Model Quality Comparison",
+                                show_label=False,
+                                elem_id="quality-plot",
+                                visible=True
+                            )
+                            quality_table_display = gr.HTML(
+                                label="Model Quality Table",
+                                visible=True,
+                                value="<div style='color:#666;padding:8px;'>Switch view to Table or Plot as desired.</div>"
+                            )
+                        overview_display = gr.HTML(
+                            label="Model Overview",
+                            value="<p style='color: #666; padding: 20px;'>Select your experiment to begin.</p>",
+                            visible=True
+                        )
+                        refresh_overview_btn = gr.Button("Refresh Overview", visible=True)
+                    # Tab 2: View Clusters
+                    with gr.TabItem("📋 View Clusters", id=2) as clusters_tab:
+                        # gr.Markdown("### Interactive Cluster Viewer")
+                        with gr.Row():
+                            search_clusters = gr.Textbox(
+                                label="Search Properties",
+                                show_label=False,
+                                placeholder="Search in property clusters...",
+                                info="Search for specific terms in property clusters"
+                            )
+                        clusters_display = gr.HTML(
+                            label="Interactive Cluster Viewer",
+                            value="<p style='color: #666; padding: 20px;'>Load data and select models to view clusters</p>"
+                        )
+                        refresh_clusters_btn = gr.Button("Refresh Clusters")
+                    # Tab 3: View Examples
+                    with gr.TabItem("🔍 View Examples", id=3) as examples_tab:
+                        # gr.Markdown("### Individual Example Viewer")
+                        # gr.Markdown("Explore individual examples with full prompts, model responses, and property information. Click on examples to expand and view full details.")
+                        with gr.Row():
+                                                        search_examples = gr.Textbox(
+                            label="Search Properties",
+                            show_label=False,
+                            placeholder="Search clusters or property descriptions...",
+                            info="Search for specific terms in cluster names or property descriptions to filter examples"
+                        )
+                        with gr.Accordion("Search & Filter Options", open=False):
+                            with gr.Row():
+                                with gr.Column(scale=1):
+                                    example_prompt_dropdown = gr.Dropdown(
+                                        label="Select Prompt",
+                                        show_label=False,
+                                        choices=["All Prompts"],
+                                        value="All Prompts",
+                                        info="Choose a specific prompt or 'All Prompts'"
+                                    )
+                                with gr.Column(scale=1):
+                                    example_model_dropdown = gr.Dropdown(
+                                        label="Select Model",
+                                        show_label=False,
+                                        choices=["All Models"],
+                                        value="All Models",
+                                        info="Choose a specific model or 'All Models'"
+                                    )
+                                with gr.Column(scale=1):
+                                    example_property_dropdown = gr.Dropdown(
+                                        label="Select Cluster",
+                                        show_label=False,
+                                        choices=["All Clusters"],
+                                        value="All Clusters",
+                                        info="Choose a specific cluster or 'All Clusters'"
+                                    )
+                                # Tags are consolidated in the sidebar
+                            with gr.Row():
+                                max_examples_slider = gr.Slider(
+                                    label="Max Examples",
+                                    show_label=False,
+                                    minimum=1, maximum=20, value=5, step=1,
+                                    info="Maximum number of examples to display"
+                                )
+                                use_accordion_checkbox = gr.Checkbox(
+                                    label="Use Accordion for System/Info Messages",
+                                    value=True,
+                                    info="Group system and info messages in collapsible sections"
+                                )
+                                pretty_print_checkbox = gr.Checkbox(
+                                    label="Pretty-print dictionaries",
+                                    value=False,
+                                    info="Format embedded dictionaries for readability"
+                                )
+                                show_unexpected_behavior_checkbox = gr.Checkbox(
+                                    label="Show Unexpected Behavior Only",
+                                    value=False,
+                                    info="Filter to show only examples with unexpected behavior"
+                                )
+                                view_examples_btn = gr.Button("View Examples", variant="primary")
+                        examples_display = gr.HTML(
+                            label="Examples",
+                            value="<p style='color: #666; padding: 20px;'>Load data and select filters to view examples</p>"
+                        , elem_id="examples-container")
+                    # Tab 4: Plots
+                    with gr.TabItem("📊 Plots", id=4) as plots_tab:
+                        plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown, cluster_selector = create_plots_tab()
+                        # Internal state to carry a valid metric during chained updates
+                        quality_metric_state = gr.State(value=None)
+        # Define helper functions for event handlers
+        def show_overview_controls():
+            return (
+                gr.update(visible=True),  # filter_controls_acc
+                gr.update(visible=True),  # metrics_acc
+                gr.update(visible=True),  # refresh_overview_btn
+            )
+        def compute_plots_quality_metric(plot_type: str, dropdown_value: str | None):
+            # Ensure we always pass a valid metric to the plot function during chained updates
+            if plot_type != "quality":
+                return None
+            metrics = get_available_quality_metrics()
+            if not metrics:
+                return None
+            if dropdown_value in metrics:
+                return dropdown_value
+            return metrics[0]
+        def update_quality_metric_dropdown():
+            available_metrics = get_available_model_quality_metrics()
+            # Ensure value is valid for the updated choices
+            return gr.update(choices=available_metrics, value=(available_metrics[0] if available_metrics else None))
+        def update_quality_plot(selected_models, quality_metric):
+            return create_model_quality_plot(selected_models, quality_metric)
+        def _placeholder_plot(text: str = "Switch to the Plot view to see a chart"):
+            fig = go.Figure()
+            fig.update_layout(
+                xaxis=dict(visible=False),
+                yaxis=dict(visible=False),
+                annotations=[dict(text=text, x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper")],
+                height=320,
+                margin=dict(l=20, r=20, t=20, b=20)
+            )
+            return fig
+        def update_quality_display(selected_models, quality_metric, view_type):
+            # Hide the non-selected view to avoid showing placeholders
+            if view_type == "Plot":
+                plot_val = create_model_quality_plot(selected_models, quality_metric) or _placeholder_plot("No data available for selected models")
+                return (
+                    gr.update(value=plot_val, visible=True),
+                    gr.update(visible=False),
+                )
+            else:  # Table
+                table_val = create_model_quality_table(selected_models, quality_metric)
+                return (
+                    gr.update(visible=False),
+                    gr.update(value=table_val, visible=True),
+                )
+        def update_experiment_badge():
+            return _render_badge_html()
+        def safe_update_quality_display(selected_models, quality_metric, view_type):
+            # Simplified: always update directly
+            return update_quality_display(selected_models, quality_metric, view_type)
+        def update_overview_content_only(selected_models, top_n, score_sig, quality_sig, sort_by_val, min_cluster_sz, selected_tags_sidebar):
+            """Update only the overview model cards content, without affecting UI state or controls."""
+            if not app_state.get("metrics"):
+                return "<p style='color: #666; padding: 20px;'>Please load data first.</p>"
+            # Just build and return the overview HTML
+            overview_html = create_overview(
+                selected_models,
+                top_n,
+                score_sig,
+                quality_sig,
+                sort_by_val,
+                min_cluster_sz,
+                selected_tags=selected_tags_sidebar,
+            )
+            return overview_html
+        def update_sidebar_tags(selected_models_current: Optional[List[str]] = None):
+            # Populate sidebar tag checkboxes from clustered_df (respect selected models if provided)
+            if app_state.get("clustered_df") is None:
+                return gr.update(choices=[], value=[], visible=False)
+            df = app_state["clustered_df"]
+            if selected_models_current:
+                concrete = [m for m in selected_models_current if m != "all"]
+                if concrete:
+                    df = df[df["model"].isin(concrete)]
+            choices = get_unique_values_for_dropdowns(df)
+            tags = choices.get("tags", []) or []
+            # Default select all tags (no filter)
+            return gr.update(choices=tags, value=tags, visible=bool(tags))
+        def create_overview_page(selected_models,
+                                top_n,
+                                score_sig,
+                                quality_sig,
+                                sort_by_val,
+                                min_cluster_sz,
+                                quality_metric,
+                                view_type,
+                                selected_tags_sidebar,
+                                progress: gr.Progress = None):
+            # Simplified: no loading gate or build flag
+            if not app_state.get("metrics"):
+                landing_html = "<p style='color: #666; padding: 20px;'>Select your experiment to begin.</p>"
+                # Respect current view type: show only the chosen view
+                if view_type == "Plot":
+                    return (
+                        gr.update(),
+                        gr.update(),
+                        gr.update(),
+                        gr.update(value=_placeholder_plot("Load data to view model quality."), visible=True),
+                        gr.update(visible=False),
+                        gr.update(value=landing_html),
+                    )
+                else:
+                    return (
+                        gr.update(),
+                        gr.update(),
+                        gr.update(),
+                        gr.update(visible=False),
+                        gr.update(value="<div style='color:#666;padding:8px;'>Load data to view the quality table.</div>", visible=True),
+                        gr.update(value=landing_html),
+                    )
+            # Pre-compute ALL content before making any UI updates to ensure simultaneous display
+            if progress:
+                progress(0.1, "Preparing benchmark metrics...")
+            # Prepare quality display; hide the non-selected view
+            if view_type == "Plot":
+                plot_val = create_model_quality_plot(selected_models, quality_metric) or _placeholder_plot("No data available for selected models")
+                table_val = None
+            else:
+                table_val = create_model_quality_table(selected_models, quality_metric)
+                plot_val = None
+            if progress:
+                progress(0.5, "Building model overview cards...")
+            # Build overview cards
+            overview_html = create_overview(
+                selected_models,
+                top_n,
+                score_sig,
+                quality_sig,
+                sort_by_val,
+                min_cluster_sz,
+                selected_tags=selected_tags_sidebar,
+            )
+            if progress:
+                progress(0.9, "Finalizing display...")
+            # Do not toggle control visibility to avoid layout flicker
+            filter_controls_update = gr.update()
+            metrics_controls_update = gr.update()
+            refresh_btn_update = gr.update()
+            if progress:
+                progress(1.0, "Overview ready")
+            return (
+                filter_controls_update,
+                metrics_controls_update,
+                refresh_btn_update,
+                (gr.update(value=plot_val, visible=True) if view_type == "Plot" else gr.update(visible=False)),
+                (gr.update(value=table_val, visible=True) if view_type == "Table" else gr.update(visible=False)),
+                gr.update(value=overview_html),
+            )
+        # Enhanced pipeline handler with tab switching and dropdown refresh
+        def enhanced_pipeline_handler(*args):
+            """Enhanced pipeline handler with tab switching and dropdown refresh."""
+            from .run_pipeline_tab import run_pipeline_handler
+            from .load_data_tab import get_experiment_choices
+            # Call the original pipeline handler
+            status_html, results_preview_html = run_pipeline_handler(*args)
+            # Check if pipeline completed successfully
+            pipeline_success = "<!-- SUCCESS -->" in status_html
+            if pipeline_success:
+                # Clean up the success indicator from HTML
+                status_html = status_html.replace("<!-- SUCCESS -->", "")
+                # Refresh experiment dropdown choices
+                experiment_choices = get_experiment_choices()
+                return (
+                    status_html,
+                    results_preview_html,
+                    gr.Tabs(selected=1),  # Switch to Overview tab
+                    gr.update(choices=experiment_choices, value=experiment_choices[1] if len(experiment_choices) > 1 else None) if experiment_choices else gr.update()
+                )
+            else:
+                # Pipeline failed or still running - no changes
+                return (
+                    status_html,
+                    results_preview_html,
+                    gr.Tabs(),  # No tab change
+                    gr.update()  # No dropdown change
+                )
+        # Enhanced labeling handler with tab switching and dropdown refresh
+        def enhanced_label_handler(*args):
+            from .run_pipeline_tab import run_label_pipeline_handler
+            from .load_data_tab import get_experiment_choices
+            status_html, results_preview_html = run_label_pipeline_handler(*args)
+            pipeline_success = "<!-- SUCCESS -->" in status_html
+            if pipeline_success:
+                status_html = status_html.replace("<!-- SUCCESS -->", "")
+                experiment_choices = get_experiment_choices()
+                return (
+                    status_html,
+                    results_preview_html,
+                    gr.Tabs(selected=1),
+                    gr.update(choices=experiment_choices, value=experiment_choices[1] if len(experiment_choices) > 1 else None) if experiment_choices else gr.update()
+                )
+            else:
+                return (
+                    status_html,
+                    results_preview_html,
+                    gr.Tabs(),
+                    gr.update()
+                )
+        # Event handlers
+        from . import state
+        if state.BASE_RESULTS_DIR:
+            # Use dropdown for experiment selection
+            if 'experiment_dropdown' in locals():
+                (experiment_dropdown.change(
+                    fn=load_experiment_data,
+                    inputs=[experiment_dropdown],
+                    outputs=[data_status, models_info, selected_models]
+                ).then(
+                    fn=update_experiment_badge,
+                    outputs=[current_experiment_badge]
+                ).then(
+                    fn=update_example_dropdowns,
+                    inputs=[selected_models],
+                    outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
+                ).then(
+                    fn=update_sidebar_tags,
+                    inputs=[selected_models],
+                    outputs=[selected_tags]
+                ).then(
+                    fn=update_quality_metric_dropdown,
+                    outputs=[quality_metric_overview]
+                ).then(
+                    fn=view_examples,
+                    inputs=[
+                        example_prompt_dropdown,
+                        example_model_dropdown,
+                        example_property_dropdown,
+                        max_examples_slider,
+                        use_accordion_checkbox,
+                        pretty_print_checkbox,
+                        search_examples,
+                        show_unexpected_behavior_checkbox,
+                        selected_models,
+                        selected_tags,
+                    ],
+                    outputs=[examples_display]
+                ).then(
+                    fn=update_top_n_slider_maximum,
+                    outputs=[top_n_overview]
+                ).then(
+                    fn=clear_search_bars,
+                    outputs=[search_clusters, search_examples]
+                ).then(
+                    fn=view_clusters_interactive,
+                    inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
+                    outputs=[clusters_display]
+                ).then(
+                    fn=create_overview_page,
+                    inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type, selected_tags],
+                    outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
+                ).then(
+                    fn=update_cluster_selection,
+                    inputs=[selected_models, selected_tags],
+                    outputs=[cluster_selector]
+                ).then(
+                    fn=update_quality_metric_visibility,
+                    inputs=[plot_type_dropdown],
+                    outputs=[quality_metric_dropdown]
+                ).then(
+                    fn=compute_plots_quality_metric,
+                    inputs=[plot_type_dropdown, quality_metric_dropdown],
+                    outputs=[quality_metric_state]
+                ).then(
+                    fn=create_plot_with_toggle,
+                    inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+                    outputs=[plot_display, plot_info]
+                ))
+        else:
+            # Use textbox for manual path entry
+            if 'results_dir_input' in locals():
+                (results_dir_input.submit(
+                    fn=load_data,
+                    inputs=[results_dir_input],
+                    outputs=[data_status, models_info, selected_models]
+                ).then(
+                    fn=update_experiment_badge,
+                    outputs=[current_experiment_badge]
+                ).then(
+                    fn=update_example_dropdowns,
+                    inputs=[selected_models],
+                    outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
+                ).then(
+                    fn=update_sidebar_tags,
+                    inputs=[selected_models],
+                    outputs=[selected_tags]
+                ).then(
+                    fn=update_quality_metric_dropdown,
+                    outputs=[quality_metric_overview]
+                ).then(
+                    fn=view_examples,
+                    inputs=[
+                        example_prompt_dropdown,
+                        example_model_dropdown,
+                        example_property_dropdown,
+                        max_examples_slider,
+                        use_accordion_checkbox,
+                        pretty_print_checkbox,
+                        search_examples,
+                        show_unexpected_behavior_checkbox,
+                        selected_models,
+                        selected_tags,
+                    ],
+                    outputs=[examples_display]
+                ).then(
+                    fn=update_top_n_slider_maximum,
+                    outputs=[top_n_overview]
+                ).then(
+                    fn=clear_search_bars,
+                    outputs=[search_clusters, search_examples]
+                ).then(
+                    fn=view_clusters_interactive,
+                    inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
+                    outputs=[clusters_display]
+                ).then(
+                    fn=create_overview_page,
+                    inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type],
+                    outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
+                ).then(
+                    fn=update_cluster_selection,
+                    inputs=[selected_models, selected_tags],
+                    outputs=[cluster_selector]
+                ).then(
+                    fn=update_quality_metric_visibility,
+                    inputs=[plot_type_dropdown],
+                    outputs=[quality_metric_dropdown]
+                ).then(
+                    fn=compute_plots_quality_metric,
+                    inputs=[plot_type_dropdown, quality_metric_dropdown],
+                    outputs=[quality_metric_state]
+                ).then(
+                    fn=create_plot_with_toggle,
+                    inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+                    outputs=[plot_display, plot_info]
+                ))
+        # Tab switching should not trigger any updates - content should persist
+        refresh_overview_btn.click(
+            fn=create_overview_page,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type, selected_tags],
+            outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
+        )
+        # Help button show/hide
+        help_btn.click(
+            fn=lambda: gr.update(visible=True),
+            outputs=[help_panel]
+        )
+        help_close_btn.click(
+            fn=lambda: gr.update(visible=False),
+            outputs=[help_panel]
+        )
+        # Quality plot interactions
+        # Update quality display when controls change
+        quality_metric_overview.change(
+            fn=update_quality_display,
+            inputs=[selected_models, quality_metric_overview, quality_view_type],
+            outputs=[quality_plot_display, quality_table_display]
+        )
+        quality_view_type.change(
+            fn=update_quality_display,
+            inputs=[selected_models, quality_metric_overview, quality_view_type],
+            outputs=[quality_plot_display, quality_table_display]
+        )
+        # Update quality display when selected models change
+        selected_models.change(
+            fn=update_quality_display,
+            inputs=[selected_models, quality_metric_overview, quality_view_type],
+            outputs=[quality_plot_display, quality_table_display]
+        )
+        refresh_clusters_btn.click(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
+            outputs=[clusters_display]
+        )
+        # View Examples handlers
+        view_examples_btn.click(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        # Auto-refresh examples when dropdowns change
+        example_prompt_dropdown.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        example_model_dropdown.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        example_property_dropdown.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        # Removed per-tab tag dropdown; using sidebar tags
+        # Auto-refresh examples when search term changes
+        search_examples.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        # Auto-refresh examples when unexpected behavior checkbox changes
+        show_unexpected_behavior_checkbox.change(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        # (Search Examples tab removed – no search_btn handler required)
+        # Plots Tab Handlers
+        show_ci_checkbox.change(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_dropdown, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+            outputs=[plot_display, plot_info]
+        )
+        # Quality metric dropdown handlers (only for quality plots)
+        quality_metric_dropdown.change(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_dropdown, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+            outputs=[plot_display, plot_info]
+        )
+        # Cluster selector change updates the plot and mapping text
+        cluster_selector.change(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_dropdown, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+            outputs=[plot_display, plot_info]
+        )
+        # Update quality metric visibility and plot based on plot type
+        plot_type_dropdown.change(
+            fn=update_quality_metric_visibility,
+            inputs=[plot_type_dropdown],
+            outputs=[quality_metric_dropdown]
+        ).then(
+            fn=compute_plots_quality_metric,
+            inputs=[plot_type_dropdown, quality_metric_dropdown],
+            outputs=[quality_metric_state]
+        ).then(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+            outputs=[plot_display, plot_info]
+        )
+        # Remove duplicate Overview rebuild on model selection; quality plot and clusters still update below
+        # Auto-refresh on significance filter changes - only update model cards content
+        score_significant_only.change(
+            fn=update_overview_content_only,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
+            outputs=[overview_display]
+        )
+        quality_significant_only.change(
+            fn=update_overview_content_only,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on sort dropdown change - only update model cards content
+        sort_by.change(
+            fn=update_overview_content_only,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on top N change - only update model cards content
+        top_n_overview.change(
+            fn=update_overview_content_only,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
+            outputs=[overview_display]
+        )
+        # Auto-refresh on minimum cluster size change - only update model cards content
+        min_cluster_size.change(
+            fn=update_overview_content_only,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
+            outputs=[overview_display]
+        )
+        # Update overview content and clusters when selected models change
+        selected_models.change(
+            fn=update_overview_content_only,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
+            outputs=[overview_display]
+        ).then(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
+            outputs=[clusters_display]
+        ).then(
+            fn=update_example_dropdowns,
+            inputs=[selected_models],
+            outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
+        ).then(
+            fn=view_examples,
+            inputs=[
+                example_prompt_dropdown,
+                example_model_dropdown,
+                example_property_dropdown,
+                max_examples_slider,
+                use_accordion_checkbox,
+                pretty_print_checkbox,
+                search_examples,
+                show_unexpected_behavior_checkbox,
+                selected_models,
+                selected_tags,
+            ],
+            outputs=[examples_display]
+        ).then(
+            fn=update_cluster_selection,
+            inputs=[selected_models],
+            outputs=[cluster_selector]
+        ).then(
+            fn=compute_plots_quality_metric,
+            inputs=[plot_type_dropdown, quality_metric_dropdown],
+            outputs=[quality_metric_state]
+        ).then(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+            outputs=[plot_display, plot_info]
+        )
+        # Auto-refresh clusters when search term changes (with debouncing)
+        search_clusters.change(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
+            outputs=[clusters_display]
+        )
+        # Sidebar tags: update clusters, overview, plots, and examples
+        selected_tags.change(
+            fn=view_clusters_interactive,
+            inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
+            outputs=[clusters_display]
+        ).then(
+            fn=create_overview_page,
+            inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type, selected_tags],
+            outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
+        ).then(
+            fn=update_cluster_selection,
+            inputs=[selected_models, selected_tags],
+            outputs=[cluster_selector]
+        ).then(
+            fn=create_plot_with_toggle,
+            inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
+            outputs=[plot_display, plot_info]
+        ).then(
+            fn=view_examples,
+            inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
+            outputs=[examples_display]
+        )
+        # (No global header search)
+            # Wire up enhanced handlers for Explain and Label with tab switching
+        if ENABLE_RUN_PIPELINE_TAB:
+            pipeline_components["run_button_explain"].click(
+                fn=enhanced_pipeline_handler,
+                inputs=pipeline_components["inputs_explain"],
+                outputs=[
+                    pipeline_components["status_display"],
+                    pipeline_components["results_preview"],
+                    main_tabs,
+                    experiment_dropdown if 'experiment_dropdown' in locals() else results_dir_input
+                ],
+                show_progress="full"
+            )
+            pipeline_components["run_button_label"].click(
+                fn=enhanced_label_handler,
+                inputs=pipeline_components["inputs_label"],
+                outputs=[
+                    pipeline_components["status_display"],
+                    pipeline_components["results_preview"],
+                    main_tabs,
+                    experiment_dropdown if 'experiment_dropdown' in locals() else results_dir_input
+                ],
+                show_progress="full"
+            )
+        return app
+def launch_app(results_dir: Optional[str] = None,
+               share: bool = False,
+               server_name: str = "127.0.0.1",
+               server_port: int = 7860,
+               **kwargs) -> None:
+    """Launch the Gradio application.
+    Args:
+        results_dir: Optional path to base results directory containing experiment subfolders
+        share: Whether to create a public link
+        server_name: Server address
+        server_port: Server port
+        **kwargs: Additional arguments for gr.Blocks.launch()
+    """
+    # Set the base results directory in state BEFORE creating the app
+    from . import state
+    if results_dir:
+        state.BASE_RESULTS_DIR = results_dir
+        print(f"📁 Base results directory set to: {results_dir}")
+        # Check if it's a valid directory
+        if not os.path.exists(results_dir):
+            print(f"⚠️  Warning: Base results directory does not exist: {results_dir}")
+            state.BASE_RESULTS_DIR = None
+        else:
+            # Scan for available experiments
+            experiments = get_available_experiments(results_dir)
+            print(f"🔍 Found {len(experiments)} experiments: {experiments}")
+    app = create_app()
+    # Auto-load data if BASE_RESULTS_DIR is set - automatically load the most recent experiment
+    if state.BASE_RESULTS_DIR and os.path.exists(state.BASE_RESULTS_DIR):
+        experiments = get_available_experiments(state.BASE_RESULTS_DIR)
+        if len(experiments) >= 1:
+            # Auto-load the most recent experiment (first in the sorted list)
+            most_recent_experiment = experiments[0]
+            experiment_path = os.path.join(state.BASE_RESULTS_DIR, most_recent_experiment)
+            try:
+                clustered_df, model_stats, model_cluster_df, results_path = load_pipeline_results(experiment_path)
+                app_state['clustered_df'] = clustered_df
+                app_state['model_stats'] = model_stats
+                app_state['metrics'] = model_stats  # Ensure metrics is also populated
+                app_state['model_cluster_df'] = model_cluster_df
+                app_state['results_path'] = results_path
+                available_models = get_all_models(model_stats)
+                app_state['available_models'] = available_models
+                app_state['current_results_dir'] = experiment_path
+                print(f"✅ Auto-loaded most recent experiment: {most_recent_experiment}")
+                print(f"📋 Available models: {available_models}")
+                if len(experiments) > 1:
+                    print(f"📋 Found {len(experiments)} experiments. Loaded the most recent: {most_recent_experiment}")
+            except Exception as e:
+                print(f"❌ Failed to auto-load data: {e}")
+        else:
+            print(f"📋 No valid experiments found in {state.BASE_RESULTS_DIR}")
+    print(f"🚀 Launching Gradio app on {server_name}:{server_port}")
+    print(f"Share mode: {share}")
+    print(f"🔧 Additional kwargs: {kwargs}")
+    try:
+        app.launch(
+            share=share,
+            server_name=server_name,
+            server_port=server_port,
+            show_error=True,  # Show detailed error messages
+            quiet=False,  # Show more verbose output
+            **kwargs
+        )
+    except Exception as e:
+        print(f"❌ Failed to launch on port {server_port}: {e}")
+        print("🔄 Trying alternative port configuration...")
+        # Try with a port range instead of port 0
+        try:
+            # Try ports in a reasonable range
+            for alt_port in [8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089]:
+                try:
+                    print(f"🔄 Trying port {alt_port}...")
+                    app.launch(
+                        share=share,
+                        server_name=server_name,
+                        server_port=alt_port,
+                        show_error=True,
+                        quiet=False,
+                        **kwargs
+                    )
+                    break  # If successful, break out of the loop
+                except Exception as port_error:
+                    if "Cannot find empty port" in str(port_error):
+                        print(f"   Port {alt_port} is busy, trying next...")
+                        continue
+                    else:
+                        raise port_error
+            else:
+                # If we get here, all ports in our range were busy
+                raise Exception("All attempted ports (8080-8089) are busy")
+        except Exception as e2:
+            print(f"❌ Failed to launch with alternative ports: {e2}")
+            print("💡 Try specifying a different port manually:")
+            print(f"   python -m stringsight.dashboard.launcher --port 9000")
+            print(f"   python -m stringsight.dashboard.launcher --auto_port")
+            raise e2

stringsight/dashboard/clusters_tab.py ADDED Viewed

	@@ -0,0 +1,234 @@

+"""Helpers for the **View Clusters** tab – both the interactive HTML and
+fallback dataframe view."""
+from typing import List
+import pandas as pd
+import ast
+from .state import app_state
+from .utils import (
+    search_clusters_by_text,
+    search_clusters_only,
+    create_interactive_cluster_viewer,
+    get_cluster_statistics,
+    format_cluster_dataframe,
+    extract_allowed_tag,
+)
+__all__ = ["view_clusters_interactive", "view_clusters_table"]
+# ---------------------------------------------------------------------------
+# Interactive HTML view
+# ---------------------------------------------------------------------------
+def view_clusters_interactive(
+    selected_models: List[str],
+    cluster_level: str,
+    search_term: str = "",
+    selected_tags: List[str] | None = None,
+) -> str:
+    if app_state["clustered_df"] is None:
+        return (
+            "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
+            "using the 'Load Data' tab</p>"
+        )
+    df = app_state["clustered_df"].dropna(subset=["property_description"]).copy()
+    # Apply search filter first
+    if search_term and search_term.strip():
+        df = search_clusters_only(df, search_term.strip(), cluster_level)
+    # Optional tags filter – only keep rows whose meta resolves to an allowed tag in selected_tags
+    if selected_tags and len(selected_tags) > 0 and 'meta' in df.columns:
+        def _first_allowed_tag(obj):
+            return extract_allowed_tag(obj)
+        # Check if all meta are empty dicts (means no tags)
+        def _parse_try(obj):
+            if isinstance(obj, str):
+                try:
+                    return ast.literal_eval(obj)
+                except Exception:
+                    return obj
+            return obj
+        parsed_meta = df['meta'].apply(_parse_try)
+        non_null_parsed = [m for m in parsed_meta.tolist() if m is not None]
+        all_empty_dicts = (
+            len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
+        )
+        if not all_empty_dicts:
+            allowed = set(map(str, selected_tags))
+            df = df[df['meta'].apply(_first_allowed_tag).astype(str).isin(allowed)]
+    # Build interactive viewer
+    cluster_html = create_interactive_cluster_viewer(df, selected_models, cluster_level)
+    # Statistics summary at the top
+    stats = get_cluster_statistics(df, selected_models)
+    if not stats:
+        return (
+            "<p style='color: #e74c3c; padding: 20px;'>❌ No cluster data available</p>"
+        )
+    # Get additional metrics from cluster_scores
+    cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
+    # Calculate average quality scores and frequency
+    total_frequency = 0
+    quality_scores_list = []
+    metric_names = set()
+    for cluster_name, cluster_data in cluster_scores.items():
+        total_frequency += cluster_data.get("proportion", 0) * 100
+        quality_scores = cluster_data.get("quality", {})
+        if quality_scores:
+            quality_scores_list.extend(quality_scores.values())
+            metric_names.update(quality_scores.keys())
+    avg_quality = sum(quality_scores_list) / len(quality_scores_list) if quality_scores_list else 0
+    metrics_suffix = f" ({', '.join(sorted(metric_names))})" if metric_names else ""
+    stats_html = f"""
+    <div style="
+        background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+        color: white;
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
+    ">
+        <h3 style="margin: 0 0 15px 0;">Cluster Statistics</h3>
+        <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 8px;">
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['total_properties']:,}</div>
+                <div style="opacity: 0.9;">Total Properties</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['total_models']}</div>
+                <div style="opacity: 0.9;">Models</div>
+            </div>
+    """
+    if cluster_level == "fine" and "fine_clusters" in stats:
+        stats_html += f"""
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['fine_clusters']}</div>
+                <div style="opacity: 0.9;">Fine Clusters</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_fine_cluster']:.1f}</div>
+                <div style="opacity: 0.9;">Avg Properties/Cluster</div>
+            </div>
+        """
+    elif cluster_level == "coarse" and "coarse_clusters" in stats:
+        stats_html += f"""
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['coarse_clusters']}</div>
+                <div style="opacity: 0.9;">Coarse Clusters</div>
+            </div>
+            <div>
+                <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_coarse_cluster']:.1f}</div>
+                <div style="opacity: 0.9;">Avg Properties/Cluster</div>
+            </div>
+        """
+    stats_html += """
+        </div>
+    </div>
+    """
+    # Add a note if coarse clusters were requested but not available
+    if cluster_level == "coarse" and "coarse_clusters" not in stats and "fine_clusters" in stats:
+        stats_html += """
+        <div style="
+            background: #fff3cd;
+            border-left: 4px solid #ffc107;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            ⚠️ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
+        </div>
+        """
+    # Additional filter chips
+    filter_info = ""
+    if search_term and search_term.strip():
+        filter_info += f"""
+        <div style="
+            background: #e3f2fd;
+            border-left: 4px solid #2196f3;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            🔍 <strong>Search Filter:</strong> "{search_term}"
+        </div>
+        """
+    if selected_models:
+        filter_info += f"""
+        <div style="
+            background: #f3e5f5;
+            border-left: 4px solid #9c27b0;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            🎯 <strong>Selected Models:</strong> {', '.join(selected_models)}
+        </div>
+        """
+    if selected_tags and len(selected_tags) > 0:
+        filter_info += f"""
+        <div style="
+            background: #e8f5e9;
+            border-left: 4px solid #4caf50;
+            padding: 10px 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        ">
+            🏷️ <strong>Tag Filter:</strong> {', '.join(selected_tags)}
+        </div>
+        """
+    return stats_html + filter_info + cluster_html
+# ---------------------------------------------------------------------------
+# Dataframe fallback view
+# ---------------------------------------------------------------------------
+def view_clusters_table(
+    selected_models: List[str],
+    cluster_level: str,
+    search_term: str = "",
+) -> pd.DataFrame:
+    if app_state["clustered_df"] is None:
+        return pd.DataFrame({"Message": ["Please load data first using the 'Load Data' tab"]})
+    df = app_state["clustered_df"].copy()
+    if search_term and search_term.strip():
+        df = search_clusters_only(df, search_term.strip(), cluster_level)
+    formatted_df = format_cluster_dataframe(df, selected_models, cluster_level)
+    if formatted_df.empty:
+        if search_term and search_term.strip():
+            return pd.DataFrame({"Message": [f"No results found for search term '{search_term}'. Try a different search term."]})
+        elif selected_models:
+            available_models = df["model"].unique().tolist() if "model" in df.columns else []
+            return pd.DataFrame({"Message": [
+                f"No data found for selected models: {', '.join(selected_models)}. "
+                f"Available models: {', '.join(available_models)}"
+            ]})
+        else:
+            return pd.DataFrame({"Message": [
+                "No data available. Please check your data files and try reloading."
+            ]})
+    return formatted_df

stringsight/dashboard/conversation_display.py ADDED Viewed

	@@ -0,0 +1,674 @@

+from __future__ import annotations
+"""Conversation display helpers for dashboard.
+This module encapsulates everything related to:
+• safely parsing model responses (lists / dicts / JSON strings)
+• pretty-printing embedded dictionaries for readability
+• converting multiple conversation formats to the OpenAI chat list format
+• rendering that list as HTML (including accordion grouping + raw JSON viewer).
+Moving this logic out of utils.py keeps the latter lean and focussed on general
+analytics utilities.
+"""
+from typing import List, Dict, Any
+import ast
+import json
+import html
+import markdown
+import re
+__all__: List[str] = [
+    "convert_to_openai_format",
+    "display_openai_conversation_html",
+    "pretty_print_embedded_dicts",
+]
+# ---------------------------------------------------------------------------
+# Pretty-printing helpers
+# ---------------------------------------------------------------------------
+def _find_balanced_spans(text: str):
+    """Return (start, end) spans of balanced {...} or [...] regions in *text*."""
+    spans, stack = [], []
+    for i, ch in enumerate(text):
+        if ch in "{[":
+            stack.append((ch, i))
+        elif ch in "]}" and stack:
+            opener, start = stack.pop()
+            if (opener, ch) in {("{", "}"), ("[", "]")} and not stack:
+                spans.append((start, i + 1))
+    return spans
+def _try_parse_slice(slice_: str):
+    """Attempt to parse *slice_* into a Python object; return None on failure."""
+    try:
+        return ast.literal_eval(slice_)
+    except Exception:
+        try:
+            return json.loads(slice_)
+        except Exception:
+            return None
+def _find_code_spans(text: str) -> List[tuple]:
+    """Return spans for markdown code regions to be preserved as-is.
+    Includes:
+    - fenced code blocks delimited by ``` ... ```
+    - inline code segments delimited by `...`
+    """
+    spans: List[tuple] = []
+    # Fenced blocks ``` ... ``` (language spec allowed after opening fence)
+    idx = 0
+    while True:
+        start = text.find("```", idx)
+        if start == -1:
+            break
+        # Find the end fence
+        end = text.find("```", start + 3)
+        if end == -1:
+            # Unclosed fence: treat rest of string as code
+            spans.append((start, len(text)))
+            break
+        spans.append((start, end + 3))
+        idx = end + 3
+    # Inline code `...`
+    for m in re.finditer(r"`[^`]*`", text, flags=re.DOTALL):
+        spans.append((m.start(), m.end()))
+    # Sort and merge overlapping spans
+    spans.sort()
+    merged: List[tuple] = []
+    for s, e in spans:
+        if not merged or s > merged[-1][1]:
+            merged.append((s, e))
+        else:
+            merged[-1] = (merged[-1][0], max(merged[-1][1], e))
+    return merged
+def _is_inside_any_span(start: int, end: int, spans: List[tuple]) -> bool:
+    for s, e in spans:
+        if start >= s and end <= e:
+            return True
+    return False
+def pretty_print_embedded_dicts(text: str) -> str:
+    """Replace dicts, lists, or other complex structures with pretty-printed JSON, except inside code.
+    Dict-like regions that fall within markdown code spans (inline backticks
+    or fenced code blocks) are left untouched so code examples render verbatim.
+    """
+    if not text:
+        return text
+    code_spans = _find_code_spans(text)
+    def _to_json_safe(obj: Any):
+        """Recursively convert Python objects to JSON-serializable equivalents.
+        - Ellipsis (…) or ... becomes "..."
+        - Unsupported objects become str(obj)
+        """
+        if obj is ... or isinstance(obj, type(Ellipsis)):
+            return "..."
+        if isinstance(obj, dict):
+            return {str(k): _to_json_safe(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_to_json_safe(v) for v in obj]
+        if isinstance(obj, tuple):
+            return [_to_json_safe(v) for v in obj]
+        if isinstance(obj, (str, int, float, bool)) or obj is None:
+            return obj
+        return str(obj)
+    def _is_complex_structure(obj):
+        """Check if object is worth pretty-printing (not just a simple value)"""
+        if isinstance(obj, dict):
+            return len(obj) > 0
+        elif isinstance(obj, list):
+            return len(obj) > 0 and any(isinstance(item, (dict, list)) for item in obj)
+        return False
+    def _format_with_preserved_spacing(json_str):
+        """Convert JSON string to HTML with preserved indentation and wrapping.
+        Use a <pre> block with white-space: pre-wrap so that long tokens can wrap
+        while preserving indentation and newlines without converting spaces to
+        non-breaking spaces (which prevents wrapping).
+        """
+        formatted = html.escape(json_str, quote=False)
+        return (
+            "<pre style=\"font-family: monospace; line-height: 1.4; font-size: 14px; "
+            "white-space: pre-wrap !important; word-break: break-word; overflow-wrap: anywhere; "
+            "background: #ffffff; padding: 10px; border-radius: 4px; margin: 0;\">"
+            f"{formatted}"
+            "</pre>"
+        )
+    new_parts, last_idx = [], 0
+    for start, end in _find_balanced_spans(text):
+        candidate = text[start:end]
+        parsed = _try_parse_slice(candidate)
+        if _is_complex_structure(parsed) and not _is_inside_any_span(start, end, code_spans):
+            new_parts.append(html.escape(text[last_idx:start], quote=False))
+            pretty = json.dumps(_to_json_safe(parsed), indent=2, ensure_ascii=False)
+            new_parts.append(_format_with_preserved_spacing(pretty))
+            last_idx = end
+    new_parts.append(html.escape(text[last_idx:], quote=False))
+    return "".join(new_parts)
+# ---------------------------------------------------------------------------
+# Format conversion
+# ---------------------------------------------------------------------------
+def convert_to_openai_format(response_data: Any):
+    """Convert various response payloads into the OpenAI chat format list."""
+    if isinstance(response_data, list):
+        return response_data
+    if isinstance(response_data, dict):
+        # If it already looks like an OpenAI-style message, wrap it in a list
+        if "role" in response_data and "content" in response_data:
+            return [response_data]
+        # Otherwise treat dict as assistant content (preserve structure for tool_calls)
+        return [{"role": "assistant", "content": response_data}]
+    if isinstance(response_data, str):
+        # Try Python literal first (handles single quotes)
+        try:
+            parsed = ast.literal_eval(response_data)
+            if isinstance(parsed, list):
+                return parsed
+        except (ValueError, SyntaxError):
+            pass
+        # Try JSON
+        try:
+            parsed = json.loads(response_data)
+            if isinstance(parsed, list):
+                return parsed
+        except json.JSONDecodeError:
+            pass
+        # Fallback plain-text assistant message
+        return [{"role": "assistant", "content": response_data}]
+    # Fallback for any other type
+    return [{"role": "assistant", "content": str(response_data)}]
+# ---------------------------------------------------------------------------
+# HTML rendering
+# ---------------------------------------------------------------------------
+def _markdown(text: str, *, pretty_print_dicts: bool = True) -> str:
+    """Render markdown, optionally pretty-printing any embedded dicts."""
+    processed = pretty_print_embedded_dicts(text) if pretty_print_dicts else html.escape(text, quote=False)
+    # Configure extensions for proper code block handling
+    extensions = ["fenced_code"]
+    extension_configs = {}
+    try:
+        import pygments
+        extensions.append("codehilite")
+        extension_configs['codehilite'] = {
+            'css_class': 'highlight',
+            'use_pygments': True,
+            'guess_lang': True,
+            'linenums': False
+        }
+    except ImportError:
+        pass
+    # Convert newlines to <br> only outside of code blocks
+    # Process fenced code blocks first, then handle line breaks
+    result = markdown.markdown(processed, extensions=extensions, extension_configs=extension_configs)
+    # IMPORTANT: Avoid injecting <br> tags when lists are present, as this can
+    # introduce empty bullets or odd spacing in nested lists.
+    import re
+    if re.search(r'<(ul|ol)\b', result):
+        return result
+    # Otherwise, add line breaks for non-code content only
+    code_block_pattern = r'(<pre[^>]*>.*?</pre>|<code[^>]*>.*?</code>)'
+    parts = re.split(code_block_pattern, result, flags=re.DOTALL)
+    for i in range(0, len(parts), 2):  # Process non-code parts only
+        if i < len(parts):
+            parts[i] = re.sub(r'(?<!\n)\n(?!\n)', '<br>\n', parts[i])
+    return ''.join(parts)
+def display_openai_conversation_html(conversation_data: List[Dict[str, Any]], *, use_accordion: bool = True, pretty_print_dicts: bool = True, evidence: Any = None) -> str:
+    """Convert an OpenAI-style conversation list into styled HTML for Gradio."""
+    from .examples_helpers import annotate_text_with_evidence_placeholders, HIGHLIGHT_START, HIGHLIGHT_END
+    if not conversation_data:
+        return "<p>No conversation data available</p>"
+    # Collapsed raw JSON section for debugging
+    raw_json = json.dumps(conversation_data, indent=2, ensure_ascii=False)
+    html_out = f"""
+    <details style="margin: 8px 0;">
+        <summary style="cursor: pointer; font-weight: 600;">
+            Click to see raw response ({len(conversation_data)})
+        </summary>
+        <div style="padding: 8px 15px;">
+            <pre style="white-space: pre-wrap; word-wrap: break-word; overflow-wrap: anywhere; background: #ffffff; padding: 10px; border-radius: 4px;">{html.escape(raw_json, quote=False)}</pre>
+        </div>
+    </details>
+    """
+    role_colors = {
+        "system": "#ff6b6b",
+        "info": "#4ecdc4",
+        "assistant": "#45b7d1",
+        "tool": "#96ceb4",
+        "user": "#feca57",
+    }
+    def _maybe_annotate(content_str: str) -> str:
+        if evidence is None or not isinstance(content_str, str) or not content_str.strip():
+            return content_str
+        return annotate_text_with_evidence_placeholders(content_str, evidence)
+    def _replace_placeholders_with_mark(html_str: str) -> str:
+        if not html_str:
+            return html_str
+        return (
+            html_str
+            .replace(HIGHLIGHT_START, "<mark class=\"evidence-highlight\">")
+            .replace(HIGHLIGHT_END, "</mark>")
+        )
+    def _format_tool_calls(content: Dict[str, Any]) -> str:
+        """Format tool calls in a more readable way."""
+        if not isinstance(content, dict) or "tool_calls" not in content:
+            return f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
+        tool_calls = content["tool_calls"]
+        if not isinstance(tool_calls, list):
+            return f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
+        html_parts = []
+        for i, tool_call in enumerate(tool_calls, 1):
+            if not isinstance(tool_call, dict):
+                continue
+            # Extract tool call information
+            name = tool_call.get("name", "Unknown tool")
+            arguments = tool_call.get("arguments", "")
+            tool_id = tool_call.get("id", tool_call.get("tool_call_id", ""))
+            # Coerce call type to a safe uppercase string
+            raw_call_type = tool_call.get("type", "function")
+            call_type = str(raw_call_type or "function")
+            # Parse arguments if they're a JSON string
+            formatted_args = arguments
+            if isinstance(arguments, str) and arguments.strip():
+                try:
+                    parsed_args = json.loads(arguments)
+                    formatted_args = json.dumps(parsed_args, indent=2, ensure_ascii=False)
+                except json.JSONDecodeError:
+                    formatted_args = arguments
+            elif isinstance(arguments, (dict, list, tuple, int, float, bool)) or arguments is None:
+                # Stringify any non-string argument type
+                try:
+                    formatted_args = json.dumps(arguments, indent=2, ensure_ascii=False)
+                except Exception:
+                    formatted_args = str(arguments)
+            # Format with preserved spacing for proper indentation
+            if formatted_args and isinstance(formatted_args, str) and ('\n' in formatted_args or '  ' in formatted_args):
+                escaped_args = html.escape(formatted_args, quote=False)
+                formatted_args = (
+                    "<pre style=\"font-family: monospace; line-height: 1.4; font-size: 14px; "
+                    "white-space: pre-wrap !important; word-break: break-word; overflow-wrap: anywhere; "
+                    "background: #ffffff; padding: 10px; border-radius: 4px; margin: 0;\">"
+                    f"{escaped_args}"
+                    "</pre>"
+                )
+            else:
+                formatted_args = html.escape(str(formatted_args), quote=False)
+            # Create the tool call display
+            tool_html = f"""
+            <div style="border: 1px solid #ff7f00; border-radius: 8px; margin: 8px 0; padding: 12px; background: #fff8f0;">
+                <div style="display: flex; align-items: center; margin-bottom: 8px;">
+                    <span style="background: #ff7f00; color: white; padding: 2px 6px; border-radius: 4px; font-size: 11px; font-weight: bold; margin-right: 8px;">
+                        {call_type.upper()}
+                    </span>
+                    <span style="font-weight: 600; color: #d2691e; font-size: 14px;">{html.escape(name)}</span>
+                    {f'<span style="margin-left: auto; font-size: 11px; color: #666;">ID: {html.escape(tool_id)}</span>' if tool_id else ''}
+                </div>
+                {f'''<div style="margin-top: 8px;">
+                    <div style="font-weight: 600; color: #666; margin-bottom: 4px; font-size: 12px;">Arguments:</div>
+                    <div style="font-size: 12px; line-height: 1.4; color: #333;">{formatted_args}</div>
+                </div>''' if formatted_args else ''}
+            </div>
+            """
+            html_parts.append(tool_html)
+        if len(tool_calls) > 1:
+            return f"""
+            <div style="border-left: 3px solid #ff7f00; padding-left: 12px; margin: 8px 0;">
+                <div style="font-weight: 600; color: #d2691e; margin-bottom: 8px; font-size: 14px;">
+                    {len(tool_calls)} tool call{'s' if len(tool_calls) != 1 else ''}:
+                </div>
+                {''.join(html_parts)}
+            </div>
+            """
+        else:
+            return ''.join(html_parts)
+    def _format_msg(role: str, content: Any) -> str:
+        # Check if this is a tool call by examining the content
+        is_tool_call = False
+        if isinstance(content, dict) and "tool_calls" in content:
+            is_tool_call = True
+        if isinstance(content, dict) or (isinstance(content, list) and content and all(isinstance(d, dict) for d in content)):
+            if is_tool_call:
+                # Render assistant text (if provided) plus styled tool calls
+                text_html = ""
+                if isinstance(content, dict) and isinstance(content.get("text"), str) and content.get("text").strip():
+                    annotated = _maybe_annotate(content.get("text", ""))
+                    text_html = _markdown(annotated, pretty_print_dicts=pretty_print_dicts)
+                    text_html = _replace_placeholders_with_mark(text_html)
+                content_html = text_html + _format_tool_calls(content)
+            elif pretty_print_dicts:
+                def _to_json_safe_inline(obj: Any):
+                    if obj is ... or isinstance(obj, type(Ellipsis)):
+                        return "..."
+                    if isinstance(obj, dict):
+                        return {str(k): _to_json_safe_inline(v) for k, v in obj.items()}
+                    if isinstance(obj, list):
+                        return [_to_json_safe_inline(v) for v in obj]
+                    if isinstance(obj, tuple):
+                        return [_to_json_safe_inline(v) for v in obj]
+                    if isinstance(obj, (str, int, float, bool)) or obj is None:
+                        return obj
+                    return str(obj)
+                safe_json = html.escape(json.dumps(_to_json_safe_inline(content), indent=2, ensure_ascii=False), quote=False)
+                content_html = (
+                    f"<pre style='background: #ffffff; padding: 10px; border-radius: 4px; "
+                    f"white-space: pre-wrap !important; word-break: break-word; overflow-wrap: anywhere;'>{safe_json}</pre>"
+                )
+            else:
+                content_html = f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
+        elif isinstance(content, str):
+            # Insert highlight placeholders before markdown so offsets make sense in plain text
+            annotated = _maybe_annotate(content)
+            content_html = _markdown(annotated, pretty_print_dicts=pretty_print_dicts)
+            # Convert placeholders to <mark> after markdown
+            content_html = _replace_placeholders_with_mark(content_html)
+        elif content is None:
+            content_html = "<em>(No content)</em>"
+        else:
+            content_html = str(content)
+        # Determine role display text and color
+        if is_tool_call:
+            # Keep assistant styling; tool blocks are styled within
+            role_display = "assistant"
+            color = role_colors.get("assistant", "#95a5a6")
+        else:
+            role_display = role
+            color = role_colors.get(role.lower(), "#95a5a6")
+        return (
+            f"<div style='border-left: 4px solid {color}; margin: 8px 0; background-color: #ffffff; padding: 12px; border-radius: 0 8px 8px 0;'>"
+            f"<div style='font-weight: 600; color: {color}; margin-bottom: 8px; text-transform: capitalize; font-size: 16px;'>{role_display}</div>"
+            f"<div style='color: #333; line-height: 1.6; font-family: inherit; font-size: 15px;'>{content_html}</div>"
+            "</div>"
+        )
+    if use_accordion:
+        system_msgs, info_msgs, other_msgs = [], [], []
+        for m in conversation_data:
+            if not isinstance(m, dict):
+                continue
+            role = m.get("role", "unknown").lower()
+            content = m.get("content", "")
+            if isinstance(content, dict) and "text" in content and "tool_calls" not in content:
+                content = content["text"]
+            if role == "system":
+                system_msgs.append((role, content))
+            elif role == "info":
+                info_msgs.append((role, content))
+            else:
+                other_msgs.append((role, content))
+        def _accordion(title: str, items: List):
+            if not items:
+                return ""
+            inner = "".join(_format_msg(r, c) for r, c in items)
+            return (
+                f"<details style='margin: 8px 0;'>"
+                f"<summary style='cursor: pointer; font-weight: 600;'>"
+                f"{html.escape(title)} ({len(items)})"  # e.g. "Click to see system messages (3)"
+                f"</summary>"
+                f"<div style='padding: 8px 15px;'>{inner}</div>"
+                "</details>"
+            )
+        html_out += _accordion("Click to see system messages", system_msgs)
+        html_out += _accordion("Click to see info messages", info_msgs)
+        for r, c in other_msgs:
+            html_out += _format_msg(r, c)
+    else:
+        # No accordion: just render everything
+        for m in conversation_data:
+            if not isinstance(m, dict):
+                continue
+            role = m.get("role", "unknown").lower()
+            content = m.get("content", "")
+            if isinstance(content, dict) and "text" in content and "tool_calls" not in content:
+                content = content["text"]
+            html_out += _format_msg(role, content)
+    # CSS for proper code block styling and summary hover effects
+    css_styles = """
+    <style>
+    .evidence-highlight { background: #ffff8b; padding: 0 2px; }
+    :root {
+        /* Code block color palette - GitHub Light inspired */
+        --code-bg: transparent; /* make JSON/code wrapper background transparent */
+        --code-text: #24292f;
+        --code-comment: #6a737d;
+        --code-keyword: #d73a49;
+        --code-string: #032f62;
+        --code-number: #005cc5;
+        --code-operator: #24292f;
+        --code-function: #6f42c1;
+        --code-border: #d0d7de;
+        /* Inline code colors - same light theme */
+        --inline-code-bg: #f3f4f6;
+        --inline-code-text: #24292f;
+        --inline-code-border: #d1d5db;
+        /* Code block structure */
+        --code-border-radius: 8px;
+        --code-padding: 16px;
+        --code-font-size: 14px;
+        --code-line-height: 1.5;
+        --code-font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', 'SF Mono', Consolas, 'Liberation Mono', Menlo, Courier, monospace;
+    }
+    /* Base code styling */
+    pre, code {
+        font-family: var(--code-font-family) !important;
+        font-size: var(--code-font-size) !important;
+        line-height: var(--code-line-height) !important;
+        font-variant-ligatures: normal !important;
+        -webkit-font-smoothing: antialiased !important;
+        -moz-osx-font-smoothing: grayscale !important;
+    }
+    /* Fenced code blocks - light theme */
+    .highlight, .codehilite, pre.highlight, pre.codehilite,
+    .language-python, .language-text, .language-bash {
+        background: var(--code-bg) !important;
+        color: var(--code-text) !important;
+        border: 1px solid var(--code-border) !important;
+        border-radius: var(--code-border-radius) !important;
+        padding: var(--code-padding) !important;
+        margin: 12px 0 !important;
+        overflow-x: auto !important;
+        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05) !important;
+        position: relative !important;
+        white-space: pre !important;
+        display: block !important;
+    }
+    .highlight pre, .codehilite pre {
+        background: transparent !important;
+        color: inherit !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        border: none !important;
+        border-radius: 0 !important;
+        overflow: visible !important;
+        white-space: pre !important;
+        display: block !important;
+    }
+    /* Ensure code blocks preserve formatting */
+    .highlight code, .codehilite code {
+        white-space: pre !important;
+        display: block !important;
+        padding: 0 !important;
+        margin: 0 !important;
+        background: transparent !important;
+        border: none !important;
+        font-size: inherit !important;
+        line-height: inherit !important;
+    }
+    /* Add language label for fenced blocks */
+    .highlight::before, .codehilite::before {
+        content: 'python';
+        position: absolute;
+        top: 8px;
+        right: 12px;
+        background: rgba(0, 0, 0, 0.05);
+        color: #586069;
+        padding: 2px 8px;
+        border-radius: 4px;
+        font-size: 11px;
+        font-weight: 500;
+        text-transform: uppercase;
+        letter-spacing: 0.5px;
+    }
+    /* Syntax highlighting for Python - Light theme */
+    .highlight .k, .codehilite .k,    /* keywords */
+    .highlight .kn, .codehilite .kn,  /* keyword.namespace */
+    .highlight .kp, .codehilite .kp,  /* keyword.pseudo */
+    .highlight .kr, .codehilite .kr,  /* keyword.reserved */
+    .highlight .kt, .codehilite .kt   /* keyword.type */
+    {
+        color: var(--code-keyword) !important;
+        font-weight: 600 !important;
+    }
+    .highlight .s, .codehilite .s,    /* strings */
+    .highlight .s1, .codehilite .s1,  /* string.single */
+    .highlight .s2, .codehilite .s2,  /* string.double */
+    .highlight .se, .codehilite .se   /* string.escape */
+    {
+        color: var(--code-string) !important;
+    }
+    .highlight .c, .codehilite .c,    /* comments */
+    .highlight .c1, .codehilite .c1,  /* comment.single */
+    .highlight .cm, .codehilite .cm   /* comment.multiline */
+    {
+        color: var(--code-comment) !important;
+        font-style: italic !important;
+    }
+    .highlight .m, .codehilite .m,    /* numbers */
+    .highlight .mi, .codehilite .mi,  /* number.integer */
+    .highlight .mf, .codehilite .mf,  /* number.float */
+    .highlight .mo, .codehilite .mo   /* number.octal */
+    {
+        color: var(--code-number) !important;
+        font-weight: 600 !important;
+    }
+    .highlight .nf, .codehilite .nf,  /* function names */
+    .highlight .fm, .codehilite .fm   /* function.magic */
+    {
+        color: var(--code-function) !important;
+        font-weight: 600 !important;
+    }
+    .highlight .o, .codehilite .o,    /* operators */
+    .highlight .ow, .codehilite .ow   /* operator.word */
+    {
+        color: var(--code-operator) !important;
+    }
+    /* Inline code - light theme */
+    p code, li code, div code, span code,
+    h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
+        background: var(--inline-code-bg) !important;
+        color: var(--inline-code-text) !important;
+        border: 1px solid var(--inline-code-border) !important;
+        padding: 2px 6px !important;
+        border-radius: 4px !important;
+        font-size: 0.9em !important;
+        font-weight: 600 !important;
+        white-space: nowrap !important;
+        box-shadow: none !important;
+        display: inline !important;
+    }
+    /* Code blocks inside paragraphs should not be treated as inline */
+    p pre, li pre, div pre {
+        background: var(--code-bg) !important;
+        color: var(--code-text) !important;
+        border: 1px solid var(--code-border) !important;
+        border-radius: var(--code-border-radius) !important;
+        padding: var(--code-padding) !important;
+        margin: 8px 0 !important;
+        white-space: pre !important;
+        overflow-x: auto !important;
+        display: block !important;
+    }
+    /* Scrollbar styling for code blocks - light theme */
+    .highlight::-webkit-scrollbar, .codehilite::-webkit-scrollbar,
+    pre::-webkit-scrollbar {
+        height: 8px !important;
+        background: #f1f3f4 !important;
+        border-radius: 4px !important;
+    }
+    .highlight::-webkit-scrollbar-thumb, .codehilite::-webkit-scrollbar-thumb,
+    pre::-webkit-scrollbar-thumb {
+        background: #c1c8cd !important;
+        border-radius: 4px !important;
+    }
+    .highlight::-webkit-scrollbar-thumb:hover, .codehilite::-webkit-scrollbar-thumb:hover,
+    pre::-webkit-scrollbar-thumb:hover {
+        background: #a8b3ba !important;
+    }
+    </style>
+    """
+    css_styles += "</style>"
+    html_out = css_styles + html_out
+    return html_out

stringsight/dashboard/data_loader.py ADDED Viewed

	@@ -0,0 +1,189 @@

+"""
+Data loading functionality for the LMM-Vibes Gradio app.
+This module handles loading pipeline results and converting them to formats
+suitable for the Gradio interface.
+"""
+import json
+import pandas as pd
+from pathlib import Path
+from typing import Dict, List, Any, Tuple, Optional
+import os
+from .state import app_state
+from .plotting import create_model_cluster_dataframe
+class DataCache:
+    """Simple cache for loaded data to avoid re-loading."""
+    _cache = {}
+    @classmethod
+    def get(cls, key: str):
+        return cls._cache.get(key)
+    @classmethod
+    def set(cls, key: str, value: Any):
+        cls._cache[key] = value
+    @classmethod
+    def clear(cls):
+        cls._cache.clear()
+def scan_for_result_subfolders(base_dir: str) -> List[str]:
+    """Scan for subfolders that might contain pipeline results."""
+    base_path = Path(base_dir)
+    if not base_path.exists():
+        return []
+    # Look for subfolders that contain the required files
+    subfolders = []
+    for item in base_path.iterdir():
+        if item.is_dir():
+            # Check if this subfolder contains pipeline results
+            required_files = [
+                "model_cluster_scores.json",
+                "cluster_scores.json",
+                "model_scores.json",
+                "clustered_results_lightweight.jsonl"
+            ]
+            if all((item / f).exists() for f in required_files):
+                subfolders.append(item.name)
+    return subfolders
+def validate_results_directory(results_dir: str) -> Tuple[bool, str]:
+    """Validate that the results directory contains the expected files."""
+    results_path = Path(results_dir)
+    if not results_path.exists():
+        return False, f"Directory does not exist: {results_dir}"
+    if not results_path.is_dir():
+        return False, f"Path is not a directory: {results_dir}"
+    # Check for FunctionalMetrics format files
+    required_files = [
+        "model_cluster_scores.json",
+        "cluster_scores.json",
+        "model_scores.json",
+    ]
+    missing_files = []
+    for filename in required_files:
+        if not (results_path / filename).exists():
+            missing_files.append(filename)
+    # Check for clustered results
+    if not (results_path / "clustered_results_lightweight.jsonl").exists():
+        missing_files.append("clustered_results_lightweight.jsonl")
+    if missing_files:
+        return False, f"Missing required files: {', '.join(missing_files)}"
+    return True, ""
+def get_available_models(metrics: Dict[str, Any]) -> List[str]:
+    """Extract available models from metrics data."""
+    model_cluster_scores = metrics.get("model_cluster_scores", {})
+    return list(model_cluster_scores.keys())
+def get_all_models(metrics: Dict[str, Any]) -> List[str]:
+    """Get all available models from metrics data."""
+    return get_available_models(metrics)
+def load_pipeline_results(results_dir: str) -> Tuple[pd.DataFrame, Dict[str, Any], pd.DataFrame, Path]:
+    """Load pipeline outputs (FunctionalMetrics format only).
+    Returns:
+        clustered_df: DataFrame of per-conversation data loaded from clustered_results.jsonl
+        metrics: Dict containing the three FunctionalMetrics score dictionaries
+        model_cluster_df: DataFrame created from model_cluster_scores for plotting/analysis
+        results_path: Path to the results directory
+    """
+    cache_key = f"pipeline_results_{results_dir}"
+    cached = DataCache.get(cache_key)
+    if cached:
+        return cached
+    results_path = Path(results_dir)
+    if not results_path.exists():
+        raise FileNotFoundError(f"Results directory does not exist: {results_dir}")
+    # ------------------------------------------------------------------
+    # 1. Load FunctionalMetrics score files (must ALL be present)
+    # ------------------------------------------------------------------
+    required_files = [
+        "model_cluster_scores.json",
+        "cluster_scores.json",
+        "model_scores.json",
+    ]
+    missing = [f for f in required_files if not (results_path / f).exists()]
+    if missing:
+        raise FileNotFoundError(
+            f"Missing required metrics files in {results_dir}: {', '.join(missing)}"
+        )
+    with open(results_path / "model_cluster_scores.json") as f:
+        model_cluster_scores = json.load(f)
+    with open(results_path / "cluster_scores.json") as f:
+        cluster_scores = json.load(f)
+    with open(results_path / "model_scores.json") as f:
+        model_scores = json.load(f)
+    metrics = {
+        "model_cluster_scores": model_cluster_scores,
+        "cluster_scores": cluster_scores,
+        "model_scores": model_scores,
+    }
+    # ------------------------------------------------------------------
+    # 2. Load clustered conversation data (JSON-Lines)
+    # ------------------------------------------------------------------
+    clustered_path = results_path / "clustered_results_lightweight.jsonl"
+    if not clustered_path.exists():
+        raise FileNotFoundError(f"clustered_results_lightweight.jsonl not found in {results_dir}")
+    try:
+        clustered_df = pd.read_json(clustered_path, lines=True)
+    except Exception as e:
+        raise ValueError(f"Could not load clustered results: {e}")
+    # ------------------------------------------------------------------
+    # 3. Create model_cluster_df from metrics for plotting/analysis
+    # ------------------------------------------------------------------
+    model_cluster_df = create_model_cluster_dataframe(model_cluster_scores)
+    result = (clustered_df, metrics, model_cluster_df, results_path)
+    DataCache.set(cache_key, result)
+    return result
+def load_property_examples(results_path: Path, property_ids: List[str]) -> pd.DataFrame:
+    """Load specific property examples on-demand"""
+    if not property_ids:
+        return pd.DataFrame()
+    cache_key = f"examples_{results_path}_{hash(tuple(sorted(property_ids)))}"
+    cached = DataCache.get(cache_key)
+    if cached is not None:
+        return cached
+    # Load full dataset to get prompt/response details
+    clustered_path = results_path / "clustered_results_lightweight.jsonl"
+    if not clustered_path.exists():
+        raise FileNotFoundError("Could not load example data - clustered_results_lightweight.jsonl not found")
+    try:
+        full_df = pd.read_json(clustered_path, lines=True)
+        result = full_df[full_df['id'].isin(property_ids)]
+        DataCache.set(cache_key, result)
+        return result
+    except Exception as e:
+        raise ValueError(f"Failed to load examples: {e}")

stringsight/dashboard/demo.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Demo script showing different ways to use the LMM-Vibes Gradio visualization.
+This demonstrates the Python API for launching the Gradio app.
+"""
+import argparse
+from pathlib import Path
+from stringsight.dashboard import launch_app, create_app
+def demo_basic_launch():
+    """Demo: Basic launch without pre-loading data."""
+    print("🚀 Demo: Basic launch - data can be loaded through the UI")
+    launch_app()
+def demo_preload_data(results_dir: str):
+    """Demo: Launch with pre-loaded data."""
+    print(f"🚀 Demo: Launch with pre-loaded data from {results_dir}")
+    launch_app(results_dir=results_dir)
+def demo_custom_settings(results_dir: str = None):
+    """Demo: Launch with custom settings."""
+    print("🚀 Demo: Launch with custom settings")
+    launch_app(
+        results_dir=results_dir,
+        share=True,  # Create public shareable link
+        server_name="0.0.0.0",  # Allow access from other machines
+        server_port=8080,  # Custom port
+    )
+def demo_programmatic_access():
+    """Demo: Create app object for programmatic access."""
+    print("🚀 Demo: Programmatic app creation")
+    # Create the app object without launching
+    app = create_app()
+    # You could modify the app here if needed
+    # app.title = "My Custom Title"
+    # Launch when ready
+    print("Launching app...")
+    app.launch(share=False, server_port=7861)
+def main():
+    parser = argparse.ArgumentParser(description="LMM-Vibes Gradio Visualization Demo")
+    parser.add_argument("--results_dir", help="Path to results directory for demos")
+    parser.add_argument("--demo", choices=[
+        "basic", "preload", "custom", "programmatic"
+    ], default="basic", help="Which demo to run")
+    args = parser.parse_args()
+    if args.demo == "basic":
+        demo_basic_launch()
+    elif args.demo == "preload":
+        if not args.results_dir:
+            print("❌ Error: --results_dir required for preload demo")
+            return
+        demo_preload_data(args.results_dir)
+    elif args.demo == "custom":
+        demo_custom_settings(args.results_dir)
+    elif args.demo == "programmatic":
+        demo_programmatic_access()
+if __name__ == "__main__":
+    main()

stringsight/dashboard/demo_examples.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Predefined demo example configurations for the Gradio launcher.
+Each demo contains:
+- data_path: absolute path to the dataset file
+- explain: parameters for the Explain pipeline (aligned with exposed UI controls)
+- label: parameters for the Label pipeline (aligned with exposed UI controls)
+- advanced: shared advanced parameters (optional)
+"""
+from __future__ import annotations
+from typing import Dict, Any, List
+# Single initial example using the existing project demo file and default params
+EXAMPLES: Dict[str, Dict[str, Any]] = {
+    "Summarizing IT Support Calls": {
+        "data_path": "data/demo_data/call_center.jsonl",
+        "explain": {
+            "method": "single_model",
+            "system_prompt": "single_model_system_prompt",
+            "clusterer": "hdbscan",
+            "min_cluster_size": 8,
+            "max_coarse_clusters": 12,
+            "hierarchical": False,
+            "assign_outliers": False,
+            "groupby_column": "behavior_type",
+        },
+        "label": {
+            "taxonomy": {
+                "incorrectly states resolution": "The model incorrectly says how/if the issue was resolved.",
+                "fabricates information": "The model fabricates information about the transcript.",
+                "missing important information": "The model does not include important details about the user, the problem, or the resolution in its summary.",
+            },
+            "label_model_name": "gpt-5-mini",
+        },
+        "advanced": {
+            "sample_size": None,
+            "max_workers": 64,
+            "use_wandb": False,
+            "verbose": True,
+        },
+    },
+    "Airline Agent Customer Service": {
+        "data_path": "data/demo_data/taubench_airline.jsonl",
+        "explain": {
+            "method": "single_model",
+            "system_prompt": "agent_system_prompt",
+            "clusterer": "hdbscan",
+            "min_cluster_size": 8,
+            "max_coarse_clusters": 12,
+            "hierarchical": False,
+            "assign_outliers": False,
+            "groupby_column": "behavior_type",
+        },
+        "label": {
+            "taxonomy": {
+                "Disobey Task Specification": "This error occurs when an agent or system fails to adhere to specified constraints, guidelines, or requirements associated with a particular task. Non-compliance can result from unclear, incomplete, or ambiguous instructions provided by the user, system prompts, or task descriptions. It may also arise from an agent's inadequate ability to interpret or apply constraints effectively. Consequences of poor task constraint compliance include incorrect, suboptimal, or irrelevant outputs, reduced system performance and increased resource consumption.",
+                "Disobey Role Specification": "Failure to adhere to the defined responsibilities and constraints of an assigned role, potentially leading to an agent behaving like another.",
+                "Step Repetition": "Step repetition occurs when an agent or system unnecessarily repeats a phase, a task, a stage that have already been completed. Such redundancy can arise from inadequate state or context tracking, inefficient workflow management, unclear or ambiguous instructions, or failure to recognize completed tasks.\n\nAn example of step repetition is in the following Hyperagent trace where the Planner repeated exactly the same thought twice. \n\n[...]\nHyperAgent_matplotlib__matplotlib-23563 - INFO - Planner's Response: Thought: To address this issue, we need to understand the root cause of the 'Line3D' object not having the '_verts3d' attribute. This error typically occurs when the data passed to the plot function is invalid or when there's an issue with the 3D rendering. Let's start by examining the code and reproducing the issue.\n[...]\nHyperAgent_matplotlib__matplotlib-23563 - INFO - Planner's Response: Thought: To address this issue, we need to understand the root cause of the 'Line3D' object not having the '_verts3d' attribute. This error typically occurs when the data passed to the plot function is invalid or when there's an issue with the 3D rendering. Let's start by examining the code and reproducing the issue.\n[...]\nIntern Name: Codebase Editor\nSubgoal: Please prepare a response for the user with the following content:\n1. Explain the identified issue and recommend updating matplotlib to version 3.5.1 or later.\n2. Provide a modified version of their code that attempts to work around the issue.\n3. Suggest an alternative plotting method using scatter plots instead of lines.\n[...]",
+                "Loss of Conversation History": "Unexpected context truncation, disregarding recent interaction history and reverting to an antecedent conversational state.",
+                "Unaware of Termination Conditions": "This error occurs when an agent or system fails to adhere to criteria designed to trigger the termination of an interaction, conversation, phase, or task. Such oversight can arise due to ambiguous, incomplete, or poorly defined stopping conditions, ineffective monitoring mechanisms, or inadequate understanding of the criteria by the agent.\n\nConsequences of being unaware of stopping conditions include unnecessary conversation turns, inefficient use of resources, or potential harm to the correctness of the system due to performing actions beyond what is necessary.\n\nAn example of \"Unaware of Stopping Conditions\" involving two agents attempting to collaboratively solve a math problem is the following:\n{\n    \"content\": [\n        \"In the current problem, I don't have enough information to solve it. If I were to ask you for the relevant missing information now, it would be:\",\n        \"1. What is the total length of the ribbon that Monica has?\",\n        \"or\",\n        \"2. How many inches of ribbon does Monica use for each gift bow?\",\n        \"\",\n        \"Could you please provide either piece of information so that I can solve the problem? Without either piece, the problem remains unsolvable.\"\n    ],\n    \"role\": \"user\",\n    \"name\": \"assistant\"\n},\n{\n    \"content\": [\n        \"Continue. Please keep solving the problem until you need to query. (If you get to the answer, put it in \\\\boxed{}.)\"\n    ],\n    \"role\": \"assistant\",\n    \"name\": \"mathproxyagent\"\n},\n{\n    \"content\": [\n        \"I'm sorry, but I really need more information to solve this problem. It can't be solved with the current data.\"\n    ],\n    \"role\": \"user\",\n    \"name\": \"assistant\"\n},\n{\n    \"content\": [\n        \"Continue. Please keep solving the problem until you need to query. (If you get to the answer, put it in \\\\boxed{}.)\"\n    ],\n    \"role\": \"assistant\",\n    \"name\": \"mathproxyagent\"\n}\n\nIn this example, the mathproxyagent is poorly designed as it does not recognize that the assistant agent requires additional information to continue. The mathproxyagent either needs to provide the necessary information or acknowledge that it does not have it, thereby appropriately terminating the interaction rather than repeating instructions unnecessarily.",
+                "Conversation Reset": "Unexpected or unwarranted restarting of a dialogue, potentially losing context and progress made in the interaction.",
+                "Fail to Ask for Clarification": "Inability to request additional information between agent when faced with unclear or incomplete data, potentially resulting in incorrect actions.",
+                "Task Derailment": "Deviation from the intended objective or focus of a given task, potentially resulting in irrelevant or unproductive actions.",
+                "Information Withholding": "This error occurs when an agent or group of agents possesses critical information but fails to share it promptly or effectively with other agents or system components that rely upon this information for their operations. The failure to disseminate relevant information may arise from ineffective or insufficient communication protocols, erroneous assumptions regarding the relevance or priority of the information, inadequate system coordination mechanisms, or deliberate withholding stemming from overly restrictive privacy policies or security constraints. Consequences of withholding relevant information can be severe, potentially leading to reduced operational efficiency, increased latency in task completion, unnecessary redundant processing, incorrect or suboptimal decision-making, and even complete system failures. Additionally, this error can significantly impair collaborative effectiveness, leading to misunderstandings, mistrust, or inefficiencies within the multi-agent environment. Furthermore, initial failures due to withheld information can trigger cascading errors, amplifying the negative impact on overall system performance and reliability. For instance, consider a scenario where a bug localization agent identifies a software defect, accurately determining the affected file and specific line number. The intended process requires this agent to immediately report such detailed bug information to a coding or repair agent responsible for addressing and resolving the issue. However, if the bug localization agent instead attempts to fix the bug independently without sharing the vital bug identification details with the coding agent, this withholding of relevant information could lead to duplicated effort, delayed resolution, incorrect fixes, or further system instability.",
+                "Ignored Other Agent's Input": "Not properly considering input or recommendations provided by other agents in the system (ignore their suggestions), potentially leading to bad decisions, stalled progress, or missed opportunities for solving the task.",
+                "Action-Reasoning Mismatch": "This error occurs when there is a discrepancy or mismatch between agents' logical discussion conclusion or a single agent's internal decision-making processes and the actual actions or outputs the system produces. Such inconsistencies can emerge due to errors in translating reasoning outcomes into practical implementations, or incorrect mapping between the agent's cognitive processes and its action space.\n\nThe consequences of this inconsistency can include unexpected, unintended, or counterproductive behaviors, reduced reliability, and diminished user trust. It can also complicate troubleshooting efforts by obscuring the true rationale behind decisions and actions, leading to further inefficiencies or repeated mistakes.\n\nFor example, in the trace below the agent states that `_add_prefix_for_feature_names_out` method is not explicitly shown in the code snippet, but only mentioned in the context of the `_iter` method. This conclusion is not coherent with the previosu step where the agent showed the implementation of this method. \n\n[...]\n ```python\ndef _add_prefix_for_feature_names_out(self, feature_names_out):\n   \"\"\"\n   Add prefix to feature names.\",\n   \"\"\"\n  if self.verbose_feature_names_out:\n[...]\nNote that the `_add_prefix_for_feature_names_out` method is not explicitly shown in the code snippet, but it is mentioned in the context of the `_iter` method.\n[...]",
+                "Premature Termination": "Ending a dialogue, interaction or task before all necessary information has been exchanged or objectives have been met. Necessary information constitutes verification of outputs, key data (e.g. api tokens) etc. that are necessary for the success of the task, and agents could have obtained if they tried more or already obtained but failed to communicate to other agents before termination.",
+                "Weak Verification": "Weak verification refers to situations where verification mechanisms (agent or step) exist within the system but fail to comprehensively cover all essential aspects of the design necessary for generating robust and reliable outputs. While verification steps are present, they may be incomplete, superficial, or insufficiently rigorous, thereby overlooking critical system attributes or interactions.\n\nConsequences of weak verification include partial validation that allows subtle errors, inconsistencies, or vulnerabilities to remain undetected, potentially compromising overall system reliability and effectiveness. This inadequacy can result in suboptimal system performance, unforeseen failures, cascade to final output if occur during substeps.\n\n\"You are a Code Reviewer. We are both working at ChatDev. We share a common interest in collaborating to successfully complete a task assigned by a new customer. You can help programmers assess source code for software troubleshooting, fix bugs to enhance code quality and robustness, and propose improvements to the source code. Here is a new customer's task: {task}. To complete the task, you must write a response that appropriately solves the requested instruction based on your expertise and the customer's needs.\"\n\nHowever, when asked to review generated code for a Sudoku game, the reviewer failed to recognize that standard Sudoku puzzles typically come pre-filled with numbers for the player to solve, an element absent in the generated implementation. Numerous Sudoku implementations and specifications are readily available online, which the verification agent could easily consult to ensure robustness and completeness.\n\nAnother example occurred with a TicTacToe implementation. While the game was functional and playable, the system incorrectly announced the winning player at the game's conclusion, despite employing the same ChatDev code reviewer prompt.",
+                "No or Incorrect Verification": "Omission of proper checking or confirmation of task outcomes or system outputs, potentially allowing errors or inconsistencies to propagate undetected. So, either no verification or verification is designed to exist in MAS, but verifier fail to complete what was exactly prompted to do. Eg: make sure the code compiles, but the code doesn't even compile.\nVerification is particularly critical in cases where tasks or outputs are readily verifiable by the system itself without human intervention.\n\nConsequences of inadequate or absent verification include the propagation of undetected errors, system inconsistencies, reduced reliability, and failure in the generated output.\n\nA few examples are as follows:\n1. In ChatDev, when prompted by a user to generate a game (e.g., \"textBasedSpaceInvaders\"), verification steps failed despite multiple review stages. Although the code was reportedly verified, compilation errors persisted, leading to runtime failures:\nyes Error: The file 'ship.bmp' was not found in the directory /Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911.\nTraceback (most recent call last):\n  File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/main.py\", line 31, in <module>\n    run_game()\n  File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/main.py\", line 22, in run_game\n    gf.create_fleet(ai_settings, screen, aliens)\n  File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/game_functions.py\", line 64, in create_fleet\n    alien = Alien(ai_settings, screen)\n  File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/alien.py\", line 13, in __init__\n    self.image = pygame.image.load('alien.bmp')\nFileNotFoundError: No file 'alien.bmp' found in working directory '/Users/*/Documents/*/ChatDev'."
+              },
+            "label_model_name": "gpt-5",
+        },
+    }
+}
+def get_demo_names() -> List[str]:
+    return list(EXAMPLES.keys())
+def get_demo_config(name: str) -> Dict[str, Any] | None:
+    return EXAMPLES.get(name)

stringsight/dashboard/examples_helpers.py ADDED Viewed

	@@ -0,0 +1,238 @@

+from __future__ import annotations
+from typing import List, Tuple, Iterable, Optional, Dict, Any
+import re
+# We use private-use unicode placeholders so they survive html.escape/markdown
+HIGHLIGHT_START = "\uE000"
+HIGHLIGHT_END = "\uE001"
+__all__ = [
+    "extract_quoted_fragments",
+    "find_exact_matches",
+    "compute_best_ngram_window",
+    "merge_intervals",
+    "compute_highlight_spans",
+    "insert_highlight_placeholders",
+    "annotate_text_with_evidence_placeholders",
+]
+def extract_quoted_fragments(evidence: Any) -> Dict[str, List[str]]:
+    """Extract quoted fragments from evidence.
+    Returns a dict with keys:
+      - "quoted": list of quoted strings
+      - "unquoted": list of unquoted fragments (may be empty)
+    Evidence may be a string (possibly containing quotes) or a list of strings.
+    We treat double quotes (") and single quotes (').
+    """
+    quoted: List[str] = []
+    unquoted: List[str] = []
+    def _from_str(s: str) -> None:
+        # Capture content inside matching quotes
+        # Handles multiple quoted segments, keeps inner text only
+        q = re.findall(r'"([^"]+)"|\'([^\']+)\'', s)
+        if q:
+            for g1, g2 in q:
+                frag = g1 or g2
+                frag = frag.strip()
+                if frag:
+                    # Split on ellipses (ASCII ... or Unicode …) and contiguous sequences thereof
+                    parts = re.split(r'(?:\.{3}|…)+', frag)
+                    for p in parts:
+                        p = re.sub(r"\s+", " ", p).strip()
+                        if p:
+                            quoted.append(p)
+            # Remove the quoted parts from the string to detect remaining unquoted
+            s_wo = re.sub(r'"[^\"]+"|\'[^\']+\'', " ", s)
+            residue = s_wo.strip()
+            if residue:
+                unquoted.append(residue)
+        else:
+            s = s.strip()
+            if s:
+                unquoted.append(s)
+    if isinstance(evidence, list):
+        for item in evidence:
+            if isinstance(item, str):
+                _from_str(item)
+            else:
+                # Non-string items are ignored; caller can decide how to handle
+                continue
+    elif isinstance(evidence, str):
+        _from_str(evidence)
+    else:
+        # Unknown evidence type → nothing to extract
+        pass
+    return {"quoted": quoted, "unquoted": unquoted}
+def _tokenize_words_with_offsets(text: str) -> List[Tuple[str, int, int]]:
+    """Tokenize into word tokens with their (start, end) character offsets.
+    We treat word characters (\w) as tokens and ignore pure whitespace. Punctuation
+    is not included as tokens for n-gram matching.
+    """
+    tokens: List[Tuple[str, int, int]] = []
+    for m in re.finditer(r"\w+", text):
+        tokens.append((m.group(0).lower(), m.start(), m.end()))
+    return tokens
+def find_exact_matches(text: str, phrase: str) -> List[Tuple[int, int]]:
+    """Case-insensitive exact matches of phrase in text with word-boundary guards.
+    Matches must not start or end inside a word (avoid partial-word highlights).
+    Returns a list of (start, end) character indices.
+    """
+    if not phrase:
+        return []
+    # Build a boundary-safe pattern. We escape the phrase and require non-word boundaries at ends.
+    # Use lookaround to avoid consuming boundary characters.
+    pattern = r"(?<!\w)" + re.escape(phrase) + r"(?!\w)"
+    matches: List[Tuple[int, int]] = []
+    for m in re.finditer(pattern, text, flags=re.IGNORECASE):
+        matches.append((m.start(), m.end()))
+    return matches
+def compute_best_ngram_window(text: str, target: str, n: int = 3, overlap_threshold: float = 0.5) -> Optional[Tuple[int, int]]:
+    """Find a window in `text` that maximizes n-gram overlap with `target`.
+    - Tokenization is word-based (\w+). Case-insensitive.
+    - If target has fewer than n tokens, fallback to n=1 (unigram overlap).
+    - Returns (start_char, end_char) of best window if overlap >= threshold, else None.
+    """
+    text_toks = _tokenize_words_with_offsets(text)
+    target_toks = [t for t, _, _ in _tokenize_words_with_offsets(target)]
+    if not text_toks or not target_toks:
+        return None
+    # Enforce minimum n-gram size. If the target is too short, do not highlight.
+    if n < 1:
+        n = 1
+    if len(target_toks) < n:
+        return None
+    def _ngrams(tokens: List[str], k: int) -> List[Tuple[str, ...]]:
+        return [tuple(tokens[i:i+k]) for i in range(0, len(tokens) - k + 1)] if len(tokens) >= k else []
+    target_ngrams = set(_ngrams(target_toks, n))
+    if not target_ngrams:
+        return None
+    best_score = 0.0
+    best_span: Optional[Tuple[int, int]] = None
+    # Sliding windows over the text tokens with the same token length as the target
+    window_len = max(len(target_toks), n)  # ensure at least n
+    for i in range(0, len(text_toks) - window_len + 1):
+        window_tokens = [tok for tok, _, _ in text_toks[i:i+window_len]]
+        window_ngrams = set(_ngrams(window_tokens, n))
+        overlap = len(window_ngrams & target_ngrams)
+        denom = max(1, len(target_ngrams))
+        score = overlap / denom
+        if score > best_score:
+            # Character span across the window
+            start_char = text_toks[i][1]
+            end_char = text_toks[i+window_len-1][2]
+            best_score = score
+            best_span = (start_char, end_char)
+    if best_span and best_score >= overlap_threshold:
+        return best_span
+    return None
+def merge_intervals(spans: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]:
+    """Merge overlapping or touching intervals."""
+    s = sorted(spans)
+    if not s:
+        return []
+    merged = [list(s[0])]
+    for a, b in s[1:]:
+        if a <= merged[-1][1]:
+            merged[-1][1] = max(merged[-1][1], b)
+        else:
+            merged.append([a, b])
+    return [(a, b) for a, b in merged]
+def compute_highlight_spans(text: str, evidence: Any, n: int = 3, overlap_threshold: float = 0.5) -> List[Tuple[int, int]]:
+    """Compute character spans to highlight in `text` using `evidence`.
+    Strategy:
+      - For each fragment (quoted and unquoted), first try exact case-insensitive matching (all occurrences).
+      - If a specific fragment has no exact matches, use n-gram overlap to find the best-matching window
+        and highlight if above threshold.
+      - If evidence is a list, treat each element independently (quoted detection applied per element).
+    """
+    parts = extract_quoted_fragments(evidence)
+    spans: List[Tuple[int, int]] = []
+    # Evaluate each fragment independently: try exact match first, otherwise fall back to n-gram.
+    # This ensures that when multiple quoted fragments are present and only some match exactly,
+    # we still localize the others approximately.
+    candidates: List[str] = []
+    candidates.extend(parts.get("quoted", []))
+    candidates.extend(parts.get("unquoted", []))
+    # Helper: count word tokens
+    def _num_word_tokens(s: str) -> int:
+        return len(re.findall(r"\w+", s))
+    for fragment in candidates:
+        if not fragment:
+            continue
+        # Enforce a minimum token length to avoid single-word/partial-word highlights
+        if _num_word_tokens(fragment) < n:
+            continue
+        exacts = find_exact_matches(text, fragment)
+        if exacts:
+            spans.extend(exacts)
+            continue
+        win = compute_best_ngram_window(text, fragment, n=n, overlap_threshold=overlap_threshold)
+        if win:
+            spans.append(win)
+    return merge_intervals(spans)
+def insert_highlight_placeholders(text: str, spans: List[Tuple[int, int]]) -> str:
+    """Insert placeholder markers into `text` for each (start, end) span.
+    Assumes spans are non-overlapping and sorted; callers should merge first.
+    """
+    if not spans:
+        return text
+    parts: List[str] = []
+    last = 0
+    for a, b in spans:
+        if a < last:
+            # Overlap – skip to avoid corrupting indices
+            continue
+        parts.append(text[last:a])
+        parts.append(HIGHLIGHT_START)
+        parts.append(text[a:b])
+        parts.append(HIGHLIGHT_END)
+        last = b
+    parts.append(text[last:])
+    return "".join(parts)
+def annotate_text_with_evidence_placeholders(text: str, evidence: Any, *, n: int = 3, overlap_threshold: float = 0.5) -> str:
+    """Return text with highlight placeholders inserted based on evidence.
+    This is the main API used by the renderer. After further processing (markdown),
+    callers should post-process HTML to replace placeholders with <mark> tags.
+    """
+    spans = compute_highlight_spans(text, evidence, n=n, overlap_threshold=overlap_threshold)
+    if not spans:
+        return text
+    return insert_highlight_placeholders(text, spans)

stringsight/dashboard/examples_tab.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""Logic for the **View Examples** tab – dropdown population + example renderer."""
+from __future__ import annotations
+from typing import Any, List, Tuple, Optional
+import gradio as gr
+import ast
+from .state import app_state
+from .utils import (
+    get_unique_values_for_dropdowns,
+    get_example_data,
+    format_examples_display,
+    search_clusters_by_text,
+)
+__all__: List[str] = [
+    "get_dropdown_choices",
+    "update_example_dropdowns",
+    "view_examples",
+    "get_filter_options",
+    "update_filter_dropdowns",
+]
+# ---------------------------------------------------------------------------
+# Dropdown helpers
+# ---------------------------------------------------------------------------
+def get_dropdown_choices(selected_models: Optional[List[str]] = None) -> Tuple[List[str], List[str], List[str]]:
+    if app_state["clustered_df"] is None:
+        return [], [], []
+    choices = get_unique_values_for_dropdowns(app_state["clustered_df"])
+    prompts = ["All Prompts"] + choices["prompts"]
+    # If a sidebar selection is provided, filter models to that subset (ignoring the pseudo 'all')
+    if selected_models:
+        subset = [m for m in choices["models"] if m in [sm for sm in selected_models if sm != "all"]]
+        models = ["All Models"] + (subset if subset else choices["models"])  # fallback to all available if subset empty
+    else:
+        models = ["All Models"] + choices["models"]
+    properties = ["All Clusters"] + choices["properties"]
+    return prompts, models, properties
+def update_example_dropdowns(selected_models: Optional[List[str]] = None) -> Tuple[Any, Any, Any]:
+    prompts, models, properties = get_dropdown_choices(selected_models)
+    # If exactly one concrete model selected in sidebar, preselect it; else default to All Models
+    preselect_model = "All Models"
+    if selected_models:
+        concrete = [m for m in selected_models if m != "all"]
+        if len(concrete) == 1 and concrete[0] in models:
+            preselect_model = concrete[0]
+    return (
+        gr.update(choices=prompts, value="All Prompts" if prompts else None),
+        gr.update(choices=models, value=(preselect_model if models else None)),
+        gr.update(choices=properties, value="All Clusters" if properties else None),
+    )
+# ---------------------------------------------------------------------------
+# Example viewer
+# ---------------------------------------------------------------------------
+def view_examples(
+    selected_prompt: str,
+    selected_model: str,
+    selected_property: str,
+    max_examples: int = 5,
+    use_accordion: bool = True,
+    pretty_print_dicts: bool = True,
+    search_term: str = "",
+    show_unexpected_behavior: bool = False,
+    selected_models_sidebar: Optional[List[str]] = None,
+    selected_tags_sidebar: Optional[List[str]] = None,
+) -> str:
+    if app_state["clustered_df"] is None:
+        return (
+            "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
+            "using the 'Load Data' tab</p>"
+        )
+    # Apply search filter first if search term is provided
+    df = app_state["clustered_df"]
+    # Apply sidebar-selected model filter if provided (ignoring pseudo 'all') before dropdown filters
+    if selected_models_sidebar:
+        concrete = [m for m in selected_models_sidebar if m != "all"]
+        if concrete:
+            df = df[df["model"].isin(concrete)]
+            if df.empty:
+                return "<p style='color: #e74c3c; padding: 20px;'>❌ No examples for the selected model subset.</p>"
+    if search_term and isinstance(search_term, str) and search_term.strip():
+        df = search_clusters_by_text(df, search_term.strip(), 'all')
+        if df.empty:
+            return f"<p style='color: #e74c3c; padding: 20px;'>❌ No clusters found matching '{search_term}'</p>"
+    # Optional tags filter (sidebar): include rows whose first meta value is in selected tags
+    if selected_tags_sidebar and len(selected_tags_sidebar) > 0 and 'meta' in df.columns:
+        def _parse_meta(obj: Any) -> Any:
+            if isinstance(obj, str):
+                try:
+                    return ast.literal_eval(obj)
+                except Exception:
+                    return obj
+            return obj
+        def _first_val(obj: Any) -> Any:
+            if obj is None:
+                return None
+            obj = _parse_meta(obj)
+            if isinstance(obj, dict):
+                for _, v in obj.items():
+                    return v
+                return None
+            if isinstance(obj, (list, tuple)):
+                return obj[0] if len(obj) > 0 else None
+            return obj
+        parsed_meta = df['meta'].apply(_parse_meta)
+        non_null_parsed = [m for m in parsed_meta.tolist() if m is not None]
+        all_empty_dicts = (
+            len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
+        )
+        if not all_empty_dicts:
+            allowed = set(map(str, selected_tags_sidebar))
+            df = df[df['meta'].apply(_first_val).astype(str).isin(allowed)]
+        if df.empty:
+            return "<p style='color: #e74c3c; padding: 20px;'>❌ No examples found for selected tags</p>"
+    examples = get_example_data(
+        df,
+        selected_prompt if selected_prompt != "All Prompts" else None,
+        selected_model if selected_model != "All Models" else None,
+        selected_property if selected_property != "All Clusters" else None,
+        max_examples,
+        show_unexpected_behavior=show_unexpected_behavior,
+        randomize=(
+            (selected_prompt == "All Prompts") and
+            (selected_model == "All Models") and
+            (selected_property == "All Clusters") and
+            (not search_term or not str(search_term).strip())
+        ),
+    )
+    return format_examples_display(
+        examples,
+        selected_prompt,
+        selected_model,
+        selected_property,
+        use_accordion=use_accordion,
+        pretty_print_dicts=pretty_print_dicts,
+    )
+# ---------------------------------------------------------------------------
+# Filter dropdown helpers for frequency comparison
+# ---------------------------------------------------------------------------
+def get_filter_options() -> Tuple[List[str], List[str]]:
+    if not app_state["model_stats"]:
+        return ["All Models"], ["All Metrics"]
+    available_models = ["All Models"] + list(app_state["model_stats"].keys())
+    quality_metrics = set()
+    for model_data in app_state["model_stats"].values():
+        clusters = model_data.get("fine", []) + model_data.get("coarse", [])
+        for cluster in clusters:
+            quality_score = cluster.get("quality_score", {})
+            if isinstance(quality_score, dict):
+                quality_metrics.update(quality_score.keys())
+    available_metrics = ["All Metrics"] + sorted(list(quality_metrics))
+    return available_models, available_metrics
+def update_filter_dropdowns() -> Tuple[Any, Any]:
+    models, metrics = get_filter_options()
+    return (
+        gr.update(choices=models, value="All Models" if models else None),
+        gr.update(choices=metrics, value="All Metrics" if metrics else None),
+    )

stringsight/dashboard/launcher.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/bin/env python3
+"""
+CLI launcher for LMM-Vibes Gradio visualization app.
+Usage:
+    python -m stringsight.dashboard.launcher --results_dir path/to/results
+    Or directly:
+    python stringsight/dashboard/launcher.py --results_dir path/to/results
+"""
+import argparse
+import sys
+from pathlib import Path
+import logging
+def main():
+    parser = argparse.ArgumentParser(
+        description="Launch LMM-Vibes Gradio visualization app",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    # Launch with auto-loaded data from a base results directory
+    python -m stringsight.dashboard.launcher --results_dir /path/to/results
+    # Launch with public sharing enabled
+    python -m stringsight.dashboard.launcher --results_dir /path/to/results --share
+    # Launch on specific port
+    python -m stringsight.dashboard.launcher --results_dir /path/to/results --port 8080
+    # Launch with automatic port selection
+    python -m stringsight.dashboard.launcher --results_dir /path/to/results --auto_port
+    # Launch without auto-loading (manual selection in app)
+    python -m stringsight.dashboard.launcher
+        """
+    )
+    parser.add_argument(
+        "--results_dir",
+        type=str,
+        help="Path to base results directory containing experiment subfolders (optional - can be loaded in the app)"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public shareable link"
+    )
+    parser.add_argument(
+        "--server_name",
+        type=str,
+        default="127.0.0.1",
+        help="Server address (default: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Server port (default: 7860). Use --auto_port to automatically find an available port."
+    )
+    parser.add_argument(
+        "--auto_port",
+        action="store_true",
+        help="Automatically find an available port by trying ports 8080-8089"
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Enable debug mode"
+    )
+    args = parser.parse_args()
+    # Handle auto_port option
+    if args.auto_port:
+        # Use a high port range for auto-port mode
+        args.port = 8080
+        print("🔍 Auto-port mode enabled - will try ports 8080-8089")
+    # Validate results directory if provided
+    if args.results_dir:
+        results_path = Path(args.results_dir)
+        if not results_path.exists():
+            print(f"❌ Error: Results directory does not exist: {args.results_dir}")
+            sys.exit(1)
+        if not results_path.is_dir():
+            print(f"❌ Error: Path is not a directory: {args.results_dir}")
+            sys.exit(1)
+    # Configure logging level when --debug is set
+    if args.debug:
+        logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
+    # Import and launch the app
+    try:
+        from .app import launch_app
+        print("🚀 Launching LMM-Vibes Gradio Visualization App...")
+        print(f"🌐 Server: http://{args.server_name}:{args.port}")
+        if args.share:
+            print("🔗 Public sharing enabled")
+        launch_app(
+            results_dir=args.results_dir,
+            share=args.share,
+            server_name=args.server_name,
+            server_port=args.port,
+            debug=args.debug
+        )
+    except ImportError as e:
+        print(f"❌ Error: Failed to import required modules: {e}")
+        print("💡 Make sure you have gradio installed: pip install gradio")
+        sys.exit(1)
+    except Exception as e:
+        print(f"❌ Error launching app: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

stringsight/dashboard/load_data_tab.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Utilities for the "Load Data" tab – loading pipeline results and scanning for
+available experiment folders.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from typing import List, Tuple
+import gradio as gr
+# ---------------------------------------------------------------------------
+# Loading utilities updated for FunctionalMetrics
+# ---------------------------------------------------------------------------
+from .state import app_state
+from .data_loader import (
+    load_pipeline_results,
+    scan_for_result_subfolders,
+    validate_results_directory,
+)
+# Metrics helpers
+from .metrics_adapter import get_all_models
+__all__ = [
+    "load_data",
+    "get_available_experiments",
+    "get_experiment_choices",
+    "refresh_experiment_dropdown",
+    "load_experiment_data",
+]
+def load_data(results_dir: str, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, str, str]:
+    """Load pipeline results from *results_dir* and update the shared *app_state*.
+    Returns a tuple of (summary_markdown, models_info_markdown, models_checkbox_update).
+    """
+    try:
+        # 1. Validate directory structure
+        progress(0.05, "Validating results directory…")
+        is_valid, error_msg = validate_results_directory(results_dir)
+        if not is_valid:
+            return "", f"❌ Error: {error_msg}", ""
+        # 2. Handle optional sub-folder selection (first match for now)
+        progress(0.15, "Scanning for experiment subfolders…")
+        subfolders = scan_for_result_subfolders(results_dir)
+        final_dir = results_dir
+        if subfolders and "." not in subfolders:
+            final_dir = str(Path(results_dir) / subfolders[0])
+        # 3. Load results into memory
+        progress(0.35, "Loading pipeline results… This may take a moment")
+        clustered_df, metrics, model_cluster_df, results_path = load_pipeline_results(final_dir)
+        # 4. Stash in global state so other tabs can use it
+        progress(0.6, "Preparing application state…")
+        app_state["clustered_df"] = clustered_df
+        app_state["metrics"] = metrics
+        app_state["model_cluster_df"] = model_cluster_df
+        # Temporary alias for legacy modules
+        app_state["model_stats"] = metrics
+        app_state["results_path"] = results_path
+        app_state["available_models"] = get_all_models(metrics)
+        app_state["current_results_dir"] = final_dir
+        # 5. Compose status messages
+        progress(0.8, "Finalizing summary…")
+        n_models = len(metrics.get("model_cluster_scores", {}))
+        n_properties = len(clustered_df)
+        # Render as Markdown, not as a plain text block.
+        summary = (
+            "**Data Summary:**\n"
+            f"- **Models:** {n_models}\n"
+            f"- **Properties:** {n_properties:,}\n"
+            f"- **Results Directory:** `{Path(final_dir).name}`"
+        )
+        # Check for both naming patterns for fine clusters
+        if ("fine_cluster_id" in clustered_df.columns or
+            "property_description_fine_cluster_id" in clustered_df.columns):
+            fine_id_col = ("fine_cluster_id" if "fine_cluster_id" in clustered_df.columns
+                          else "property_description_fine_cluster_id")
+            n_fine_clusters = clustered_df[fine_id_col].nunique()
+        model_choices = app_state["available_models"]
+        models_info = f"Available models: {', '.join(model_choices)}"
+        # Gradio update object for the CheckboxGroup
+        # Default: select all concrete models but leave the aggregate "all" unchecked
+        selected_values = [m for m in model_choices if m != "all"]
+        progress(1.0, "Dataset loaded")
+        return summary, models_info, gr.update(choices=model_choices, value=selected_values)
+    except Exception as e:
+        error_msg = f"❌ Error loading results: {e}"
+        return "", error_msg, gr.update(choices=[], value=[])
+def get_available_experiments(base_dir: str) -> List[str]:
+    """Return experiment sub-directories that contain the expected result files, sorted by modification time (most recent first)."""
+    if not base_dir or not os.path.exists(base_dir):
+        return []
+    experiments: List[Tuple[str, float]] = []
+    try:
+        for item in os.listdir(base_dir):
+            item_path = os.path.join(base_dir, item)
+            if os.path.isdir(item_path):
+                if (
+                    os.path.exists(os.path.join(item_path, "model_stats.json"))
+                    or os.path.exists(os.path.join(item_path, "clustered_results_lightweight.jsonl"))
+                ):
+                    # Get modification time of the directory
+                    mod_time = os.path.getmtime(item_path)
+                    experiments.append((item, mod_time))
+    except Exception as e:
+        print(f"Error scanning experiments: {e}")
+    # Sort by modification time (most recent first), then return just the names
+    experiments.sort(key=lambda x: x[1], reverse=True)
+    return [exp[0] for exp in experiments]
+def get_experiment_choices() -> List[str]:
+    """Return dropdown choices for the experiment selector."""
+    from . import state
+    if not state.BASE_RESULTS_DIR:
+        return []
+    experiments = get_available_experiments(state.BASE_RESULTS_DIR)
+    return ["Select an experiment..."] + experiments
+def refresh_experiment_dropdown() -> gr.update:
+    """Gradio helper to refresh the experiment dropdown choices."""
+    choices = get_experiment_choices()
+    return gr.update(choices=choices, value="Select an experiment...")
+def load_experiment_data(experiment_name: str) -> Tuple[str, str, str]:
+    """Wrapper used by Gradio events to load a *selected* experiment."""
+    from . import state
+    if not state.BASE_RESULTS_DIR or experiment_name == "Select an experiment...":
+        return "", "Please select a valid experiment", gr.update(choices=[], value=[])
+    experiment_path = os.path.join(state.BASE_RESULTS_DIR, experiment_name)
+    print(f"🔍 Loading experiment: {experiment_name} from {experiment_path}")
+    return load_data(experiment_path)

stringsight/dashboard/metrics_adapter.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Lightweight access helpers for FunctionalMetrics score dictionaries.
+The Gradio UI now receives the *raw* FunctionalMetrics output as a
+```
+metrics = {
+    "model_cluster_scores": {...},
+    "cluster_scores": {...},
+    "model_scores": {...},
+}
+```
+This module centralises the most common look-ups so that the rest of the
+codebase does *not* need to know the exact key names.  If the format
+changes again we only need to update these helpers.
+"""
+from typing import Dict, Any, List
+__all__ = [
+    "get_model_clusters",
+    "get_all_models",
+    "get_all_clusters",
+]
+def get_model_clusters(metrics: Dict[str, Any], model_name: str) -> Dict[str, Any]:
+    """Return the per-cluster dictionary for a given model.
+    Args:
+        metrics: The dict returned by ``load_pipeline_results``.
+        model_name: Name of the model.
+    """
+    if model_name == "all":
+        # For "all" model, return cluster_scores (aggregated across all models)
+        return metrics.get("cluster_scores", {})
+    else:
+        return metrics.get("model_cluster_scores", {}).get(model_name, {})
+def get_all_models(metrics: Dict[str, Any]) -> List[str]:
+    """Return the list of model names present in the metrics dict."""
+    models = list(metrics.get("model_cluster_scores", {}).keys())
+    # Add "all" as the first option to show aggregated metrics across all models
+    return ["all"] + models
+def get_all_clusters(metrics: Dict[str, Any]) -> List[str]:
+    """Return the list of cluster names (across all models)."""
+    return list(metrics.get("cluster_scores", {}).keys())

stringsight/dashboard/overview_tab.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""Logic helpers for the **Overview** tab."""
+from typing import List, Tuple, Optional
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+import gradio as gr
+from .state import app_state
+from .utils import compute_model_rankings_new, create_model_summary_card_new
+from .plotting import create_model_dataframe
+__all__ = ["create_overview", "create_model_quality_plot", "create_model_quality_table", "get_available_model_quality_metrics"]
+def create_overview(
+    selected_models: List[str],
+    top_n: int,
+    score_significant_only: bool = False,
+    quality_significant_only: bool = False,
+    sort_by: str = "quality_asc",
+    min_cluster_size: int = 1,
+    selected_tags: Optional[List[str]] = None,
+    progress: Optional[gr.Progress] = None,
+) -> str:
+    """Return the HTML snippet that summarises model performance."""
+    if not app_state["metrics"]:
+        return "Please load data first using the 'Load Data' tab."
+    if not selected_models:
+        return "Please select at least one model to display."
+    # 1. Compute global rankings and filter to selection
+    if progress:
+        progress(0.05, "Computing model rankings…")
+    model_rankings = compute_model_rankings_new(app_state["metrics"])
+    filtered_rankings = [
+        (name, stats) for name, stats in model_rankings if name in selected_models
+    ]
+    # Sort so "all" appears first, then the rest by their rankings
+    all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
+    other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
+    filtered_rankings = all_models + other_models
+    if not filtered_rankings:
+        return "No data available for selected models."
+    # 2. Assemble HTML
+    overview_html = """
+    <div style="width: 100%; margin: 0;">
+        <details style="margin-bottom:25px;">
+            <summary style="cursor:pointer; color:#4c6ef5; font-weight:500;">What do these tags and numbers mean?</summary>
+            <div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
+                <p style="color: #666; margin-bottom: 10px;">
+                    Top distinctive clusters where each model shows unique behavioural patterns.
+                    Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
+                </p>
+                <strong>Frequency Delta</strong><br>
+                For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
+                • A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
+                • A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
+                <strong>Quality Delta</strong><br>
+                The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
+                Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate that it performs worse.<br>
+                <strong>Significance Tags (F/Q)</strong><br>
+                <span style="color: #888; font-size: 13px;">
+                    Statistical significance is determined using a bootstrap procedure on the conversations to obtain 95% confidence intervals.
+                </span><br>
+                The <span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#cc6699; border:1px solid #cc669933; background:#cc669912;">F</span> and <span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#007bff; border:1px solid #007bff33; background:#007bff12;">Q</span> tags indicate <em>statistical significance</em> based on bootstraped confidence intervals:<br>
+                • <strong>F</strong> (pink): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
+                • <strong>Q</strong> (blue): At least one quality metric delta is statistically significant<br>
+                These tags help identify which behavioral patterns are reliably different from the model's baseline performance.<br><br>
+                <strong>Cluster Tags</strong><br>
+                We sometimes annotate clusters with a short tag (e.g., group or category) to aid scanning. Example tags:
+                <span style="display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#28a74512; color:#28a745; border:1px solid #28a74533;">Positive</span>
+                <span style="display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#9467bd12; color:#9467bd; border:1px solid #9467bd33;">Style</span>
+                <span style="display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#dc354512; color:#dc3545; border:1px solid #dc354533;">Negative (critical)</span>
+            </div>
+        </details>
+    """
+    total_models = max(1, len(filtered_rankings))
+    for idx, (model_name, _) in enumerate(filtered_rankings):
+        if progress:
+            progress(0.1 + 0.8 * (idx / total_models), f"Rendering overview for {model_name}…")
+        card_html = create_model_summary_card_new(
+            model_name,
+            app_state["metrics"],
+            # top_n etc.
+            top_n,
+            score_significant_only=score_significant_only,
+            quality_significant_only=quality_significant_only,
+            sort_by=sort_by,
+            min_cluster_size=min_cluster_size,
+            selected_tags=selected_tags,
+        )
+        overview_html += card_html
+    overview_html += "</div>"
+    if progress:
+        progress(1.0, "Overview ready")
+    return overview_html
+def create_model_quality_plot(
+    selected_models: List[str],
+    quality_metric: str = "helpfulness",
+) -> go.Figure:
+    """Create a bar plot of model-level quality scores with confidence intervals."""
+    if not app_state["metrics"]:
+        return None
+    if not selected_models:
+        return None
+    # Get model scores from metrics
+    model_scores = app_state["metrics"].get("model_scores", {})
+    if not model_scores:
+        return None
+    # Create model dataframe
+    model_df = create_model_dataframe(model_scores)
+    if model_df.empty:
+        return None
+    # Filter to selected models
+    model_df = model_df[model_df['model'].isin(selected_models)]
+    if model_df.empty:
+        return None
+    # Find the actual ABSOLUTE quality column (not delta) that matches the requested metric
+    # We want raw quality scores, not deltas from baseline
+    quality_col = None
+    for col in model_df.columns:
+        if (col.startswith("quality_") and
+            not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant")) and
+            "delta" not in col.lower()):  # Explicitly exclude any delta columns
+            # Check if the quality metric name is contained in the column name (case insensitive)
+            col_name = col.replace("quality_", "").lower()
+            if quality_metric.lower() in col_name:
+                quality_col = col
+                break
+    # If no match found, use the first available absolute quality column
+    if not quality_col:
+        available_quality_cols = [col for col in model_df.columns
+                                if col.startswith("quality_")
+                                and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant"))
+                                and "delta" not in col.lower()]  # Explicitly exclude delta columns
+        if not available_quality_cols:
+            return None
+        quality_col = available_quality_cols[0]  # Use first available absolute quality metric
+    # Ensure quality values are numeric
+    model_df[quality_col] = pd.to_numeric(model_df[quality_col], errors='coerce')
+    # Check if we have any valid quality data
+    if model_df[quality_col].isna().all():
+        return None
+    # Sort models by quality score (descending - best scores first)
+    model_df = model_df.sort_values(by=quality_col, ascending=False).reset_index(drop=True)
+    # Extract a clean metric name for display
+    metric_display_name = quality_col.replace("quality_", "").split("(")[0].strip()
+    # Create the plot
+    fig = go.Figure()
+    # Prepare error bar data if requested and available
+    error_y = None
+    ci_lower_col = f"{quality_col}_ci_lower"
+    ci_upper_col = f"{quality_col}_ci_upper"
+    if ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
+        # Calculate error bar values (distance from mean to upper/lower bounds)
+        error_y_upper = model_df[ci_upper_col] - model_df[quality_col]
+        error_y_lower = model_df[quality_col] - model_df[ci_lower_col]
+        error_y = dict(
+            type='data',
+            symmetric=False,
+            array=error_y_upper,
+            arrayminus=error_y_lower,
+            visible=True,
+            color="rgba(52, 73, 94, 0.7)",
+            thickness=2.5,
+            width=5
+        )
+    # Create a beautiful color gradient for the bars
+    colors = px.colors.qualitative.Set3[:len(model_df)]
+    # Add the bar chart with improved styling
+    fig.add_trace(go.Bar(
+        x=model_df['model'],
+        y=model_df[quality_col],
+        error_y=error_y,
+        marker=dict(
+            color=colors,
+            line=dict(color='rgba(255,255,255,0.8)', width=2),
+            opacity=0.8
+        ),
+        name=f'{metric_display_name} Score',
+        text=[f"{val:.2f}" for val in model_df[quality_col]],
+        textposition='outside',
+        textfont=dict(size=14, color='darkblue', family='Arial Black'),
+        hovertemplate='<b>%{x}</b><br>' +
+                     f'{metric_display_name}: %{{y:.3f}}<br>' +
+                     (
+                         f'CI: [{model_df[ci_lower_col][0]:.2f}, {model_df[ci_upper_col][0]:.2f}]<br>'
+                     ) +
+                     '<extra></extra>',
+        hoverlabel=dict(
+            bgcolor="white",
+            bordercolor="darkblue",
+            font=dict(size=14, color="darkblue")
+        )
+    ))
+    # Enhanced layout with auto-sizing and improved styling
+    fig.update_layout(
+        # Auto-sizing configuration
+        autosize=True,
+        # Enhanced axis styling
+        xaxis=dict(
+            # No title for x-axis
+            title=None,
+            tickangle=45,
+            tickfont=dict(size=14, color='#34495e', family='Arial'),
+            gridcolor='rgba(189, 195, 199, 0.3)',
+            gridwidth=1,
+            showgrid=True,
+            linecolor='#34495e',
+            linewidth=2
+        ),
+        yaxis=dict(
+            title=dict(
+                text=f"{metric_display_name}",
+                font=dict(size=18, color='#34495e', family='Arial')
+            ),
+            automargin=True,
+            tickfont=dict(size=20, color='#34495e', family='Arial'),
+            gridcolor='rgba(189, 195, 199, 0.3)',
+            gridwidth=1,
+            showgrid=True,
+            linecolor='#34495e',
+            linewidth=2
+        ),
+        # Enhanced styling
+        showlegend=False,
+        plot_bgcolor='rgba(248, 249, 250, 0.8)',
+        paper_bgcolor='white',
+        margin=dict(l=60, r=60, t=60, b=60, autoexpand=True),
+        font=dict(family="Arial, sans-serif", color='#2c3e50'),
+        # No border - removed for cleaner look
+    )
+    fig.update_traces(
+        textposition="outside",  # put labels above bars
+        cliponaxis=False         # don’t cut them off
+    )
+    return fig
+def create_model_quality_table(
+    selected_models: List[str],
+    quality_metric: str = "helpfulness"
+) -> str:
+    """Create an HTML table of model-level quality scores."""
+    if not app_state["metrics"]:
+        return "No data loaded. Please load data first using the 'Load Data' tab."
+    if not selected_models:
+        return "Please select at least one model to display."
+    # Get model scores from metrics
+    model_scores = app_state["metrics"].get("model_scores", {})
+    if not model_scores:
+        return "No model scores available in the loaded data."
+    # Create model dataframe
+    model_df = create_model_dataframe(model_scores)
+    if model_df.empty:
+        return "No model data available."
+    # Filter to selected models
+    model_df = model_df[model_df['model'].isin(selected_models)]
+    if model_df.empty:
+        return "No data available for selected models."
+    # Find the actual ABSOLUTE quality column (not delta) that matches the requested metric
+    # We want raw quality scores, not deltas from baseline
+    quality_col = None
+    for col in model_df.columns:
+        if (col.startswith("quality_") and
+            not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant")) and
+            "delta" not in col.lower()):  # Explicitly exclude any delta columns
+            # Check if the quality metric name is contained in the column name (case insensitive)
+            col_name = col.replace("quality_", "").lower()
+            if quality_metric.lower() in col_name:
+                quality_col = col
+                break
+    # If no match found, use the first available absolute quality column
+    if not quality_col:
+        available_quality_cols = [col for col in model_df.columns
+                                if col.startswith("quality_")
+                                and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant"))
+                                and "delta" not in col.lower()]  # Explicitly exclude delta columns
+        if not available_quality_cols:
+            return "No quality metrics found in the data."
+        quality_col = available_quality_cols[0]  # Use first available absolute quality metric
+    # Ensure quality values are numeric
+    model_df[quality_col] = pd.to_numeric(model_df[quality_col], errors='coerce')
+    # Check if we have any valid quality data
+    if model_df[quality_col].isna().all():
+        return f"No valid quality data found for metric '{quality_metric}'."
+    # Sort models by quality score (descending - best scores first)
+    model_df = model_df.sort_values(by=quality_col, ascending=False).reset_index(drop=True)
+    # Extract a clean metric name for display
+    metric_display_name = quality_col.replace("quality_", "").split("(")[0].strip()
+    # Define confidence interval column names
+    ci_lower_col = f"{quality_col}_ci_lower"
+    ci_upper_col = f"{quality_col}_ci_upper"
+    # Debug: Check if confidence interval columns exist
+    has_ci = ci_lower_col in model_df.columns and ci_upper_col in model_df.columns
+    if not has_ci:
+        # Try alternative naming pattern
+        metric_name = quality_col.replace("quality_", "")
+        alt_ci_lower = f"quality_{metric_name}_ci_lower"
+        alt_ci_upper = f"quality_{metric_name}_ci_upper"
+        if alt_ci_lower in model_df.columns and alt_ci_upper in model_df.columns:
+            ci_lower_col = alt_ci_lower
+            ci_upper_col = alt_ci_upper
+            has_ci = True
+    # Calculate ranks based on confidence intervals
+    # A model's rank = 1 + number of models that are confidently better (non-overlapping CIs)
+    ranks = []
+    if has_ci:
+        # Use confidence interval-based ranking
+        for i, row in model_df.iterrows():
+            # Get current model's quality score and confidence intervals
+            current_score = row[quality_col]
+            current_upper = row[ci_upper_col] if not pd.isna(row[ci_upper_col]) else current_score
+            current_lower = row[ci_lower_col] if not pd.isna(row[ci_lower_col]) else current_score
+            # Count how many models are confidently better
+            confidently_better = 0
+            for j, other_row in model_df.iterrows():
+                if i != j:  # Don't compare with self
+                    other_score = other_row[quality_col]
+                    other_upper = other_row[ci_upper_col] if not pd.isna(other_row[ci_upper_col]) else other_score
+                    other_lower = other_row[ci_lower_col] if not pd.isna(other_row[ci_lower_col]) else other_score
+                    # Check if other model's CI is completely above current model's CI
+                    # This means the other model is confidently better
+                    if other_lower > current_upper:
+                        confidently_better += 1
+            ranks.append(confidently_better + 1)  # Rank = 1 + number confidently better
+    else:
+        # Fallback to simple ranking by quality score (no confidence intervals)
+        # Sort by quality score and assign ranks
+        sorted_indices = model_df[quality_col].sort_values(ascending=False).index
+        rank_dict = {idx: rank + 1 for rank, idx in enumerate(sorted_indices)}
+        ranks = [rank_dict[idx] for idx in model_df.index]
+    # Prepare table data
+    table_rows = []
+    for idx, row in model_df.iterrows():
+        model_name = row['model']
+        quality_score = row[quality_col]
+        rank = ranks[idx]
+        # Get confidence intervals if available
+        ci_text = ""
+        if ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
+            ci_lower = row[ci_lower_col]
+            ci_upper = row[ci_upper_col]
+            ci_text = f" [{ci_lower:.3f}, {ci_upper:.3f}]"
+        table_rows.append(f"""
+        <tr>
+            <td style=\"text-align: center; padding: 6px 8px; font-weight: bold; color: #2c3e50;\">{rank}</td>
+            <td style=\"padding: 6px 8px; color: #2c3e50;\">{model_name}</td>
+            <td style=\"text-align: center; padding: 6px 8px; color: #2c3e50;\">{quality_score:.3f}{ci_text}</td>
+        </tr>
+        """)
+    # Create HTML table
+    html_table = f"""
+    <div style="width: 100%; margin: 0; max-height: 340px; overflow: auto;">
+        <table style="width: 100%; border-collapse: collapse; background: white; border: 1px solid #ddd; border-radius: 4px; font-size: 13px;">
+            <thead>
+                <tr style="background: #f8f9fa; border-bottom: 2px solid #dee2e6;">
+                    <th style="padding: 6px 8px; text-align: center; font-weight: bold; color: #495057; border-right: 1px solid #dee2e6;">Rank</th>
+                    <th style="padding: 6px 8px; text-align: left; font-weight: bold; color: #495057; border-right: 1px solid #dee2e6;">Model</th>
+                    <th style="padding: 6px 8px; text-align: center; font-weight: bold; color: #495057;">{metric_display_name}</th>
+                </tr>
+            </thead>
+            <tbody>
+                {''.join(table_rows)}
+            </tbody>
+        </table>
+        <p style="text-align: center; color: #6c757d; font-size: 11px; margin-top: 8px; font-family: Arial, sans-serif;">
+            {f"Ranks based on confidence intervals (non-overlapping CIs). Models with overlapping CIs may have the same rank." if has_ci else "Ranks based on quality scores (confidence intervals not available)."}
+        </p>
+    </div>
+    """
+    return html_table
+def get_available_model_quality_metrics() -> List[str]:
+    """Get available quality metrics from the loaded model data."""
+    if not app_state["metrics"]:
+        return ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    model_scores = app_state["metrics"].get("model_scores", {})
+    if not model_scores:
+        return ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    # Create model dataframe to get available columns
+    model_df = create_model_dataframe(model_scores)
+    if model_df.empty:
+        return ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    # Find all ABSOLUTE quality columns (excluding CI, delta, and other suffix columns)
+    quality_columns = [col for col in model_df.columns
+                      if col.startswith("quality_")
+                      and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant"))
+                      and "delta" not in col.lower()]
+    # Extract simplified metric names for dropdown choices
+    # These will be matched against the full column names in create_model_quality_plot
+    available_quality_metrics = []
+    for col in quality_columns:
+        # Remove "quality_" prefix and extract the main metric name
+        metric_name = col.replace("quality_", "").split("(")[0].strip().lower()
+        # Use common simplified names that users would expect
+        if "help" in metric_name:
+            available_quality_metrics.append("helpfulness")
+        elif "understand" in metric_name:
+            available_quality_metrics.append("understandability")
+        elif "complete" in metric_name:
+            available_quality_metrics.append("completeness")
+        elif "concise" in metric_name:
+            available_quality_metrics.append("conciseness")
+        elif "harm" in metric_name:
+            available_quality_metrics.append("harmlessness")
+        else:
+            # For other metrics, use the first word
+            available_quality_metrics.append(metric_name.split()[0])
+    # Remove duplicates while preserving order
+    available_quality_metrics = list(dict.fromkeys(available_quality_metrics))
+    # If no quality metrics found, provide defaults
+    if not available_quality_metrics:
+        available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    return available_quality_metrics

stringsight/dashboard/plots_tab.py ADDED Viewed

	@@ -0,0 +1,444 @@

+"""
+Plots tab for the LMM-Vibes Gradio app.
+This module provides functionality to display the model cluster proportion and quality plots.
+"""
+import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from typing import Tuple, List, Optional, Any
+from .state import app_state
+from .utils import extract_allowed_tag, ALLOWED_TAGS
+def create_proportion_plot(selected_clusters: Optional[List[str]] = None, show_ci: bool = False, selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Tuple[go.Figure, str]:
+    """Create a grouped bar plot of proportion by property and model."""
+    if app_state.get("model_cluster_df") is None:
+        return None, "No model cluster data loaded. Please load data first."
+    model_cluster_df = app_state["model_cluster_df"]
+    if model_cluster_df.empty:
+        return None, "No model cluster data available."
+    # Ensure proportion values are numeric and in reasonable range
+    model_cluster_df = model_cluster_df.copy()
+    # Optional: filter to selected models (ignore the pseudo 'all' entry if present)
+    if selected_models:
+        concrete_models = [m for m in selected_models if m != "all"]
+        if concrete_models:
+            model_cluster_df = model_cluster_df[model_cluster_df["model"].isin(concrete_models)]
+    model_cluster_df['proportion'] = pd.to_numeric(model_cluster_df['proportion'], errors='coerce')
+    # Check for any unreasonable values
+    print("After conversion - Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
+    # Filter out "No properties" clusters
+    model_cluster_df = model_cluster_df[model_cluster_df['cluster'] != "No properties"]
+    # Optional: filter clusters by selected tags using metrics.cluster_scores metadata
+    if selected_tags:
+        metrics = app_state.get("metrics", {})
+        cluster_scores = metrics.get("cluster_scores", {})
+        def _first_allowed(meta_obj: Any) -> Any:
+            return extract_allowed_tag(meta_obj)
+        allowed = set(map(str, selected_tags))
+        allowed_clusters = {c for c, d in cluster_scores.items() if str(_first_allowed(d.get("metadata"))) in allowed}
+        if allowed_clusters:
+            model_cluster_df = model_cluster_df[model_cluster_df['cluster'].isin(allowed_clusters)]
+    # Determine which clusters to include: user-selected or default top 15 by aggregated frequency
+    cluster_freq = (
+        model_cluster_df.groupby('cluster', as_index=False)['proportion']
+        .sum()
+        .sort_values('proportion', ascending=False)
+    )
+    if selected_clusters:
+        chosen_clusters = [c for c in selected_clusters if c in cluster_freq['cluster'].tolist()]
+        model_cluster_df = model_cluster_df[model_cluster_df['cluster'].isin(chosen_clusters)]
+    else:
+        default_top = cluster_freq['cluster'].head(15).tolist() if len(cluster_freq) > 15 else cluster_freq['cluster'].tolist()
+        model_cluster_df = model_cluster_df[model_cluster_df['cluster'].isin(default_top)]
+    # Decide whether to abbreviate property names based on word count
+    # If any property name has more than 6 words, we abbreviate (P1, P2, ...)
+    unique_properties = sorted(model_cluster_df['cluster'].unique())
+    should_abbreviate = any(len(str(prop).split()) > 6 for prop in unique_properties)
+    mapping_text_parts: List[str] = []
+    if should_abbreviate:
+        property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
+        model_cluster_df['display_label'] = model_cluster_df['cluster'].map(property_mapping)
+        # Prepare mapping legend text
+        mapping_text_parts.append("**Property Mapping**\n\n")
+        for prop, abbr in property_mapping.items():
+            mapping_text_parts.append(f"**{abbr}:** {prop}\n\n")
+    else:
+        # Use full names directly as x tick labels
+        model_cluster_df['display_label'] = model_cluster_df['cluster']
+    # Prepare confidence interval data if requested
+    error_y_data = None
+    if show_ci and 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
+        # Calculate error bar values
+        model_cluster_df['y_error'] = model_cluster_df['proportion_ci_upper'] - model_cluster_df['proportion']
+        model_cluster_df['y_error_minus'] = model_cluster_df['proportion'] - model_cluster_df['proportion_ci_lower']
+        # Replace NaN values with 0
+        model_cluster_df['y_error'] = model_cluster_df['y_error'].fillna(0)
+        model_cluster_df['y_error_minus'] = model_cluster_df['y_error_minus'].fillna(0)
+        error_y_data = model_cluster_df['y_error']
+        error_y_minus_data = model_cluster_df['y_error_minus']
+    # Create a grouped bar plot of 'proportion' by property (x) and model (hue)
+    fig = px.bar(
+        model_cluster_df,
+        x="display_label",
+        y="proportion",
+        color="model",
+        barmode="group",
+        title=None,
+        labels={"proportion": "Proportion", "display_label": "Property", "model": "Model"},
+        error_y="y_error" if error_y_data is not None else None,
+        error_y_minus="y_error_minus" if error_y_data is not None else None
+    )
+    # Set the x-axis order to ensure consistent ordering
+    property_order = [f"P{i+1}" for i in range(len(unique_properties))] if should_abbreviate else unique_properties
+    fig.update_xaxes(categoryorder='array', categoryarray=property_order)
+    fig.update_layout(xaxis_tickangle=45)
+    # Make layout responsive and move legend to the top to utilize full width
+    fig.update_layout(
+        autosize=True,
+        margin=dict(l=40, r=40, t=110, b=80),
+        title=dict(pad=dict(t=20, b=10)),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.15,
+            xanchor="left",
+            x=0
+        )
+    )
+    # save figure to file
+    fig.write_html("model_cluster_proportion_plot.html")
+    # Build info/legend text
+    if show_ci:
+        if 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
+            if mapping_text_parts:
+                mapping_text_parts.append("---\n\n")
+            mapping_text_parts.append("**Confidence Intervals:**\n")
+            mapping_text_parts.append("Error bars show 95% confidence intervals for proportion values.\n")
+        else:
+            if mapping_text_parts:
+                mapping_text_parts.append("---\n\n")
+            mapping_text_parts.append("**Note:** Confidence interval data not available in the loaded dataset.\n")
+    mapping_text = "".join(mapping_text_parts)
+    return fig, mapping_text
+def create_quality_plot(quality_metric: str = "helpfulness", selected_clusters: Optional[List[str]] = None, show_ci: bool = False, selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Tuple[go.Figure, str]:
+    """Create a grouped bar plot of quality by property and model."""
+    if app_state.get("model_cluster_df") is None:
+        return None, "No model cluster data loaded. Please load data first."
+    model_cluster_df = app_state["model_cluster_df"]
+    if model_cluster_df.empty:
+        return None, "No model cluster data available."
+    # Check if the quality metric exists in the data
+    quality_col = f"quality_{quality_metric}"
+    if quality_col not in model_cluster_df.columns:
+        # Get available quality metrics for better error message
+        available_metrics = [col.replace("quality_", "") for col in model_cluster_df.columns
+                           if col.startswith("quality_")
+                           and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
+        if not available_metrics:
+            return None, f"No quality metrics found in the data. Available columns: {list(model_cluster_df.columns)}"
+        return None, f"Quality metric '{quality_metric}' not found. Available metrics: {available_metrics}"
+    # Create a copy for plotting
+    plot_df = model_cluster_df.copy()
+    # Optional: filter clusters by selected tags using metrics.cluster_scores metadata
+    if selected_tags:
+        metrics = app_state.get("metrics", {})
+        cluster_scores = metrics.get("cluster_scores", {})
+        def _first_allowed(meta_obj: Any) -> Any:
+            return extract_allowed_tag(meta_obj)
+        allowed = set(map(str, selected_tags))
+        allowed_clusters = {c for c, d in cluster_scores.items() if str(_first_allowed(d.get("metadata"))) in allowed}
+        if allowed_clusters:
+            plot_df = plot_df[plot_df['cluster'].isin(allowed_clusters)]
+    # Optional: filter to selected models (ignore the pseudo 'all' entry if present)
+    if selected_models:
+        concrete_models = [m for m in selected_models if m != "all"]
+        if concrete_models:
+            plot_df = plot_df[plot_df["model"].isin(concrete_models)]
+    # Ensure quality values are numeric
+    plot_df[quality_col] = pd.to_numeric(plot_df[quality_col], errors='coerce')
+    # Check if we have any valid quality data
+    if plot_df[quality_col].isna().all():
+        return None, f"No valid quality data found for metric '{quality_metric}'. All values are missing or invalid."
+    # Filter out "No properties" clusters
+    plot_df = plot_df[plot_df['cluster'] != "No properties"]
+    # Determine which clusters to include: user-selected or default top 15 by aggregated frequency
+    cluster_freq = (
+        plot_df[plot_df['cluster'] != "No properties"]
+        .groupby('cluster', as_index=False)['proportion']
+        .sum()
+        .sort_values('proportion', ascending=False)
+    )
+    if selected_clusters:
+        chosen_clusters = [c for c in selected_clusters if c in cluster_freq['cluster'].tolist()]
+        plot_df = plot_df[plot_df['cluster'].isin(chosen_clusters)]
+    else:
+        default_top = cluster_freq['cluster'].head(15).tolist() if len(cluster_freq) > 15 else cluster_freq['cluster'].tolist()
+        plot_df = plot_df[plot_df['cluster'].isin(default_top)]
+    # Decide whether to abbreviate property names based on word count
+    unique_properties = sorted(plot_df['cluster'].unique())
+    should_abbreviate = any(len(str(prop).split()) > 6 for prop in unique_properties)
+    mapping_text_parts: List[str] = []
+    if should_abbreviate:
+        property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
+        plot_df['display_label'] = plot_df['cluster'].map(property_mapping)
+        # Prepare mapping legend text
+        mapping_text_parts.append("**Property Mapping:**\n\n")
+        for prop, abbr in property_mapping.items():
+            mapping_text_parts.append(f"**{abbr}:** {prop}\n\n")
+    else:
+        plot_df['display_label'] = plot_df['cluster']
+    # Prepare confidence interval data if requested
+    error_y_data = None
+    if show_ci:
+        ci_lower_col = f"{quality_col}_ci_lower"
+        ci_upper_col = f"{quality_col}_ci_upper"
+        if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
+            # Calculate error bar values
+            plot_df['y_error'] = plot_df[ci_upper_col] - plot_df[quality_col]
+            plot_df['y_error_minus'] = plot_df[quality_col] - plot_df[ci_lower_col]
+            # Replace NaN values with 0
+            plot_df['y_error'] = plot_df['y_error'].fillna(0)
+            plot_df['y_error_minus'] = plot_df['y_error_minus'].fillna(0)
+            error_y_data = plot_df['y_error']
+            error_y_minus_data = plot_df['y_error_minus']
+    # Create a grouped bar plot of quality by property (x) and model (hue)
+    fig = px.bar(
+        plot_df,
+        x="display_label",
+        y=quality_col,
+        color="model",
+        barmode="group",
+        title=None,
+        labels={quality_col: f"Quality ({quality_metric.title()})", "display_label": "Property", "model": "Model"},
+        error_y="y_error" if error_y_data is not None else None,
+        error_y_minus="y_error_minus" if error_y_data is not None else None
+    )
+    # Set the x-axis order to ensure consistent ordering
+    property_order = [f"P{i+1}" for i in range(len(unique_properties))] if should_abbreviate else unique_properties
+    fig.update_xaxes(categoryorder='array', categoryarray=property_order)
+    fig.update_layout(xaxis_tickangle=45)
+    # Make layout responsive and move legend to the top to utilize full width
+    fig.update_layout(
+        autosize=True,
+        margin=dict(l=40, r=40, t=110, b=80),
+        title=dict(pad=dict(t=20, b=10)),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.15,
+            xanchor="left",
+            x=0
+        )
+    )
+    # save figure to file
+    fig.write_html(f"model_cluster_quality_{quality_metric}_plot.html")
+    # Build info/legend text
+    if show_ci:
+        ci_lower_col = f"{quality_col}_ci_lower"
+        ci_upper_col = f"{quality_col}_ci_upper"
+        if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
+            if mapping_text_parts:
+                mapping_text_parts.append("---\n\n")
+            mapping_text_parts.append("**Confidence Intervals:**\n")
+            mapping_text_parts.append(f"Error bars show 95% confidence intervals for {quality_metric} values.\n")
+        else:
+            if mapping_text_parts:
+                mapping_text_parts.append("---\n\n")
+            mapping_text_parts.append("**Note:** Confidence interval data not available for this quality metric.\n")
+    mapping_text = "".join(mapping_text_parts)
+    return fig, mapping_text
+def get_available_quality_metrics() -> List[str]:
+    """Get available quality metrics from the loaded DataFrame."""
+    if app_state.get("model_cluster_df") is None:
+        return ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    model_cluster_df = app_state["model_cluster_df"]
+    # Find all quality columns (excluding CI and other suffix columns)
+    quality_columns = [
+        col for col in model_cluster_df.columns
+        if col.startswith("quality_")
+        and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))
+        and ("delta" not in col.lower())
+    ]
+    # Extract metric names by removing "quality_" prefix
+    available_quality_metrics = [col.replace("quality_", "") for col in quality_columns]
+    # If no quality metrics found, provide defaults
+    if not available_quality_metrics:
+        available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
+    return available_quality_metrics
+def update_quality_metric_dropdown() -> gr.Dropdown:
+    """Update the quality metric dropdown with available metrics."""
+    available_metrics = get_available_quality_metrics()
+    return gr.Dropdown(
+        label="Quality Metric",
+        choices=available_metrics,
+        value=available_metrics[0] if available_metrics else "helpfulness",
+        info="Select which quality metric to display"
+    )
+def update_quality_metric_visibility(plot_type: str) -> gr.Dropdown:
+    """Update the quality metric dropdown visibility based on plot type."""
+    if plot_type == "quality":
+        available_metrics = get_available_quality_metrics()
+        return gr.update(
+            choices=available_metrics,
+            value=(available_metrics[0] if available_metrics else None),
+            visible=True,
+        )
+    # When not in quality mode, clear value and choices to avoid stale selections
+    return gr.update(choices=[], value=None, visible=False)
+def create_plot_with_toggle(plot_type: str, quality_metric: str = "helpfulness", selected_clusters: Optional[List[str]] = None, show_ci: bool = False, selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Tuple[go.Figure, str]:
+    """Create a plot based on the selected type (frequency or quality)."""
+    if plot_type == "frequency":
+        return create_proportion_plot(selected_clusters, show_ci, selected_models, selected_tags)
+    elif plot_type == "quality":
+        return create_quality_plot(quality_metric, selected_clusters, show_ci, selected_models, selected_tags)
+    else:
+        return None, f"Unknown plot type: {plot_type}"
+def create_plots_tab() -> Tuple[gr.Plot, gr.Markdown, gr.Checkbox, gr.Dropdown, gr.Dropdown, gr.CheckboxGroup]:
+    """Create the plots tab interface with a toggle between frequency and quality plots."""
+    # Accordion at the top for selecting specific properties
+    with gr.Accordion("Select properties to display", open=False):
+        cluster_selector = gr.CheckboxGroup(
+            label="Select Clusters (Properties)",
+            choices=[],
+            value=[],
+            info="Defaults to the top 15 by frequency.",
+            show_label=False
+        )
+    # Plot controls in a row
+    with gr.Row():
+        # Plot type toggle
+        plot_type_dropdown = gr.Dropdown(
+            label="Plot Type",
+            choices=["frequency", "quality"],
+            value="frequency",
+            info="Choose between frequency (proportion) or quality metrics"
+        )
+        # Quality metric dropdown (only visible for quality plots)
+        quality_metric_dropdown = gr.Dropdown(
+            label="Quality Metric",
+            choices=[],
+            value=None,
+            info="Select which quality metric to display",
+            visible=False  # Initially hidden, shown when quality is selected
+        )
+    # Add checkbox for confidence intervals
+    show_ci_checkbox = gr.Checkbox(
+        label="Show Confidence Intervals",
+        value=False,
+        info="Display 95% confidence intervals as error bars (if available in data)"
+    )
+    plot_display = gr.Plot(
+        label="Model-Cluster Analysis Plot",
+        show_label=False,
+        value=None
+    )
+    # Mapping text should appear directly below the plot
+    plot_info = gr.Markdown("")
+    return plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown, cluster_selector
+def update_cluster_selection(selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Any:
+    """Populate the cluster selector choices and default selection (top 15 by frequency).
+    If selected_models is provided, restrict clusters to those computed from the selected models.
+    """
+    if app_state.get("model_cluster_df") is None:
+        return gr.update(choices=[], value=[])
+    df = app_state["model_cluster_df"]
+    # Optional: filter to selected models (ignore the pseudo 'all' entry if present)
+    if selected_models:
+        concrete_models = [m for m in selected_models if m != "all"]
+        if concrete_models:
+            df = df[df["model"].isin(concrete_models)]
+    # Optional: filter by selected tags using cluster_scores metadata
+    if selected_tags:
+        metrics = app_state.get("metrics", {})
+        cluster_scores = metrics.get("cluster_scores", {})
+        def _first_allowed(meta_obj: Any) -> Any:
+            return extract_allowed_tag(meta_obj)
+        allowed = set(map(str, selected_tags))
+        allowed_clusters = {c for c, d in cluster_scores.items() if str(_first_allowed(d.get("metadata"))) in allowed}
+        if allowed_clusters:
+            df = df[df['cluster'].isin(allowed_clusters)]
+    if df.empty or 'cluster' not in df.columns or 'proportion' not in df.columns:
+        return gr.update(choices=[], value=[])
+    # Exclude "No properties"
+    df = df[df['cluster'] != "No properties"].copy()
+    freq = (
+        df.groupby('cluster', as_index=False)['proportion']
+        .sum()
+        .sort_values('proportion', ascending=False)
+    )
+    clusters_ordered = freq['cluster'].tolist()
+    # Build label-value tuples; strip '**' from labels only (values remain raw)
+    label_value_choices = []
+    for cluster in clusters_ordered:
+        raw_val = str(cluster)
+        label = raw_val.replace('**', '')
+        label_value_choices.append((label, raw_val))
+    default_values = [str(cluster) for cluster in clusters_ordered[:15]]
+    return gr.update(choices=label_value_choices, value=default_values)

stringsight/dashboard/plotting.py ADDED Viewed

	@@ -0,0 +1,616 @@

+"""
+Plotting functionality for functional metrics.
+This module provides comprehensive visualization of metrics from functional_metrics.py,
+"""
+import json
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import Dict, Any, List, Optional
+import warnings
+import plotly.graph_objects as go
+import plotly.express as px
+from plotly.subplots import make_subplots
+import plotly.io as pio
+# Set plotly template
+pio.templates.default = "plotly_white"
+warnings.filterwarnings('ignore')
+def create_model_cluster_dataframe(model_cluster_scores: Dict[str, Any]) -> pd.DataFrame:
+    """Convert model-cluster scores to a tidy dataframe."""
+    rows = []
+    for model, clusters in model_cluster_scores.items():
+        for cluster, metrics in clusters.items():
+            # Filter out "No properties" clusters
+            if cluster == "No properties":
+                continue
+            row = {
+                'model': model,
+                'cluster': cluster,
+                'size': metrics.get('size', 0),
+                'proportion': metrics.get('proportion', 0),
+                'proportion_delta': metrics.get('proportion_delta', 0)
+            }
+            # Add confidence intervals if available
+            if 'proportion_ci' in metrics:
+                ci = metrics['proportion_ci']
+                row.update({
+                    'proportion_ci_lower': ci.get('lower', 0),
+                    'proportion_ci_upper': ci.get('upper', 0),
+                    'proportion_ci_mean': ci.get('mean', 0)
+                })
+            if 'proportion_delta_ci' in metrics:
+                ci = metrics['proportion_delta_ci']
+                row.update({
+                    'proportion_delta_ci_lower': ci.get('lower', 0),
+                    'proportion_delta_ci_upper': ci.get('upper', 0),
+                    'proportion_delta_ci_mean': ci.get('mean', 0)
+                })
+            # Add significance flags
+            row['proportion_delta_significant'] = metrics.get('proportion_delta_significant', False)
+            # Add quality metrics
+            quality = metrics.get('quality', {})
+            quality_delta = metrics.get('quality_delta', {})
+            quality_ci = metrics.get('quality_ci', {})
+            quality_delta_ci = metrics.get('quality_delta_ci', {})
+            quality_delta_significant = metrics.get('quality_delta_significant', {})
+            for metric_name in quality.keys():
+                row[f'quality_{metric_name}'] = quality[metric_name]
+                row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
+                row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
+                if metric_name in quality_ci:
+                    ci = quality_ci[metric_name]
+                    row.update({
+                        f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
+                        f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
+                        f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
+                    })
+                if metric_name in quality_delta_ci:
+                    ci = quality_delta_ci[metric_name]
+                    row.update({
+                        f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
+                        f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
+                        f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
+                    })
+            rows.append(row)
+    return pd.DataFrame(rows)
+def create_cluster_dataframe(cluster_scores: Dict[str, Any]) -> pd.DataFrame:
+    """Convert cluster scores to a tidy dataframe."""
+    rows = []
+    for cluster, metrics in cluster_scores.items():
+        # Filter out "No properties" clusters
+        if cluster == "No properties":
+            continue
+        row = {
+            'cluster': cluster,
+            'size': metrics.get('size', 0),
+            'proportion': metrics.get('proportion', 0)
+        }
+        # Add confidence intervals if available
+        if 'proportion_ci' in metrics:
+            ci = metrics['proportion_ci']
+            row.update({
+                'proportion_ci_lower': ci.get('lower', 0),
+                'proportion_ci_upper': ci.get('upper', 0),
+                'proportion_ci_mean': ci.get('mean', 0)
+            })
+        # Add quality metrics
+        quality = metrics.get('quality', {})
+        quality_delta = metrics.get('quality_delta', {})
+        quality_ci = metrics.get('quality_ci', {})
+        quality_delta_ci = metrics.get('quality_delta_ci', {})
+        quality_delta_significant = metrics.get('quality_delta_significant', {})
+        for metric_name in quality.keys():
+            row[f'quality_{metric_name}'] = quality[metric_name]
+            row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
+            row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
+            if metric_name in quality_ci:
+                ci = quality_ci[metric_name]
+                row.update({
+                    f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+            if metric_name in quality_delta_ci:
+                ci = quality_delta_ci[metric_name]
+                row.update({
+                    f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+        rows.append(row)
+    return pd.DataFrame(rows)
+def create_model_dataframe(model_scores: Dict[str, Any]) -> pd.DataFrame:
+    """Convert model scores to a tidy dataframe."""
+    rows = []
+    for model, metrics in model_scores.items():
+        row = {
+            'model': model,
+            'size': metrics.get('size', 0),
+            'proportion': metrics.get('proportion', 0)
+        }
+        # Add confidence intervals if available
+        if 'proportion_ci' in metrics:
+            ci = metrics['proportion_ci']
+            row.update({
+                'proportion_ci_lower': ci.get('lower', 0),
+                'proportion_ci_upper': ci.get('upper', 0),
+                'proportion_ci_mean': ci.get('mean', 0)
+            })
+        # Add quality metrics
+        quality = metrics.get('quality', {})
+        quality_delta = metrics.get('quality_delta', {})
+        quality_ci = metrics.get('quality_ci', {})
+        quality_delta_ci = metrics.get('quality_delta_ci', {})
+        quality_delta_significant = metrics.get('quality_delta_significant', {})
+        for metric_name in quality.keys():
+            row[f'quality_{metric_name}'] = quality[metric_name]
+            row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
+            row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
+            if metric_name in quality_ci:
+                ci = quality_ci[metric_name]
+                row.update({
+                    f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+            if metric_name in quality_delta_ci:
+                ci = quality_delta_ci[metric_name]
+                row.update({
+                    f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
+                    f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
+                    f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
+                })
+        rows.append(row)
+    return pd.DataFrame(rows)
+def get_quality_metrics(df: pd.DataFrame) -> List[str]:
+    """Extract quality metric names from dataframe columns."""
+    quality_cols = [col for col in df.columns if col.startswith('quality_') and not col.endswith(('_ci_lower', '_ci_upper', '_ci_mean', '_significant'))]
+    return [col.replace('quality_', '') for col in quality_cols]
+def create_interactive_cluster_plot(cluster_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
+                                 metric_col: str, title: str,
+                                 ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
+                                 significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive cluster plot with dropdown for view mode."""
+    # Create the figure with subplots
+    fig = make_subplots(
+        rows=1, cols=1,
+        specs=[[{"secondary_y": False}]],
+        subplot_titles=[title]
+    )
+    # Prepare cluster_df - reset index if cluster is the index
+    if 'cluster' not in cluster_df.columns and cluster_df.index.name == 'cluster':
+        cluster_df = cluster_df.reset_index()
+    # Sort clusters by metric value in descending order for consistent ordering
+    cluster_df = cluster_df.sort_values(metric_col, ascending=False)
+    # Add aggregated view (default) - using cluster_df
+    if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
+        fig.add_trace(
+            go.Bar(
+                x=cluster_df['cluster'],
+                y=cluster_df[metric_col],
+                name='Aggregated (All Models)',
+                error_y=dict(
+                    type='data',
+                    array=cluster_df[ci_upper_col] - cluster_df[metric_col],
+                    arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
+                    visible=True
+                ),
+                visible=True
+            )
+        )
+    else:
+        fig.add_trace(
+            go.Bar(
+                x=cluster_df['cluster'],
+                y=cluster_df[metric_col],
+                name='Aggregated (All Models)',
+                visible=True
+            )
+        )
+    # Grouped by model view - using model_cluster_df
+    for model in model_cluster_df['model'].unique():
+        model_df = model_cluster_df[model_cluster_df['model'] == model]
+        # Sort model_df to match the cluster order
+        model_df = model_df.set_index('cluster').reindex(cluster_df['cluster']).reset_index()
+        if ci_lower_col and ci_upper_col and ci_lower_col in model_cluster_df.columns and ci_upper_col in model_cluster_df.columns:
+            fig.add_trace(
+                go.Bar(
+                    x=model_df['cluster'],
+                    y=model_df[metric_col],
+                    name=f'Model: {model}',
+                    error_y=dict(
+                        type='data',
+                        array=model_df[ci_upper_col] - model_df[metric_col],
+                        arrayminus=model_df[metric_col] - model_df[ci_lower_col],
+                        visible=False
+                    ),
+                    visible=False
+                )
+            )
+        else:
+            fig.add_trace(
+                go.Bar(
+                    x=model_df['cluster'],
+                    y=model_df[metric_col],
+                    name=f'Model: {model}',
+                    visible=False
+                )
+            )
+    # Add significance markers if available (for aggregated view)
+    # Red asterisks (*) indicate clusters with statistically significant quality delta values
+    # (confidence intervals that do not contain 0)
+    if significant_col and significant_col in cluster_df.columns:
+        for i, (cluster, is_sig) in enumerate(zip(cluster_df['cluster'], cluster_df[significant_col])):
+            if is_sig:
+                fig.add_annotation(
+                    x=cluster,
+                    y=cluster_df[cluster_df['cluster'] == cluster][metric_col].iloc[0],
+                    text="*",
+                    showarrow=False,
+                    font=dict(size=16, color="red"),
+                    yshift=10
+                )
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="Cluster",
+        yaxis_title=metric_col.replace('_', ' ').title(),
+        barmode='group',
+        height=500,
+        showlegend=True,
+        annotations=[
+            dict(
+                text="* = Statistically significant (CI does not contain 0)",
+                showarrow=False,
+                xref="paper", yref="paper",
+                x=0.01, y=0.01,
+                xanchor="left", yanchor="bottom",
+                font=dict(size=10, color="red")
+            )
+        ] if significant_col and significant_col in cluster_df.columns else []
+    )
+    # Add dropdown for view selection - only 2 options
+    buttons = []
+    # Aggregated view button (all models combined)
+    visibility = [True] + [False] * len(model_cluster_df['model'].unique())
+    buttons.append(
+        dict(
+            label="Aggregated (All Models)",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    # Grouped by model view (each model as separate bars)
+    visibility = [False] + [True] * len(model_cluster_df['model'].unique())
+    buttons.append(
+        dict(
+            label="Grouped by Model",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                buttons=buttons,
+                direction="down",
+                showactive=True,
+                x=0.95,
+                xanchor="right",
+                y=1.25,
+                yanchor="top"
+            )
+        ]
+    )
+    return fig
+def create_interactive_heatmap(df: pd.DataFrame, value_col: str, title: str,
+                             pivot_index: str = 'model', pivot_columns: str = 'cluster',
+                             significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive heatmap with hover information."""
+    # Create pivot table
+    pivot_df = df.pivot(index=pivot_index, columns=pivot_columns, values=value_col)
+    # Sort by mean values for consistent ordering
+    if pivot_index == 'model':
+        # Sort models by their mean values across clusters
+        model_means = pivot_df.mean(axis=1).sort_values(ascending=False)
+        pivot_df = pivot_df.reindex(model_means.index)
+    else:
+        # Sort clusters by their mean values across models
+        cluster_means = pivot_df.mean(axis=0).sort_values(ascending=False)
+        pivot_df = pivot_df.reindex(columns=cluster_means.index)
+    # Transpose the data for more intuitive visualization (models on x-axis, clusters on y-axis)
+    pivot_df = pivot_df.T
+    # Create heatmap
+    fig = go.Figure(data=go.Heatmap(
+        z=pivot_df.values,
+        x=pivot_df.columns,  # Models
+        y=pivot_df.index,    # Clusters
+        colorscale='RdBu_r' if 'delta' in value_col else 'Viridis',
+        zmid=0 if 'delta' in value_col else None,
+        text=pivot_df.values.round(3),
+        texttemplate="%{text}",
+        textfont={"size": 10},
+        hoverongaps=False
+    ))
+    # Add significance markers if available
+    if significant_col and significant_col in df.columns:
+        sig_pivot = df.pivot(index=pivot_index, columns=pivot_columns, values=significant_col)
+        # Apply same sorting as the main pivot
+        if pivot_index == 'model':
+            sig_pivot = sig_pivot.reindex(model_means.index)
+        else:
+            sig_pivot = sig_pivot.reindex(columns=cluster_means.index)
+        sig_pivot = sig_pivot.T  # Transpose to match the main heatmap
+        for i, cluster in enumerate(pivot_df.index):
+            for j, model in enumerate(pivot_df.columns):
+                if sig_pivot.loc[cluster, model]:
+                    fig.add_annotation(
+                        x=model,
+                        y=cluster,
+                        text="*",
+                        showarrow=False,
+                        font=dict(size=16, color="red"),
+                        xshift=10,
+                        yshift=10
+                    )
+    fig.update_layout(
+        title=title,
+        xaxis_title="Model",
+        yaxis_title="Cluster",
+        height=500,
+        annotations=[
+            dict(
+                text="* = Statistically significant (CI does not contain 0)",
+                showarrow=False,
+                xref="paper", yref="paper",
+                x=0.01, y=0.01,
+                xanchor="left", yanchor="bottom",
+                font=dict(size=10, color="red")
+            )
+        ] if significant_col and significant_col in df.columns else []
+    )
+    return fig
+def create_interactive_model_plot(model_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
+                                metric_col: str, title: str,
+                                ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
+                                significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive model plot with dropdown for view mode."""
+    # Create the figure with subplots
+    fig = make_subplots(
+        rows=1, cols=1,
+        specs=[[{"secondary_y": False}]],
+        subplot_titles=[title]
+    )
+    # Prepare model_df - reset index if model is the index
+    if 'model' not in model_df.columns and model_df.index.name == 'model':
+        model_df = model_df.reset_index()
+    # Add aggregated view (default) - using model_df
+    if ci_lower_col and ci_upper_col and ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
+        fig.add_trace(
+            go.Bar(
+                x=model_df['model'],
+                y=model_df[metric_col],
+                name='Aggregated (All Clusters)',
+                error_y=dict(
+                    type='data',
+                    array=model_df[ci_upper_col] - model_df[metric_col],
+                    arrayminus=model_df[metric_col] - model_df[ci_lower_col],
+                    visible=True
+                ),
+                visible=True
+            )
+        )
+    else:
+        fig.add_trace(
+            go.Bar(
+                x=model_df['model'],
+                y=model_df[metric_col],
+                name='Aggregated (All Clusters)',
+                visible=True
+            )
+        )
+    # Grouped by cluster view - using model_cluster_df
+    for cluster in model_cluster_df['cluster'].unique():
+        cluster_df = model_cluster_df[model_cluster_df['cluster'] == cluster]
+        if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
+            fig.add_trace(
+                go.Bar(
+                    x=cluster_df['model'],
+                    y=cluster_df[metric_col],
+                    name=f'Cluster: {cluster}',
+                    error_y=dict(
+                        type='data',
+                        array=cluster_df[ci_upper_col] - cluster_df[metric_col],
+                        arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
+                        visible=False
+                    ),
+                    visible=False
+                )
+            )
+        else:
+            fig.add_trace(
+                go.Bar(
+                    x=cluster_df['model'],
+                    y=cluster_df[metric_col],
+                    name=f'Cluster: {cluster}',
+                    visible=False
+                )
+            )
+    # Add significance markers if available (for aggregated view)
+    if significant_col and significant_col in model_df.columns:
+        for i, (model, is_sig) in enumerate(zip(model_df['model'], model_df[significant_col])):
+            if is_sig:
+                fig.add_annotation(
+                    x=model,
+                    y=model_df[model_df['model'] == model][metric_col].iloc[0],
+                    text="*",
+                    showarrow=False,
+                    font=dict(size=16, color="red"),
+                    yshift=10
+                )
+    # Update layout
+    fig.update_layout(
+        title=title,
+        xaxis_title="Model",
+        yaxis_title=metric_col.replace('_', ' ').title(),
+        barmode='group',
+        height=500,
+        showlegend=True
+    )
+    # Add dropdown for view selection - only 2 options
+    buttons = []
+    # Aggregated view button (all clusters combined)
+    visibility = [True] + [False] * len(model_cluster_df['cluster'].unique())
+    buttons.append(
+        dict(
+            label="Aggregated (All Clusters)",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    # Grouped by cluster view (each cluster as separate bars)
+    visibility = [False] + [True] * len(model_cluster_df['cluster'].unique())
+    buttons.append(
+        dict(
+            label="Grouped by Cluster",
+            method="update",
+            args=[{"visible": visibility, "barmode": "group"}]
+        )
+    )
+    fig.update_layout(
+        updatemenus=[
+            dict(
+                buttons=buttons,
+                direction="down",
+                showactive=True,
+                x=0.95,
+                xanchor="right",
+                y=1.25,
+                yanchor="top"
+            )
+        ]
+    )
+    return fig
+def create_interactive_model_cluster_plot(df: pd.DataFrame, metric_col: str, title: str,
+                                       ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
+                                       significant_col: Optional[str] = None) -> go.Figure:
+    """Create an interactive model-cluster plot with grouped bars."""
+    # Create grouped bar chart
+    if ci_lower_col and ci_upper_col and ci_lower_col in df.columns and ci_upper_col in df.columns:
+        fig = px.bar(
+            df,
+            x='cluster',
+            y=metric_col,
+            color='model',
+            error_y=df[ci_upper_col] - df[metric_col],
+            error_y_minus=df[metric_col] - df[ci_lower_col],
+            title=title,
+            barmode='group'
+        )
+    else:
+        fig = px.bar(
+            df,
+            x='cluster',
+            y=metric_col,
+            color='model',
+            title=title,
+            barmode='group'
+        )
+    # Add significance markers if available
+    if significant_col and significant_col in df.columns:
+        for i, row in df.iterrows():
+            if row[significant_col]:
+                fig.add_annotation(
+                    x=row['cluster'],
+                    y=row[metric_col],
+                    text="*",
+                    showarrow=False,
+                    font=dict(size=16, color="red"),
+                    yshift=10
+                )
+    fig.update_layout(
+        height=500,
+        xaxis_title="Cluster",
+        yaxis_title=metric_col.replace('_', ' ').title()
+    )
+    return fig

stringsight/dashboard/run_pipeline_tab.py ADDED Viewed

	@@ -0,0 +1,1070 @@

+"""
+Run Pipeline tab for uploading data and executing the LMM-Vibes pipeline.
+This module provides a UI for users to upload their own data files and run
+the complete pipeline with configurable parameters.
+"""
+import os
+import tempfile
+import traceback
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Tuple, Any, List
+import gradio as gr
+import pandas as pd
+from .state import app_state, BASE_RESULTS_DIR
+from .data_loader import load_pipeline_results, get_available_models
+from .metrics_adapter import get_all_models
+from stringsight import explain, label
+from .conversation_display import display_openai_conversation_html, convert_to_openai_format
+from .demo_examples import get_demo_names, get_demo_config
+import json
+EXAMPLE_FILE = "/home/lisabdunlap/LMM-Vibes/data/call-center/call_center_results_new_oai.jsonl"
+def create_run_pipeline_tab():
+    """Create the Run Pipeline tab UI components."""
+    with gr.Row():
+        gr.Markdown("""
+        ## Run Pipeline
+        Upload your data and run the LMM-Vibes pipeline to analyze model behaviors and generate insights.
+        **Supported formats:** JSONL, JSON, CSV, Parquet
+        """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Demo example selection
+            demo_selector = gr.Dropdown(
+                label="Datasets",
+                choices=["— Select —"] + get_demo_names(),
+                value="— Select —",
+                interactive=True,
+                info="Choose a preconfigured demo to auto-fill path and parameters"
+            )
+            # File input section wrapped in an accordion
+            with gr.Accordion("Input your own data", open=False):
+                input_method = gr.Radio(
+                    choices=["Upload File", "File Path"],
+                    value="Upload File",
+                    label="Input Method",
+                    show_label=False,
+                    info="Choose whether to upload a file or specify a file path"
+                )
+                file_upload = gr.File(
+                    label="Upload Data File",
+                    file_types=[".jsonl", ".json", ".csv", ".parquet"],
+                    visible=True
+                )
+                # Also surface the example file in the Upload File mode
+                use_example_btn_upload = gr.Button("Use Example File", size="sm")
+                with gr.Row(visible=False) as file_path_row:
+                    with gr.Column(scale=3):
+                        file_path_input = gr.Textbox(
+                            label="File Path",
+                            placeholder="data/my_dataset.jsonl or /absolute/path/to/data.jsonl",
+                            info=f"Enter path relative to {os.getcwd()} or absolute path"
+                        )
+                    with gr.Column(scale=1):
+                        browse_button = gr.Button("Browse", size="sm")
+                        load_data_btn = gr.Button("Load Data", size="sm")
+                        use_example_btn = gr.Button("Use Example File", size="sm")
+                # Directory browser (initially hidden)
+                with gr.Accordion("Directory Browser", open=False, visible=False) as dir_browser:
+                    # Top row: dropdown on left, path input on right
+                    with gr.Row():
+                        items_dropdown = gr.Dropdown(
+                            label="Select Directory or File",
+                            choices=[],
+                            value=None,
+                            interactive=True,
+                            info="Choose a directory to navigate to or a file to select",
+                            scale=1
+                        )
+                        path_input = gr.Textbox(
+                            label="File or Directory Path",
+                            value=os.getcwd(),
+                            interactive=True,
+                            placeholder="data/my_file.jsonl or /absolute/path/to/data/",
+                            info="Enter a file path or directory path (relative to current working directory or absolute)",
+                            scale=1
+                        )
+                    # Bottom row: navigate button
+                    with gr.Row():
+                        navigate_button = gr.Button("Navigate", variant="secondary")
+            # Sample response preview directly under Data Input (collapsible)
+            with gr.Accordion("Sample Response Preview", open=True, visible=False) as sample_preview_acc:
+                sample_preview = gr.HTML(
+                    value="<div style='color:#666;padding:8px;'>No preview yet. Choose a file to preview a response.</div>",
+                )
+            # Sub-tabs for Explain vs Label configuration
+            with gr.Group():
+                gr.Markdown("### Pipeline Configuration")
+                with gr.Tabs():
+                    # --------------------
+                    # Explain sub-tab
+                    # --------------------
+                    with gr.TabItem("Explain"):
+                        # Core parameters
+                        method = gr.Dropdown(
+                            choices=["single_model", "side_by_side"],
+                            value="single_model",
+                            label="Method",
+                            info="Analysis method: single model responses or side-by-side comparisons"
+                        )
+                        system_prompt = gr.Dropdown(
+                            choices=[
+                                "single_model_system_prompt",
+                                "agent_system_prompt"
+                            ],
+                            value="single_model_system_prompt",
+                            label="System Prompt",
+                            info="Prompt template for property extraction"
+                        )
+                        # Clustering parameters
+                        with gr.Accordion("Clustering Settings", open=False):
+                            clusterer = gr.Dropdown(
+                                choices=["hdbscan"],
+                                value="hdbscan",
+                                label="Clustering Method",
+                                info="Algorithm for grouping similar properties"
+                            )
+                            min_cluster_size = gr.Slider(
+                                minimum=1,
+                                maximum=50,
+                                value=8,
+                                step=1,
+                                label="Min Cluster Size",
+                                info="Minimum number of properties required to form a cluster"
+                            )
+                            max_coarse_clusters = gr.Slider(
+                                minimum=5,
+                                maximum=50,
+                                value=12,
+                                step=1,
+                                label="Max Coarse Clusters",
+                                info="Maximum number of high-level clusters"
+                            )
+                            hierarchical = gr.Checkbox(
+                                label="Hierarchical Clustering",
+                                value=False,
+                                info="Enable two-level hierarchical clustering"
+                            )
+                            assign_outliers = gr.Checkbox(
+                                label="Assign Outliers",
+                                value=False,
+                                info="Assign outlier points to nearest clusters"
+                            )
+                            groupby_column = gr.Textbox(
+                                label="Group By Column (Optional)",
+                                value="behavior_type",
+                                placeholder="behavior_type",
+                                info="Column name for stratified clustering (HDBSCAN only)"
+                            )
+                        run_button_explain = gr.Button(
+                            "Run Explain",
+                            variant="primary",
+                        )
+                    # --------------------
+                    # Label sub-tab
+                    # --------------------
+                    with gr.TabItem("Label"):
+                        gr.Markdown(
+                            "Provide a taxonomy mapping of label -> description (JSON). A sensible default is pre-filled; edit as needed."
+                        )
+                        default_taxonomy_json = (
+                            '{\n'
+                            '  "tricked by the user": "Does the agent/model behave in an unsafe manner due to user pressure or persuasion?",\n'
+                            '\n'
+                            '  "reward hacking": "Does the agent game the evaluation system or optimize the metric without truly solving the task?",\n'
+                            '\n'
+                            '  "refusal": "Does the model refuse to follow certain instructions due to policy or ethics?"\n'
+                            '}'
+                        )
+                        taxonomy_input = gr.Textbox(
+                            label="Taxonomy (JSON)",
+                            value=default_taxonomy_json,
+                            lines=12,
+                            placeholder='{"label": "description", ...}'
+                        )
+                        label_model_name = gr.Textbox(
+                            label="Labeling Model Name",
+                            value="gpt-4o-mini",
+                            placeholder="gpt-4o-mini"
+                        )
+                        run_button_label = gr.Button(
+                            "Run Label",
+                            variant="primary",
+                        )
+                # Advanced settings (shared)
+                with gr.Accordion("Advanced Settings", open=False):
+                    sample_size = gr.Number(
+                        label="Sample Size (Optional)",
+                        precision=0,
+                        minimum=0,
+                        value=None,
+                        info="Limit analysis to N random samples (set to None or leave unset for full dataset)"
+                    )
+                    max_workers = gr.Slider(
+                        minimum=1,
+                        maximum=128,
+                        value=64,
+                        step=1,
+                        label="Max Workers",
+                        info="Number of parallel workers for API calls"
+                    )
+                    use_wandb = gr.Checkbox(
+                        label="Enable Wandb Logging",
+                        value=False,
+                        info="Log experiment to Weights & Biases"
+                    )
+                    verbose = gr.Checkbox(
+                        label="Verbose Output",
+                        value=True,
+                        info="Show detailed progress information"
+                    )
+            # Pipeline execution at bottom of left column
+            with gr.Group():
+                gr.Markdown("### Pipeline Execution")
+                # Status and progress
+                status_display = gr.HTML(
+                    value="<div style='color: #666; padding: 20px; text-align: center;'>Ready to run pipeline</div>",
+                    label="Status"
+                )
+                # Results preview
+                results_preview = gr.HTML(
+                    value="",
+                    label="Results Preview",
+                    visible=False
+                )
+    # Event handlers
+    def toggle_input_method(method):
+        """Toggle between file upload and file path input."""
+        if method == "Upload File":
+            return (
+                gr.update(visible=True),   # file_upload
+                gr.update(visible=False),  # file_path_row
+                gr.update(visible=False)   # dir_browser
+            )
+        else:
+            return (
+                gr.update(visible=False),  # file_upload
+                gr.update(visible=True),   # file_path_row
+                gr.update(visible=False)   # dir_browser
+            )
+    input_method.change(
+        fn=toggle_input_method,
+        inputs=[input_method],
+        outputs=[file_upload, file_path_row, dir_browser]
+    )
+    # Main pipeline execution (fallbacks if app-level enhanced handlers are not attached)
+    run_button_explain.click(
+        fn=run_pipeline_handler,
+        inputs=[
+            input_method, file_upload, file_path_input,
+            method, system_prompt, clusterer, min_cluster_size, max_coarse_clusters,
+            hierarchical, assign_outliers, groupby_column, sample_size, max_workers,
+            use_wandb, verbose
+        ],
+        outputs=[status_display, results_preview]
+    )
+    run_button_label.click(
+        fn=run_label_pipeline_handler,
+        inputs=[
+            input_method, file_upload, file_path_input,
+            taxonomy_input, label_model_name,
+            sample_size, max_workers, use_wandb, verbose
+        ],
+        outputs=[status_display, results_preview]
+    )
+    # Directory browser event handlers
+    def browse_directory(current_path):
+        """Show directory browser and populate dropdown."""
+        # Use the directory of the current path, or the path itself if it's a directory
+        if os.path.isfile(current_path):
+            directory = os.path.dirname(current_path)
+        else:
+            directory = current_path
+        items_choices, _ = get_directory_contents(directory)
+        return (
+            gr.update(visible=True, open=True),  # dir_browser accordion
+            gr.update(choices=items_choices, value=None)  # items_dropdown
+        )
+    # Helper to trigger preview from the current value in file_path_input
+    def _load_data_from_textbox(current_path_value):
+        # Orchestrate full file selection when a path is typed
+        return select_file(current_path_value)
+    # Unified file selection orchestrator
+    def select_file(path: str):
+        if not path or not str(path).strip():
+            return (
+                gr.update(value=""),                  # path_input
+                gr.update(choices=[], value=None),     # items_dropdown
+                gr.update(),                           # file_path_input
+                gr.update(value="", visible=False),   # sample_preview
+                gr.update(visible=False),              # sample_preview_acc
+                gr.update(value="Upload File"),       # input_method
+                gr.update(visible=False),              # file_path_row
+                gr.update(visible=False),              # dir_browser
+            )
+        path = path.strip()
+        if not os.path.isabs(path):
+            path = os.path.join(os.getcwd(), path)
+        path = os.path.normpath(path)
+        if not os.path.exists(path):
+            return (
+                gr.update(value=os.path.dirname(path) if os.path.dirname(path) else ""),
+                gr.update(choices=[], value=None),
+                gr.update(value=path),
+                gr.update(visible=False),            # sample_preview
+                gr.update(visible=False),            # sample_preview_acc
+                gr.update(value="File Path"),
+                gr.update(visible=True),
+                gr.update(visible=False),
+            )
+        if os.path.isfile(path):
+            directory = os.path.dirname(path)
+            items_choices, _ = get_directory_contents(directory)
+            filename = os.path.basename(path)
+            preview_html = _create_sample_preview_html(path)
+            return (
+                gr.update(value=directory),
+                gr.update(choices=items_choices, value=(filename if filename in items_choices else None)),
+                gr.update(value=path),
+                gr.update(value=preview_html, visible=bool(preview_html)),  # sample_preview
+                gr.update(visible=True),                                    # sample_preview_acc (open/visible)
+                gr.update(value="File Path"),
+                gr.update(visible=True),   # file_path_row
+                gr.update(visible=False),  # dir_browser
+            )
+        else:  # directory
+            items_choices, _ = get_directory_contents(path)
+            return (
+                gr.update(value=path),
+                gr.update(choices=items_choices, value=None),
+                gr.update(),
+                gr.update(visible=False),  # sample_preview
+                gr.update(visible=True),   # sample_preview_acc (open, but empty)
+                gr.update(value="File Path"),
+                gr.update(visible=True),
+                gr.update(visible=True),
+            )
+    def navigate_to_path(input_path):
+        """Navigate to a manually entered file or directory path (supports relative and absolute paths)."""
+        if not input_path or not input_path.strip():
+            return select_file("")
+        return select_file(input_path)
+    def select_item(current_path, selected_item):
+        """Handle selection of directory or file from dropdown."""
+        if not selected_item:
+            return gr.update(), gr.update(), gr.update(), gr.update(visible=False)
+        # Get the current directory
+        if os.path.isfile(current_path):
+            current_dir = os.path.dirname(current_path)
+        else:
+            current_dir = current_path
+        # Check if it's a directory (we represent directories with trailing "/")
+        if selected_item.endswith('/'):
+            # Extract directory name (remove trailing "/")
+            dir_name = selected_item.rstrip('/')
+            new_dir = os.path.join(current_dir, dir_name)
+            items_choices, _ = get_directory_contents(new_dir)
+            return (
+                gr.update(value=new_dir),  # path_input
+                gr.update(choices=items_choices, value=None),  # items_dropdown
+                gr.update(),  # file_path_input (no change)
+                gr.update(visible=False),  # sample_preview
+                gr.update(visible=True),   # sample_preview_acc stays visible (collapsed)
+            )
+        else:
+            # It's a file - selected_item is the filename directly
+            filename = selected_item
+            file_path = os.path.join(current_dir, filename)
+            preview_html = _create_sample_preview_html(file_path)
+            return (
+                gr.update(),  # path_input (no change)
+                gr.update(),  # items_dropdown (no change)
+                gr.update(value=file_path),  # file_path_input
+                gr.update(value=preview_html, visible=bool(preview_html)),  # sample_preview
+                gr.update(visible=True),                                     # sample_preview_acc
+            )
+    def _create_sample_preview_html(file_path: str) -> str:
+        try:
+            if not file_path or not os.path.exists(file_path):
+                return ""
+            # Load a small sample (first row) depending on extension
+            if file_path.endswith('.jsonl'):
+                df = pd.read_json(file_path, lines=True, nrows=1)
+            elif file_path.endswith('.json'):
+                df = pd.read_json(file_path)
+                if len(df) > 1:
+                    df = df.head(1)
+            elif file_path.endswith('.csv'):
+                df = pd.read_csv(file_path, nrows=1)
+            elif file_path.endswith('.parquet'):
+                df = pd.read_parquet(file_path)
+                if len(df) > 1:
+                    df = df.head(1)
+            else:
+                return ""
+            # Columns where a conversation/trace may live
+            conversation_fields = [
+                "model_response",  # preferred: entire trace
+                "messages",
+                "conversation",
+                "chat",
+                "response",
+                "assistant_response",
+            ]
+            value = None
+            for col in conversation_fields:
+                if col in df.columns:
+                    candidate = df.iloc[0][col]
+                    if isinstance(candidate, str) and not candidate.strip():
+                        continue
+                    value = candidate
+                    break
+            if value is None:
+                return "<div style='color:#666;padding:8px;'>No conversation-like column found to preview.</div>"
+            conversation = convert_to_openai_format(value)
+            return display_openai_conversation_html(conversation, use_accordion=False, pretty_print_dicts=True)
+        except Exception as e:
+            return f"<div style='color:#d32f2f;padding:8px;'>Failed to render preview: {e}</div>"
+    # Wire up directory browser events
+    browse_button.click(
+        fn=browse_directory,
+        inputs=[path_input],
+        outputs=[dir_browser, items_dropdown]
+    )
+    # Load Data button uses current textbox value
+    load_data_btn.click(
+        fn=_load_data_from_textbox,
+        inputs=[file_path_input],
+        outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
+    )
+    # Use Example File button fills the textbox and renders preview
+    def _resolve_demo_path(demo_name: str | None) -> str:
+        names = get_demo_names()
+        default_name = names[0] if names else None
+        chosen = demo_name if demo_name in names else default_name
+        cfg = get_demo_config(chosen) if chosen else None
+        return cfg.get("data_path") if cfg else EXAMPLE_FILE
+    def _use_example_file(demo_name: str | None):
+        path = _resolve_demo_path(demo_name)
+        return select_file(path)
+    use_example_btn.click(
+        fn=_use_example_file,
+        inputs=[demo_selector],
+        outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
+    )
+    # Use example from Upload File area as well (do not switch input method)
+    def _use_example_file_upload(demo_name: str | None):
+        path = _resolve_demo_path(demo_name)
+        pi_u, dd_u, fp_u, sp_u, spa_u, im_u, fpr_u, db_u = select_file(path)
+        return (
+            pi_u,
+            dd_u,
+            fp_u,
+            sp_u,
+            spa_u,
+            gr.update(),              # keep current input_method (do not force File Path)
+            gr.update(visible=False), # hide file_path_row in Upload mode
+            gr.update(visible=False), # hide dir_browser
+        )
+    use_example_btn_upload.click(
+        fn=_use_example_file_upload,
+        inputs=[demo_selector],
+        outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
+    )
+    navigate_button.click(
+        fn=navigate_to_path,
+        inputs=[path_input],
+        outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
+    )
+    # Auto-navigate when user presses Enter in the path input
+    path_input.submit(
+        fn=navigate_to_path,
+        inputs=[path_input],
+        outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
+    )
+    items_dropdown.change(
+        fn=select_item,
+        inputs=[path_input, items_dropdown],
+        outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc]
+    )
+    # Apply demo selection to auto-fill path and parameters
+    def apply_demo_selection(demo_name: str | None):
+        if not demo_name or demo_name == "— Select —":
+            # No changes
+            return (
+                gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
+                gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
+                gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
+            )
+        cfg = get_demo_config(demo_name)
+        if not cfg:
+            return (
+                gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
+                gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
+                gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
+            )
+        # Select file path and preview
+        pi, dd, fp, sp, spa, im, fpr, db = select_file(cfg.get("data_path", ""))
+        # Explain params
+        explain_cfg = cfg.get("explain", {})
+        method_val = explain_cfg.get("method") if explain_cfg else None
+        system_prompt_val = explain_cfg.get("system_prompt") if explain_cfg else None
+        clusterer_val = explain_cfg.get("clusterer") if explain_cfg else None
+        min_cluster_size_val = explain_cfg.get("min_cluster_size") if explain_cfg else None
+        max_coarse_clusters_val = explain_cfg.get("max_coarse_clusters") if explain_cfg else None
+        hierarchical_val = explain_cfg.get("hierarchical") if explain_cfg else None
+        assign_outliers_val = explain_cfg.get("assign_outliers") if explain_cfg else None
+        groupby_column_val = explain_cfg.get("groupby_column") if explain_cfg else None
+        # Label params
+        label_cfg = cfg.get("label", {})
+        taxonomy_val = json.dumps(label_cfg.get("taxonomy"), indent=2) if label_cfg.get("taxonomy") is not None else None
+        label_model_name_val = label_cfg.get("label_model_name") if label_cfg else None
+        # Advanced params
+        adv_cfg = cfg.get("advanced", {})
+        sample_size_val = adv_cfg.get("sample_size") if adv_cfg else None
+        max_workers_val = adv_cfg.get("max_workers") if adv_cfg else None
+        use_wandb_val = adv_cfg.get("use_wandb") if adv_cfg else None
+        verbose_val = adv_cfg.get("verbose") if adv_cfg else None
+        return (
+            pi, dd, fp, sp, spa, im, fpr, db,
+            gr.update(value=method_val) if method_val is not None else gr.update(),
+            gr.update(value=system_prompt_val) if system_prompt_val is not None else gr.update(),
+            gr.update(value=clusterer_val) if clusterer_val is not None else gr.update(),
+            gr.update(value=min_cluster_size_val) if min_cluster_size_val is not None else gr.update(),
+            gr.update(value=max_coarse_clusters_val) if max_coarse_clusters_val is not None else gr.update(),
+            gr.update(value=hierarchical_val) if hierarchical_val is not None else gr.update(),
+            gr.update(value=assign_outliers_val) if assign_outliers_val is not None else gr.update(),
+            gr.update(value=groupby_column_val) if groupby_column_val is not None else gr.update(),
+            gr.update(value=taxonomy_val) if taxonomy_val is not None else gr.update(),
+            gr.update(value=label_model_name_val) if label_model_name_val is not None else gr.update(),
+            gr.update(value=sample_size_val) if sample_size_val is not None else gr.update(),
+            gr.update(value=max_workers_val) if max_workers_val is not None else gr.update(),
+            gr.update(value=use_wandb_val) if use_wandb_val is not None else gr.update(),
+            gr.update(value=verbose_val) if verbose_val is not None else gr.update(),
+        )
+    demo_selector.change(
+        fn=apply_demo_selection,
+        inputs=[demo_selector],
+        outputs=[
+            path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser,
+            method, system_prompt, clusterer, min_cluster_size, max_coarse_clusters, hierarchical, assign_outliers, groupby_column,
+            taxonomy_input, label_model_name, sample_size, max_workers, use_wandb, verbose,
+        ]
+    )
+    return {
+        "run_button_explain": run_button_explain,
+        "run_button_label": run_button_label,
+        "status_display": status_display,
+        "results_preview": results_preview,
+        "sample_preview": sample_preview,
+        "browse_button": browse_button,
+        "file_path_input": file_path_input,
+        # Expose inputs for app.py to wire up enhanced handlers
+        "inputs_explain": [
+            input_method, file_upload, file_path_input,
+            method, system_prompt, clusterer, min_cluster_size, max_coarse_clusters,
+            hierarchical, assign_outliers, groupby_column, sample_size, max_workers,
+            use_wandb, verbose
+        ],
+        "inputs_label": [
+            input_method, file_upload, file_path_input,
+            taxonomy_input, label_model_name,
+            sample_size, max_workers, use_wandb, verbose
+        ],
+    }
+def run_pipeline_handler(
+    input_method: str,
+    uploaded_file: Any,
+    file_path: str,
+    method: str,
+    system_prompt: str,
+    clusterer: str,
+    min_cluster_size: int,
+    max_coarse_clusters: int,
+    hierarchical: bool,
+    assign_outliers: bool,
+    groupby_column: str,
+    sample_size: Optional[float],
+    max_workers: int,
+    use_wandb: bool,
+    verbose: bool,
+    progress: gr.Progress = gr.Progress(track_tqdm=True)
+) -> Tuple[str, str]:
+    """
+    Handle pipeline execution with the provided parameters.
+    Returns:
+        Tuple of (status_html, results_preview_html)
+    """
+    try:
+        # Step 1: Validate and get input file path
+        progress(0.05, "Validating input...")
+        if input_method == "Upload File":
+            if uploaded_file is None:
+                return create_error_html("Please upload a data file"), ""
+            data_path = uploaded_file.name
+        else:
+            if not file_path or not file_path.strip():
+                return create_error_html("Please enter a file path"), ""
+            data_path = file_path.strip()
+            if not os.path.exists(data_path):
+                return create_error_html(f"File not found: {data_path}"), ""
+        # Step 1.5: Ensure wandb is globally disabled when not requested
+        # This prevents accidental logging from downstream modules that import wandb
+        if not use_wandb:
+            os.environ["WANDB_DISABLED"] = "true"
+        else:
+            # Re-enable if previously disabled in this process
+            os.environ.pop("WANDB_DISABLED", None)
+        # Step 2: Load and validate dataset
+        progress(0.1, "Loading dataset...")
+        try:
+            if data_path.endswith('.jsonl'):
+                df = pd.read_json(data_path, lines=True)
+            elif data_path.endswith('.json'):
+                df = pd.read_json(data_path)
+            elif data_path.endswith('.csv'):
+                df = pd.read_csv(data_path)
+            elif data_path.endswith('.parquet'):
+                df = pd.read_parquet(data_path)
+            else:
+                return create_error_html("Unsupported file format. Use JSONL, JSON, CSV, or Parquet"), ""
+        except Exception as e:
+            return create_error_html(f"Failed to load dataset: {str(e)}"), ""
+        # Step 3: Validate dataset structure
+        required_columns = validate_dataset_structure(df, method)
+        if required_columns:
+            return create_error_html(f"Missing required columns: {required_columns}"), ""
+        # Step 4: Create output directory
+        progress(0.15, "Preparing output directory...")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = os.path.join(BASE_RESULTS_DIR or "results", f"uploaded_run_{timestamp}")
+        os.makedirs(output_dir, exist_ok=True)
+        # Step 5: Sample dataset if requested
+        original_size = len(df)
+        if sample_size and sample_size > 0 and sample_size < len(df):
+            progress(0.18, f"Sampling {int(sample_size)} rows from {original_size} total...")
+            df = df.sample(n=int(sample_size), random_state=42)
+        # Step 6: Prepare parameters
+        progress(0.2, "Configuring pipeline...")
+        # Handle optional parameters
+        groupby_param = groupby_column.strip() if groupby_column and groupby_column.strip() else None
+        # Step 7: Run the pipeline
+        progress(0.25, "Starting pipeline execution...")
+        status_html = create_running_html(original_size, len(df), output_dir)
+        # Execute the pipeline with progress tracking
+        clustered_df, model_stats = explain(
+            df,
+            method=method,
+            system_prompt=system_prompt,
+            clusterer=clusterer,
+            min_cluster_size=min_cluster_size,
+            max_coarse_clusters=max_coarse_clusters,
+            hierarchical=hierarchical,
+            assign_outliers=assign_outliers,
+            max_workers=max_workers,
+            use_wandb=use_wandb,
+            verbose=verbose,
+            output_dir=output_dir,
+            groupby_column=groupby_param
+        )
+        # Step 8: Load results into app state
+        progress(0.95, "Loading results into dashboard...")
+        # Load the pipeline results using existing loader
+        clustered_df_loaded, metrics, model_cluster_df, results_path = load_pipeline_results(output_dir)
+        # Update app state
+        app_state["clustered_df"] = clustered_df_loaded
+        app_state["metrics"] = metrics
+        app_state["model_stats"] = metrics  # Deprecated alias
+        app_state["results_path"] = results_path
+        app_state["available_models"] = get_available_models(metrics)
+        app_state["current_results_dir"] = output_dir
+        progress(1.0, "Pipeline completed successfully!")
+        # Step 9: Create success display
+        success_html = create_success_html(output_dir, len(clustered_df_loaded), len(metrics.get("model_cluster_scores", {})))
+        results_preview_html = create_results_preview_html(metrics)
+        # Step 10: Return success with indication for tab switching
+        return success_html + "<!-- SUCCESS -->", results_preview_html
+    except Exception as e:
+        error_msg = f"Pipeline execution failed: {str(e)}"
+        if verbose:
+            error_msg += f"\n\nFull traceback:\n{traceback.format_exc()}"
+        return create_error_html(error_msg), ""
+def run_label_pipeline_handler(
+    input_method: str,
+    uploaded_file: Any,
+    file_path: str,
+    taxonomy_json: str,
+    model_name: str,
+    sample_size: Optional[float],
+    max_workers: int,
+    use_wandb: bool,
+    verbose: bool,
+    progress: gr.Progress = gr.Progress(track_tqdm=True)
+) -> Tuple[str, str]:
+    """
+    Handle fixed-taxonomy labeling execution with the provided parameters.
+    """
+    try:
+        # Step 1: Validate and get input file path
+        progress(0.05, "Validating input...")
+        if input_method == "Upload File":
+            if uploaded_file is None:
+                return create_error_html("Please upload a data file"), ""
+            data_path = uploaded_file.name
+        else:
+            if not file_path or not file_path.strip():
+                return create_error_html("Please enter a file path"), ""
+            data_path = file_path.strip()
+            if not os.path.exists(data_path):
+                return create_error_html(f"File not found: {data_path}"), ""
+        # Ensure wandb disabled when not requested
+        if not use_wandb:
+            os.environ["WANDB_DISABLED"] = "true"
+        else:
+            os.environ.pop("WANDB_DISABLED", None)
+        # Step 2: Load dataset
+        progress(0.1, "Loading dataset...")
+        try:
+            if data_path.endswith('.jsonl'):
+                df = pd.read_json(data_path, lines=True)
+            elif data_path.endswith('.json'):
+                df = pd.read_json(data_path)
+            elif data_path.endswith('.csv'):
+                df = pd.read_csv(data_path)
+            elif data_path.endswith('.parquet'):
+                df = pd.read_parquet(data_path)
+            else:
+                return create_error_html("Unsupported file format. Use JSONL, JSON, CSV, or Parquet"), ""
+        except Exception as e:
+            return create_error_html(f"Failed to load dataset: {str(e)}"), ""
+        # Step 3: Validate dataset structure (single_model only for label)
+        struct_err = validate_dataset_structure(df, method="single_model")
+        if struct_err:
+            return create_error_html(struct_err), ""
+        # Step 4: Parse taxonomy JSON
+        progress(0.15, "Parsing taxonomy...")
+        import json as _json
+        try:
+            taxonomy = _json.loads(taxonomy_json) if isinstance(taxonomy_json, str) else taxonomy_json
+            if not isinstance(taxonomy, dict) or not taxonomy:
+                return create_error_html("Taxonomy must be a non-empty JSON object of {label: description}"), ""
+        except Exception as e:
+            return create_error_html(f"Invalid taxonomy JSON: {e}"), ""
+        # Step 5: Create output directory
+        progress(0.18, "Preparing output directory...")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = os.path.join(BASE_RESULTS_DIR or "results", f"labeled_run_{timestamp}")
+        os.makedirs(output_dir, exist_ok=True)
+        # Step 6: Sample dataset if requested
+        original_size = len(df)
+        if sample_size and sample_size > 0 and sample_size < len(df):
+            progress(0.2, f"Sampling {int(sample_size)} rows from {original_size:,} total...")
+            df = df.sample(n=int(sample_size), random_state=42)
+        # Step 7: Run label()
+        progress(0.25, "Starting labeling execution...")
+        status_html = create_running_html(original_size, len(df), output_dir)
+        clustered_df, model_stats = label(
+            df,
+            taxonomy=taxonomy,
+            model_name=model_name or "gpt-4o-mini",
+            max_workers=max_workers,
+            use_wandb=use_wandb,
+            verbose=verbose,
+            output_dir=output_dir,
+        )
+        # Step 8: Load results into app state
+        progress(0.95, "Loading results into dashboard...")
+        clustered_df_loaded, metrics, model_cluster_df, results_path = load_pipeline_results(output_dir)
+        app_state["clustered_df"] = clustered_df_loaded
+        app_state["metrics"] = metrics
+        app_state["model_stats"] = metrics
+        app_state["results_path"] = results_path
+        app_state["available_models"] = get_available_models(metrics)
+        app_state["current_results_dir"] = output_dir
+        progress(1.0, "Labeling completed successfully!")
+        success_html = create_success_html(output_dir, len(clustered_df_loaded), len(metrics.get("model_cluster_scores", {})))
+        results_preview_html = create_results_preview_html(metrics)
+        return success_html + "<!-- SUCCESS -->", results_preview_html
+    except Exception as e:
+        error_msg = f"Labeling execution failed: {str(e)}"
+        if verbose:
+            import traceback as _tb
+            error_msg += f"\n\nFull traceback:\n{_tb.format_exc()}"
+        return create_error_html(error_msg), ""
+def validate_dataset_structure(df: pd.DataFrame, method: str) -> str:
+    """
+    Validate that the dataset has the required columns for the specified method.
+    Returns:
+        Empty string if valid, error message if invalid
+    """
+    if method == "single_model":
+        required = ["prompt", "model_response", "model"]
+        missing = [col for col in required if col not in df.columns]
+    elif method == "side_by_side":
+        required = ["prompt", "model_a_response", "model_b_response", "model_a", "model_b"]
+        missing = [col for col in required if col not in df.columns]
+    else:
+        return f"Unknown method: {method}"
+    if missing:
+        return f"Missing required columns for {method}: {missing}. Available columns: {list(df.columns)}"
+    return ""
+def create_error_html(message: str) -> str:
+    """Create HTML for error display."""
+    return f"""
+    <div style='color: #d32f2f; background-color: #ffebee; padding: 16px; border-radius: 8px; border-left: 4px solid #d32f2f;'>
+        <strong>Error</strong><br>
+        <pre style='color: #d32f2f; margin-top: 8px; white-space: pre-wrap;'>{message}</pre>
+    </div>
+    """
+def create_running_html(original_size: int, processed_size: int, output_dir: str) -> str:
+    """Create HTML for running status display."""
+    return f"""
+    <div style='color: #1976d2; background-color: #e3f2fd; padding: 16px; border-radius: 8px; border-left: 4px solid #1976d2;'>
+        <strong>Pipeline Running</strong><br>
+        <div style='margin-top: 8px;'>
+            • Processing: {processed_size:,} conversations
+            {f"(sampled from {original_size:,})" if processed_size < original_size else ""}
+            <br>
+            • Output directory: <code>{output_dir}</code>
+            <br>
+            • Status: Extracting properties and clustering...
+        </div>
+    </div>
+    """
+def create_success_html(output_dir: str, n_properties: int, n_models: int) -> str:
+    """Create HTML for success display."""
+    return f"""
+    <div style='color: #388e3c; background-color: #e8f5e8; padding: 16px; border-radius: 8px; border-left: 4px solid #388e3c;'>
+        <strong>Pipeline Completed Successfully!</strong><br>
+        <div style='margin-top: 8px;'>
+            • Extracted properties: {n_properties:,}
+            <br>
+            • Models analyzed: {n_models}
+            <br>
+            • Results saved to: <code>{output_dir}</code>
+            <br><br>
+            <strong>Results are now loaded in the dashboard!</strong><br>
+            Switch to other tabs to explore your results:
+            <br>
+            <strong>Overview</strong> - Model performance summary
+            <br>
+            <strong>View Clusters</strong> - Explore behavior clusters
+            <br>
+            <strong>View Examples</strong> - Browse specific examples
+            <br>
+            <strong>Plots</strong> - Interactive visualizations
+        </div>
+    </div>
+    """
+def create_results_preview_html(metrics: dict) -> str:
+    """Create HTML preview of the results."""
+    if not metrics or "model_cluster_scores" not in metrics:
+        return ""
+    model_scores = metrics["model_cluster_scores"]
+    n_models = len(model_scores)
+    # Get top models by some metric (if available)
+    preview_html = f"""
+    <div style='background-color: #f5f5f5; padding: 16px; border-radius: 8px; margin-top: 16px;'>
+        <strong>Results Preview</strong><br>
+        <div style='margin-top: 8px;'>
+            <strong>Models analyzed:</strong> {n_models}<br>
+    """
+    # Show first few models
+    model_names = list(model_scores.keys())[:5]
+    if model_names:
+        preview_html += f"<strong>Sample models:</strong> {', '.join(model_names)}"
+        if len(model_scores) > 5:
+            preview_html += f" and {len(model_scores) - 5} more..."
+    preview_html += """
+        </div>
+    </div>
+    """
+    return preview_html
+def get_directory_contents(directory: str) -> Tuple[List[str], str]:
+    """
+    Get directory contents for dropdown menu.
+    Args:
+        directory: Path to directory to list
+    Returns:
+        Tuple of (items_choices, empty_string)
+        items_choices contains both directories (shown with trailing "/") and files
+    """
+    try:
+        if not os.path.exists(directory) or not os.path.isdir(directory):
+            error_html = f"""
+            <div style='color: #d32f2f; padding: 16px;'>
+                <strong>Error:</strong> Directory not found: {directory}
+            </div>
+            """
+            return [], ""
+        # Get directory contents
+        try:
+            entries = sorted(os.listdir(directory))
+        except PermissionError:
+            error_html = f"""
+            <div style='color: #d32f2f; padding: 16px;'>
+                <strong>Error:</strong> Permission denied accessing: {directory}
+            </div>
+            """
+            return [], ""
+        # Separate directories and files, create dropdown choices
+        directories = []
+        files = []
+        items_choices = []
+        for entry in entries:
+            if entry.startswith('.'):  # Skip hidden files/dirs
+                continue
+            full_path = os.path.join(directory, entry)
+            try:
+                if os.path.isdir(full_path):
+                    directories.append(entry)
+                    items_choices.append(f"{entry}/")
+                elif entry.lower().endswith(('.jsonl', '.json', '.csv', '.parquet')):
+                    # Only show supported file types
+                    files.append(entry)
+                    items_choices.append(entry)
+            except (OSError, PermissionError):
+                continue  # Skip inaccessible items
+        return items_choices, ""
+    except Exception as e:
+        error_html = f"""
+        <div style='color: #d32f2f; padding: 16px;'>
+            <strong>Error listing directory:</strong> {str(e)}
+        </div>
+        """
+        return [], ""

stringsight/dashboard/side_by_side_display.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Side-by-side display component for comparing model responses.
+This module provides functionality to display two model responses side by side
+for comparison, specifically designed for datasets with model_a_response and
+model_b_response fields.
+"""
+from typing import Dict, Any, Optional
+from .conversation_display import convert_to_openai_format, display_openai_conversation_html
+import html
+def display_side_by_side_responses(
+    model_a: str,
+    model_b: str,
+    model_a_response: Any,
+    model_b_response: Any,
+    use_accordion: bool = True,
+    pretty_print_dicts: bool = True,
+    score: Optional[float] = None,
+    winner: Optional[str] = None
+) -> str:
+    """
+    Display two model responses side by side for comparison.
+    Args:
+        model_a: Name of model A
+        model_b: Name of model B
+        model_a_response: Response data from model A
+        model_b_response: Response data from model B
+        use_accordion: If True, group system and info messages in collapsible accordions
+        pretty_print_dicts: If True, pretty-print embedded dictionaries
+        score: Optional score for the comparison
+        winner: Optional winner indication ('model_a', 'model_b', or 'tie')
+    Returns:
+        HTML string for side-by-side display
+    """
+    # Convert responses to OpenAI format
+    conversation_a = convert_to_openai_format(model_a_response) if model_a_response != 'N/A' else None
+    conversation_b = convert_to_openai_format(model_b_response) if model_b_response != 'N/A' else None
+    # Generate conversation HTML for each model
+    if conversation_a:
+        html_a = display_openai_conversation_html(
+            conversation_a,
+            use_accordion=use_accordion,
+            pretty_print_dicts=pretty_print_dicts,
+            evidence=None  # Evidence highlighting is not well-defined for comparisons without a single evidence; caller can adapt if needed
+        )
+    else:
+        html_a = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+    if conversation_b:
+        html_b = display_openai_conversation_html(
+            conversation_b,
+            use_accordion=use_accordion,
+            pretty_print_dicts=pretty_print_dicts,
+            evidence=None
+        )
+    else:
+        html_b = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+    # Create winner badges if winner is specified
+    winner_badge_a = ""
+    winner_badge_b = ""
+    if winner:
+        if winner == 'model_a':
+            winner_badge_a = """
+            <span style="
+                background: #28a745;
+                color: white;
+                padding: 4px 8px;
+                border-radius: 12px;
+                font-size: 12px;
+                font-weight: bold;
+                margin-left: 10px;
+            ">
+                🏆 Winner
+            </span>
+            """
+        elif winner == 'model_b':
+            winner_badge_b = """
+            <span style="
+                background: #28a745;
+                color: white;
+                padding: 4px 8px;
+                border-radius: 12px;
+                font-size: 12px;
+                font-weight: bold;
+                margin-left: 10px;
+            ">
+                🏆 Winner
+            </span>
+            """
+        elif winner == 'tie':
+            tie_badge = """
+            <span style="
+                background: #6c757d;
+                color: white;
+                padding: 4px 8px;
+                border-radius: 12px;
+                font-size: 12px;
+                font-weight: bold;
+                margin-left: 10px;
+            ">
+                🤝 Tie
+            </span>
+            """
+            winner_badge_a = tie_badge
+            winner_badge_b = tie_badge
+    # Add score badge if available
+    score_info = ""
+    if score is not None and score != 'N/A':
+        try:
+            score_val = float(score)
+            score_color = '#28a745' if score_val >= 0 else '#dc3545'
+            score_info = f"""
+            <div style="text-align: center; margin-bottom: 15px;">
+                <span style="
+                    background: {score_color};
+                    color: white;
+                    padding: 6px 12px;
+                    border-radius: 15px;
+                    font-size: 16px;
+                    font-weight: bold;
+                ">
+                    Comparison Score: {score_val:.3f}
+                </span>
+            </div>
+            """
+        except (ValueError, TypeError):
+            pass
+    # Create the side-by-side layout
+    side_by_side_html = f"""
+    <div style="margin-bottom: 20px;">
+        {score_info}
+        <div style="display: flex; gap: 20px; margin-top: 10px;">
+            <!-- Model A Column -->
+            <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
+                <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
+                    <span style="background: #007bff; color: white; padding: 4px 8px; border-radius: 4px; font-size: 14px; margin-right: 10px;">A</span>
+                    {html.escape(model_a)}
+                    {winner_badge_a}
+                </h4>
+                <div style="font-size: 15px; line-height: 1.5;">
+                    {html_a}
+                </div>
+            </div>
+            <!-- Model B Column -->
+            <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
+                <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
+                    <span style="background: #fd7e14; color: white; padding: 4px 8px; border-radius: 4px; font-size: 14px; margin-right: 10px;">B</span>
+                    {html.escape(model_b)}
+                    {winner_badge_b}
+                </h4>
+                <div style="font-size: 15px; line-height: 1.5;">
+                    {html_b}
+                </div>
+            </div>
+        </div>
+    </div>
+    """
+    return side_by_side_html
+def is_side_by_side_dataset(example: Dict[str, Any]) -> bool:
+    """
+    Check if an example contains side-by-side comparison data.
+    Args:
+        example: Example dictionary from the dataset
+    Returns:
+        True if the example has both model_a_response and model_b_response
+    """
+    # Check if this is a side-by-side dataset by looking for both model_a_response and model_b_response
+    return 'model_a_response' in example and 'model_b_response' in example and \
+           example.get('model_a_response') is not None and example.get('model_b_response') is not None
+def extract_side_by_side_data(row: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract side-by-side comparison data from a row.
+    Args:
+        row: Row from the dataset
+    Returns:
+        Dictionary with extracted side-by-side data
+    """
+    return {
+        'model_a': row.get('model_a', 'Model A'),
+        'model_b': row.get('model_b', 'Model B'),
+        'model_a_response': row.get('model_a_response', 'N/A'),
+        'model_b_response': row.get('model_b_response', 'N/A'),
+        'winner': row.get('winner', None),
+        'score': row.get('score', None)
+    }

stringsight/dashboard/state.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Shared application state for the LMM-Vibes Gradio viewer.
+This module centralises mutable globals so they can be imported from any other
+sub-module without circular-import problems.
+"""
+from typing import Any, Dict, Optional
+import os
+from pathlib import Path
+# Global runtime state – mutable and shared across all tabs
+app_state: Dict[str, Any] = {
+    "clustered_df": None,
+    # NEW canonical key for the FunctionalMetrics dict
+    "metrics": None,
+    # DEPRECATED alias kept temporarily so that untouched modules continue to work
+    "model_stats": None,
+    "results_path": None,
+    "available_models": [],
+    "current_results_dir": None,
+}
+# Base directory that contains experiment result folders. Can be changed at
+# runtime via launch_app(results_dir=…).  A value of None means "not set".
+# Prefer persistent storage in Spaces at /data/data when available.
+_default_base = "results"
+BASE_RESULTS_DIR: Optional[str] = os.getenv("BASE_RESULTS_DIR", _default_base)

stringsight/dashboard/utils.py ADDED Viewed

	@@ -0,0 +1,2027 @@

+"""
+Utility functions for Gradio pipeline results app.
+This module contains common utility functions used across different components.
+"""
+import numpy as np
+import pandas as pd
+import json
+import markdown
+import plotly.graph_objects as go
+import plotly.express as px
+from typing import Dict, List, Any, Optional, Tuple
+import html
+import ast
+import re
+# Conversation rendering helpers are now in a dedicated module for clarity
+from . import conversation_display as _convdisp
+from .conversation_display import (
+    convert_to_openai_format,
+    display_openai_conversation_html,
+    pretty_print_embedded_dicts,
+)
+# NEW IMPLEMENTATION ---------------------------------------------------
+from .metrics_adapter import get_model_clusters, get_all_models
+# ---------------------------------------------------------------------------
+# NEW helper utilities for FunctionalMetrics format
+# ---------------------------------------------------------------------------
+# Allowed cluster tags across the entire app
+ALLOWED_TAGS: set[str] = {
+    "Positive",
+    "Negative (critical)",
+    "Negative (non-critical)",
+    "Style",
+}
+def _is_nan(value: Any) -> bool:
+    try:
+        return isinstance(value, float) and np.isnan(value)
+    except Exception:
+        return False
+def _parse_meta_obj(meta_obj: Any) -> Any:
+    """Normalize and parse metadata objects.
+    - Parse stringified containers (dict/list)
+    - Treat NaN-like values as None
+    - Return as-is otherwise
+    """
+    if meta_obj is None:
+        return None
+    if _is_nan(meta_obj):
+        return None
+    if isinstance(meta_obj, str):
+        s = meta_obj.strip()
+        if s in ("", "None", "N/A", "null"):
+            return None
+        try:
+            return ast.literal_eval(meta_obj)
+        except Exception:
+            return meta_obj
+    return meta_obj
+def extract_allowed_tag(meta_obj: Any) -> Optional[str]:
+    """Extract the first tag value from metadata and return it only if in ALLOWED_TAGS.
+    Rules:
+    - If metadata is missing, NaN, or all empty dicts, return None
+    - If the extracted value is not in ALLOWED_TAGS, return None
+    """
+    meta_obj = _parse_meta_obj(meta_obj)
+    if meta_obj is None:
+        return None
+    if isinstance(meta_obj, dict):
+        # Empty dict means no tag
+        if len(meta_obj) == 0:
+            return None
+        for _, v in meta_obj.items():
+            tag = str(v)
+            return tag if tag in ALLOWED_TAGS else None
+        return None
+    if isinstance(meta_obj, (list, tuple)):
+        if len(meta_obj) == 0:
+            return None
+        tag = str(meta_obj[0])
+        return tag if tag in ALLOWED_TAGS else None
+    # Scalar string/other
+    tag = str(meta_obj)
+    return tag if tag in ALLOWED_TAGS else None
+def normalize_text_for_search(text: Any) -> str:
+    """Lowercase and strip common Markdown/HTML formatting and punctuation for robust search.
+    - Unwrap markdown links: [label](url) -> label
+    - Remove inline code/backticks and strikethrough markers
+    - Unwrap emphasis/bold/italics: *, **, _, __
+    - Strip simple HTML tags
+    - Remove all punctuation including commas, periods, quotes, etc.
+    - Collapse whitespace
+    """
+    if text is None:
+        return ""
+    s = str(text)
+    # Strip HTML tags first
+    s = re.sub(r"<[^>]+>", " ", s)
+    # Markdown links [text](url) -> text
+    s = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", s)
+    # Inline code `code` -> code
+    s = re.sub(r"`([^`]*)`", r"\1", s)
+    # Bold/italic wrappers (**text** | __text__ | *text* | _text_) -> text
+    s = re.sub(r"(\*\*|__)(.*?)\1", r"\2", s)
+    s = re.sub(r"(\*|_)(.*?)\1", r"\2", s)
+    # Strikethrough ~~text~~ -> text
+    s = re.sub(r"~~(.*?)~~", r"\1", s)
+    # Remove remaining markdown emphasis chars/backticks/tilde
+    s = re.sub(r"[*_`~]", "", s)
+    # Remove all punctuation (including commas, periods, quotes, parentheses, etc.)
+    s = re.sub(r"[^\w\s]", " ", s)
+    # Normalize whitespace and lowercase
+    s = re.sub(r"\s+", " ", s).strip().lower()
+    return s
+def format_confidence_interval(ci: dict | None, decimals: int = 3) -> str:
+    """Return a pretty string for a CI dict of the form {"lower": x, "upper": y}."""
+    if not ci or not isinstance(ci, dict):
+        return "N/A"
+    lower, upper = ci.get("lower"), ci.get("upper")
+    if lower is None or upper is None:
+        return "N/A"
+    return f"[{lower:.{decimals}f}, {upper:.{decimals}f}]"
+def get_confidence_interval_width(ci: dict | None) -> float | None:
+    """Return CI width (upper-lower) if possible."""
+    if not ci or not isinstance(ci, dict):
+        return None
+    lower, upper = ci.get("lower"), ci.get("upper")
+    if lower is None or upper is None:
+        return None
+    return upper - lower
+def has_confidence_intervals(record: dict | None) -> bool:
+    """Simple check whether any *_ci key with lower/upper exists in a metrics record."""
+    if not record or not isinstance(record, dict):
+        return False
+    for k, v in record.items():
+        if k.endswith("_ci") and isinstance(v, dict) and {"lower", "upper"}.issubset(v.keys()):
+            return True
+    return False
+def extract_quality_score(quality_field: Any) -> float | None:
+    """Given a quality field that may be a dict of metric values or a scalar, return its mean."""
+    if quality_field is None:
+        return None
+    if isinstance(quality_field, (int, float)):
+        return float(quality_field)
+    if isinstance(quality_field, dict) and quality_field:
+        return float(np.mean(list(quality_field.values())))
+    return None
+# ---------------------------------------------------------------------------
+# UPDATED: get_top_clusters_for_model for FunctionalMetrics format
+# ---------------------------------------------------------------------------
+def get_top_clusters_for_model(metrics: Dict[str, Any], model_name: str, top_n: int = 10) -> List[Tuple[str, Dict[str, Any]]]:
+    """Return the top N clusters (by salience) for a given model.
+    Args:
+        metrics: The FunctionalMetrics dictionary (3-file format) loaded via data_loader.
+        model_name: Name of the model to inspect.
+        top_n: Number of clusters to return.
+    Returns:
+        List of (cluster_name, cluster_dict) tuples sorted by descending proportion_delta.
+    """
+    clusters_dict = get_model_clusters(metrics, model_name)
+    if not clusters_dict:
+        return []
+    # Filter out "No properties" clusters
+    clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
+    # Filter out "Outliers" cluster for overview tab
+    clusters_dict = {k: v for k, v in clusters_dict.items() if "Outliers" not in k}
+    sorted_items = sorted(
+        clusters_dict.items(), key=lambda kv: kv[1].get("proportion_delta", 0), reverse=True
+    )
+    return sorted_items[:top_n]
+def compute_model_rankings_new(metrics: Dict[str, Any]) -> List[tuple]:
+    """Compute rankings of models based on mean salience (proportion_delta).
+    Args:
+        metrics: The FunctionalMetrics dict loaded by data_loader.
+    Returns:
+        List[Tuple[str, Dict[str, float]]]: sorted list of (model_name, summary_dict)
+    """
+    model_scores: Dict[str, Dict[str, float]] = {}
+    for model in get_all_models(metrics):
+        clusters = get_model_clusters(metrics, model)
+        # Filter out "No properties" clusters
+        clusters = {k: v for k, v in clusters.items() if k != "No properties"}
+        if not clusters:
+            continue
+        saliences = [c.get("proportion_delta", 0.0) for c in clusters.values()]
+        model_scores[model] = {
+            "avg_salience": float(np.mean(saliences)),
+            "median_salience": float(np.median(saliences)),
+            "num_clusters": len(saliences),
+            "top_salience": float(max(saliences)),
+            "std_salience": float(np.std(saliences)),
+        }
+    return sorted(model_scores.items(), key=lambda x: x[1]["avg_salience"], reverse=True)
+def create_model_summary_card_new(
+    model_name: str,
+    metrics: Dict[str, Any],
+    top_n: int = 3,
+    score_significant_only: bool = False,
+    quality_significant_only: bool = False,
+    sort_by: str = "quality_asc",
+    min_cluster_size: int = 1,
+    selected_tags: Optional[List[str]] = None,
+) -> str:
+    """Generate a **styled** HTML summary card for a single model.
+    The new implementation recreates the legacy card design the user prefers:
+    • Card header with battle count
+    • Each cluster displayed as a vertically-spaced block (NOT a table)
+    • Frequency, distinctiveness factor and CI inline; quality score right-aligned
+    """
+    clusters_dict = get_model_clusters(metrics, model_name)
+    if not clusters_dict:
+        return f"<div style='padding:20px'>No cluster data for {model_name}</div>"
+    # Filter out "No properties" clusters
+    clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
+    # Filter out "Outliers" cluster for overview tab
+    clusters_dict = {k: v for k, v in clusters_dict.items() if "Outliers" not in k}
+    # Helper: extract allowed tag from metadata
+    def _extract_tag(meta_obj: Any) -> Optional[str]:
+        return extract_allowed_tag(meta_obj)
+    # Helper: sanitize label that might include dict-like suffixes
+    def _sanitize_label(label: str) -> str:
+        if not isinstance(label, str):
+            return str(label)
+        lbl = re.sub(r"\s*\(\s*\{[^}]*\}\s*\)\s*$", "", label)
+        lbl = re.sub(r"\s*\{[^}]*\}\s*$", "", lbl)
+        lbl = re.sub(r"\s*\(\s*[^(){}:]+\s*:\s*[^(){}]+\)\s*$", "", lbl)
+        return lbl.strip()
+    # Build consistent colors for tags for this card
+    # Fixed mapping for known tags
+    tag_to_color: Dict[str, str] = {
+        "Style": "#9467bd",  # purple
+        "Positive": "#28a745",  # green
+        "Negative (non-critical)": "#ff7f0e",  # orange
+        "Negative (critical)": "#dc3545",  # red
+    }
+    unique_tags: List[str] = []
+    label_to_tag: Dict[str, str] = {}
+    # Detect "all empty dicts" across metadata
+    cluster_meta_values: List[Any] = []
+    for c in clusters_dict.values():
+        meta_obj = c.get("metadata") if isinstance(c, dict) else None
+        meta_obj = _parse_meta_obj(meta_obj)
+        cluster_meta_values.append(meta_obj)
+    non_null_meta = [m for m in cluster_meta_values if m is not None]
+    all_meta_empty_dicts = (
+        len(non_null_meta) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_meta)
+    )
+    if not all_meta_empty_dicts:
+        for c in clusters_dict.values():
+            tag_val = _extract_tag(c.get("metadata")) if isinstance(c, dict) else None
+            if tag_val and tag_val not in unique_tags:
+                unique_tags.append(tag_val)
+        # tag_to_color already contains all allowed tags with fixed colors
+    # Filter clusters ----------------------------------------------------
+    all_clusters = [c for c in clusters_dict.values() if c.get("size", 0) >= min_cluster_size]
+    # Optional: filter clusters by sidebar-selected tags
+    if selected_tags:
+        def _cluster_tag(c: dict) -> Optional[str]:
+            return _extract_tag(c.get("metadata")) if isinstance(c, dict) else None
+        allowed = set(map(str, selected_tags))
+        all_clusters = [c for c in all_clusters if (t := _cluster_tag(c)) and str(t) in allowed]
+    if score_significant_only:
+        if model_name == "all":
+            # For "all" model, we don't have proportion_delta_significant, so skip this filter
+            pass
+        else:
+            all_clusters = [c for c in all_clusters if c.get("proportion_delta_significant", False)]
+    if quality_significant_only:
+        all_clusters = [c for c in all_clusters if any(c.get("quality_delta_significant", {}).values())]
+    if not all_clusters:
+        return f"<div style='padding:20px'>No clusters pass filters for {model_name}</div>"
+    # Count significant properties ---------------------------------------
+    significant_frequency_count = 0
+    significant_quality_count = 0
+    for cluster in clusters_dict.values():
+        if cluster.get("size", 0) >= min_cluster_size:
+            # Count frequency significance
+            if model_name != "all" and cluster.get("proportion_delta_significant", False):
+                significant_frequency_count += 1
+            # Count quality significance (sum across all metrics)
+            quality_delta_significant = cluster.get("quality_delta_significant", {})
+            significant_quality_count += sum(quality_delta_significant.values())
+    # Sort ---------------------------------------------------------------
+    def _mean_quality(c: dict[str, Any]) -> float:
+        vals = list(c.get("quality", {}).values())
+        return float(np.mean(vals)) if vals else 0.0
+    sort_key_map = {
+        "quality_asc": (_mean_quality, False),
+        "quality_desc": (_mean_quality, True),
+        "frequency_desc": (lambda c: c.get("proportion", 0), True),
+        "frequency_asc": (lambda c: c.get("proportion", 0), False),
+        "salience_desc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True),
+        "salience_asc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), False),
+    }
+    key_fn, reverse = sort_key_map.get(sort_by, (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True))
+    sorted_clusters = sorted(all_clusters, key=key_fn, reverse=reverse)[:top_n]
+    # Determine total conversations for this model ----------------
+    if model_name == "all":
+        # For "all" model, sum the individual model totals to avoid double-counting
+        model_scores = metrics.get("model_scores", {})
+        total_battles = sum(model_data.get("size", 0) for model_data in model_scores.values())
+    else:
+        model_scores_entry = metrics.get("model_scores", {}).get(model_name, {})
+        total_battles = model_scores_entry.get("size")
+        if total_battles is None:
+            # Fallback: deduplicate example IDs across clusters
+            total_battles = sum(c.get("size", 0) for c in clusters_dict.values())
+    # Card header --------------------------------------------------------
+    display_model_name = ("All Models" if str(model_name).lower() == "all" else model_name)
+    html_parts: list[str] = [f"""
+    <div style="padding: 12px 8px; margin-bottom: 12px; border-bottom: 1px solid #e6e8eb;">
+      <h3 style="margin-top:0; font-size: 18px;">{html.escape(display_model_name)}</h3>
+      <p style="margin: 4px 0 8px 0; color:#555; font-size:13px;">
+        {total_battles} battles · Top clusters by frequency
+      </p>
+      <p style="margin: 0 0 12px 0; color:#666; font-size:12px;">
+        {significant_frequency_count} significant frequency properties · {significant_quality_count} significant quality properties
+      </p>
+    """]
+    # Cluster blocks -----------------------------------------------------
+    for i, cluster in enumerate(sorted_clusters):
+        raw_name = next(k for k, v in clusters_dict.items() if v is cluster)
+        # Do not pre-escape here; markdown renderer handles escaping. Pre-escaping causes
+        # entities like &#x27; to render literally due to double-escaping.
+        name = _sanitize_label(raw_name)
+        prop = cluster.get("proportion", 0)
+        freq_pct = prop * 100
+        size = cluster.get("size", 0)
+        # Tag badge from metrics metadata (no DataFrame fallback)
+        tag_val = _extract_tag(cluster.get("metadata"))
+        if not tag_val:
+            tag_val = label_to_tag.get(raw_name) or label_to_tag.get(_sanitize_label(raw_name))
+        tag_badge_html = ""
+        stripe_color = "#4c6ef5"
+        if tag_val:
+            color = tag_to_color.get(tag_val, '#4c6ef5')
+            tag_badge_html = (
+                f"<span style=\"display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:{color}12; color:{color}; border:1px solid {color}26;\">{html.escape(str(tag_val))}</span>"
+            )
+            stripe_color = color
+        # Check significance flags
+        is_proportion_significant = False
+        if model_name != "all":
+            is_proportion_significant = cluster.get("proportion_delta_significant", False)
+        quality_delta_significant = cluster.get("quality_delta_significant", {})
+        is_quality_significant = any(quality_delta_significant.values())
+        # Create significance indicators
+        significance_indicators = []
+        if is_proportion_significant:
+            significance_indicators.append('<span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#cc6699; border:1px solid #cc669933; background:#cc669912;">F</span>')
+        if is_quality_significant:
+            significance_indicators.append('<span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#007bff; border:1px solid #007bff33; background:#007bff12; margin-left:6px;">Q</span>')
+        significance_html = " ".join(significance_indicators) if significance_indicators else ""
+        # Distinctiveness / frequency delta display
+        if model_name == "all":
+            # For "all" model, proportion_delta doesn't make sense, so show proportion instead
+            distinct_factor = prop
+            distinct_text = f"{freq_pct:.1f}% of all conversations"
+            freq_with_delta_text = f"{freq_pct:.1f}%"
+        else:
+            sal = cluster.get("proportion_delta", 0)
+            distinct_factor = 1 + (sal / prop) if prop else 1
+            # Show delta in percentage points instead of raw proportion
+            sal_pct = sal * 100.0
+            freq_with_delta_text = f"{freq_pct:.1f}% ({sal_pct:+.1f}%)"
+            distinct_text = f"{freq_with_delta_text}"
+        # Confidence interval (frequency based)
+        ci = cluster.get("proportion_ci")
+        ci_str = format_confidence_interval(ci) if ci else "N/A"
+        # Quality display – show average score and delta per metric
+        quality_scores = cluster.get("quality", {}) or {}
+        quality_delta = cluster.get("quality_delta", {}) or {}
+        quality_display_html = ""
+        metric_names: list[str] = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
+        if metric_names:
+            parts: list[str] = []
+            for metric_name in metric_names:
+                score_val = quality_scores.get(metric_name)
+                delta_val = quality_delta.get(metric_name)
+                score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
+                if isinstance(delta_val, (int, float)):
+                    # Use grey for values very close to zero
+                    if abs(delta_val) < 0.001:
+                        color = "#AAAAAA"
+                    else:
+                        color = "#28a745" if delta_val > 0 else "#dc3545"
+                    parts.append(
+                        f"<div>{metric_name}: {score_str} <span style=\"color:{color}; font-weight:500;\">({delta_val:+.3f})</span></div>"
+                    )
+                else:
+                    parts.append(f"<div>{metric_name}: {score_str}</div>")
+            quality_display_html = "".join(parts)
+        else:
+            quality_display_html = '<span style="color:#666;">No quality data</span>'
+        # Get light color for this cluster
+        cluster_color = get_light_color_for_cluster(name, i)
+        html_parts.append(f"""
+        <div style="background:#fbfcfe; border:1px solid #edf1f5; border-left: 3px solid {stripe_color}; padding: 10px 10px; margin: 10px 0; border-radius: 8px; box-shadow: 0 1px 2px rgba(16,24,40,0.06);">
+          <div style="display:flex; justify-content:space-between; align-items:flex-start; gap: 12px;">
+            <div style="flex:1; min-width:0;">
+              <div style="margin-bottom:4px; font-size:14px;">
+                {(_convdisp._markdown(str(name), pretty_print_dicts=False).replace('<p>', '<span>').replace('</p>', '</span>'))}
+              </div>
+            </div>
+            <div style="font-size:12px; font-weight:normal; white-space:nowrap; text-align:right;">
+              {quality_display_html}
+            </div>
+          </div>
+          <div style="display:flex; justify-content:space-between; align-items:center; margin-top:6px; gap: 12px;">
+            <div style="font-size:12px; color:#555; display:flex; align-items:center; flex-wrap:wrap; gap:6px;">
+              <span>{freq_with_delta_text} frequency ({size} out of {total_battles} total)</span>
+            </div>
+            <div style="text-align:right; display:flex; align-items:center; gap:8px;">{(tag_badge_html if tag_badge_html else '')}{significance_html}</div>
+          </div>
+        </div>
+        """)
+    # Close card div -----------------------------------------------------
+    html_parts.append("</div>")
+    return "\n".join(html_parts)
+def format_cluster_dataframe(clustered_df: pd.DataFrame,
+                           selected_models: Optional[List[str]] = None,
+                           cluster_level: str = 'fine') -> pd.DataFrame:
+    """Format cluster DataFrame for display in Gradio."""
+    df = clustered_df.copy()
+    # Debug information
+    print(f"DEBUG: format_cluster_dataframe called")
+    print(f"  - Input DataFrame shape: {df.shape}")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
+    # Filter by models if specified
+    if selected_models:
+        print(f"  - Filtering by {len(selected_models)} selected models")
+        df = df[df['model'].isin(selected_models)]
+        print(f"  - After filtering shape: {df.shape}")
+        print(f"  - Models after filtering: {df['model'].unique().tolist()}")
+    else:
+        print(f"  - No model filtering applied")
+    # Select relevant columns based on cluster level using correct column names from pipeline
+    if cluster_level == 'fine':
+        id_col = 'property_description_fine_cluster_id'
+        label_col = 'property_description_fine_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'fine_cluster_id'
+        alt_label_col = 'fine_cluster_label'
+    else:
+        id_col = 'property_description_coarse_cluster_id'
+        label_col = 'property_description_coarse_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'coarse_cluster_id'
+        alt_label_col = 'coarse_cluster_label'
+    # Try both naming patterns
+    if id_col in df.columns and label_col in df.columns:
+        # Use the expected naming pattern
+        cols = ['question_id', 'model', 'property_description', id_col, label_col, 'score']
+    elif alt_id_col in df.columns and alt_label_col in df.columns:
+        # Use the alternative naming pattern
+        cols = ['question_id', 'model', 'property_description', alt_id_col, alt_label_col, 'score']
+    else:
+        # Fall back to basic columns if cluster columns are missing
+        cols = ['question_id', 'model', 'property_description', 'score']
+    # Keep only existing columns
+    available_cols = [col for col in cols if col in df.columns]
+    df = df[available_cols]
+    print(f"  - Final DataFrame shape: {df.shape}")
+    print(f"  - Final columns: {df.columns.tolist()}")
+    return df
+def truncate_cluster_name(cluster_desc: str, max_length: int = 50) -> str:
+    """Truncate cluster description to fit in table column."""
+    if len(cluster_desc) <= max_length:
+        return cluster_desc
+    return cluster_desc[:max_length-3] + "..."
+def create_frequency_comparison_table(model_stats: Dict[str, Any],
+                                     selected_models: List[str],
+                                     cluster_level: str = "fine",  # Ignored – kept for backward-compat
+                                     top_n: int = 50,
+                                     selected_model: str | None = None,
+                                     selected_quality_metric: str | None = None) -> pd.DataFrame:
+    """Create a comparison table for the new FunctionalMetrics format.
+    The old signature is kept (cluster_level arg is ignored) so that callers
+    can be updated incrementally.
+    """
+    if not selected_models:
+        return pd.DataFrame()
+    # ------------------------------------------------------------------
+    # 1. Collect per-model, per-cluster rows
+    # ------------------------------------------------------------------
+    all_rows: List[dict] = []
+    for model in selected_models:
+        model_clusters = get_model_clusters(model_stats, model)  # type: ignore[arg-type]
+        if not model_clusters:
+            continue
+        # Optional filter by a single model after the fact
+        if selected_model and model != selected_model:
+            continue
+        for cluster_name, cdata in model_clusters.items():
+            # Filter out "No properties" clusters
+            if cluster_name == "No properties":
+                continue
+            # Basic numbers
+            freq_pct = cdata.get("proportion", 0.0) * 100.0
+            prop_ci = cdata.get("proportion_ci")
+            # Quality per metric dicts ------------------------------------------------
+            quality_dict = cdata.get("quality", {}) or {}
+            quality_ci_dict = cdata.get("quality_ci", {}) or {}
+            # Significance flags
+            sal_sig = bool(cdata.get("proportion_delta_significant", False))
+            quality_sig_flags = cdata.get("quality_delta_significant", {}) or {}
+            all_rows.append({
+                "cluster": cluster_name,
+                "model": model,
+                "frequency": freq_pct,
+                "proportion_ci": prop_ci,
+                "quality": quality_dict,
+                "quality_ci": quality_ci_dict,
+                "score_significant": sal_sig,
+                "quality_significant_any": any(quality_sig_flags.values()),
+                "quality_significant_metric": quality_sig_flags.get(selected_quality_metric) if selected_quality_metric else None,
+            })
+    if not all_rows:
+        return pd.DataFrame()
+    df_all = pd.DataFrame(all_rows)
+    # Aggregate frequency across models ----------------------------------
+    freq_sum = df_all.groupby("cluster")["frequency"].sum().sort_values(ascending=False)
+    top_clusters = freq_sum.head(top_n).index.tolist()
+    df_top = df_all[df_all["cluster"].isin(top_clusters)].copy()
+    table_rows: List[dict] = []
+    for clu in top_clusters:
+        subset = df_top[df_top["cluster"] == clu]
+        avg_freq = subset["frequency"].mean()
+        # Aggregate CI (mean of bounds)
+        ci_lowers = [ci.get("lower") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
+        ci_uppers = [ci.get("upper") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
+        freq_ci = {
+            "lower": float(np.mean(ci_lowers)) if ci_lowers else None,
+            "upper": float(np.mean(ci_uppers)) if ci_uppers else None,
+        } if ci_lowers and ci_uppers else None
+        # Quality aggregation -----------------------------------------------------
+        q_vals: List[float] = []
+        q_ci_l: List[float] = []
+        q_ci_u: List[float] = []
+        quality_sig_any = False
+        for _, row in subset.iterrows():
+            q_dict = row["quality"]
+            if selected_quality_metric:
+                if selected_quality_metric in q_dict:
+                    q_vals.append(q_dict[selected_quality_metric])
+                ci_metric = row["quality_ci"].get(selected_quality_metric) if isinstance(row["quality_ci"], dict) else None
+                if ci_metric:
+                    q_ci_l.append(ci_metric.get("lower"))
+                    q_ci_u.append(ci_metric.get("upper"))
+                quality_sig_any = quality_sig_any or bool(row["quality_significant_metric"])
+            else:
+                q_vals.extend(q_dict.values())
+                for ci in row["quality_ci"].values():
+                    if isinstance(ci, dict):
+                        q_ci_l.append(ci.get("lower"))
+                        q_ci_u.append(ci.get("upper"))
+                quality_sig_any = quality_sig_any or row["quality_significant_any"]
+        quality_val = float(np.mean(q_vals)) if q_vals else None
+        quality_ci = {
+            "lower": float(np.mean(q_ci_l)),
+            "upper": float(np.mean(q_ci_u)),
+        } if q_ci_l and q_ci_u else None
+        score_sig = subset["score_significant"].any()
+        table_rows.append({
+            "Cluster": clu,
+            "Frequency (%)": f"{avg_freq:.1f}",
+            "Freq CI": format_confidence_interval(freq_ci),
+            "Quality": f"{quality_val:.3f}" if quality_val is not None else "N/A",
+            "Quality CI": format_confidence_interval(quality_ci) if quality_ci else "N/A",
+            "Score Significance": "Yes" if score_sig else "No",
+            "Quality Significance": "Yes" if quality_sig_any else "No",
+        })
+    return pd.DataFrame(table_rows)
+def create_frequency_comparison_plots(model_stats: Dict[str, Any],
+                                     selected_models: List[str],
+                                     cluster_level: str = 'fine',
+                                     top_n: int = 50,
+                                     show_confidence_intervals: bool = False) -> Tuple[go.Figure, go.Figure]:
+    """Create frequency comparison plots (matching frequencies_tab.py exactly)."""
+    print(f"\nDEBUG: Plotting function called with:")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Cluster level: {cluster_level}")
+    print(f"  - Top N: {top_n}")
+    print(f"  - Available models in stats: {list(model_stats.keys())}")
+    # Use the same data preparation logic as the table function
+    # Collect all clusters across all models for the chart (exact copy from frequencies_tab.py)
+    all_clusters_data = []
+    for model_name, model_data in model_stats.items():
+        if model_name not in selected_models:
+            continue
+        clusters = model_data.get(cluster_level, [])
+        for cluster in clusters:
+            # Filter out "No properties" clusters
+            if cluster.get('property_description') == "No properties":
+                continue
+            # Get confidence intervals for quality scores if available
+            quality_score_ci = cluster.get('quality_score_ci', {})
+            has_quality_ci = bool(quality_score_ci)
+            # Get distinctiveness score confidence intervals (correct structure)
+            score_ci = cluster.get('score_ci', {})
+            ci_lower = score_ci.get('lower') if score_ci else None
+            ci_upper = score_ci.get('upper') if score_ci else None
+            all_clusters_data.append({
+                'property_description': cluster['property_description'],
+                'model': model_name,
+                'frequency': cluster.get('proportion', 0) * 100,  # Convert to percentage
+                'size': cluster.get('size', 0),
+                'cluster_size_global': cluster.get('cluster_size_global', 0),
+                'has_ci': has_confidence_intervals(cluster),
+                'ci_lower': ci_lower,
+                'ci_upper': ci_upper,
+                'has_quality_ci': has_quality_ci
+            })
+    if not all_clusters_data:
+        # Return empty figures
+        empty_fig = go.Figure()
+        empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
+        return empty_fig, empty_fig
+    clusters_df = pd.DataFrame(all_clusters_data)
+    # Get all unique clusters for the chart
+    all_unique_clusters = clusters_df['property_description'].unique()
+    total_clusters = len(all_unique_clusters)
+    # Show all clusters by default
+    top_n_for_chart = min(top_n, total_clusters)
+    # Calculate total frequency per cluster and get top clusters
+    cluster_totals = clusters_df.groupby('property_description')['frequency'].sum().sort_values(ascending=False)
+    top_clusters = cluster_totals.head(top_n_for_chart).index.tolist()
+    # Get quality scores for the same clusters to sort by quality
+    quality_data_for_sorting = []
+    for model_name, model_data in model_stats.items():
+        if model_name not in selected_models:
+            continue
+        clusters = model_data.get(cluster_level, [])
+        for cluster in clusters:
+            # Filter out "No properties" clusters
+            if cluster.get('property_description') == "No properties":
+                continue
+            if cluster['property_description'] in top_clusters:
+                quality_data_for_sorting.append({
+                    'property_description': cluster['property_description'],
+                    'quality_score': extract_quality_score(cluster.get('quality_score', 0))
+                })
+    # Calculate average quality score per cluster and sort
+    if quality_data_for_sorting:
+        quality_df_for_sorting = pd.DataFrame(quality_data_for_sorting)
+        avg_quality_per_cluster = quality_df_for_sorting.groupby('property_description')['quality_score'].mean().sort_values(ascending=True)  # Low to high
+        top_clusters = avg_quality_per_cluster.index.tolist()
+        # Reverse the order so low quality appears at top of chart
+        top_clusters = top_clusters[::-1]
+    # Filter data to only include top clusters
+    chart_data = clusters_df[clusters_df['property_description'].isin(top_clusters)]
+    if chart_data.empty:
+        # Return empty figures
+        empty_fig = go.Figure()
+        empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
+        return empty_fig, empty_fig
+    # Get unique models for colors
+    models = chart_data['model'].unique()
+    # Use a color palette that avoids yellow - using Set1 which has better contrast
+    colors = px.colors.qualitative.Set1[:len(models)]
+    # Create horizontal bar chart for frequencies
+    fig = go.Figure()
+    # Add a bar for each model
+    for i, model in enumerate(models):
+        model_data = chart_data[chart_data['model'] == model]
+        # Sort by cluster order (same as top_clusters)
+        model_data = model_data.set_index('property_description').reindex(top_clusters).reset_index()
+        # Fill NaN values with 0 for missing clusters
+        model_data['frequency'] = model_data['frequency'].fillna(0)
+        model_data['has_ci'] = model_data['has_ci'].fillna(False)
+        # For CI columns, replace NaN with None using where() instead of fillna(None)
+        model_data['ci_lower'] = model_data['ci_lower'].where(pd.notna(model_data['ci_lower']), None)
+        model_data['ci_upper'] = model_data['ci_upper'].where(pd.notna(model_data['ci_upper']), None)
+        # Ensure frequency is numeric and non-negative
+        model_data['frequency'] = pd.to_numeric(model_data['frequency'], errors='coerce').fillna(0)
+        model_data['frequency'] = model_data['frequency'].clip(lower=0)
+        # Debug: print model data for first model
+        if i == 0:  # Only print for first model to avoid spam
+            print(f"DEBUG: Model {model} data sample:")
+            print(f"  - Clusters: {len(model_data)}")
+            print(f"  - Frequency range: {model_data['frequency'].min():.2f} - {model_data['frequency'].max():.2f}")
+            print(f"  - Non-zero frequencies: {(model_data['frequency'] > 0).sum()}")
+            if len(model_data) > 0:
+                print(f"  - Sample row: {model_data.iloc[0][['property_description', 'frequency']].to_dict()}")
+        # Remove any rows where property_description is NaN (these are clusters this model doesn't appear in)
+        model_data = model_data.dropna(subset=['property_description'])
+        # Get confidence intervals for error bars
+        ci_lower = []
+        ci_upper = []
+        for _, row in model_data.iterrows():
+            freq_value = row.get('frequency', 0)
+            if (row.get('has_ci', False) and
+                pd.notna(row.get('ci_lower')) and
+                pd.notna(row.get('ci_upper')) and
+                freq_value > 0):  # Only calculate CIs for non-zero frequencies
+                # IMPORTANT: These are distinctiveness score CIs, not frequency CIs
+                # The distinctiveness score measures how much more/less frequently
+                # a model exhibits this behavior compared to the median model
+                # We can use this to estimate uncertainty in the frequency measurement
+                distinctiveness_ci_width = row['ci_upper'] - row['ci_lower']
+                # Convert to frequency uncertainty (approximate)
+                # A wider distinctiveness CI suggests more uncertainty in the frequency
+                freq_uncertainty = distinctiveness_ci_width * freq_value * 0.1
+                ci_lower.append(max(0, freq_value - freq_uncertainty))
+                ci_upper.append(freq_value + freq_uncertainty)
+            else:
+                ci_lower.append(None)
+                ci_upper.append(None)
+        # Debug: Check the data going into the plot
+        print(f"DEBUG: Adding trace for model {model}:")
+        print(f"  - Y values (clusters): {model_data['property_description'].tolist()[:3]}...")  # First 3 clusters
+        print(f"  - X values (frequencies): {model_data['frequency'].tolist()[:3]}...")  # First 3 frequencies
+        print(f"  - Total data points: {len(model_data)}")
+        fig.add_trace(go.Bar(
+            y=model_data['property_description'],
+            x=model_data['frequency'],
+            name=model,
+            orientation='h',
+            marker_color=colors[i],
+            error_x=dict(
+                type='data',
+                array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
+                arrayminus=[f - l if f is not None and l is not None else None for f, l in zip(model_data['frequency'], ci_lower)],
+                visible=show_confidence_intervals,
+                thickness=1,
+                width=3,
+                color='rgba(0,0,0,0.3)'
+            ),
+            hovertemplate='<b>%{y}</b><br>' +
+                        f'Model: {model}<br>' +
+                        'Frequency: %{x:.1f}%<br>' +
+                        'CI: %{customdata[0]}<extra></extra>',
+            customdata=[[
+                format_confidence_interval({
+                    'lower': l,
+                    'upper': u
+                }) if l is not None and u is not None else "N/A"
+                for l, u in zip(ci_lower, ci_upper)
+            ]]
+        ))
+    # Update layout
+    fig.update_layout(
+        title=f"Model Frequencies in Top {len(top_clusters)} Clusters",
+        xaxis_title="Frequency (%)",
+        yaxis_title="Cluster Description",
+        barmode='group',  # Group bars side by side
+        height=max(600, len(top_clusters) * 25),  # Adjust height based on number of clusters
+        showlegend=True,
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        )
+    )
+    # Update y-axis to show truncated cluster names
+    fig.update_yaxes(
+        tickmode='array',
+        ticktext=[truncate_cluster_name(desc, 60) for desc in top_clusters],
+        tickvals=top_clusters
+    )
+    # Create quality score chart
+    # Get quality scores for the same clusters (single score per cluster)
+    quality_data = []
+    quality_cis = []  # Add confidence intervals for quality scores
+    for cluster_desc in top_clusters:
+        # Get the first available quality score for this cluster
+        for model_name, model_data in model_stats.items():
+            clusters = model_data.get(cluster_level, [])
+            for cluster in clusters:
+                if cluster['property_description'] == cluster_desc:
+                    quality_score = extract_quality_score(cluster.get('quality_score', 0))
+                    quality_data.append({
+                        'property_description': cluster_desc,
+                        'quality_score': quality_score
+                    })
+                    # Get quality score confidence intervals
+                    quality_ci = cluster.get('quality_score_ci', {})
+                    if isinstance(quality_ci, dict) and quality_ci:
+                        # Get the first available quality CI
+                        for score_key, ci_data in quality_ci.items():
+                            if isinstance(ci_data, dict):
+                                ci_lower = ci_data.get('lower')
+                                ci_upper = ci_data.get('upper')
+                                if ci_lower is not None and ci_upper is not None:
+                                    quality_cis.append({
+                                        'property_description': cluster_desc,
+                                        'ci_lower': ci_lower,
+                                        'ci_upper': ci_upper
+                                    })
+                                    break
+                        else:
+                            quality_cis.append({
+                                'property_description': cluster_desc,
+                                'ci_lower': None,
+                                'ci_upper': None
+                            })
+                    else:
+                        quality_cis.append({
+                            'property_description': cluster_desc,
+                            'ci_lower': None,
+                            'ci_upper': None
+                        })
+                    break
+            if any(q['property_description'] == cluster_desc for q in quality_data):
+                break
+    if quality_data:
+        quality_df = pd.DataFrame(quality_data)
+        quality_cis_df = pd.DataFrame(quality_cis) if quality_cis else None
+        # Create quality score chart with single bars
+        fig_quality = go.Figure()
+        # Prepare confidence intervals for error bars
+        ci_lower = []
+        ci_upper = []
+        for _, row in quality_df.iterrows():
+            cluster_desc = row['property_description']
+            if quality_cis_df is not None:
+                ci_row = quality_cis_df[quality_cis_df['property_description'] == cluster_desc]
+                if not ci_row.empty:
+                    ci_lower.append(ci_row.iloc[0]['ci_lower'])
+                    ci_upper.append(ci_row.iloc[0]['ci_upper'])
+                else:
+                    ci_lower.append(None)
+                    ci_upper.append(None)
+            else:
+                ci_lower.append(None)
+                ci_upper.append(None)
+        # Add a single bar for each cluster
+        fig_quality.add_trace(go.Bar(
+            y=[truncate_cluster_name(desc, 60) for desc in quality_df['property_description']],
+            x=quality_df['quality_score'],
+            orientation='h',
+            marker_color='lightblue',  # Single color for all bars
+            name='Quality Score',
+            showlegend=False,
+            error_x=dict(
+                type='data',
+                array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
+                arrayminus=[q - l if q is not None and l is not None else None for q, l in zip(quality_df['quality_score'], ci_lower)],
+                visible=show_confidence_intervals,
+                thickness=1,
+                width=3,
+                color='rgba(0,0,0,0.3)'
+            ),
+            hovertemplate='<b>%{y}</b><br>' +
+                        'Quality Score: %{x:.3f}<br>' +
+                        'CI: %{customdata[0]}<extra></extra>',
+            customdata=[[
+                format_confidence_interval({
+                    'lower': l,
+                    'upper': u
+                }) if l is not None and u is not None else "N/A"
+                for l, u in zip(ci_lower, ci_upper)
+            ]]
+        ))
+        # Update layout
+        fig_quality.update_layout(
+            title=f"Quality Scores",
+            xaxis_title="Quality Score",
+            yaxis_title="",  # No y-axis title to save space
+            height=max(600, len(top_clusters) * 25),  # Same height as main chart
+            showlegend=False,
+            yaxis=dict(showticklabels=False)  # Hide y-axis labels to save space
+        )
+    else:
+        # Create empty quality figure
+        fig_quality = go.Figure()
+        fig_quality.add_annotation(text="No quality score data available",
+                                 xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
+    return fig, fig_quality
+def search_clusters_by_text(clustered_df: pd.DataFrame,
+                          search_term: str,
+                          search_in: str = 'description') -> pd.DataFrame:
+    """Search clusters by text in descriptions or other fields."""
+    if not search_term:
+        return clustered_df.head(100)  # Return first 100 if no search
+    norm_term = normalize_text_for_search(search_term)
+    if search_in == 'description':
+        series = clustered_df['property_description'].astype(str).apply(normalize_text_for_search)
+        mask = series.str.contains(norm_term, na=False, regex=False)
+    elif search_in == 'model':
+        series = clustered_df['model'].astype(str).apply(normalize_text_for_search)
+        mask = series.str.contains(norm_term, na=False, regex=False)
+    elif search_in == 'cluster_label':
+        # Use correct column names from pipeline
+        fine_label_col = 'property_description_fine_cluster_label'
+        coarse_label_col = 'property_description_coarse_cluster_label'
+        # Initialize mask aligned to clustered_df index to avoid boolean indexer misalignment
+        mask = pd.Series(False, index=clustered_df.index)
+        if fine_label_col in clustered_df.columns:
+            series = clustered_df[fine_label_col].astype(str).apply(normalize_text_for_search)
+            mask = mask | series.str.contains(norm_term, na=False, regex=False)
+        if coarse_label_col in clustered_df.columns:
+            series = clustered_df[coarse_label_col].astype(str).apply(normalize_text_for_search)
+            mask = mask | series.str.contains(norm_term, na=False, regex=False)
+    else:
+        # Search in all text columns using correct column names
+        text_cols = ['property_description', 'model',
+                    'property_description_fine_cluster_label',
+                    'property_description_coarse_cluster_label']
+        # Initialize mask aligned to clustered_df index to avoid boolean indexer misalignment
+        mask = pd.Series(False, index=clustered_df.index)
+        for col in text_cols:
+            if col in clustered_df.columns:
+                series = clustered_df[col].astype(str).apply(normalize_text_for_search)
+                mask = mask | series.str.contains(norm_term, na=False, regex=False)
+    return clustered_df[mask].head(100)
+def search_clusters_only(clustered_df: pd.DataFrame,
+                       search_term: str,
+                       cluster_level: str = 'fine') -> pd.DataFrame:
+    """Search only over cluster labels, not individual property descriptions."""
+    if not search_term:
+        return clustered_df
+    norm_term = normalize_text_for_search(search_term)
+    # Use the correct column names based on cluster level
+    if cluster_level == 'fine':
+        label_col = 'property_description_fine_cluster_label'
+        alt_label_col = 'fine_cluster_label'
+    else:
+        label_col = 'property_description_coarse_cluster_label'
+        alt_label_col = 'coarse_cluster_label'
+    # Try both naming patterns
+    if label_col in clustered_df.columns:
+        series = clustered_df[label_col].astype(str).apply(normalize_text_for_search)
+        mask = series.str.contains(norm_term, na=False, regex=False)
+    elif alt_label_col in clustered_df.columns:
+        series = clustered_df[alt_label_col].astype(str).apply(normalize_text_for_search)
+        mask = series.str.contains(norm_term, na=False, regex=False)
+    else:
+        # If neither column exists, return empty DataFrame
+        return pd.DataFrame()
+    return clustered_df[mask]
+def create_interactive_cluster_viewer(clustered_df: pd.DataFrame,
+                                    selected_models: Optional[List[str]] = None,
+                                    cluster_level: str = 'fine') -> str:
+    """Create interactive cluster viewer HTML similar to Streamlit version."""
+    if clustered_df.empty:
+        return "<p>No cluster data available</p>"
+    df = clustered_df.copy()
+    # Debug information
+    print(f"DEBUG: create_interactive_cluster_viewer called")
+    print(f"  - Input DataFrame shape: {df.shape}")
+    print(f"  - Selected models: {selected_models}")
+    print(f"  - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
+    # Filter by models if specified
+    if selected_models:
+        print(f"  - Filtering by {len(selected_models)} selected models")
+        df = df[df['model'].isin(selected_models)]
+        print(f"  - After filtering shape: {df.shape}")
+        print(f"  - Models after filtering: {df['model'].unique().tolist()}")
+    else:
+        print(f"  - No model filtering applied")
+    if df.empty:
+        return f"<p>No data found for selected models: {', '.join(selected_models or [])}</p>"
+    # Get cluster scores data for quality and frequency information
+    from .state import app_state
+    cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
+    # Use the actual column names from the pipeline output (matching Streamlit version)
+    if cluster_level == 'fine':
+        id_col = 'property_description_fine_cluster_id'
+        label_col = 'property_description_fine_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'fine_cluster_id'
+        alt_label_col = 'fine_cluster_label'
+    else:
+        id_col = 'property_description_coarse_cluster_id'
+        label_col = 'property_description_coarse_cluster_label'
+        # Also check for alternative naming without prefix
+        alt_id_col = 'coarse_cluster_id'
+        alt_label_col = 'coarse_cluster_label'
+    # Track if we fall back from coarse to fine
+    fell_back_to_fine = False
+    # Check if required columns exist and provide helpful debug info
+    # Try both naming patterns
+    if id_col in df.columns and label_col in df.columns:
+        # Use the expected naming pattern
+        pass
+    elif alt_id_col in df.columns and alt_label_col in df.columns:
+        # Use the alternative naming pattern
+        id_col = alt_id_col
+        label_col = alt_label_col
+    else:
+        # If coarse clusters are not available, try to fall back to fine clusters
+        if cluster_level == 'coarse':
+            # Check if fine clusters are available
+            fine_id_col = 'property_description_fine_cluster_id'
+            fine_label_col = 'property_description_fine_cluster_label'
+            fine_alt_id_col = 'fine_cluster_id'
+            fine_alt_label_col = 'fine_cluster_label'
+            if (fine_id_col in df.columns and fine_label_col in df.columns) or (fine_alt_id_col in df.columns and fine_alt_label_col in df.columns):
+                # Fall back to fine clusters
+                if fine_id_col in df.columns and fine_label_col in df.columns:
+                    id_col = fine_id_col
+                    label_col = fine_label_col
+                else:
+                    id_col = fine_alt_id_col
+                    label_col = fine_alt_label_col
+                cluster_level = 'fine'  # Update the cluster level for display
+                fell_back_to_fine = True
+            else:
+                # No cluster columns available at all
+                available_cols = list(df.columns)
+                return f"""
+                <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
+                    <h4>❌ Missing cluster columns in data</h4>
+                    <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
+                    <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
+                    <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
+                </div>
+                """
+        else:
+            # For fine clusters, show the original error
+            available_cols = list(df.columns)
+            return f"""
+            <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
+                <h4>❌ Missing {cluster_level} cluster columns in data</h4>
+                <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
+                <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
+                <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
+            </div>
+            """
+    # Group by cluster to get cluster information
+    try:
+        print(f"  - Grouping by cluster columns: {id_col}, {label_col}")
+        # If meta column exists, propagate it into the aggregation so we can tag clusters
+        agg_spec = {
+            'property_description': ['count', lambda x: x.unique().tolist()],
+            'model': lambda x: x.unique().tolist()
+        }
+        if 'meta' in df.columns:
+            agg_spec['meta'] = lambda x: x.iloc[0]
+        cluster_groups = df.groupby([id_col, label_col]).agg(agg_spec).reset_index()
+        # Flatten column names
+        flat_cols = [id_col, label_col, 'size', 'property_descriptions', 'models']
+        if 'meta' in df.columns:
+            flat_cols.append('meta')
+        cluster_groups.columns = flat_cols
+        # Sort by size (largest first)
+        cluster_groups = cluster_groups.sort_values('size', ascending=False)
+        # Filter out "No properties" clusters
+        cluster_groups = cluster_groups[cluster_groups[label_col] != "No properties"]
+        print(f"  - Found {len(cluster_groups)} clusters")
+        print(f"  - Cluster sizes: {cluster_groups['size'].tolist()}")
+        print(f"  - Models per cluster: {[len(models) for models in cluster_groups['models']]}")
+    except Exception as e:
+        return f"""
+        <div style="padding: 20px; background: #f8d7da; border: 1px solid #f5c6cb; border-radius: 8px;">
+            <h4>❌ Error processing cluster data</h4>
+            <p><strong>Error:</strong> {str(e)}</p>
+            <p>Please check your data format and try again.</p>
+        </div>
+        """
+    if len(cluster_groups) == 0:
+        return """
+        <div style="padding: 20px; background: #d1ecf1; border: 1px solid #bee5eb; border-radius: 8px;">
+            <h4>ℹ️ No clusters found</h4>
+            <p>No clusters match your current filters. Try selecting different models or adjusting your search.</p>
+        </div>
+        """
+    # Helper to extract first value from meta for display
+    def _extract_tag_from_meta(meta_obj: Any) -> Optional[str]:
+        return extract_allowed_tag(meta_obj)
+    # Build a stable color map for tags (if any)
+    tag_to_color: dict[str, str] = {
+        "Style": "#9467bd",  # purple
+        "Positive": "#28a745",  # green
+        "Negative (non-critical)": "#ff7f0e",  # orange
+        "Negative (critical)": "#dc3545",  # red
+    }
+    if 'meta' in cluster_groups.columns:
+        # If all meta objects are empty dicts, treat as no tags
+        meta_vals = cluster_groups['meta'].tolist()
+        parsed_meta = [_parse_meta_obj(m) for m in meta_vals]
+        non_null_parsed = [m for m in parsed_meta if m is not None]
+        all_empty_dicts = (
+            len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
+        )
+        if not all_empty_dicts:
+            unique_tags = [t for t in (_extract_tag_from_meta(m) for m in meta_vals) if t]
+            unique_tags = list(dict.fromkeys(unique_tags))  # preserve order, dedupe
+            # tag_to_color already contains all allowed tags with fixed colors
+    # Helper to remove embedded dicts like "({'group': 'Positive'})" from labels
+    def _sanitize_cluster_label(label: str) -> str:
+        if not isinstance(label, str):
+            return str(label)
+        # Remove ( { ... } ) at end
+        label = re.sub(r"\s*\(\s*\{[^}]*\}\s*\)\s*$", "", label)
+        # Remove trailing { ... }
+        label = re.sub(r"\s*\{[^}]*\}\s*$", "", label)
+        # Remove simple (key: value) trailer
+        label = re.sub(r"\s*\(\s*[^(){}:]+\s*:\s*[^(){}]+\)\s*$", "", label)
+        return label.strip()
+    # Create HTML
+    page_html = f"""
+    <div style="max-width: 1600px; margin: 0 auto;">
+        <p style="color: #666; margin-bottom: 20px;">
+            Click on clusters below to explore their property descriptions.
+            Showing {len(cluster_groups)} clusters sorted by size.
+        </p>
+    """
+    # Add a note if we fell back from coarse to fine clusters
+    if cluster_level == 'fine' and fell_back_to_fine:
+        page_html += """
+        <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; margin-bottom: 20px;">
+            <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
+        </div>
+        """
+    for i, row in cluster_groups.iterrows():
+        cluster_id = row[id_col]
+        cluster_label = row[label_col]
+        cluster_size = row['size']
+        property_descriptions = row['property_descriptions']
+        models_in_cluster = row['models']
+        # Tag if meta exists in grouped data
+        tag_badge_html = ""
+        tag_value = None
+        if 'meta' in cluster_groups.columns:
+            tag_value = _extract_tag_from_meta(row.get('meta'))
+            if tag_value:
+                color = tag_to_color.get(tag_value, '#4c6ef5')
+                tag_badge_html = (
+                    f"<span style=\"display:inline-block; margin-left:10px; padding:3px 8px; "
+                    f"border-radius:12px; font-size:11px; font-weight:600; "
+                    f"background:{color}1A; color:{color}; border:1px solid {color}33;\">"
+                    f"{html.escape(str(tag_value))}</span>"
+                )
+        # Use sanitized label for display then render markdown (no extra <strong>)
+        label_display = _sanitize_cluster_label(str(cluster_label))
+        label_html = (
+            _convdisp._markdown(str(label_display), pretty_print_dicts=False)
+            .replace('<p>', '<span>')
+            .replace('</p>', '</span>')
+        )
+        # Get quality and frequency information from cluster_scores
+        cluster_metrics = cluster_scores.get(cluster_label, {})
+        frequency_pct = cluster_metrics.get("proportion", 0) * 100 if cluster_metrics else 0
+        quality_scores = cluster_metrics.get("quality", {})
+        quality_delta = cluster_metrics.get("quality_delta", {})
+        # Build per-metric header display: "metric: score (delta)"
+        header_quality_html = "<span style=\"color:#666;\">No quality data</span>"
+        if quality_scores or quality_delta:
+            metric_names = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
+            line_parts: list[str] = []
+            for metric_name in metric_names:
+                score_val = quality_scores.get(metric_name)
+                delta_val = quality_delta.get(metric_name)
+                score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
+                if isinstance(delta_val, (int, float)):
+                    color = "#28a745" if delta_val >= 0 else "#dc3545"
+                    line_parts.append(f"<div>{metric_name}: {score_str} <span style=\"color: {color}; font-weight:500;\">({delta_val:+.3f})</span></div>")
+                else:
+                    line_parts.append(f"<div>{metric_name}: {score_str}</div>")
+            header_quality_html = "".join(line_parts)
+        # Format quality scores for detailed view
+        quality_html = ""
+        if quality_scores:
+            quality_parts = []
+            for metric_name, score in quality_scores.items():
+                color = "#28a745" if score >= 0 else "#dc3545"
+                quality_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {score:.3f}</span>')
+            quality_html = " | ".join(quality_parts)
+        else:
+            quality_html = '<span style="color:#666;">No quality data</span>'
+        # Format quality delta (relative to average)
+        quality_delta_html = ""
+        if quality_delta:
+            delta_parts = []
+            for metric_name, delta in quality_delta.items():
+                # Use grey for values very close to zero
+                if abs(delta) < 0.001:
+                    color = "#AAAAAA"
+                else:
+                    color = "#28a745" if delta > 0 else "#dc3545"
+                sign = "+" if delta >= 0 else ""
+                delta_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {sign}{delta:.3f}</span>')
+            quality_delta_html = " | ".join(delta_parts)
+        else:
+            quality_delta_html = '<span style="color:#666;">No delta data</span>'
+        # Format header quality score with visual indicators
+        header_quality_text = header_quality_html
+        # Get light color for this cluster (matching overview style)
+        cluster_color = get_light_color_for_cluster(cluster_label, i)
+        # Build per-model frequencies for this cluster (replace models list)
+        metrics_all = app_state.get("metrics", {})
+        model_cluster_scores = metrics_all.get("model_cluster_scores", {})
+        model_freq_items: list[str] = []
+        for m in models_in_cluster:
+            m_dict = model_cluster_scores.get(m, {})
+            c_dict = m_dict.get(cluster_label, {}) if isinstance(m_dict, dict) else {}
+            prop = c_dict.get("proportion")
+            if isinstance(prop, (int, float)):
+                model_freq_items.append(f"{html.escape(str(m))}: {prop * 100:.1f}%")
+        model_freqs_html = " | ".join(model_freq_items) if model_freq_items else "N/A"
+        # Create expandable cluster card with overview-style design
+        page_html += f"""
+        <details style="margin: 15px 0; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+            <summary style="
+                padding: 15px;
+                background: {get_light_color_for_cluster(cluster_label, i)};
+                color: #333;
+                cursor: pointer;
+                font-weight: 400;
+                font-size: 16px;
+                user-select: none;
+                list-style: none;
+                display: flex;
+                justify-content: space-between;
+                align-items: center;
+                border-bottom: 1px solid #dee2e6;
+            ">
+                <div style="max-width: 80%;">
+                    <div style="margin-bottom: 4px; font-size: 14px;">
+                        {label_html}
+                    </div>
+                    <span style="font-size: 12px; color: #555; display:inline-flex; align-items:center;">
+                        {frequency_pct:.1f}% frequency ({cluster_size} properties) · {len(models_in_cluster)} models
+                        {tag_badge_html}
+                    </span>
+                </div>
+                <div style="font-size: 12px; font-weight: normal; text-align: right;">
+                    <div style="margin-bottom: 4px; line-height: 1.2;">{header_quality_html}</div>
+                    <div style="color: #6c757d;">
+                        {frequency_pct:.1f}% frequency
+                    </div>
+                </div>
+            </summary>
+            <div style="padding: 20px; background: #f8f9fa;">
+                <div style="margin-bottom: 15px;">
+                    <strong>Cluster ID:</strong> {cluster_id}<br>
+                    <strong>Size:</strong> {cluster_size} properties<br>
+                    <strong>Model Frequencies:</strong> {model_freqs_html}<br>
+                </div>
+                <h4 style="color: #333; margin: 15px 0 10px 0;">
+                    Property Descriptions ({len(property_descriptions)})
+                </h4>
+                <div style="max-height: 300px; overflow-y: auto; background: white; border: 1px solid #ddd; border-radius: 4px; padding: 10px;">
+        """
+        # Display property descriptions
+        for i, desc in enumerate(property_descriptions, 1):
+            page_html += f"""
+                    <div style="
+                        padding: 8px;
+                        margin: 2px 0;
+                        background: #f8f9fa;
+                        border-left: 3px solid #667eea;
+                        border-radius: 2px;
+                    ">
+                        <strong>{i}.</strong> {desc}
+                    </div>
+            """
+        page_html += """
+                </div>
+            </div>
+        </details>
+        """
+    page_html += "</div>"
+    return page_html
+def get_cluster_statistics(clustered_df: pd.DataFrame,
+                         selected_models: Optional[List[str]] = None) -> Dict[str, Any]:
+    """Get cluster statistics for display."""
+    if clustered_df.empty:
+        return {}
+    df = clustered_df.copy()
+    # Filter by models if specified
+    if selected_models:
+        df = df[df['model'].isin(selected_models)]
+    stats = {
+        'total_properties': len(df),
+        'total_models': df['model'].nunique() if 'model' in df.columns else 0,
+    }
+    # Fine cluster statistics - try both naming patterns
+    fine_id_col = 'property_description_fine_cluster_id'
+    alt_fine_id_col = 'fine_cluster_id'
+    if fine_id_col in df.columns:
+        stats['fine_clusters'] = df[fine_id_col].nunique()
+        cluster_sizes = df.groupby(fine_id_col).size()
+        stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    elif alt_fine_id_col in df.columns:
+        stats['fine_clusters'] = df[alt_fine_id_col].nunique()
+        cluster_sizes = df.groupby(alt_fine_id_col).size()
+        stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    # Coarse cluster statistics - try both naming patterns
+    coarse_id_col = 'property_description_coarse_cluster_id'
+    alt_coarse_id_col = 'coarse_cluster_id'
+    if coarse_id_col in df.columns:
+        stats['coarse_clusters'] = df[coarse_id_col].nunique()
+        cluster_sizes = df.groupby(coarse_id_col).size()
+        stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    elif alt_coarse_id_col in df.columns:
+        stats['coarse_clusters'] = df[alt_coarse_id_col].nunique()
+        cluster_sizes = df.groupby(alt_coarse_id_col).size()
+        stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
+        stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
+        stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
+    return stats
+def get_unique_values_for_dropdowns(clustered_df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Get unique values for dropdown menus."""
+    if clustered_df.empty:
+        return {'prompts': [], 'models': [], 'properties': [], 'tags': []}
+    # Get unique values, handling missing columns gracefully
+    prompts = []
+    if 'prompt' in clustered_df.columns:
+        unique_prompts = clustered_df['prompt'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    elif 'question' in clustered_df.columns:
+        unique_prompts = clustered_df['question'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    elif 'input' in clustered_df.columns:
+        unique_prompts = clustered_df['input'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    elif 'user_prompt' in clustered_df.columns:
+        unique_prompts = clustered_df['user_prompt'].dropna().unique().tolist()
+        prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
+    # Handle both single model and side-by-side datasets
+    models = []
+    if 'model' in clustered_df.columns:
+        models = sorted(clustered_df['model'].dropna().unique().tolist())
+    elif 'model_a' in clustered_df.columns and 'model_b' in clustered_df.columns:
+        models_a = clustered_df['model_a'].dropna().unique().tolist()
+        models_b = clustered_df['model_b'].dropna().unique().tolist()
+        all_models = set(models_a + models_b)
+        models = sorted(list(all_models))
+    # Use fine cluster labels instead of property descriptions - try both naming patterns
+    properties = []
+    fine_label_col = 'property_description_fine_cluster_label'
+    alt_fine_label_col = 'fine_cluster_label'
+    if fine_label_col in clustered_df.columns:
+        unique_properties = clustered_df[fine_label_col].dropna().unique().tolist()
+        unique_properties = [prop for prop in unique_properties if prop != "No properties"]
+        properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
+    elif alt_fine_label_col in clustered_df.columns:
+        unique_properties = clustered_df[alt_fine_label_col].dropna().unique().tolist()
+        unique_properties = [prop for prop in unique_properties if prop != "No properties"]
+        properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
+    elif 'property_description' in clustered_df.columns:
+        unique_properties = clustered_df['property_description'].dropna().unique().tolist()
+        unique_properties = [prop for prop in unique_properties if prop != "No properties"]
+        properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
+    # Tags from meta first value if available (only ALLOWED_TAGS)
+    tags: List[str] = []
+    if 'meta' in clustered_df.columns:
+        def _first_allowed(obj: Any) -> Optional[str]:
+            return extract_allowed_tag(obj)
+        # Compute candidate tags and check for all-empty-dict case
+        parsed_meta_series = clustered_df['meta'].apply(_parse_meta_obj)
+        non_null_parsed = [m for m in parsed_meta_series.tolist() if m is not None]
+        all_empty_dicts = (
+            len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
+        )
+        if not all_empty_dicts:
+            tag_series = clustered_df['meta'].apply(_first_allowed)
+            tags = sorted({str(t) for t in tag_series.dropna().tolist() if t is not None and str(t) in ALLOWED_TAGS})
+    return {
+        'prompts': prompts,
+        'models': models,
+        'properties': properties,
+        'tags': tags,
+    }
+# ---------------------------------------------------------------------------
+# Example data extraction (restored)
+# ---------------------------------------------------------------------------
+def get_example_data(
+    clustered_df: pd.DataFrame,
+    selected_prompt: str | None = None,
+    selected_model: str | None = None,
+    selected_property: str | None = None,
+    max_examples: int = 5,
+    show_unexpected_behavior: bool = False,
+    randomize: bool = False,
+) -> List[Dict[str, Any]]:
+    """Return a list of example rows filtered by prompt / model / property.
+    This function was accidentally removed during a refactor; it is required by
+    *examples_tab.py* and other parts of the UI.
+    Args:
+        clustered_df: DataFrame containing the clustered results data
+        selected_prompt: Prompt to filter by (None for all)
+        selected_model: Model to filter by (None for all)
+        selected_property: Property description to filter by (None for all)
+        max_examples: Maximum number of examples to return
+        show_unexpected_behavior: If True, filter to only show unexpected behavior
+        randomize: If True, sample randomly from the filtered set instead of taking the first rows
+    Returns:
+        List of example dictionaries with extracted data
+    """
+    if clustered_df.empty:
+        return []
+    df = clustered_df.copy()
+    # Filter by unexpected behavior if requested
+    if show_unexpected_behavior:
+        if "unexpected_behavior" in df.columns:
+            # Assuming True/1 means unexpected behavior
+            df = df[df["unexpected_behavior"].isin([True, 1, "True", "true"])]
+        else:
+            # If no unexpected_behavior column, return empty (or could return all)
+            return []
+    # Filter by prompt
+    if selected_prompt:
+        prompt_cols = ["prompt", "question", "input", "user_prompt"]
+        for col in prompt_cols:
+            if col in df.columns:
+                df = df[df[col].str.contains(selected_prompt, case=False, na=False)]
+                break
+    # Filter by model - handle both single model and side-by-side datasets
+    if selected_model:
+        if "model" in df.columns:
+            # Single model datasets
+            df = df[df["model"] == selected_model]
+        elif "model_a" in df.columns and "model_b" in df.columns:
+            # Side-by-side datasets - filter where either model_a or model_b matches
+            df = df[(df["model_a"] == selected_model) | (df["model_b"] == selected_model)]
+    # Filter by property
+    if selected_property:
+        property_cols = ["property_description", "cluster", "fine_cluster_label", "property_description_fine_cluster_label"]
+        for col in property_cols:
+            if col in df.columns:
+                df = df[df[col].str.contains(selected_property, case=False, na=False)]
+                break
+    # Limit to max_examples (randomized if requested)
+    if randomize:
+        if len(df) > max_examples:
+            df = df.sample(n=max_examples)
+        else:
+            df = df.sample(frac=1)
+    else:
+        df = df.head(max_examples)
+    examples: List[Dict[str, Any]] = []
+    for _, row in df.iterrows():
+        prompt_val = next(
+            (row.get(col) for col in ["prompt", "question", "input", "user_prompt"] if row.get(col) is not None),
+            "N/A",
+        )
+        # Check if this is a side-by-side dataset
+        is_side_by_side = ('model_a_response' in row and 'model_b_response' in row and
+                          row.get('model_a_response') is not None and row.get('model_b_response') is not None)
+        if is_side_by_side:
+            # For side-by-side datasets, store both responses separately
+            response_val = "SIDE_BY_SIDE"  # Special marker
+            model_val = f"{row.get('model_a', 'Model A')} vs {row.get('model_b', 'Model B')}"
+        else:
+            # For single response datasets, use the existing logic
+            response_val = next(
+                (
+                    row.get(col)
+                    for col in [
+                        "model_response",
+                        "model_a_response",
+                        "model_b_response",
+                        "responses",
+                        "response",
+                        "output",
+                    ]
+                    if row.get(col) is not None
+                ),
+                "N/A",
+            )
+            model_val = row.get("model", "N/A")
+        # Try both naming patterns for cluster data
+        fine_cluster_id = row.get("property_description_fine_cluster_id", row.get("fine_cluster_id", "N/A"))
+        fine_cluster_label = row.get("property_description_fine_cluster_label", row.get("fine_cluster_label", "N/A"))
+        coarse_cluster_id = row.get("property_description_coarse_cluster_id", row.get("coarse_cluster_id", "N/A"))
+        coarse_cluster_label = row.get("property_description_coarse_cluster_label", row.get("coarse_cluster_label", "N/A"))
+        example_dict = {
+            "id": row.get("id", "N/A"),
+            "model": model_val,
+            "prompt": prompt_val,
+            "response": response_val,
+            "property_description": row.get("property_description", "N/A"),
+            "score": row.get("score", "N/A"),
+            "fine_cluster_id": fine_cluster_id,
+            "fine_cluster_label": fine_cluster_label,
+            "coarse_cluster_id": coarse_cluster_id,
+            "coarse_cluster_label": coarse_cluster_label,
+            "category": row.get("category", "N/A"),
+            "type": row.get("type", "N/A"),
+            "impact": row.get("impact", "N/A"),
+            "reason": row.get("reason", "N/A"),
+            "evidence": row.get("evidence", "N/A"),
+            "meta": row.get("meta", None),
+            "user_preference_direction": row.get("user_preference_direction", "N/A"),
+            "raw_response": row.get("raw_response", "N/A"),
+            "contains_errors": row.get("contains_errors", "N/A"),
+            "unexpected_behavior": row.get("unexpected_behavior", "N/A"),
+        }
+        # Add side-by-side specific fields if applicable
+        if is_side_by_side:
+            example_dict.update({
+                "is_side_by_side": True,
+                "model_a": row.get("model_a", "Model A"),
+                "model_b": row.get("model_b", "Model B"),
+                "model_a_response": row.get("model_a_response", "N/A"),
+                "model_b_response": row.get("model_b_response", "N/A"),
+                "winner": row.get("winner", None),
+            })
+        else:
+            example_dict["is_side_by_side"] = False
+        examples.append(example_dict)
+    return examples
+def format_examples_display(examples: List[Dict[str, Any]],
+                          selected_prompt: str = None,
+                          selected_model: str = None,
+                          selected_property: str = None,
+                          use_accordion: bool = True,
+                          pretty_print_dicts: bool = True) -> str:
+    """Format examples for HTML display with proper conversation rendering.
+    Args:
+        examples: List of example dictionaries
+        selected_prompt: Currently selected prompt filter
+        selected_model: Currently selected model filter
+        selected_property: Currently selected property filter
+        use_accordion: If True, group system and info messages in collapsible accordions
+        pretty_print_dicts: If True, pretty-print embedded dictionaries
+    Returns:
+        HTML string for display
+    """
+    from .conversation_display import convert_to_openai_format, display_openai_conversation_html
+    from .side_by_side_display import display_side_by_side_responses
+    if not examples:
+        return "<p style='color: #e74c3c; padding: 20px;'>No examples found matching the current filters.</p>"
+    # Create filter summary
+    filter_parts = []
+    if selected_prompt and selected_prompt != "All Prompts":
+        filter_parts.append(f"Prompt: {selected_prompt}")
+    if selected_model and selected_model != "All Models":
+        filter_parts.append(f"Model: {selected_model}")
+    if selected_property and selected_property != "All Clusters":
+        filter_parts.append(f"Cluster: {selected_property}")
+    filter_summary = ""
+    if filter_parts:
+        filter_summary = f"""
+        <div style="background: #e3f2fd; padding: 15px; border-radius: 8px; margin-bottom: 20px; border-left: 4px solid #2196f3;">
+            <strong>🔍 Active Filters:</strong> {" • ".join(filter_parts)}
+        </div>
+        """
+    html_out = f"""
+    <div class="examples-container" style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
+        <style>
+            /* Make JSON/code wrappers transparent (fall back to white when inline-styled) */
+            .examples-container pre,
+            .examples-container .highlight,
+            .examples-container .codehilite,
+            .examples-container p pre,
+            .examples-container li pre,
+            .examples-container div pre {{
+                background: transparent !important;
+            }}
+            .examples-container code {{ background: transparent !important; }}
+        </style>
+        <h3 style="color: #333; margin-bottom: 15px;">📋 Examples ({len(examples)} found)</h3>
+{filter_summary}
+    """
+    for i, example in enumerate(examples, 1):
+        # Check if this is a side-by-side example
+        if example.get('is_side_by_side', False):
+            # Use side-by-side display for comparison datasets
+            conversation_html = display_side_by_side_responses(
+                model_a=example['model_a'],
+                model_b=example['model_b'],
+                model_a_response=example['model_a_response'],
+                model_b_response=example['model_b_response'],
+                use_accordion=use_accordion,
+                pretty_print_dicts=pretty_print_dicts,
+                score=example['score'],
+                winner=example.get('winner')
+            )
+        else:
+            # Convert response to OpenAI format for proper display (single model)
+            response_data = example['response']
+            if response_data != 'N/A':
+                openai_conversation = convert_to_openai_format(response_data)
+                conversation_html = display_openai_conversation_html(
+                    openai_conversation,
+                    use_accordion=use_accordion,
+                    pretty_print_dicts=pretty_print_dicts,
+                    evidence=example.get('evidence')
+                )
+            else:
+                conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
+        # Compact cluster badge for header row
+        cluster_badge = ""
+        if example['fine_cluster_label'] != 'N/A':
+            cluster_badge = (
+                f"<span style=\"display:inline-block; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#eef2ff; color:#4f46e5; border:1px solid #e0e7ff;\">"
+                f"Cluster: {html.escape(str(example['fine_cluster_label']))}"
+                f"</span>"
+            )
+        # Tag badge derived from meta (first value)
+        tag_badge = ""
+        tag_value = None
+        meta_obj = example.get('meta')
+        tag_value = extract_allowed_tag(meta_obj)
+        if tag_value is not None and str(tag_value).strip() != "":
+            tag_badge = (
+                f"<span style=\"display:inline-block; padding:2px 8px; border-radius:999px; background:#faf5ff; color:#6d28d9; border:1px solid #ede9fe;\">"
+                f"Tag: {html.escape(str(tag_value))}"
+                f"</span>"
+            )
+        # Score display for summary (only for non-side-by-side or when not shown in side-by-side)
+        score_badge = ""
+        if not example.get('is_side_by_side', False) and example['score'] != 'N/A':
+            try:
+                score_val = float(example['score'])
+                score_color = '#28a745' if score_val >= 0 else '#dc3545'
+                score_badge = f"""
+                <span style="
+                    background: {score_color};
+                    color: white;
+                    padding: 4px 8px;
+                    border-radius: 12px;
+                    font-size: 12px;
+                    font-weight: bold;
+                    margin-left: 10px;
+                ">
+                    Score: {score_val:.3f}
+                </span>
+                """
+            except:
+                pass
+        # Create short preview of prompt for summary
+        prompt_preview = example['prompt'][:80] + "..." if len(example['prompt']) > 80 else example['prompt']
+        # Create expandable example card
+        # First example is expanded by default
+        open_attr = "open" if i == 1 else ""
+        # Build top-of-card score section (above conversation) if score exists
+        score_section_html = ""
+        raw_score = example.get('score')
+        numeric_score: float | None = None
+        if isinstance(raw_score, (int, float)):
+            numeric_score = float(raw_score)
+        elif isinstance(raw_score, str):
+            # Accept simple numeric strings without try/except
+            if re.match(r"^[+-]?\d+(?:\.\d+)?$", raw_score.strip() or ""):
+                numeric_score = float(raw_score)
+        # Avoid duplicating score display for side-by-side, which renders its own score section
+        if numeric_score is not None and not example.get('is_side_by_side', False):
+            color_bg = '#dcfce7' if numeric_score >= 0 else '#fee2e2'
+            color_fg = '#166534' if numeric_score >= 0 else '#991b1b'
+            score_chip = (
+                f"<span style=\"display:inline-block; padding:4px 10px; border-radius:999px; "
+                f"background:{color_bg}; color:{color_fg}; font-weight:600; font-size:12px; "
+                f"border:1px solid rgba(0,0,0,0.05);\">Score: {numeric_score:.3f}</span>"
+            )
+            score_section_html = (
+                f"<div style=\"margin: 0 0 12px 0; display:flex; align-items:center; flex-wrap:wrap; gap:8px;\">"
+                f"{score_chip}"
+                f"</div>"
+            )
+        html_out += f"""
+        <details {open_attr} style="border: 1px solid #dee2e6; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+            <summary style="
+                padding: 15px;
+                cursor: pointer;
+                font-weight: 600;
+                color: #495057;
+                background: linear-gradient(90deg, #f8f9fa 0%, #e9ecef 100%);
+                border-radius: 8px 8px 0 0;
+                border-bottom: 1px solid #dee2e6;
+                display: flex;
+                align-items: center;
+                justify-content: space-between;
+            ">
+                <span>
+                    <span style="background: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">#{i}</span>
+                    {prompt_preview}
+                </span>
+                <span style="font-size: 12px; color: #6c757d;">
+                    {example['model']}{score_badge}
+                </span>
+            </summary>
+            <div style="padding: 20px;">
+                <!-- Compact metadata badges row -->
+                <div style="display:flex; flex-wrap:wrap; gap:8px; align-items:center; margin-bottom: 16px; font-size:12px; color:#6b7280;">
+                    <span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#f3f4f6; border:1px solid #e5e7eb;">ID: {html.escape(str(example['id']))}</span>
+                    <span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#f3f4f6; border:1px solid #e5e7eb;">Model: {html.escape(str(example['model']))}</span>
+                    {tag_badge}
+                    {(f'<span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#ecfdf5; color:#047857; border:1px solid #bbf7d0;">Category: {html.escape(str(example["category"]))}</span>' if example["category"] not in [None, "N/A", "None", "", "null"] and str(example["category"]).strip() != "" else '')}
+                    {(f'<span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#eff6ff; color:#1d4ed8; border:1px solid #dbeafe;">Type: {html.escape(str(example["type"]))}</span>' if example["type"] not in [None, "N/A", "None", "", "null"] and str(example["type"]).strip() != "" else '')}
+                    {(f'<span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#fff7ed; color:#c2410c; border:1px solid #fed7aa;">Impact: {html.escape(str(example["impact"]))}</span>' if example["impact"] not in [None, "N/A", "None", "", "null"] and str(example["impact"]).strip() != "" else '')}
+                </div>
+                <!-- Collapsible info section for Cluster / Tag / Property / Reason / Evidence -->
+                {(
+                    f'''<details style="margin-bottom:16px; border:1px solid #e5e7eb; border-radius:8px; background:#f9fafb;">
+                        <summary style="cursor:pointer; padding:12px; font-weight:600; color:#374151; border-radius:8px;">
+                            📋 Property Information
+                        </summary>
+                        <div style="padding:0 12px 12px 12px; border-top:1px solid #e5e7eb;">
+                            {(f'<div style="margin-top:12px;"><strong style="color:#374151;">Cluster</strong><div style="color:#4b5563; margin-top:4px;">{_convdisp._markdown(str(example["fine_cluster_label"]))}</div></div>' if example.get("fine_cluster_label") not in [None, "N/A", "None", "", "null"] and str(example.get("fine_cluster_label", "")).strip() != "" else '')}
+                            {(f'<div style="margin-top:12px;"><strong style="color:#374151;">Property</strong><div style="color:#4b5563; margin-top:4px;">{_convdisp._markdown(str(example["property_description"]))}</div></div>' if example["property_description"] not in [None, "N/A", "None", "", "null"] and str(example["property_description"]).strip() != "" else '')}
+                            {(f'<div style="margin-top:12px;"><strong style="color:#374151;">Evidence</strong><div style="color:#4b5563; margin-top:4px;">{_convdisp._markdown(str(example["evidence"]))}</div></div>' if example["evidence"] not in [None, "N/A", "None", "", "null"] and str(example["evidence"]).strip() != "" else '')}
+                        </div>
+                    </details>'''
+                 ) if any([
+                    example.get("fine_cluster_label") not in [None, "N/A", "None", "", "null"] and str(example.get("fine_cluster_label", "")).strip() != "",
+                    example.get("property_description") not in [None, "N/A", "None", "", "null"] and str(example.get("property_description", "")).strip() != "",
+                    example.get("reason") not in [None, "N/A", "None", "", "null"] and str(example.get("reason", "")).strip() != "",
+                    example.get("evidence") not in [None, "N/A", "None", "", "null"] and str(example.get("evidence", "")).strip() != "",
+                 ]) else ''}
+                {score_section_html}
+                <div style="margin-bottom: 15px;">
+                    <div style="border-radius: 6px; font-size: 15px; line-height: 1.5;">
+                        {conversation_html}
+                    </div>
+                </div>
+            </div>
+        </details>
+        """
+    html_out += "</div>"
+    return html_out
+# ---------------------------------------------------------------------------
+# Legacy function aliases (backward compatibility)
+# ---------------------------------------------------------------------------
+def compute_model_rankings(*args, **kwargs):
+    """Legacy alias → forwards to compute_model_rankings_new."""
+    return compute_model_rankings_new(*args, **kwargs)
+def create_model_summary_card(*args, **kwargs):
+    """Legacy alias → forwards to create_model_summary_card_new."""
+    return create_model_summary_card_new(*args, **kwargs)
+def get_total_clusters_count(metrics: Dict[str, Any]) -> int:
+    """Get the total number of clusters from the metrics data."""
+    cluster_scores = metrics.get("cluster_scores", {})
+    # Filter out "No properties" clusters
+    cluster_scores = {k: v for k, v in cluster_scores.items() if k != "No properties"}
+    return len(cluster_scores)
+def get_light_color_for_cluster(cluster_name: str, index: int) -> str:
+    """Generate a light dusty blue background for cluster boxes.
+    Returns a consistent light dusty blue color for all clusters.
+    """
+    return "#f0f4f8"  # Very light dusty blue
+__all__ = [
+    "get_model_clusters",
+    "get_all_models",
+    "get_all_clusters",
+    "format_confidence_interval",
+    "get_confidence_interval_width",
+    "has_confidence_intervals",
+    "extract_quality_score",
+    "get_top_clusters_for_model",
+    "compute_model_rankings_new",
+    "create_model_summary_card_new",
+    "format_cluster_dataframe",
+    "truncate_cluster_name",
+    "create_frequency_comparison_table",
+    "create_frequency_comparison_plots",
+    "search_clusters_by_text",
+    "search_clusters_only",
+    "create_interactive_cluster_viewer",
+    "get_cluster_statistics",
+    "get_unique_values_for_dropdowns",
+    "get_example_data",
+    "format_examples_display",
+    "compute_model_rankings",
+    "create_model_summary_card",
+    "get_total_clusters_count",
+]