Lisa Dunlap commited on
Commit
1af0726
Β·
1 Parent(s): 224c2b6

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/**/*.jsonl filter=lfs diff=lfs merge=lfs -text
37
+ data/**/*.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # Local gradio artifacts
6
+ .gradio/
app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+
6
+ # Hard-disable the Run Pipeline tab for this app entrypoint
7
+ os.environ["ENABLE_RUN_PIPELINE_TAB"] = "0"
8
+
9
+ from stringsight.dashboard import state
10
+ from stringsight.dashboard.app import create_app
11
+
12
+
13
+ # Point the dashboard to the repository's ./data directory
14
+ state.BASE_RESULTS_DIR = str((Path(__file__).parent / "data").resolve())
15
+
16
+ # Build the Gradio application; expose for Spaces
17
+ demo: gr.Blocks = create_app()
18
+ app: gr.Blocks = demo
19
+
20
+
21
+ if __name__ == "__main__":
22
+ # Local launch settings; Spaces will auto-serve the "demo/app" object
23
+ port = int(os.environ.get("PORT", 7860))
24
+ demo.launch(server_name="0.0.0.0", server_port=port)
25
+
26
+
data/taubench_airline/cluster_scores.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb852010abe92b482d5880f6ef89859545e00609cf5999a37319b7de89aee81
3
+ size 74398946
data/taubench_airline/cluster_scores_df.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd541fd6c20c51c6d6c2061bcae12fd979aa563ade3cebe7a93cf7364da044ca
3
+ size 58028003
data/taubench_airline/clustered_results_lightweight.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14f3aece9c76950a4cd31fa33e02ada38cb7e38e1a0f544d650c9031cba08869
3
+ size 121263508
data/taubench_airline/model_cluster_scores.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:101a4f1137ca6ae230fa27b6bb3309d5d5e31ce5c41e6686ce6291c344cb4fa3
3
+ size 76420820
data/taubench_airline/model_cluster_scores_df.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90176fb8107e8c31fdb4a62e7076adbc573889bd8e4bcad41fb2eb5d67d1e6f9
3
+ size 58100860
data/taubench_airline/model_scores.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef23296a9c714064f0363155f558267a4c8679b0b2dd4f828f1d77552ae0b0db
3
+ size 74327684
data/taubench_airline/model_scores_df.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aee6668d6abbc9e5ce1db3927b65f8c5e10da1fa1f066a2dc73822401af06ad
3
+ size 57978938
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.31.0
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ plotly>=5.15.0
5
+ scikit-learn>=1.3.0
6
+ plotly-express>=0.4.1
7
+ markdown
8
+ pygments
9
+ pyflakes
stringsight/dashboard/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dashboard visualization for StringSight pipeline results.
2
+
3
+ This module provides a Gradio interface for exploring model performance,
4
+ cluster analysis, and detailed examples from pipeline output.
5
+
6
+ Usage:
7
+ from stringsight.dashboard import launch_app
8
+ launch_app(results_dir="path/to/results")
9
+ """
10
+
11
+ from .app import launch_app, create_app
12
+
13
+ __all__ = ["launch_app", "create_app"]
stringsight/dashboard/app.py ADDED
@@ -0,0 +1,1312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main Gradio application for LMM-Vibes pipeline results visualization.
3
+
4
+ This module creates a comprehensive Gradio interface for exploring model performance,
5
+ cluster analysis, and detailed examples from pipeline output.
6
+ """
7
+
8
+ import gradio as gr
9
+ from gradio.themes import Soft
10
+ import pandas as pd
11
+ import numpy as np
12
+ import plotly.graph_objects as go
13
+ from pathlib import Path
14
+ from typing import Dict, List, Any, Optional, Tuple
15
+ import os
16
+
17
+ from .data_loader import (
18
+ load_pipeline_results,
19
+ load_property_examples,
20
+ scan_for_result_subfolders,
21
+ validate_results_directory,
22
+ get_available_models
23
+ )
24
+ from .metrics_adapter import get_all_models
25
+ from .utils import (
26
+ compute_model_rankings,
27
+ create_model_summary_card,
28
+ format_cluster_dataframe,
29
+
30
+ search_clusters_by_text,
31
+ get_top_clusters_for_model,
32
+ create_interactive_cluster_viewer,
33
+ get_cluster_statistics,
34
+ get_unique_values_for_dropdowns,
35
+ get_example_data,
36
+ format_examples_display,
37
+ get_total_clusters_count
38
+ )
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # NEW: centralised state + logic split into per-tab modules
42
+ # ---------------------------------------------------------------------------
43
+ from .state import app_state
44
+
45
+ # Tab-specific logic (moved out of this file)
46
+ from .load_data_tab import (
47
+ load_data,
48
+ get_available_experiments,
49
+ get_experiment_choices,
50
+ refresh_experiment_dropdown,
51
+ load_experiment_data,
52
+ )
53
+ from .overview_tab import create_overview, create_model_quality_plot, create_model_quality_table, get_available_model_quality_metrics
54
+ from .clusters_tab import view_clusters_interactive, view_clusters_table
55
+ from .examples_tab import (
56
+ get_dropdown_choices,
57
+ update_example_dropdowns,
58
+ view_examples,
59
+ )
60
+ from .plots_tab import create_plots_tab, create_plot_with_toggle, update_quality_metric_visibility, update_cluster_selection, get_available_quality_metrics
61
+
62
+ # app_state now comes from dashboard.state
63
+
64
+ # Feature flag: enable or disable the Run Pipeline tab (default: disabled)
65
+ ENABLE_RUN_PIPELINE_TAB = os.environ.get("ENABLE_RUN_PIPELINE_TAB", "0") not in ("0", "false", "False", "off", "")
66
+
67
+
68
+ def update_top_n_slider_maximum():
69
+ """Update the top N slider maximum based on total clusters in loaded data."""
70
+ from .state import app_state
71
+
72
+ if not app_state.get("metrics"):
73
+ return gr.Slider(minimum=1, maximum=10, value=3, step=1)
74
+
75
+ total_clusters = get_total_clusters_count(app_state["metrics"])
76
+ max_value = max(10, total_clusters) # At least 10, or total clusters if more
77
+
78
+ return gr.Slider(
79
+ label="Top N Clusters per Model",
80
+ minimum=1,
81
+ maximum=max_value,
82
+ value=min(3, max_value),
83
+ step=1,
84
+ info=f"Number of top clusters to show per model (max: {total_clusters})"
85
+ )
86
+
87
+
88
+ def clear_search_bars():
89
+ """Clear all search bars when new data is loaded."""
90
+ return "", "" # Returns empty strings for search_clusters and search_examples
91
+
92
+
93
+ def create_app() -> gr.Blocks:
94
+ """Create the main Gradio application."""
95
+
96
+ # Custom CSS for minimal margins and better sidebar layout + polished header/tabs
97
+ custom_css = """
98
+ /* Ensure the app itself spans the full page width (inside shadow root) */
99
+ :host {
100
+ width: 100% !important;
101
+ max-width: 100% !important;
102
+ margin: 0 !important;
103
+ padding: 0 !important;
104
+ /* Override Gradio's layout max width if present */
105
+ --layout-max-width: 100% !important;
106
+ }
107
+ /* Base font stack for broad compatibility */
108
+ body, .gradio-container {
109
+ font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", "Liberation Sans", sans-serif;
110
+ }
111
+ /* Ensure Examples tab inherits same font (avoid code blocks) */
112
+ #examples-container, #examples-container *:not(code):not(pre) {
113
+ font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", "Liberation Sans", sans-serif !important;
114
+ }
115
+
116
+ /* Universal reset for all elements */
117
+ * {
118
+ box-sizing: border-box !important;
119
+ }
120
+
121
+ .main-container {
122
+ width: 100% !important;
123
+ max-width: 100% !important;
124
+ margin: 0 !important;
125
+ padding: 5px 0 0 8px !important;
126
+ }
127
+ .gradio-container {
128
+ width: 100% !important;
129
+ max-width: none !important;
130
+ margin: 0 !important;
131
+ padding: 5px 0 0 8px !important;
132
+ }
133
+ /* --- Polished sticky header --- */
134
+ #app-header {
135
+ position: sticky;
136
+ top: 0;
137
+ z-index: 50;
138
+ backdrop-filter: saturate(180%) blur(8px);
139
+ -webkit-backdrop-filter: saturate(180%) blur(8px);
140
+ background: rgba(255,255,255,.85);
141
+ border-bottom: 1px solid rgba(15,23,42,.06);
142
+ padding: 12px 16px;
143
+ margin: 0 0 8px 0 !important;
144
+ display: flex;
145
+ align-items: center;
146
+ justify-content: space-between;
147
+ width: 100%;
148
+ }
149
+ .brand { display:flex; align-items:center; gap:10px; font-weight:600; font-size:18px; color:#0f172a; }
150
+ .brand small { font-weight:500; color:#64748b; }
151
+ .header-right { display:flex; gap:8px; align-items:center; margin-left:auto; }
152
+ /* Ensure the right group actually sticks to the right */
153
+ #app-header > *:last-child { margin-left: auto !important; }
154
+ #app-header .header-right { margin-left: auto !important; justify-content: flex-end !important; }
155
+ #app-header .header-right > * { margin-left: 0 !important; }
156
+ .header-badge { background:#eef2ff; color:#3730a3; border-radius:9999px; padding:2px 8px; font-size:12px; border:1px solid #c7d2fe; }
157
+ /* Round the tab buttons into pills with clear active state */
158
+ .tabs .tab-nav button { border-radius:9999px !important; padding:6px 12px !important; }
159
+ .tabs .tab-nav button.selected { background:#eef2ff !important; color:#3730a3 !important; }
160
+ /* Tone down color for model selection group (Gradio renders as pill labels) */
161
+ #selected-models label { background: #f8fafc !important; color: #111827 !important; border: 1px solid #e2e8f0 !important; }
162
+ #selected-models label:hover { background: #f1f5f9 !important; }
163
+ #selected-models .selected, #selected-models [data-selected="true"],
164
+ #selected-models label[aria-pressed="true"],
165
+ #selected-models label:has(input:checked) { background: #f1f5f9 !important; border-color: #e2e8f0 !important; color: #111827 !important; }
166
+ #selected-models input[type="checkbox"] { accent-color: #94a3b8 !important; }
167
+ /* Help panel card */
168
+ #help-panel { margin: 8px 12px; padding: 12px; background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; }
169
+ #help-panel .gr-prose, #help-panel .prose, #help-panel .markdown, #help-panel p, #help-panel div { background: #ffffff !important; }
170
+ /* Style the Close button with a light tint */
171
+ #help-close-btn button { background: #eef2ff !important; color: #3730a3 !important; border: 1px solid #c7d2fe !important; }
172
+ #help-close-btn button:hover { background: #e0e7ff !important; }
173
+ /* Compact Help button */
174
+ #help-btn { flex: 0 0 auto !important; width: auto !important; display: inline-flex !important; }
175
+ #help-btn button { padding: 2px 8px !important; min-width: unset !important; width: auto !important; }
176
+
177
+ .tabs {
178
+ margin: 0 !important;
179
+ padding: 0 !important;
180
+ }
181
+ .tab-nav {
182
+ margin: 0 !important;
183
+ padding: 0 !important;
184
+ }
185
+ .tab-content {
186
+ margin: 0 !important;
187
+ padding: 5px 0 2px 8px !important;
188
+ }
189
+ .sidebar {
190
+ border-left: 1px solid #e0e0e0;
191
+ background-color: #f8f9fa;
192
+ padding: 8px !important;
193
+ order: 2;
194
+ }
195
+ .main-content {
196
+ padding: 5px 0 2px 8px !important;
197
+ order: 1;
198
+ }
199
+ /* Additional selectors to override Gradio's default margins */
200
+ .block {
201
+ margin: 0 !important;
202
+ padding: 2px 0 2px 8px !important;
203
+ }
204
+ .form {
205
+ margin: 0 !important;
206
+ padding: 0 !important;
207
+ }
208
+ body {
209
+ margin: 0 !important;
210
+ padding: 5px 0 0 8px !important;
211
+ }
212
+ .app {
213
+ margin: 0 !important;
214
+ padding: 5px 0 0 8px !important;
215
+ }
216
+ /* Target specific Gradio container classes */
217
+ .gradio-row {
218
+ margin: 0 !important;
219
+ padding: 0 !important;
220
+ }
221
+ .gradio-column {
222
+ margin: 0 !important;
223
+ padding: 0 0 0 8px !important;
224
+ }
225
+ /* Override any container padding */
226
+ .container {
227
+ width: 100% !important;
228
+ max-width: none !important;
229
+ padding: 5px 0 0 8px !important;
230
+ margin: 0 !important;
231
+ }
232
+ /* Target the root element */
233
+ #root {
234
+ padding: 5px 0 0 8px !important;
235
+ margin: 0 !important;
236
+ }
237
+ /* Make sure no right padding on wrapper elements */
238
+ .wrap {
239
+ width: 100% !important;
240
+ max-width: none !important;
241
+ padding: 0 !important;
242
+ margin: 0 !important;
243
+ }
244
+ /* Aggressive targeting of common Gradio elements */
245
+ div[class*="gradio"] {
246
+ padding-right: 0 !important;
247
+ margin-right: 0 !important;
248
+ }
249
+ /* Target any div that might have padding */
250
+ .gradio-blocks > div,
251
+ .gradio-blocks div[style*="padding"] {
252
+ padding-right: 0 !important;
253
+ margin-right: 0 !important;
254
+ }
255
+ /* Ensure content fills width */
256
+ .gradio-blocks {
257
+ width: 100% !important;
258
+ max-width: none !important;
259
+ padding: 5px 0 0 8px !important;
260
+ margin: 0 !important;
261
+ }
262
+
263
+ /* Catch-all: remove max-width and auto-centering from any container-like nodes */
264
+ [class*="container"], [class*="Container"], [class*="main"], [class*="Main"], [class*="block"], [class*="Block"] {
265
+ max-width: none !important;
266
+ margin-left: 0 !important;
267
+ margin-right: 0 !important;
268
+ }
269
+
270
+ /* Slight right margin for overall app */
271
+ .gradio-container {
272
+ margin-right: 12px !important;
273
+ }
274
+
275
+ /* Ensure slight right padding inside the app content */
276
+ .main-container,
277
+ .gradio-blocks,
278
+ .tab-content,
279
+ .main-content,
280
+ .container,
281
+ #root,
282
+ .app,
283
+ .wrap,
284
+ .gradio-column {
285
+ padding-right: 12px !important;
286
+ }
287
+
288
+ /* Final override: ensure host has slight right padding so it's always visible */
289
+ :host {
290
+ padding-right: 12px !important;
291
+ }
292
+ """
293
+
294
+ # Modern theme setup (Inter font, neutral slate, indigo primary)
295
+ theme = Soft(
296
+ primary_hue="indigo",
297
+ neutral_hue="slate",
298
+ )
299
+
300
+ with gr.Blocks(title="LMM-Vibes Pipeline Results Explorer", theme=theme, css=custom_css, fill_width=True) as app:
301
+ # Header helpers
302
+ def _current_experiment_name() -> str:
303
+ from .state import app_state
304
+ from . import state
305
+ path = app_state.get("current_results_dir") or state.BASE_RESULTS_DIR or ""
306
+ if not path:
307
+ return "No experiment loaded"
308
+ try:
309
+ return Path(path).name
310
+ except Exception:
311
+ return str(path)
312
+
313
+ def _render_badge_html() -> str:
314
+ exp = _current_experiment_name()
315
+ return f"<span class=\"header-badge\">{exp}</span>"
316
+
317
+ # Polished sticky header
318
+ with gr.Row(elem_id="app-header"):
319
+ with gr.Row(elem_classes=["header-left"]):
320
+ gr.HTML(
321
+ value=(
322
+ "<div class=\"brand\">🧡 StringSight <small>Evaluation Console</small></div>"
323
+ )
324
+ )
325
+ # Move experiment selection to the header when a base directory is provided
326
+ from . import state
327
+ if state.BASE_RESULTS_DIR:
328
+ experiment_dropdown = gr.Dropdown(
329
+ label="Select Experiment",
330
+ choices=get_experiment_choices(),
331
+ value="Select an experiment...",
332
+ show_label=False,
333
+ interactive=True,
334
+ )
335
+ with gr.Row(elem_classes=["header-right"]):
336
+ help_btn = gr.Button("Help", variant="secondary", elem_id="help-btn")
337
+ # Separate badge element we can update after data loads
338
+ current_experiment_badge = gr.HTML(value=_render_badge_html(), visible=False)
339
+
340
+ # Contextual Help panel (hidden by default)
341
+ with gr.Group(visible=False, elem_id="help-panel") as help_panel:
342
+ help_md = gr.Markdown(
343
+ """
344
+ **Overview**: Compare model quality metrics and view model cards with top behavior clusters. Use Filter Controls to refine and switch between Plot/Table.
345
+
346
+ **View Clusters**: Explore clusters interactively. Use the search field in this tab to filter cluster labels; optional tag filter appears when available.
347
+
348
+ **View Examples**: Inspect individual examples with rich conversation rendering. Filter by prompt/model/cluster; adjust max examples and formatting options.
349
+ """
350
+ )
351
+ help_close_btn = gr.Button("Close", variant="secondary", elem_id="help-close-btn")
352
+
353
+ with gr.Row():
354
+ # Sidebar for data loading and model selection
355
+ with gr.Column(scale=1, min_width=180, elem_classes=["sidebar"]):
356
+ from . import state
357
+ if state.BASE_RESULTS_DIR:
358
+ gr.Markdown(f"Base Results Directory: `{state.BASE_RESULTS_DIR}`")
359
+ else:
360
+ gr.Markdown("Provide the path to your pipeline results directory containing either:")
361
+ gr.Markdown("β€’ **Legacy format**: `model_stats.json` + `clustered_results.jsonl`")
362
+ gr.Markdown("β€’ **Functional format**: `model_cluster_scores.json` + `cluster_scores.json` + `model_scores.json` + `clustered_results.jsonl`")
363
+ gr.Markdown("*The app will automatically detect which format you're using.*")
364
+
365
+ if not state.BASE_RESULTS_DIR:
366
+ results_dir_input = gr.Textbox(
367
+ label="Results Directory Path",
368
+ placeholder="/path/to/your/results/directory",
369
+ info="Directory containing pipeline results (legacy or functional format)"
370
+ )
371
+
372
+ data_status = gr.Markdown("")
373
+ models_info = gr.Markdown("", visible=False)
374
+
375
+ # Model selection (will be updated after loading)
376
+ selected_models = gr.CheckboxGroup(
377
+ label="Select Models for Analysis",
378
+ show_label=False,
379
+ choices=["all"], # Provide default to prevent errors
380
+ value=[],
381
+ info="Choose which models to include in comparisons",
382
+ elem_id="selected-models"
383
+ )
384
+ # Consolidated Tag selection (hidden until data provides tags)
385
+ selected_tags = gr.CheckboxGroup(
386
+ label="Filter by Tags",
387
+ show_label=False,
388
+ choices=[],
389
+ value=[],
390
+ info="Filter clusters/examples/plots by tags (derived from metadata)",
391
+ visible=False,
392
+ )
393
+
394
+ # Main content area with reduced margins
395
+ with gr.Column(scale=6, elem_classes=["main-content"]):
396
+ with gr.Tabs(selected=1) as main_tabs:
397
+
398
+ # Tab 0: Run Pipeline (conditionally enabled)
399
+ if ENABLE_RUN_PIPELINE_TAB:
400
+ from .run_pipeline_tab import create_run_pipeline_tab
401
+ with gr.TabItem("πŸš€ Run Pipeline", id=0) as pipeline_tab:
402
+ pipeline_components = create_run_pipeline_tab()
403
+ # Store pipeline components for later event handler setup
404
+
405
+ # Tab 1: Overview
406
+ with gr.TabItem("πŸ“Š Overview", id=1) as overview_tab:
407
+ # Accordion for Filter Controls
408
+ with gr.Accordion("Filter Controls", open=False, visible=True) as filter_controls_acc:
409
+ with gr.Row():
410
+ min_cluster_size = gr.Slider(
411
+ label="Minimum Cluster Size",
412
+ minimum=1, maximum=50, value=5, step=1,
413
+ # info="Hide clusters with fewer than this many examples"
414
+ )
415
+ score_significant_only = gr.Checkbox(
416
+ label="Show Only Frequency Significant Clusters",
417
+ value=False,
418
+ info="Only show clusters where the distinctiveness score is statistically significant"
419
+ )
420
+ quality_significant_only = gr.Checkbox(
421
+ label="Show Only Quality Significant Clusters",
422
+ value=False,
423
+ info="Only show clusters where the quality score is statistically significant"
424
+ )
425
+
426
+ with gr.Row():
427
+ sort_by = gr.Dropdown(
428
+ label="Sort Clusters By",
429
+ choices=[
430
+ ("Relative Frequency (Descending)", "salience_desc"),
431
+ ("Relative Frequency (Ascending)", "salience_asc"),
432
+ ("Quality (Ascending)", "quality_asc"),
433
+ ("Quality (Descending)", "quality_desc"),
434
+ ("Frequency (Descending)", "frequency_desc"),
435
+ ("Frequency (Ascending)", "frequency_asc")
436
+ ],
437
+ value="salience_desc",
438
+ # info="How to sort clusters within each model card"
439
+ )
440
+ top_n_overview = gr.Slider(
441
+ label="Top N Clusters per Model",
442
+ minimum=1, maximum=10, value=3, step=1,
443
+ # info="Number of top clusters to show per model"
444
+ )
445
+
446
+ # Accordion for Quality Plot
447
+ with gr.Accordion("Benchmark Metrics", open=True, visible=True) as metrics_acc:
448
+ with gr.Row():
449
+ quality_metric_overview = gr.Dropdown(
450
+ label="Quality Metric",
451
+ show_label=False,
452
+ choices=["helpfulness", "accuracy", "harmlessness", "honesty"],
453
+ value="accuracy",
454
+ # info="Select quality metric to display"
455
+ )
456
+ quality_view_type = gr.Dropdown(
457
+ label="View Type",
458
+ show_label=False,
459
+ choices=["Plot", "Table"],
460
+ value="Table",
461
+ # info="Choose between plot or table view"
462
+ )
463
+
464
+ quality_plot_display = gr.Plot(
465
+ label="Model Quality Comparison",
466
+ show_label=False,
467
+ elem_id="quality-plot",
468
+ visible=True
469
+ )
470
+
471
+ quality_table_display = gr.HTML(
472
+ label="Model Quality Table",
473
+ visible=True,
474
+ value="<div style='color:#666;padding:8px;'>Switch view to Table or Plot as desired.</div>"
475
+ )
476
+ overview_display = gr.HTML(
477
+ label="Model Overview",
478
+ value="<p style='color: #666; padding: 20px;'>Select your experiment to begin.</p>",
479
+ visible=True
480
+ )
481
+
482
+ refresh_overview_btn = gr.Button("Refresh Overview", visible=True)
483
+
484
+ # Tab 2: View Clusters
485
+ with gr.TabItem("πŸ“‹ View Clusters", id=2) as clusters_tab:
486
+ # gr.Markdown("### Interactive Cluster Viewer")
487
+
488
+ with gr.Row():
489
+ search_clusters = gr.Textbox(
490
+ label="Search Properties",
491
+ show_label=False,
492
+ placeholder="Search in property clusters...",
493
+ info="Search for specific terms in property clusters"
494
+ )
495
+
496
+ clusters_display = gr.HTML(
497
+ label="Interactive Cluster Viewer",
498
+ value="<p style='color: #666; padding: 20px;'>Load data and select models to view clusters</p>"
499
+ )
500
+
501
+ refresh_clusters_btn = gr.Button("Refresh Clusters")
502
+
503
+ # Tab 3: View Examples
504
+ with gr.TabItem("πŸ” View Examples", id=3) as examples_tab:
505
+ # gr.Markdown("### Individual Example Viewer")
506
+ # gr.Markdown("Explore individual examples with full prompts, model responses, and property information. Click on examples to expand and view full details.")
507
+ with gr.Row():
508
+ search_examples = gr.Textbox(
509
+ label="Search Properties",
510
+ show_label=False,
511
+ placeholder="Search clusters or property descriptions...",
512
+ info="Search for specific terms in cluster names or property descriptions to filter examples"
513
+ )
514
+
515
+ with gr.Accordion("Search & Filter Options", open=False):
516
+
517
+ with gr.Row():
518
+ with gr.Column(scale=1):
519
+ example_prompt_dropdown = gr.Dropdown(
520
+ label="Select Prompt",
521
+ show_label=False,
522
+ choices=["All Prompts"],
523
+ value="All Prompts",
524
+ info="Choose a specific prompt or 'All Prompts'"
525
+ )
526
+ with gr.Column(scale=1):
527
+ example_model_dropdown = gr.Dropdown(
528
+ label="Select Model",
529
+ show_label=False,
530
+ choices=["All Models"],
531
+ value="All Models",
532
+ info="Choose a specific model or 'All Models'"
533
+ )
534
+ with gr.Column(scale=1):
535
+ example_property_dropdown = gr.Dropdown(
536
+ label="Select Cluster",
537
+ show_label=False,
538
+ choices=["All Clusters"],
539
+ value="All Clusters",
540
+ info="Choose a specific cluster or 'All Clusters'"
541
+ )
542
+ # Tags are consolidated in the sidebar
543
+
544
+ with gr.Row():
545
+ max_examples_slider = gr.Slider(
546
+ label="Max Examples",
547
+ show_label=False,
548
+ minimum=1, maximum=20, value=5, step=1,
549
+ info="Maximum number of examples to display"
550
+ )
551
+ use_accordion_checkbox = gr.Checkbox(
552
+ label="Use Accordion for System/Info Messages",
553
+ value=True,
554
+ info="Group system and info messages in collapsible sections"
555
+ )
556
+ pretty_print_checkbox = gr.Checkbox(
557
+ label="Pretty-print dictionaries",
558
+ value=False,
559
+ info="Format embedded dictionaries for readability"
560
+ )
561
+ show_unexpected_behavior_checkbox = gr.Checkbox(
562
+ label="Show Unexpected Behavior Only",
563
+ value=False,
564
+ info="Filter to show only examples with unexpected behavior"
565
+ )
566
+ view_examples_btn = gr.Button("View Examples", variant="primary")
567
+
568
+ examples_display = gr.HTML(
569
+ label="Examples",
570
+ value="<p style='color: #666; padding: 20px;'>Load data and select filters to view examples</p>"
571
+ , elem_id="examples-container")
572
+
573
+ # Tab 4: Plots
574
+ with gr.TabItem("πŸ“Š Plots", id=4) as plots_tab:
575
+ plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown, cluster_selector = create_plots_tab()
576
+ # Internal state to carry a valid metric during chained updates
577
+ quality_metric_state = gr.State(value=None)
578
+
579
+ # Define helper functions for event handlers
580
+ def show_overview_controls():
581
+ return (
582
+ gr.update(visible=True), # filter_controls_acc
583
+ gr.update(visible=True), # metrics_acc
584
+ gr.update(visible=True), # refresh_overview_btn
585
+ )
586
+ def compute_plots_quality_metric(plot_type: str, dropdown_value: str | None):
587
+ # Ensure we always pass a valid metric to the plot function during chained updates
588
+ if plot_type != "quality":
589
+ return None
590
+ metrics = get_available_quality_metrics()
591
+ if not metrics:
592
+ return None
593
+ if dropdown_value in metrics:
594
+ return dropdown_value
595
+ return metrics[0]
596
+ def update_quality_metric_dropdown():
597
+ available_metrics = get_available_model_quality_metrics()
598
+ # Ensure value is valid for the updated choices
599
+ return gr.update(choices=available_metrics, value=(available_metrics[0] if available_metrics else None))
600
+
601
+ def update_quality_plot(selected_models, quality_metric):
602
+ return create_model_quality_plot(selected_models, quality_metric)
603
+
604
+ def _placeholder_plot(text: str = "Switch to the Plot view to see a chart"):
605
+ fig = go.Figure()
606
+ fig.update_layout(
607
+ xaxis=dict(visible=False),
608
+ yaxis=dict(visible=False),
609
+ annotations=[dict(text=text, x=0.5, y=0.5, showarrow=False, xref="paper", yref="paper")],
610
+ height=320,
611
+ margin=dict(l=20, r=20, t=20, b=20)
612
+ )
613
+ return fig
614
+
615
+ def update_quality_display(selected_models, quality_metric, view_type):
616
+ # Hide the non-selected view to avoid showing placeholders
617
+ if view_type == "Plot":
618
+ plot_val = create_model_quality_plot(selected_models, quality_metric) or _placeholder_plot("No data available for selected models")
619
+ return (
620
+ gr.update(value=plot_val, visible=True),
621
+ gr.update(visible=False),
622
+ )
623
+ else: # Table
624
+ table_val = create_model_quality_table(selected_models, quality_metric)
625
+ return (
626
+ gr.update(visible=False),
627
+ gr.update(value=table_val, visible=True),
628
+ )
629
+
630
+ def update_experiment_badge():
631
+ return _render_badge_html()
632
+
633
+ def safe_update_quality_display(selected_models, quality_metric, view_type):
634
+ # Simplified: always update directly
635
+ return update_quality_display(selected_models, quality_metric, view_type)
636
+
637
+ def update_overview_content_only(selected_models, top_n, score_sig, quality_sig, sort_by_val, min_cluster_sz, selected_tags_sidebar):
638
+ """Update only the overview model cards content, without affecting UI state or controls."""
639
+ if not app_state.get("metrics"):
640
+ return "<p style='color: #666; padding: 20px;'>Please load data first.</p>"
641
+
642
+ # Just build and return the overview HTML
643
+ overview_html = create_overview(
644
+ selected_models,
645
+ top_n,
646
+ score_sig,
647
+ quality_sig,
648
+ sort_by_val,
649
+ min_cluster_sz,
650
+ selected_tags=selected_tags_sidebar,
651
+ )
652
+ return overview_html
653
+
654
+ def update_sidebar_tags(selected_models_current: Optional[List[str]] = None):
655
+ # Populate sidebar tag checkboxes from clustered_df (respect selected models if provided)
656
+ if app_state.get("clustered_df") is None:
657
+ return gr.update(choices=[], value=[], visible=False)
658
+ df = app_state["clustered_df"]
659
+ if selected_models_current:
660
+ concrete = [m for m in selected_models_current if m != "all"]
661
+ if concrete:
662
+ df = df[df["model"].isin(concrete)]
663
+ choices = get_unique_values_for_dropdowns(df)
664
+ tags = choices.get("tags", []) or []
665
+ # Default select all tags (no filter)
666
+ return gr.update(choices=tags, value=tags, visible=bool(tags))
667
+
668
+
669
+ def create_overview_page(selected_models,
670
+ top_n,
671
+ score_sig,
672
+ quality_sig,
673
+ sort_by_val,
674
+ min_cluster_sz,
675
+ quality_metric,
676
+ view_type,
677
+ selected_tags_sidebar,
678
+ progress: gr.Progress = None):
679
+ # Simplified: no loading gate or build flag
680
+ if not app_state.get("metrics"):
681
+ landing_html = "<p style='color: #666; padding: 20px;'>Select your experiment to begin.</p>"
682
+ # Respect current view type: show only the chosen view
683
+ if view_type == "Plot":
684
+ return (
685
+ gr.update(),
686
+ gr.update(),
687
+ gr.update(),
688
+ gr.update(value=_placeholder_plot("Load data to view model quality."), visible=True),
689
+ gr.update(visible=False),
690
+ gr.update(value=landing_html),
691
+ )
692
+ else:
693
+ return (
694
+ gr.update(),
695
+ gr.update(),
696
+ gr.update(),
697
+ gr.update(visible=False),
698
+ gr.update(value="<div style='color:#666;padding:8px;'>Load data to view the quality table.</div>", visible=True),
699
+ gr.update(value=landing_html),
700
+ )
701
+
702
+ # Pre-compute ALL content before making any UI updates to ensure simultaneous display
703
+ if progress:
704
+ progress(0.1, "Preparing benchmark metrics...")
705
+
706
+ # Prepare quality display; hide the non-selected view
707
+ if view_type == "Plot":
708
+ plot_val = create_model_quality_plot(selected_models, quality_metric) or _placeholder_plot("No data available for selected models")
709
+ table_val = None
710
+ else:
711
+ table_val = create_model_quality_table(selected_models, quality_metric)
712
+ plot_val = None
713
+
714
+ if progress:
715
+ progress(0.5, "Building model overview cards...")
716
+
717
+ # Build overview cards
718
+ overview_html = create_overview(
719
+ selected_models,
720
+ top_n,
721
+ score_sig,
722
+ quality_sig,
723
+ sort_by_val,
724
+ min_cluster_sz,
725
+ selected_tags=selected_tags_sidebar,
726
+ )
727
+
728
+ if progress:
729
+ progress(0.9, "Finalizing display...")
730
+
731
+ # Do not toggle control visibility to avoid layout flicker
732
+ filter_controls_update = gr.update()
733
+ metrics_controls_update = gr.update()
734
+ refresh_btn_update = gr.update()
735
+
736
+ if progress:
737
+ progress(1.0, "Overview ready")
738
+
739
+ return (
740
+ filter_controls_update,
741
+ metrics_controls_update,
742
+ refresh_btn_update,
743
+ (gr.update(value=plot_val, visible=True) if view_type == "Plot" else gr.update(visible=False)),
744
+ (gr.update(value=table_val, visible=True) if view_type == "Table" else gr.update(visible=False)),
745
+ gr.update(value=overview_html),
746
+ )
747
+
748
+
749
+ # Enhanced pipeline handler with tab switching and dropdown refresh
750
+ def enhanced_pipeline_handler(*args):
751
+ """Enhanced pipeline handler with tab switching and dropdown refresh."""
752
+ from .run_pipeline_tab import run_pipeline_handler
753
+ from .load_data_tab import get_experiment_choices
754
+
755
+ # Call the original pipeline handler
756
+ status_html, results_preview_html = run_pipeline_handler(*args)
757
+
758
+ # Check if pipeline completed successfully
759
+ pipeline_success = "<!-- SUCCESS -->" in status_html
760
+
761
+ if pipeline_success:
762
+ # Clean up the success indicator from HTML
763
+ status_html = status_html.replace("<!-- SUCCESS -->", "")
764
+
765
+ # Refresh experiment dropdown choices
766
+ experiment_choices = get_experiment_choices()
767
+
768
+ return (
769
+ status_html,
770
+ results_preview_html,
771
+ gr.Tabs(selected=1), # Switch to Overview tab
772
+ gr.update(choices=experiment_choices, value=experiment_choices[1] if len(experiment_choices) > 1 else None) if experiment_choices else gr.update()
773
+ )
774
+ else:
775
+ # Pipeline failed or still running - no changes
776
+ return (
777
+ status_html,
778
+ results_preview_html,
779
+ gr.Tabs(), # No tab change
780
+ gr.update() # No dropdown change
781
+ )
782
+
783
+ # Enhanced labeling handler with tab switching and dropdown refresh
784
+ def enhanced_label_handler(*args):
785
+ from .run_pipeline_tab import run_label_pipeline_handler
786
+ from .load_data_tab import get_experiment_choices
787
+ status_html, results_preview_html = run_label_pipeline_handler(*args)
788
+ pipeline_success = "<!-- SUCCESS -->" in status_html
789
+ if pipeline_success:
790
+ status_html = status_html.replace("<!-- SUCCESS -->", "")
791
+ experiment_choices = get_experiment_choices()
792
+ return (
793
+ status_html,
794
+ results_preview_html,
795
+ gr.Tabs(selected=1),
796
+ gr.update(choices=experiment_choices, value=experiment_choices[1] if len(experiment_choices) > 1 else None) if experiment_choices else gr.update()
797
+ )
798
+ else:
799
+ return (
800
+ status_html,
801
+ results_preview_html,
802
+ gr.Tabs(),
803
+ gr.update()
804
+ )
805
+
806
+ # Event handlers
807
+ from . import state
808
+ if state.BASE_RESULTS_DIR:
809
+ # Use dropdown for experiment selection
810
+ if 'experiment_dropdown' in locals():
811
+ (experiment_dropdown.change(
812
+ fn=load_experiment_data,
813
+ inputs=[experiment_dropdown],
814
+ outputs=[data_status, models_info, selected_models]
815
+ ).then(
816
+ fn=update_experiment_badge,
817
+ outputs=[current_experiment_badge]
818
+ ).then(
819
+ fn=update_example_dropdowns,
820
+ inputs=[selected_models],
821
+ outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
822
+ ).then(
823
+ fn=update_sidebar_tags,
824
+ inputs=[selected_models],
825
+ outputs=[selected_tags]
826
+ ).then(
827
+ fn=update_quality_metric_dropdown,
828
+ outputs=[quality_metric_overview]
829
+ ).then(
830
+ fn=view_examples,
831
+ inputs=[
832
+ example_prompt_dropdown,
833
+ example_model_dropdown,
834
+ example_property_dropdown,
835
+ max_examples_slider,
836
+ use_accordion_checkbox,
837
+ pretty_print_checkbox,
838
+ search_examples,
839
+ show_unexpected_behavior_checkbox,
840
+ selected_models,
841
+ selected_tags,
842
+ ],
843
+ outputs=[examples_display]
844
+ ).then(
845
+ fn=update_top_n_slider_maximum,
846
+ outputs=[top_n_overview]
847
+ ).then(
848
+ fn=clear_search_bars,
849
+ outputs=[search_clusters, search_examples]
850
+ ).then(
851
+ fn=view_clusters_interactive,
852
+ inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
853
+ outputs=[clusters_display]
854
+ ).then(
855
+ fn=create_overview_page,
856
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type, selected_tags],
857
+ outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
858
+ ).then(
859
+ fn=update_cluster_selection,
860
+ inputs=[selected_models, selected_tags],
861
+ outputs=[cluster_selector]
862
+ ).then(
863
+ fn=update_quality_metric_visibility,
864
+ inputs=[plot_type_dropdown],
865
+ outputs=[quality_metric_dropdown]
866
+ ).then(
867
+ fn=compute_plots_quality_metric,
868
+ inputs=[plot_type_dropdown, quality_metric_dropdown],
869
+ outputs=[quality_metric_state]
870
+ ).then(
871
+ fn=create_plot_with_toggle,
872
+ inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
873
+ outputs=[plot_display, plot_info]
874
+ ))
875
+ else:
876
+ # Use textbox for manual path entry
877
+ if 'results_dir_input' in locals():
878
+ (results_dir_input.submit(
879
+ fn=load_data,
880
+ inputs=[results_dir_input],
881
+ outputs=[data_status, models_info, selected_models]
882
+ ).then(
883
+ fn=update_experiment_badge,
884
+ outputs=[current_experiment_badge]
885
+ ).then(
886
+ fn=update_example_dropdowns,
887
+ inputs=[selected_models],
888
+ outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
889
+ ).then(
890
+ fn=update_sidebar_tags,
891
+ inputs=[selected_models],
892
+ outputs=[selected_tags]
893
+ ).then(
894
+ fn=update_quality_metric_dropdown,
895
+ outputs=[quality_metric_overview]
896
+ ).then(
897
+ fn=view_examples,
898
+ inputs=[
899
+ example_prompt_dropdown,
900
+ example_model_dropdown,
901
+ example_property_dropdown,
902
+ max_examples_slider,
903
+ use_accordion_checkbox,
904
+ pretty_print_checkbox,
905
+ search_examples,
906
+ show_unexpected_behavior_checkbox,
907
+ selected_models,
908
+ selected_tags,
909
+ ],
910
+ outputs=[examples_display]
911
+ ).then(
912
+ fn=update_top_n_slider_maximum,
913
+ outputs=[top_n_overview]
914
+ ).then(
915
+ fn=clear_search_bars,
916
+ outputs=[search_clusters, search_examples]
917
+ ).then(
918
+ fn=view_clusters_interactive,
919
+ inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
920
+ outputs=[clusters_display]
921
+ ).then(
922
+ fn=create_overview_page,
923
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type],
924
+ outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
925
+ ).then(
926
+ fn=update_cluster_selection,
927
+ inputs=[selected_models, selected_tags],
928
+ outputs=[cluster_selector]
929
+ ).then(
930
+ fn=update_quality_metric_visibility,
931
+ inputs=[plot_type_dropdown],
932
+ outputs=[quality_metric_dropdown]
933
+ ).then(
934
+ fn=compute_plots_quality_metric,
935
+ inputs=[plot_type_dropdown, quality_metric_dropdown],
936
+ outputs=[quality_metric_state]
937
+ ).then(
938
+ fn=create_plot_with_toggle,
939
+ inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
940
+ outputs=[plot_display, plot_info]
941
+ ))
942
+
943
+ # Tab switching should not trigger any updates - content should persist
944
+
945
+ refresh_overview_btn.click(
946
+ fn=create_overview_page,
947
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type, selected_tags],
948
+ outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
949
+ )
950
+
951
+ # Help button show/hide
952
+ help_btn.click(
953
+ fn=lambda: gr.update(visible=True),
954
+ outputs=[help_panel]
955
+ )
956
+ help_close_btn.click(
957
+ fn=lambda: gr.update(visible=False),
958
+ outputs=[help_panel]
959
+ )
960
+
961
+ # Quality plot interactions
962
+ # Update quality display when controls change
963
+ quality_metric_overview.change(
964
+ fn=update_quality_display,
965
+ inputs=[selected_models, quality_metric_overview, quality_view_type],
966
+ outputs=[quality_plot_display, quality_table_display]
967
+ )
968
+
969
+ quality_view_type.change(
970
+ fn=update_quality_display,
971
+ inputs=[selected_models, quality_metric_overview, quality_view_type],
972
+ outputs=[quality_plot_display, quality_table_display]
973
+ )
974
+
975
+ # Update quality display when selected models change
976
+ selected_models.change(
977
+ fn=update_quality_display,
978
+ inputs=[selected_models, quality_metric_overview, quality_view_type],
979
+ outputs=[quality_plot_display, quality_table_display]
980
+ )
981
+
982
+ refresh_clusters_btn.click(
983
+ fn=view_clusters_interactive,
984
+ inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
985
+ outputs=[clusters_display]
986
+ )
987
+
988
+ # View Examples handlers
989
+ view_examples_btn.click(
990
+ fn=view_examples,
991
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
992
+ outputs=[examples_display]
993
+ )
994
+
995
+ # Auto-refresh examples when dropdowns change
996
+ example_prompt_dropdown.change(
997
+ fn=view_examples,
998
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
999
+ outputs=[examples_display]
1000
+ )
1001
+
1002
+ example_model_dropdown.change(
1003
+ fn=view_examples,
1004
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
1005
+ outputs=[examples_display]
1006
+ )
1007
+
1008
+ example_property_dropdown.change(
1009
+ fn=view_examples,
1010
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
1011
+ outputs=[examples_display]
1012
+ )
1013
+
1014
+ # Removed per-tab tag dropdown; using sidebar tags
1015
+
1016
+ # Auto-refresh examples when search term changes
1017
+ search_examples.change(
1018
+ fn=view_examples,
1019
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
1020
+ outputs=[examples_display]
1021
+ )
1022
+
1023
+ # Auto-refresh examples when unexpected behavior checkbox changes
1024
+ show_unexpected_behavior_checkbox.change(
1025
+ fn=view_examples,
1026
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
1027
+ outputs=[examples_display]
1028
+ )
1029
+
1030
+
1031
+
1032
+ # (Search Examples tab removed – no search_btn handler required)
1033
+
1034
+ # Plots Tab Handlers
1035
+ show_ci_checkbox.change(
1036
+ fn=create_plot_with_toggle,
1037
+ inputs=[plot_type_dropdown, quality_metric_dropdown, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
1038
+ outputs=[plot_display, plot_info]
1039
+ )
1040
+
1041
+ # Quality metric dropdown handlers (only for quality plots)
1042
+ quality_metric_dropdown.change(
1043
+ fn=create_plot_with_toggle,
1044
+ inputs=[plot_type_dropdown, quality_metric_dropdown, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
1045
+ outputs=[plot_display, plot_info]
1046
+ )
1047
+
1048
+ # Cluster selector change updates the plot and mapping text
1049
+ cluster_selector.change(
1050
+ fn=create_plot_with_toggle,
1051
+ inputs=[plot_type_dropdown, quality_metric_dropdown, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
1052
+ outputs=[plot_display, plot_info]
1053
+ )
1054
+
1055
+ # Update quality metric visibility and plot based on plot type
1056
+ plot_type_dropdown.change(
1057
+ fn=update_quality_metric_visibility,
1058
+ inputs=[plot_type_dropdown],
1059
+ outputs=[quality_metric_dropdown]
1060
+ ).then(
1061
+ fn=compute_plots_quality_metric,
1062
+ inputs=[plot_type_dropdown, quality_metric_dropdown],
1063
+ outputs=[quality_metric_state]
1064
+ ).then(
1065
+ fn=create_plot_with_toggle,
1066
+ inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
1067
+ outputs=[plot_display, plot_info]
1068
+ )
1069
+
1070
+ # Remove duplicate Overview rebuild on model selection; quality plot and clusters still update below
1071
+
1072
+ # Auto-refresh on significance filter changes - only update model cards content
1073
+ score_significant_only.change(
1074
+ fn=update_overview_content_only,
1075
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
1076
+ outputs=[overview_display]
1077
+ )
1078
+
1079
+ quality_significant_only.change(
1080
+ fn=update_overview_content_only,
1081
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
1082
+ outputs=[overview_display]
1083
+ )
1084
+
1085
+ # Auto-refresh on sort dropdown change - only update model cards content
1086
+ sort_by.change(
1087
+ fn=update_overview_content_only,
1088
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
1089
+ outputs=[overview_display]
1090
+ )
1091
+
1092
+ # Auto-refresh on top N change - only update model cards content
1093
+ top_n_overview.change(
1094
+ fn=update_overview_content_only,
1095
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
1096
+ outputs=[overview_display]
1097
+ )
1098
+
1099
+ # Auto-refresh on minimum cluster size change - only update model cards content
1100
+ min_cluster_size.change(
1101
+ fn=update_overview_content_only,
1102
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
1103
+ outputs=[overview_display]
1104
+ )
1105
+
1106
+ # Update overview content and clusters when selected models change
1107
+ selected_models.change(
1108
+ fn=update_overview_content_only,
1109
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, selected_tags],
1110
+ outputs=[overview_display]
1111
+ ).then(
1112
+ fn=view_clusters_interactive,
1113
+ inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
1114
+ outputs=[clusters_display]
1115
+ ).then(
1116
+ fn=update_example_dropdowns,
1117
+ inputs=[selected_models],
1118
+ outputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown]
1119
+ ).then(
1120
+ fn=view_examples,
1121
+ inputs=[
1122
+ example_prompt_dropdown,
1123
+ example_model_dropdown,
1124
+ example_property_dropdown,
1125
+ max_examples_slider,
1126
+ use_accordion_checkbox,
1127
+ pretty_print_checkbox,
1128
+ search_examples,
1129
+ show_unexpected_behavior_checkbox,
1130
+ selected_models,
1131
+ selected_tags,
1132
+ ],
1133
+ outputs=[examples_display]
1134
+ ).then(
1135
+ fn=update_cluster_selection,
1136
+ inputs=[selected_models],
1137
+ outputs=[cluster_selector]
1138
+ ).then(
1139
+ fn=compute_plots_quality_metric,
1140
+ inputs=[plot_type_dropdown, quality_metric_dropdown],
1141
+ outputs=[quality_metric_state]
1142
+ ).then(
1143
+ fn=create_plot_with_toggle,
1144
+ inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
1145
+ outputs=[plot_display, plot_info]
1146
+ )
1147
+
1148
+ # Auto-refresh clusters when search term changes (with debouncing)
1149
+ search_clusters.change(
1150
+ fn=view_clusters_interactive,
1151
+ inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
1152
+ outputs=[clusters_display]
1153
+ )
1154
+
1155
+ # Sidebar tags: update clusters, overview, plots, and examples
1156
+ selected_tags.change(
1157
+ fn=view_clusters_interactive,
1158
+ inputs=[selected_models, gr.State("fine"), search_clusters, selected_tags],
1159
+ outputs=[clusters_display]
1160
+ ).then(
1161
+ fn=create_overview_page,
1162
+ inputs=[selected_models, top_n_overview, score_significant_only, quality_significant_only, sort_by, min_cluster_size, quality_metric_overview, quality_view_type, selected_tags],
1163
+ outputs=[filter_controls_acc, metrics_acc, refresh_overview_btn, quality_plot_display, quality_table_display, overview_display]
1164
+ ).then(
1165
+ fn=update_cluster_selection,
1166
+ inputs=[selected_models, selected_tags],
1167
+ outputs=[cluster_selector]
1168
+ ).then(
1169
+ fn=create_plot_with_toggle,
1170
+ inputs=[plot_type_dropdown, quality_metric_state, cluster_selector, show_ci_checkbox, selected_models, selected_tags],
1171
+ outputs=[plot_display, plot_info]
1172
+ ).then(
1173
+ fn=view_examples,
1174
+ inputs=[example_prompt_dropdown, example_model_dropdown, example_property_dropdown, max_examples_slider, use_accordion_checkbox, pretty_print_checkbox, search_examples, show_unexpected_behavior_checkbox, selected_models, selected_tags],
1175
+ outputs=[examples_display]
1176
+ )
1177
+
1178
+ # (No global header search)
1179
+
1180
+ # Wire up enhanced handlers for Explain and Label with tab switching
1181
+ if ENABLE_RUN_PIPELINE_TAB:
1182
+ pipeline_components["run_button_explain"].click(
1183
+ fn=enhanced_pipeline_handler,
1184
+ inputs=pipeline_components["inputs_explain"],
1185
+ outputs=[
1186
+ pipeline_components["status_display"],
1187
+ pipeline_components["results_preview"],
1188
+ main_tabs,
1189
+ experiment_dropdown if 'experiment_dropdown' in locals() else results_dir_input
1190
+ ],
1191
+ show_progress="full"
1192
+ )
1193
+ pipeline_components["run_button_label"].click(
1194
+ fn=enhanced_label_handler,
1195
+ inputs=pipeline_components["inputs_label"],
1196
+ outputs=[
1197
+ pipeline_components["status_display"],
1198
+ pipeline_components["results_preview"],
1199
+ main_tabs,
1200
+ experiment_dropdown if 'experiment_dropdown' in locals() else results_dir_input
1201
+ ],
1202
+ show_progress="full"
1203
+ )
1204
+
1205
+ return app
1206
+
1207
+
1208
+ def launch_app(results_dir: Optional[str] = None,
1209
+ share: bool = False,
1210
+ server_name: str = "127.0.0.1",
1211
+ server_port: int = 7860,
1212
+ **kwargs) -> None:
1213
+ """Launch the Gradio application.
1214
+
1215
+ Args:
1216
+ results_dir: Optional path to base results directory containing experiment subfolders
1217
+ share: Whether to create a public link
1218
+ server_name: Server address
1219
+ server_port: Server port
1220
+ **kwargs: Additional arguments for gr.Blocks.launch()
1221
+ """
1222
+ # Set the base results directory in state BEFORE creating the app
1223
+ from . import state
1224
+ if results_dir:
1225
+ state.BASE_RESULTS_DIR = results_dir
1226
+ print(f"πŸ“ Base results directory set to: {results_dir}")
1227
+
1228
+ # Check if it's a valid directory
1229
+ if not os.path.exists(results_dir):
1230
+ print(f"⚠️ Warning: Base results directory does not exist: {results_dir}")
1231
+ state.BASE_RESULTS_DIR = None
1232
+ else:
1233
+ # Scan for available experiments
1234
+ experiments = get_available_experiments(results_dir)
1235
+ print(f"πŸ” Found {len(experiments)} experiments: {experiments}")
1236
+
1237
+ app = create_app()
1238
+
1239
+ # Auto-load data if BASE_RESULTS_DIR is set - automatically load the most recent experiment
1240
+ if state.BASE_RESULTS_DIR and os.path.exists(state.BASE_RESULTS_DIR):
1241
+ experiments = get_available_experiments(state.BASE_RESULTS_DIR)
1242
+ if len(experiments) >= 1:
1243
+ # Auto-load the most recent experiment (first in the sorted list)
1244
+ most_recent_experiment = experiments[0]
1245
+ experiment_path = os.path.join(state.BASE_RESULTS_DIR, most_recent_experiment)
1246
+ try:
1247
+ clustered_df, model_stats, model_cluster_df, results_path = load_pipeline_results(experiment_path)
1248
+ app_state['clustered_df'] = clustered_df
1249
+ app_state['model_stats'] = model_stats
1250
+ app_state['metrics'] = model_stats # Ensure metrics is also populated
1251
+ app_state['model_cluster_df'] = model_cluster_df
1252
+ app_state['results_path'] = results_path
1253
+ available_models = get_all_models(model_stats)
1254
+ app_state['available_models'] = available_models
1255
+ app_state['current_results_dir'] = experiment_path
1256
+ print(f"βœ… Auto-loaded most recent experiment: {most_recent_experiment}")
1257
+ print(f"πŸ“‹ Available models: {available_models}")
1258
+ if len(experiments) > 1:
1259
+ print(f"πŸ“‹ Found {len(experiments)} experiments. Loaded the most recent: {most_recent_experiment}")
1260
+ except Exception as e:
1261
+ print(f"❌ Failed to auto-load data: {e}")
1262
+ else:
1263
+ print(f"πŸ“‹ No valid experiments found in {state.BASE_RESULTS_DIR}")
1264
+
1265
+ print(f"πŸš€ Launching Gradio app on {server_name}:{server_port}")
1266
+ print(f"Share mode: {share}")
1267
+ print(f"πŸ”§ Additional kwargs: {kwargs}")
1268
+
1269
+ try:
1270
+ app.launch(
1271
+ share=share,
1272
+ server_name=server_name,
1273
+ server_port=server_port,
1274
+ show_error=True, # Show detailed error messages
1275
+ quiet=False, # Show more verbose output
1276
+ **kwargs
1277
+ )
1278
+ except Exception as e:
1279
+ print(f"❌ Failed to launch on port {server_port}: {e}")
1280
+ print("πŸ”„ Trying alternative port configuration...")
1281
+
1282
+ # Try with a port range instead of port 0
1283
+ try:
1284
+ # Try ports in a reasonable range
1285
+ for alt_port in [8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089]:
1286
+ try:
1287
+ print(f"πŸ”„ Trying port {alt_port}...")
1288
+ app.launch(
1289
+ share=share,
1290
+ server_name=server_name,
1291
+ server_port=alt_port,
1292
+ show_error=True,
1293
+ quiet=False,
1294
+ **kwargs
1295
+ )
1296
+ break # If successful, break out of the loop
1297
+ except Exception as port_error:
1298
+ if "Cannot find empty port" in str(port_error):
1299
+ print(f" Port {alt_port} is busy, trying next...")
1300
+ continue
1301
+ else:
1302
+ raise port_error
1303
+ else:
1304
+ # If we get here, all ports in our range were busy
1305
+ raise Exception("All attempted ports (8080-8089) are busy")
1306
+
1307
+ except Exception as e2:
1308
+ print(f"❌ Failed to launch with alternative ports: {e2}")
1309
+ print("πŸ’‘ Try specifying a different port manually:")
1310
+ print(f" python -m stringsight.dashboard.launcher --port 9000")
1311
+ print(f" python -m stringsight.dashboard.launcher --auto_port")
1312
+ raise e2
stringsight/dashboard/clusters_tab.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helpers for the **View Clusters** tab – both the interactive HTML and
2
+ fallback dataframe view."""
3
+ from typing import List
4
+
5
+ import pandas as pd
6
+ import ast
7
+
8
+ from .state import app_state
9
+ from .utils import (
10
+ search_clusters_by_text,
11
+ search_clusters_only,
12
+ create_interactive_cluster_viewer,
13
+ get_cluster_statistics,
14
+ format_cluster_dataframe,
15
+ extract_allowed_tag,
16
+ )
17
+
18
+ __all__ = ["view_clusters_interactive", "view_clusters_table"]
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # Interactive HTML view
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def view_clusters_interactive(
26
+ selected_models: List[str],
27
+ cluster_level: str,
28
+ search_term: str = "",
29
+ selected_tags: List[str] | None = None,
30
+ ) -> str:
31
+ if app_state["clustered_df"] is None:
32
+ return (
33
+ "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
34
+ "using the 'Load Data' tab</p>"
35
+ )
36
+
37
+ df = app_state["clustered_df"].dropna(subset=["property_description"]).copy()
38
+
39
+ # Apply search filter first
40
+ if search_term and search_term.strip():
41
+ df = search_clusters_only(df, search_term.strip(), cluster_level)
42
+
43
+ # Optional tags filter – only keep rows whose meta resolves to an allowed tag in selected_tags
44
+ if selected_tags and len(selected_tags) > 0 and 'meta' in df.columns:
45
+ def _first_allowed_tag(obj):
46
+ return extract_allowed_tag(obj)
47
+
48
+ # Check if all meta are empty dicts (means no tags)
49
+ def _parse_try(obj):
50
+ if isinstance(obj, str):
51
+ try:
52
+ return ast.literal_eval(obj)
53
+ except Exception:
54
+ return obj
55
+ return obj
56
+
57
+ parsed_meta = df['meta'].apply(_parse_try)
58
+ non_null_parsed = [m for m in parsed_meta.tolist() if m is not None]
59
+ all_empty_dicts = (
60
+ len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
61
+ )
62
+ if not all_empty_dicts:
63
+ allowed = set(map(str, selected_tags))
64
+ df = df[df['meta'].apply(_first_allowed_tag).astype(str).isin(allowed)]
65
+
66
+ # Build interactive viewer
67
+ cluster_html = create_interactive_cluster_viewer(df, selected_models, cluster_level)
68
+
69
+ # Statistics summary at the top
70
+ stats = get_cluster_statistics(df, selected_models)
71
+ if not stats:
72
+ return (
73
+ "<p style='color: #e74c3c; padding: 20px;'>❌ No cluster data available</p>"
74
+ )
75
+
76
+ # Get additional metrics from cluster_scores
77
+ cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
78
+
79
+ # Calculate average quality scores and frequency
80
+ total_frequency = 0
81
+ quality_scores_list = []
82
+ metric_names = set()
83
+
84
+ for cluster_name, cluster_data in cluster_scores.items():
85
+ total_frequency += cluster_data.get("proportion", 0) * 100
86
+ quality_scores = cluster_data.get("quality", {})
87
+ if quality_scores:
88
+ quality_scores_list.extend(quality_scores.values())
89
+ metric_names.update(quality_scores.keys())
90
+
91
+ avg_quality = sum(quality_scores_list) / len(quality_scores_list) if quality_scores_list else 0
92
+ metrics_suffix = f" ({', '.join(sorted(metric_names))})" if metric_names else ""
93
+
94
+ stats_html = f"""
95
+ <div style="
96
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
97
+ color: white;
98
+ padding: 20px;
99
+ border-radius: 8px;
100
+ margin-bottom: 20px;
101
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
102
+ ">
103
+ <h3 style="margin: 0 0 15px 0;">Cluster Statistics</h3>
104
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); gap: 8px;">
105
+ <div>
106
+ <div style="font-size: 24px; font-weight: bold;">{stats['total_properties']:,}</div>
107
+ <div style="opacity: 0.9;">Total Properties</div>
108
+ </div>
109
+ <div>
110
+ <div style="font-size: 24px; font-weight: bold;">{stats['total_models']}</div>
111
+ <div style="opacity: 0.9;">Models</div>
112
+ </div>
113
+ """
114
+
115
+ if cluster_level == "fine" and "fine_clusters" in stats:
116
+ stats_html += f"""
117
+ <div>
118
+ <div style="font-size: 24px; font-weight: bold;">{stats['fine_clusters']}</div>
119
+ <div style="opacity: 0.9;">Fine Clusters</div>
120
+ </div>
121
+ <div>
122
+ <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_fine_cluster']:.1f}</div>
123
+ <div style="opacity: 0.9;">Avg Properties/Cluster</div>
124
+ </div>
125
+ """
126
+ elif cluster_level == "coarse" and "coarse_clusters" in stats:
127
+ stats_html += f"""
128
+ <div>
129
+ <div style="font-size: 24px; font-weight: bold;">{stats['coarse_clusters']}</div>
130
+ <div style="opacity: 0.9;">Coarse Clusters</div>
131
+ </div>
132
+ <div>
133
+ <div style="font-size: 24px; font-weight: bold;">{stats['avg_properties_per_coarse_cluster']:.1f}</div>
134
+ <div style="opacity: 0.9;">Avg Properties/Cluster</div>
135
+ </div>
136
+ """
137
+
138
+ stats_html += """
139
+ </div>
140
+ </div>
141
+ """
142
+
143
+ # Add a note if coarse clusters were requested but not available
144
+ if cluster_level == "coarse" and "coarse_clusters" not in stats and "fine_clusters" in stats:
145
+ stats_html += """
146
+ <div style="
147
+ background: #fff3cd;
148
+ border-left: 4px solid #ffc107;
149
+ padding: 10px 15px;
150
+ margin-bottom: 15px;
151
+ border-radius: 4px;
152
+ ">
153
+ ⚠️ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
154
+ </div>
155
+ """
156
+
157
+ # Additional filter chips
158
+ filter_info = ""
159
+ if search_term and search_term.strip():
160
+ filter_info += f"""
161
+ <div style="
162
+ background: #e3f2fd;
163
+ border-left: 4px solid #2196f3;
164
+ padding: 10px 15px;
165
+ margin-bottom: 15px;
166
+ border-radius: 4px;
167
+ ">
168
+ πŸ” <strong>Search Filter:</strong> "{search_term}"
169
+ </div>
170
+ """
171
+
172
+ if selected_models:
173
+ filter_info += f"""
174
+ <div style="
175
+ background: #f3e5f5;
176
+ border-left: 4px solid #9c27b0;
177
+ padding: 10px 15px;
178
+ margin-bottom: 15px;
179
+ border-radius: 4px;
180
+ ">
181
+ 🎯 <strong>Selected Models:</strong> {', '.join(selected_models)}
182
+ </div>
183
+ """
184
+
185
+ if selected_tags and len(selected_tags) > 0:
186
+ filter_info += f"""
187
+ <div style="
188
+ background: #e8f5e9;
189
+ border-left: 4px solid #4caf50;
190
+ padding: 10px 15px;
191
+ margin-bottom: 15px;
192
+ border-radius: 4px;
193
+ ">
194
+ 🏷️ <strong>Tag Filter:</strong> {', '.join(selected_tags)}
195
+ </div>
196
+ """
197
+
198
+ return stats_html + filter_info + cluster_html
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Dataframe fallback view
203
+ # ---------------------------------------------------------------------------
204
+
205
+ def view_clusters_table(
206
+ selected_models: List[str],
207
+ cluster_level: str,
208
+ search_term: str = "",
209
+ ) -> pd.DataFrame:
210
+ if app_state["clustered_df"] is None:
211
+ return pd.DataFrame({"Message": ["Please load data first using the 'Load Data' tab"]})
212
+
213
+ df = app_state["clustered_df"].copy()
214
+
215
+ if search_term and search_term.strip():
216
+ df = search_clusters_only(df, search_term.strip(), cluster_level)
217
+
218
+ formatted_df = format_cluster_dataframe(df, selected_models, cluster_level)
219
+
220
+ if formatted_df.empty:
221
+ if search_term and search_term.strip():
222
+ return pd.DataFrame({"Message": [f"No results found for search term '{search_term}'. Try a different search term."]})
223
+ elif selected_models:
224
+ available_models = df["model"].unique().tolist() if "model" in df.columns else []
225
+ return pd.DataFrame({"Message": [
226
+ f"No data found for selected models: {', '.join(selected_models)}. "
227
+ f"Available models: {', '.join(available_models)}"
228
+ ]})
229
+ else:
230
+ return pd.DataFrame({"Message": [
231
+ "No data available. Please check your data files and try reloading."
232
+ ]})
233
+
234
+ return formatted_df
stringsight/dashboard/conversation_display.py ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ """Conversation display helpers for dashboard.
4
+
5
+ This module encapsulates everything related to:
6
+ β€’ safely parsing model responses (lists / dicts / JSON strings)
7
+ β€’ pretty-printing embedded dictionaries for readability
8
+ β€’ converting multiple conversation formats to the OpenAI chat list format
9
+ β€’ rendering that list as HTML (including accordion grouping + raw JSON viewer).
10
+
11
+ Moving this logic out of utils.py keeps the latter lean and focussed on general
12
+ analytics utilities.
13
+ """
14
+
15
+ from typing import List, Dict, Any
16
+ import ast
17
+ import json
18
+ import html
19
+ import markdown
20
+ import re
21
+
22
+ __all__: List[str] = [
23
+ "convert_to_openai_format",
24
+ "display_openai_conversation_html",
25
+ "pretty_print_embedded_dicts",
26
+ ]
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Pretty-printing helpers
30
+ # ---------------------------------------------------------------------------
31
+
32
+ def _find_balanced_spans(text: str):
33
+ """Return (start, end) spans of balanced {...} or [...] regions in *text*."""
34
+ spans, stack = [], []
35
+ for i, ch in enumerate(text):
36
+ if ch in "{[":
37
+ stack.append((ch, i))
38
+ elif ch in "]}" and stack:
39
+ opener, start = stack.pop()
40
+ if (opener, ch) in {("{", "}"), ("[", "]")} and not stack:
41
+ spans.append((start, i + 1))
42
+ return spans
43
+
44
+
45
+ def _try_parse_slice(slice_: str):
46
+ """Attempt to parse *slice_* into a Python object; return None on failure."""
47
+ try:
48
+ return ast.literal_eval(slice_)
49
+ except Exception:
50
+ try:
51
+ return json.loads(slice_)
52
+ except Exception:
53
+ return None
54
+
55
+
56
+ def _find_code_spans(text: str) -> List[tuple]:
57
+ """Return spans for markdown code regions to be preserved as-is.
58
+
59
+ Includes:
60
+ - fenced code blocks delimited by ``` ... ```
61
+ - inline code segments delimited by `...`
62
+ """
63
+ spans: List[tuple] = []
64
+
65
+ # Fenced blocks ``` ... ``` (language spec allowed after opening fence)
66
+ idx = 0
67
+ while True:
68
+ start = text.find("```", idx)
69
+ if start == -1:
70
+ break
71
+ # Find the end fence
72
+ end = text.find("```", start + 3)
73
+ if end == -1:
74
+ # Unclosed fence: treat rest of string as code
75
+ spans.append((start, len(text)))
76
+ break
77
+ spans.append((start, end + 3))
78
+ idx = end + 3
79
+
80
+ # Inline code `...`
81
+ for m in re.finditer(r"`[^`]*`", text, flags=re.DOTALL):
82
+ spans.append((m.start(), m.end()))
83
+
84
+ # Sort and merge overlapping spans
85
+ spans.sort()
86
+ merged: List[tuple] = []
87
+ for s, e in spans:
88
+ if not merged or s > merged[-1][1]:
89
+ merged.append((s, e))
90
+ else:
91
+ merged[-1] = (merged[-1][0], max(merged[-1][1], e))
92
+ return merged
93
+
94
+
95
+ def _is_inside_any_span(start: int, end: int, spans: List[tuple]) -> bool:
96
+ for s, e in spans:
97
+ if start >= s and end <= e:
98
+ return True
99
+ return False
100
+
101
+
102
+ def pretty_print_embedded_dicts(text: str) -> str:
103
+ """Replace dicts, lists, or other complex structures with pretty-printed JSON, except inside code.
104
+
105
+ Dict-like regions that fall within markdown code spans (inline backticks
106
+ or fenced code blocks) are left untouched so code examples render verbatim.
107
+ """
108
+ if not text:
109
+ return text
110
+
111
+ code_spans = _find_code_spans(text)
112
+
113
+ def _to_json_safe(obj: Any):
114
+ """Recursively convert Python objects to JSON-serializable equivalents.
115
+
116
+ - Ellipsis (…) or ... becomes "..."
117
+ - Unsupported objects become str(obj)
118
+ """
119
+ if obj is ... or isinstance(obj, type(Ellipsis)):
120
+ return "..."
121
+ if isinstance(obj, dict):
122
+ return {str(k): _to_json_safe(v) for k, v in obj.items()}
123
+ if isinstance(obj, list):
124
+ return [_to_json_safe(v) for v in obj]
125
+ if isinstance(obj, tuple):
126
+ return [_to_json_safe(v) for v in obj]
127
+ if isinstance(obj, (str, int, float, bool)) or obj is None:
128
+ return obj
129
+ return str(obj)
130
+
131
+ def _is_complex_structure(obj):
132
+ """Check if object is worth pretty-printing (not just a simple value)"""
133
+ if isinstance(obj, dict):
134
+ return len(obj) > 0
135
+ elif isinstance(obj, list):
136
+ return len(obj) > 0 and any(isinstance(item, (dict, list)) for item in obj)
137
+ return False
138
+
139
+ def _format_with_preserved_spacing(json_str):
140
+ """Convert JSON string to HTML with preserved indentation and wrapping.
141
+
142
+ Use a <pre> block with white-space: pre-wrap so that long tokens can wrap
143
+ while preserving indentation and newlines without converting spaces to
144
+ non-breaking spaces (which prevents wrapping).
145
+ """
146
+ formatted = html.escape(json_str, quote=False)
147
+ return (
148
+ "<pre style=\"font-family: monospace; line-height: 1.4; font-size: 14px; "
149
+ "white-space: pre-wrap !important; word-break: break-word; overflow-wrap: anywhere; "
150
+ "background: #ffffff; padding: 10px; border-radius: 4px; margin: 0;\">"
151
+ f"{formatted}"
152
+ "</pre>"
153
+ )
154
+
155
+ new_parts, last_idx = [], 0
156
+ for start, end in _find_balanced_spans(text):
157
+ candidate = text[start:end]
158
+ parsed = _try_parse_slice(candidate)
159
+
160
+ if _is_complex_structure(parsed) and not _is_inside_any_span(start, end, code_spans):
161
+ new_parts.append(html.escape(text[last_idx:start], quote=False))
162
+ pretty = json.dumps(_to_json_safe(parsed), indent=2, ensure_ascii=False)
163
+ new_parts.append(_format_with_preserved_spacing(pretty))
164
+ last_idx = end
165
+ new_parts.append(html.escape(text[last_idx:], quote=False))
166
+ return "".join(new_parts)
167
+
168
+ # ---------------------------------------------------------------------------
169
+ # Format conversion
170
+ # ---------------------------------------------------------------------------
171
+
172
+ def convert_to_openai_format(response_data: Any):
173
+ """Convert various response payloads into the OpenAI chat format list."""
174
+ if isinstance(response_data, list):
175
+ return response_data
176
+ if isinstance(response_data, dict):
177
+ # If it already looks like an OpenAI-style message, wrap it in a list
178
+ if "role" in response_data and "content" in response_data:
179
+ return [response_data]
180
+ # Otherwise treat dict as assistant content (preserve structure for tool_calls)
181
+ return [{"role": "assistant", "content": response_data}]
182
+ if isinstance(response_data, str):
183
+ # Try Python literal first (handles single quotes)
184
+ try:
185
+ parsed = ast.literal_eval(response_data)
186
+ if isinstance(parsed, list):
187
+ return parsed
188
+ except (ValueError, SyntaxError):
189
+ pass
190
+ # Try JSON
191
+ try:
192
+ parsed = json.loads(response_data)
193
+ if isinstance(parsed, list):
194
+ return parsed
195
+ except json.JSONDecodeError:
196
+ pass
197
+ # Fallback plain-text assistant message
198
+ return [{"role": "assistant", "content": response_data}]
199
+ # Fallback for any other type
200
+ return [{"role": "assistant", "content": str(response_data)}]
201
+
202
+ # ---------------------------------------------------------------------------
203
+ # HTML rendering
204
+ # ---------------------------------------------------------------------------
205
+
206
+ def _markdown(text: str, *, pretty_print_dicts: bool = True) -> str:
207
+ """Render markdown, optionally pretty-printing any embedded dicts."""
208
+ processed = pretty_print_embedded_dicts(text) if pretty_print_dicts else html.escape(text, quote=False)
209
+
210
+ # Configure extensions for proper code block handling
211
+ extensions = ["fenced_code"]
212
+ extension_configs = {}
213
+
214
+ try:
215
+ import pygments
216
+ extensions.append("codehilite")
217
+ extension_configs['codehilite'] = {
218
+ 'css_class': 'highlight',
219
+ 'use_pygments': True,
220
+ 'guess_lang': True,
221
+ 'linenums': False
222
+ }
223
+ except ImportError:
224
+ pass
225
+
226
+ # Convert newlines to <br> only outside of code blocks
227
+ # Process fenced code blocks first, then handle line breaks
228
+ result = markdown.markdown(processed, extensions=extensions, extension_configs=extension_configs)
229
+
230
+ # IMPORTANT: Avoid injecting <br> tags when lists are present, as this can
231
+ # introduce empty bullets or odd spacing in nested lists.
232
+ import re
233
+ if re.search(r'<(ul|ol)\b', result):
234
+ return result
235
+
236
+ # Otherwise, add line breaks for non-code content only
237
+ code_block_pattern = r'(<pre[^>]*>.*?</pre>|<code[^>]*>.*?</code>)'
238
+ parts = re.split(code_block_pattern, result, flags=re.DOTALL)
239
+
240
+ for i in range(0, len(parts), 2): # Process non-code parts only
241
+ if i < len(parts):
242
+ parts[i] = re.sub(r'(?<!\n)\n(?!\n)', '<br>\n', parts[i])
243
+
244
+ return ''.join(parts)
245
+
246
+
247
+ def display_openai_conversation_html(conversation_data: List[Dict[str, Any]], *, use_accordion: bool = True, pretty_print_dicts: bool = True, evidence: Any = None) -> str:
248
+ """Convert an OpenAI-style conversation list into styled HTML for Gradio."""
249
+ from .examples_helpers import annotate_text_with_evidence_placeholders, HIGHLIGHT_START, HIGHLIGHT_END
250
+ if not conversation_data:
251
+ return "<p>No conversation data available</p>"
252
+
253
+ # Collapsed raw JSON section for debugging
254
+ raw_json = json.dumps(conversation_data, indent=2, ensure_ascii=False)
255
+ html_out = f"""
256
+ <details style="margin: 8px 0;">
257
+ <summary style="cursor: pointer; font-weight: 600;">
258
+ Click to see raw response ({len(conversation_data)})
259
+ </summary>
260
+ <div style="padding: 8px 15px;">
261
+ <pre style="white-space: pre-wrap; word-wrap: break-word; overflow-wrap: anywhere; background: #ffffff; padding: 10px; border-radius: 4px;">{html.escape(raw_json, quote=False)}</pre>
262
+ </div>
263
+ </details>
264
+ """
265
+
266
+ role_colors = {
267
+ "system": "#ff6b6b",
268
+ "info": "#4ecdc4",
269
+ "assistant": "#45b7d1",
270
+ "tool": "#96ceb4",
271
+ "user": "#feca57",
272
+ }
273
+
274
+ def _maybe_annotate(content_str: str) -> str:
275
+ if evidence is None or not isinstance(content_str, str) or not content_str.strip():
276
+ return content_str
277
+ return annotate_text_with_evidence_placeholders(content_str, evidence)
278
+
279
+ def _replace_placeholders_with_mark(html_str: str) -> str:
280
+ if not html_str:
281
+ return html_str
282
+ return (
283
+ html_str
284
+ .replace(HIGHLIGHT_START, "<mark class=\"evidence-highlight\">")
285
+ .replace(HIGHLIGHT_END, "</mark>")
286
+ )
287
+
288
+ def _format_tool_calls(content: Dict[str, Any]) -> str:
289
+ """Format tool calls in a more readable way."""
290
+ if not isinstance(content, dict) or "tool_calls" not in content:
291
+ return f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
292
+
293
+ tool_calls = content["tool_calls"]
294
+ if not isinstance(tool_calls, list):
295
+ return f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
296
+
297
+ html_parts = []
298
+
299
+ for i, tool_call in enumerate(tool_calls, 1):
300
+ if not isinstance(tool_call, dict):
301
+ continue
302
+
303
+ # Extract tool call information
304
+ name = tool_call.get("name", "Unknown tool")
305
+ arguments = tool_call.get("arguments", "")
306
+ tool_id = tool_call.get("id", tool_call.get("tool_call_id", ""))
307
+ # Coerce call type to a safe uppercase string
308
+ raw_call_type = tool_call.get("type", "function")
309
+ call_type = str(raw_call_type or "function")
310
+
311
+ # Parse arguments if they're a JSON string
312
+ formatted_args = arguments
313
+ if isinstance(arguments, str) and arguments.strip():
314
+ try:
315
+ parsed_args = json.loads(arguments)
316
+ formatted_args = json.dumps(parsed_args, indent=2, ensure_ascii=False)
317
+ except json.JSONDecodeError:
318
+ formatted_args = arguments
319
+ elif isinstance(arguments, (dict, list, tuple, int, float, bool)) or arguments is None:
320
+ # Stringify any non-string argument type
321
+ try:
322
+ formatted_args = json.dumps(arguments, indent=2, ensure_ascii=False)
323
+ except Exception:
324
+ formatted_args = str(arguments)
325
+
326
+ # Format with preserved spacing for proper indentation
327
+ if formatted_args and isinstance(formatted_args, str) and ('\n' in formatted_args or ' ' in formatted_args):
328
+ escaped_args = html.escape(formatted_args, quote=False)
329
+ formatted_args = (
330
+ "<pre style=\"font-family: monospace; line-height: 1.4; font-size: 14px; "
331
+ "white-space: pre-wrap !important; word-break: break-word; overflow-wrap: anywhere; "
332
+ "background: #ffffff; padding: 10px; border-radius: 4px; margin: 0;\">"
333
+ f"{escaped_args}"
334
+ "</pre>"
335
+ )
336
+ else:
337
+ formatted_args = html.escape(str(formatted_args), quote=False)
338
+
339
+ # Create the tool call display
340
+ tool_html = f"""
341
+ <div style="border: 1px solid #ff7f00; border-radius: 8px; margin: 8px 0; padding: 12px; background: #fff8f0;">
342
+ <div style="display: flex; align-items: center; margin-bottom: 8px;">
343
+ <span style="background: #ff7f00; color: white; padding: 2px 6px; border-radius: 4px; font-size: 11px; font-weight: bold; margin-right: 8px;">
344
+ {call_type.upper()}
345
+ </span>
346
+ <span style="font-weight: 600; color: #d2691e; font-size: 14px;">{html.escape(name)}</span>
347
+ {f'<span style="margin-left: auto; font-size: 11px; color: #666;">ID: {html.escape(tool_id)}</span>' if tool_id else ''}
348
+ </div>
349
+
350
+ {f'''<div style="margin-top: 8px;">
351
+ <div style="font-weight: 600; color: #666; margin-bottom: 4px; font-size: 12px;">Arguments:</div>
352
+ <div style="font-size: 12px; line-height: 1.4; color: #333;">{formatted_args}</div>
353
+ </div>''' if formatted_args else ''}
354
+ </div>
355
+ """
356
+
357
+ html_parts.append(tool_html)
358
+
359
+ if len(tool_calls) > 1:
360
+ return f"""
361
+ <div style="border-left: 3px solid #ff7f00; padding-left: 12px; margin: 8px 0;">
362
+ <div style="font-weight: 600; color: #d2691e; margin-bottom: 8px; font-size: 14px;">
363
+ {len(tool_calls)} tool call{'s' if len(tool_calls) != 1 else ''}:
364
+ </div>
365
+ {''.join(html_parts)}
366
+ </div>
367
+ """
368
+ else:
369
+ return ''.join(html_parts)
370
+
371
+ def _format_msg(role: str, content: Any) -> str:
372
+ # Check if this is a tool call by examining the content
373
+ is_tool_call = False
374
+ if isinstance(content, dict) and "tool_calls" in content:
375
+ is_tool_call = True
376
+
377
+ if isinstance(content, dict) or (isinstance(content, list) and content and all(isinstance(d, dict) for d in content)):
378
+ if is_tool_call:
379
+ # Render assistant text (if provided) plus styled tool calls
380
+ text_html = ""
381
+ if isinstance(content, dict) and isinstance(content.get("text"), str) and content.get("text").strip():
382
+ annotated = _maybe_annotate(content.get("text", ""))
383
+ text_html = _markdown(annotated, pretty_print_dicts=pretty_print_dicts)
384
+ text_html = _replace_placeholders_with_mark(text_html)
385
+ content_html = text_html + _format_tool_calls(content)
386
+ elif pretty_print_dicts:
387
+ def _to_json_safe_inline(obj: Any):
388
+ if obj is ... or isinstance(obj, type(Ellipsis)):
389
+ return "..."
390
+ if isinstance(obj, dict):
391
+ return {str(k): _to_json_safe_inline(v) for k, v in obj.items()}
392
+ if isinstance(obj, list):
393
+ return [_to_json_safe_inline(v) for v in obj]
394
+ if isinstance(obj, tuple):
395
+ return [_to_json_safe_inline(v) for v in obj]
396
+ if isinstance(obj, (str, int, float, bool)) or obj is None:
397
+ return obj
398
+ return str(obj)
399
+
400
+ safe_json = html.escape(json.dumps(_to_json_safe_inline(content), indent=2, ensure_ascii=False), quote=False)
401
+ content_html = (
402
+ f"<pre style='background: #ffffff; padding: 10px; border-radius: 4px; "
403
+ f"white-space: pre-wrap !important; word-break: break-word; overflow-wrap: anywhere;'>{safe_json}</pre>"
404
+ )
405
+ else:
406
+ content_html = f"<code>{html.escape(json.dumps(content, ensure_ascii=False))}</code>"
407
+ elif isinstance(content, str):
408
+ # Insert highlight placeholders before markdown so offsets make sense in plain text
409
+ annotated = _maybe_annotate(content)
410
+ content_html = _markdown(annotated, pretty_print_dicts=pretty_print_dicts)
411
+ # Convert placeholders to <mark> after markdown
412
+ content_html = _replace_placeholders_with_mark(content_html)
413
+ elif content is None:
414
+ content_html = "<em>(No content)</em>"
415
+ else:
416
+ content_html = str(content)
417
+
418
+ # Determine role display text and color
419
+ if is_tool_call:
420
+ # Keep assistant styling; tool blocks are styled within
421
+ role_display = "assistant"
422
+ color = role_colors.get("assistant", "#95a5a6")
423
+ else:
424
+ role_display = role
425
+ color = role_colors.get(role.lower(), "#95a5a6")
426
+
427
+ return (
428
+ f"<div style='border-left: 4px solid {color}; margin: 8px 0; background-color: #ffffff; padding: 12px; border-radius: 0 8px 8px 0;'>"
429
+ f"<div style='font-weight: 600; color: {color}; margin-bottom: 8px; text-transform: capitalize; font-size: 16px;'>{role_display}</div>"
430
+ f"<div style='color: #333; line-height: 1.6; font-family: inherit; font-size: 15px;'>{content_html}</div>"
431
+ "</div>"
432
+ )
433
+
434
+ if use_accordion:
435
+ system_msgs, info_msgs, other_msgs = [], [], []
436
+ for m in conversation_data:
437
+ if not isinstance(m, dict):
438
+ continue
439
+ role = m.get("role", "unknown").lower()
440
+ content = m.get("content", "")
441
+ if isinstance(content, dict) and "text" in content and "tool_calls" not in content:
442
+ content = content["text"]
443
+ if role == "system":
444
+ system_msgs.append((role, content))
445
+ elif role == "info":
446
+ info_msgs.append((role, content))
447
+ else:
448
+ other_msgs.append((role, content))
449
+
450
+ def _accordion(title: str, items: List):
451
+ if not items:
452
+ return ""
453
+ inner = "".join(_format_msg(r, c) for r, c in items)
454
+ return (
455
+ f"<details style='margin: 8px 0;'>"
456
+ f"<summary style='cursor: pointer; font-weight: 600;'>"
457
+ f"{html.escape(title)} ({len(items)})" # e.g. "Click to see system messages (3)"
458
+ f"</summary>"
459
+ f"<div style='padding: 8px 15px;'>{inner}</div>"
460
+ "</details>"
461
+ )
462
+
463
+ html_out += _accordion("Click to see system messages", system_msgs)
464
+ html_out += _accordion("Click to see info messages", info_msgs)
465
+ for r, c in other_msgs:
466
+ html_out += _format_msg(r, c)
467
+ else:
468
+ # No accordion: just render everything
469
+ for m in conversation_data:
470
+ if not isinstance(m, dict):
471
+ continue
472
+ role = m.get("role", "unknown").lower()
473
+ content = m.get("content", "")
474
+ if isinstance(content, dict) and "text" in content and "tool_calls" not in content:
475
+ content = content["text"]
476
+ html_out += _format_msg(role, content)
477
+
478
+ # CSS for proper code block styling and summary hover effects
479
+ css_styles = """
480
+ <style>
481
+ .evidence-highlight { background: #ffff8b; padding: 0 2px; }
482
+ :root {
483
+ /* Code block color palette - GitHub Light inspired */
484
+ --code-bg: transparent; /* make JSON/code wrapper background transparent */
485
+ --code-text: #24292f;
486
+ --code-comment: #6a737d;
487
+ --code-keyword: #d73a49;
488
+ --code-string: #032f62;
489
+ --code-number: #005cc5;
490
+ --code-operator: #24292f;
491
+ --code-function: #6f42c1;
492
+ --code-border: #d0d7de;
493
+
494
+ /* Inline code colors - same light theme */
495
+ --inline-code-bg: #f3f4f6;
496
+ --inline-code-text: #24292f;
497
+ --inline-code-border: #d1d5db;
498
+
499
+ /* Code block structure */
500
+ --code-border-radius: 8px;
501
+ --code-padding: 16px;
502
+ --code-font-size: 14px;
503
+ --code-line-height: 1.5;
504
+ --code-font-family: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', 'SF Mono', Consolas, 'Liberation Mono', Menlo, Courier, monospace;
505
+ }
506
+
507
+ /* Base code styling */
508
+ pre, code {
509
+ font-family: var(--code-font-family) !important;
510
+ font-size: var(--code-font-size) !important;
511
+ line-height: var(--code-line-height) !important;
512
+ font-variant-ligatures: normal !important;
513
+ -webkit-font-smoothing: antialiased !important;
514
+ -moz-osx-font-smoothing: grayscale !important;
515
+ }
516
+
517
+ /* Fenced code blocks - light theme */
518
+ .highlight, .codehilite, pre.highlight, pre.codehilite,
519
+ .language-python, .language-text, .language-bash {
520
+ background: var(--code-bg) !important;
521
+ color: var(--code-text) !important;
522
+ border: 1px solid var(--code-border) !important;
523
+ border-radius: var(--code-border-radius) !important;
524
+ padding: var(--code-padding) !important;
525
+ margin: 12px 0 !important;
526
+ overflow-x: auto !important;
527
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05) !important;
528
+ position: relative !important;
529
+ white-space: pre !important;
530
+ display: block !important;
531
+ }
532
+
533
+ .highlight pre, .codehilite pre {
534
+ background: transparent !important;
535
+ color: inherit !important;
536
+ margin: 0 !important;
537
+ padding: 0 !important;
538
+ border: none !important;
539
+ border-radius: 0 !important;
540
+ overflow: visible !important;
541
+ white-space: pre !important;
542
+ display: block !important;
543
+ }
544
+
545
+ /* Ensure code blocks preserve formatting */
546
+ .highlight code, .codehilite code {
547
+ white-space: pre !important;
548
+ display: block !important;
549
+ padding: 0 !important;
550
+ margin: 0 !important;
551
+ background: transparent !important;
552
+ border: none !important;
553
+ font-size: inherit !important;
554
+ line-height: inherit !important;
555
+ }
556
+
557
+ /* Add language label for fenced blocks */
558
+ .highlight::before, .codehilite::before {
559
+ content: 'python';
560
+ position: absolute;
561
+ top: 8px;
562
+ right: 12px;
563
+ background: rgba(0, 0, 0, 0.05);
564
+ color: #586069;
565
+ padding: 2px 8px;
566
+ border-radius: 4px;
567
+ font-size: 11px;
568
+ font-weight: 500;
569
+ text-transform: uppercase;
570
+ letter-spacing: 0.5px;
571
+ }
572
+
573
+ /* Syntax highlighting for Python - Light theme */
574
+ .highlight .k, .codehilite .k, /* keywords */
575
+ .highlight .kn, .codehilite .kn, /* keyword.namespace */
576
+ .highlight .kp, .codehilite .kp, /* keyword.pseudo */
577
+ .highlight .kr, .codehilite .kr, /* keyword.reserved */
578
+ .highlight .kt, .codehilite .kt /* keyword.type */
579
+ {
580
+ color: var(--code-keyword) !important;
581
+ font-weight: 600 !important;
582
+ }
583
+
584
+ .highlight .s, .codehilite .s, /* strings */
585
+ .highlight .s1, .codehilite .s1, /* string.single */
586
+ .highlight .s2, .codehilite .s2, /* string.double */
587
+ .highlight .se, .codehilite .se /* string.escape */
588
+ {
589
+ color: var(--code-string) !important;
590
+ }
591
+
592
+ .highlight .c, .codehilite .c, /* comments */
593
+ .highlight .c1, .codehilite .c1, /* comment.single */
594
+ .highlight .cm, .codehilite .cm /* comment.multiline */
595
+ {
596
+ color: var(--code-comment) !important;
597
+ font-style: italic !important;
598
+ }
599
+
600
+ .highlight .m, .codehilite .m, /* numbers */
601
+ .highlight .mi, .codehilite .mi, /* number.integer */
602
+ .highlight .mf, .codehilite .mf, /* number.float */
603
+ .highlight .mo, .codehilite .mo /* number.octal */
604
+ {
605
+ color: var(--code-number) !important;
606
+ font-weight: 600 !important;
607
+ }
608
+
609
+ .highlight .nf, .codehilite .nf, /* function names */
610
+ .highlight .fm, .codehilite .fm /* function.magic */
611
+ {
612
+ color: var(--code-function) !important;
613
+ font-weight: 600 !important;
614
+ }
615
+
616
+ .highlight .o, .codehilite .o, /* operators */
617
+ .highlight .ow, .codehilite .ow /* operator.word */
618
+ {
619
+ color: var(--code-operator) !important;
620
+ }
621
+
622
+ /* Inline code - light theme */
623
+ p code, li code, div code, span code,
624
+ h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
625
+ background: var(--inline-code-bg) !important;
626
+ color: var(--inline-code-text) !important;
627
+ border: 1px solid var(--inline-code-border) !important;
628
+ padding: 2px 6px !important;
629
+ border-radius: 4px !important;
630
+ font-size: 0.9em !important;
631
+ font-weight: 600 !important;
632
+ white-space: nowrap !important;
633
+ box-shadow: none !important;
634
+ display: inline !important;
635
+ }
636
+
637
+ /* Code blocks inside paragraphs should not be treated as inline */
638
+ p pre, li pre, div pre {
639
+ background: var(--code-bg) !important;
640
+ color: var(--code-text) !important;
641
+ border: 1px solid var(--code-border) !important;
642
+ border-radius: var(--code-border-radius) !important;
643
+ padding: var(--code-padding) !important;
644
+ margin: 8px 0 !important;
645
+ white-space: pre !important;
646
+ overflow-x: auto !important;
647
+ display: block !important;
648
+ }
649
+
650
+ /* Scrollbar styling for code blocks - light theme */
651
+ .highlight::-webkit-scrollbar, .codehilite::-webkit-scrollbar,
652
+ pre::-webkit-scrollbar {
653
+ height: 8px !important;
654
+ background: #f1f3f4 !important;
655
+ border-radius: 4px !important;
656
+ }
657
+
658
+ .highlight::-webkit-scrollbar-thumb, .codehilite::-webkit-scrollbar-thumb,
659
+ pre::-webkit-scrollbar-thumb {
660
+ background: #c1c8cd !important;
661
+ border-radius: 4px !important;
662
+ }
663
+
664
+ .highlight::-webkit-scrollbar-thumb:hover, .codehilite::-webkit-scrollbar-thumb:hover,
665
+ pre::-webkit-scrollbar-thumb:hover {
666
+ background: #a8b3ba !important;
667
+ }
668
+ </style>
669
+ """
670
+
671
+ css_styles += "</style>"
672
+ html_out = css_styles + html_out
673
+
674
+ return html_out
stringsight/dashboard/data_loader.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data loading functionality for the LMM-Vibes Gradio app.
3
+
4
+ This module handles loading pipeline results and converting them to formats
5
+ suitable for the Gradio interface.
6
+ """
7
+
8
+ import json
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Tuple, Optional
12
+ import os
13
+
14
+ from .state import app_state
15
+ from .plotting import create_model_cluster_dataframe
16
+
17
+
18
+ class DataCache:
19
+ """Simple cache for loaded data to avoid re-loading."""
20
+ _cache = {}
21
+
22
+ @classmethod
23
+ def get(cls, key: str):
24
+ return cls._cache.get(key)
25
+
26
+ @classmethod
27
+ def set(cls, key: str, value: Any):
28
+ cls._cache[key] = value
29
+
30
+ @classmethod
31
+ def clear(cls):
32
+ cls._cache.clear()
33
+
34
+
35
+ def scan_for_result_subfolders(base_dir: str) -> List[str]:
36
+ """Scan for subfolders that might contain pipeline results."""
37
+ base_path = Path(base_dir)
38
+ if not base_path.exists():
39
+ return []
40
+
41
+ # Look for subfolders that contain the required files
42
+ subfolders = []
43
+ for item in base_path.iterdir():
44
+ if item.is_dir():
45
+ # Check if this subfolder contains pipeline results
46
+ required_files = [
47
+ "model_cluster_scores.json",
48
+ "cluster_scores.json",
49
+ "model_scores.json",
50
+ "clustered_results_lightweight.jsonl"
51
+ ]
52
+ if all((item / f).exists() for f in required_files):
53
+ subfolders.append(item.name)
54
+
55
+ return subfolders
56
+
57
+
58
+ def validate_results_directory(results_dir: str) -> Tuple[bool, str]:
59
+ """Validate that the results directory contains the expected files."""
60
+ results_path = Path(results_dir)
61
+
62
+ if not results_path.exists():
63
+ return False, f"Directory does not exist: {results_dir}"
64
+
65
+ if not results_path.is_dir():
66
+ return False, f"Path is not a directory: {results_dir}"
67
+
68
+ # Check for FunctionalMetrics format files
69
+ required_files = [
70
+ "model_cluster_scores.json",
71
+ "cluster_scores.json",
72
+ "model_scores.json",
73
+ ]
74
+
75
+ missing_files = []
76
+ for filename in required_files:
77
+ if not (results_path / filename).exists():
78
+ missing_files.append(filename)
79
+
80
+ # Check for clustered results
81
+ if not (results_path / "clustered_results_lightweight.jsonl").exists():
82
+ missing_files.append("clustered_results_lightweight.jsonl")
83
+
84
+ if missing_files:
85
+ return False, f"Missing required files: {', '.join(missing_files)}"
86
+
87
+ return True, ""
88
+
89
+
90
+ def get_available_models(metrics: Dict[str, Any]) -> List[str]:
91
+ """Extract available models from metrics data."""
92
+ model_cluster_scores = metrics.get("model_cluster_scores", {})
93
+ return list(model_cluster_scores.keys())
94
+
95
+
96
+ def get_all_models(metrics: Dict[str, Any]) -> List[str]:
97
+ """Get all available models from metrics data."""
98
+ return get_available_models(metrics)
99
+
100
+
101
+ def load_pipeline_results(results_dir: str) -> Tuple[pd.DataFrame, Dict[str, Any], pd.DataFrame, Path]:
102
+ """Load pipeline outputs (FunctionalMetrics format only).
103
+ Returns:
104
+ clustered_df: DataFrame of per-conversation data loaded from clustered_results.jsonl
105
+ metrics: Dict containing the three FunctionalMetrics score dictionaries
106
+ model_cluster_df: DataFrame created from model_cluster_scores for plotting/analysis
107
+ results_path: Path to the results directory
108
+ """
109
+ cache_key = f"pipeline_results_{results_dir}"
110
+ cached = DataCache.get(cache_key)
111
+ if cached:
112
+ return cached
113
+
114
+ results_path = Path(results_dir)
115
+ if not results_path.exists():
116
+ raise FileNotFoundError(f"Results directory does not exist: {results_dir}")
117
+
118
+ # ------------------------------------------------------------------
119
+ # 1. Load FunctionalMetrics score files (must ALL be present)
120
+ # ------------------------------------------------------------------
121
+ required_files = [
122
+ "model_cluster_scores.json",
123
+ "cluster_scores.json",
124
+ "model_scores.json",
125
+ ]
126
+ missing = [f for f in required_files if not (results_path / f).exists()]
127
+ if missing:
128
+ raise FileNotFoundError(
129
+ f"Missing required metrics files in {results_dir}: {', '.join(missing)}"
130
+ )
131
+
132
+ with open(results_path / "model_cluster_scores.json") as f:
133
+ model_cluster_scores = json.load(f)
134
+ with open(results_path / "cluster_scores.json") as f:
135
+ cluster_scores = json.load(f)
136
+ with open(results_path / "model_scores.json") as f:
137
+ model_scores = json.load(f)
138
+
139
+ metrics = {
140
+ "model_cluster_scores": model_cluster_scores,
141
+ "cluster_scores": cluster_scores,
142
+ "model_scores": model_scores,
143
+ }
144
+
145
+ # ------------------------------------------------------------------
146
+ # 2. Load clustered conversation data (JSON-Lines)
147
+ # ------------------------------------------------------------------
148
+ clustered_path = results_path / "clustered_results_lightweight.jsonl"
149
+ if not clustered_path.exists():
150
+ raise FileNotFoundError(f"clustered_results_lightweight.jsonl not found in {results_dir}")
151
+
152
+ try:
153
+ clustered_df = pd.read_json(clustered_path, lines=True)
154
+ except Exception as e:
155
+ raise ValueError(f"Could not load clustered results: {e}")
156
+
157
+ # ------------------------------------------------------------------
158
+ # 3. Create model_cluster_df from metrics for plotting/analysis
159
+ # ------------------------------------------------------------------
160
+ model_cluster_df = create_model_cluster_dataframe(model_cluster_scores)
161
+
162
+ result = (clustered_df, metrics, model_cluster_df, results_path)
163
+ DataCache.set(cache_key, result)
164
+ return result
165
+
166
+
167
+ def load_property_examples(results_path: Path, property_ids: List[str]) -> pd.DataFrame:
168
+ """Load specific property examples on-demand"""
169
+ if not property_ids:
170
+ return pd.DataFrame()
171
+
172
+ cache_key = f"examples_{results_path}_{hash(tuple(sorted(property_ids)))}"
173
+ cached = DataCache.get(cache_key)
174
+ if cached is not None:
175
+ return cached
176
+
177
+ # Load full dataset to get prompt/response details
178
+ clustered_path = results_path / "clustered_results_lightweight.jsonl"
179
+
180
+ if not clustered_path.exists():
181
+ raise FileNotFoundError("Could not load example data - clustered_results_lightweight.jsonl not found")
182
+
183
+ try:
184
+ full_df = pd.read_json(clustered_path, lines=True)
185
+ result = full_df[full_df['id'].isin(property_ids)]
186
+ DataCache.set(cache_key, result)
187
+ return result
188
+ except Exception as e:
189
+ raise ValueError(f"Failed to load examples: {e}")
stringsight/dashboard/demo.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Demo script showing different ways to use the LMM-Vibes Gradio visualization.
3
+
4
+ This demonstrates the Python API for launching the Gradio app.
5
+ """
6
+
7
+ import argparse
8
+ from pathlib import Path
9
+ from stringsight.dashboard import launch_app, create_app
10
+
11
+
12
+ def demo_basic_launch():
13
+ """Demo: Basic launch without pre-loading data."""
14
+ print("πŸš€ Demo: Basic launch - data can be loaded through the UI")
15
+ launch_app()
16
+
17
+
18
+ def demo_preload_data(results_dir: str):
19
+ """Demo: Launch with pre-loaded data."""
20
+ print(f"πŸš€ Demo: Launch with pre-loaded data from {results_dir}")
21
+ launch_app(results_dir=results_dir)
22
+
23
+
24
+ def demo_custom_settings(results_dir: str = None):
25
+ """Demo: Launch with custom settings."""
26
+ print("πŸš€ Demo: Launch with custom settings")
27
+ launch_app(
28
+ results_dir=results_dir,
29
+ share=True, # Create public shareable link
30
+ server_name="0.0.0.0", # Allow access from other machines
31
+ server_port=8080, # Custom port
32
+ )
33
+
34
+
35
+ def demo_programmatic_access():
36
+ """Demo: Create app object for programmatic access."""
37
+ print("πŸš€ Demo: Programmatic app creation")
38
+
39
+ # Create the app object without launching
40
+ app = create_app()
41
+
42
+ # You could modify the app here if needed
43
+ # app.title = "My Custom Title"
44
+
45
+ # Launch when ready
46
+ print("Launching app...")
47
+ app.launch(share=False, server_port=7861)
48
+
49
+
50
+ def main():
51
+ parser = argparse.ArgumentParser(description="LMM-Vibes Gradio Visualization Demo")
52
+ parser.add_argument("--results_dir", help="Path to results directory for demos")
53
+ parser.add_argument("--demo", choices=[
54
+ "basic", "preload", "custom", "programmatic"
55
+ ], default="basic", help="Which demo to run")
56
+
57
+ args = parser.parse_args()
58
+
59
+ if args.demo == "basic":
60
+ demo_basic_launch()
61
+ elif args.demo == "preload":
62
+ if not args.results_dir:
63
+ print("❌ Error: --results_dir required for preload demo")
64
+ return
65
+ demo_preload_data(args.results_dir)
66
+ elif args.demo == "custom":
67
+ demo_custom_settings(args.results_dir)
68
+ elif args.demo == "programmatic":
69
+ demo_programmatic_access()
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
stringsight/dashboard/demo_examples.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Predefined demo example configurations for the Gradio launcher.
2
+
3
+ Each demo contains:
4
+ - data_path: absolute path to the dataset file
5
+ - explain: parameters for the Explain pipeline (aligned with exposed UI controls)
6
+ - label: parameters for the Label pipeline (aligned with exposed UI controls)
7
+ - advanced: shared advanced parameters (optional)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Dict, Any, List
13
+
14
+
15
+ # Single initial example using the existing project demo file and default params
16
+ EXAMPLES: Dict[str, Dict[str, Any]] = {
17
+ "Summarizing IT Support Calls": {
18
+ "data_path": "data/demo_data/call_center.jsonl",
19
+ "explain": {
20
+ "method": "single_model",
21
+ "system_prompt": "single_model_system_prompt",
22
+ "clusterer": "hdbscan",
23
+ "min_cluster_size": 8,
24
+ "max_coarse_clusters": 12,
25
+ "hierarchical": False,
26
+ "assign_outliers": False,
27
+ "groupby_column": "behavior_type",
28
+ },
29
+ "label": {
30
+ "taxonomy": {
31
+ "incorrectly states resolution": "The model incorrectly says how/if the issue was resolved.",
32
+ "fabricates information": "The model fabricates information about the transcript.",
33
+ "missing important information": "The model does not include important details about the user, the problem, or the resolution in its summary.",
34
+ },
35
+ "label_model_name": "gpt-5-mini",
36
+ },
37
+ "advanced": {
38
+ "sample_size": None,
39
+ "max_workers": 64,
40
+ "use_wandb": False,
41
+ "verbose": True,
42
+ },
43
+ },
44
+ "Airline Agent Customer Service": {
45
+ "data_path": "data/demo_data/taubench_airline.jsonl",
46
+ "explain": {
47
+ "method": "single_model",
48
+ "system_prompt": "agent_system_prompt",
49
+ "clusterer": "hdbscan",
50
+ "min_cluster_size": 8,
51
+ "max_coarse_clusters": 12,
52
+ "hierarchical": False,
53
+ "assign_outliers": False,
54
+ "groupby_column": "behavior_type",
55
+ },
56
+ "label": {
57
+ "taxonomy": {
58
+ "Disobey Task Specification": "This error occurs when an agent or system fails to adhere to specified constraints, guidelines, or requirements associated with a particular task. Non-compliance can result from unclear, incomplete, or ambiguous instructions provided by the user, system prompts, or task descriptions. It may also arise from an agent's inadequate ability to interpret or apply constraints effectively. Consequences of poor task constraint compliance include incorrect, suboptimal, or irrelevant outputs, reduced system performance and increased resource consumption.",
59
+ "Disobey Role Specification": "Failure to adhere to the defined responsibilities and constraints of an assigned role, potentially leading to an agent behaving like another.",
60
+ "Step Repetition": "Step repetition occurs when an agent or system unnecessarily repeats a phase, a task, a stage that have already been completed. Such redundancy can arise from inadequate state or context tracking, inefficient workflow management, unclear or ambiguous instructions, or failure to recognize completed tasks.\n\nAn example of step repetition is in the following Hyperagent trace where the Planner repeated exactly the same thought twice. \n\n[...]\nHyperAgent_matplotlib__matplotlib-23563 - INFO - Planner's Response: Thought: To address this issue, we need to understand the root cause of the 'Line3D' object not having the '_verts3d' attribute. This error typically occurs when the data passed to the plot function is invalid or when there's an issue with the 3D rendering. Let's start by examining the code and reproducing the issue.\n[...]\nHyperAgent_matplotlib__matplotlib-23563 - INFO - Planner's Response: Thought: To address this issue, we need to understand the root cause of the 'Line3D' object not having the '_verts3d' attribute. This error typically occurs when the data passed to the plot function is invalid or when there's an issue with the 3D rendering. Let's start by examining the code and reproducing the issue.\n[...]\nIntern Name: Codebase Editor\nSubgoal: Please prepare a response for the user with the following content:\n1. Explain the identified issue and recommend updating matplotlib to version 3.5.1 or later.\n2. Provide a modified version of their code that attempts to work around the issue.\n3. Suggest an alternative plotting method using scatter plots instead of lines.\n[...]",
61
+ "Loss of Conversation History": "Unexpected context truncation, disregarding recent interaction history and reverting to an antecedent conversational state.",
62
+ "Unaware of Termination Conditions": "This error occurs when an agent or system fails to adhere to criteria designed to trigger the termination of an interaction, conversation, phase, or task. Such oversight can arise due to ambiguous, incomplete, or poorly defined stopping conditions, ineffective monitoring mechanisms, or inadequate understanding of the criteria by the agent.\n\nConsequences of being unaware of stopping conditions include unnecessary conversation turns, inefficient use of resources, or potential harm to the correctness of the system due to performing actions beyond what is necessary.\n\nAn example of \"Unaware of Stopping Conditions\" involving two agents attempting to collaboratively solve a math problem is the following:\n{\n \"content\": [\n \"In the current problem, I don't have enough information to solve it. If I were to ask you for the relevant missing information now, it would be:\",\n \"1. What is the total length of the ribbon that Monica has?\",\n \"or\",\n \"2. How many inches of ribbon does Monica use for each gift bow?\",\n \"\",\n \"Could you please provide either piece of information so that I can solve the problem? Without either piece, the problem remains unsolvable.\"\n ],\n \"role\": \"user\",\n \"name\": \"assistant\"\n},\n{\n \"content\": [\n \"Continue. Please keep solving the problem until you need to query. (If you get to the answer, put it in \\\\boxed{}.)\"\n ],\n \"role\": \"assistant\",\n \"name\": \"mathproxyagent\"\n},\n{\n \"content\": [\n \"I'm sorry, but I really need more information to solve this problem. It can't be solved with the current data.\"\n ],\n \"role\": \"user\",\n \"name\": \"assistant\"\n},\n{\n \"content\": [\n \"Continue. Please keep solving the problem until you need to query. (If you get to the answer, put it in \\\\boxed{}.)\"\n ],\n \"role\": \"assistant\",\n \"name\": \"mathproxyagent\"\n}\n\nIn this example, the mathproxyagent is poorly designed as it does not recognize that the assistant agent requires additional information to continue. The mathproxyagent either needs to provide the necessary information or acknowledge that it does not have it, thereby appropriately terminating the interaction rather than repeating instructions unnecessarily.",
63
+ "Conversation Reset": "Unexpected or unwarranted restarting of a dialogue, potentially losing context and progress made in the interaction.",
64
+ "Fail to Ask for Clarification": "Inability to request additional information between agent when faced with unclear or incomplete data, potentially resulting in incorrect actions.",
65
+ "Task Derailment": "Deviation from the intended objective or focus of a given task, potentially resulting in irrelevant or unproductive actions.",
66
+ "Information Withholding": "This error occurs when an agent or group of agents possesses critical information but fails to share it promptly or effectively with other agents or system components that rely upon this information for their operations. The failure to disseminate relevant information may arise from ineffective or insufficient communication protocols, erroneous assumptions regarding the relevance or priority of the information, inadequate system coordination mechanisms, or deliberate withholding stemming from overly restrictive privacy policies or security constraints. Consequences of withholding relevant information can be severe, potentially leading to reduced operational efficiency, increased latency in task completion, unnecessary redundant processing, incorrect or suboptimal decision-making, and even complete system failures. Additionally, this error can significantly impair collaborative effectiveness, leading to misunderstandings, mistrust, or inefficiencies within the multi-agent environment. Furthermore, initial failures due to withheld information can trigger cascading errors, amplifying the negative impact on overall system performance and reliability. For instance, consider a scenario where a bug localization agent identifies a software defect, accurately determining the affected file and specific line number. The intended process requires this agent to immediately report such detailed bug information to a coding or repair agent responsible for addressing and resolving the issue. However, if the bug localization agent instead attempts to fix the bug independently without sharing the vital bug identification details with the coding agent, this withholding of relevant information could lead to duplicated effort, delayed resolution, incorrect fixes, or further system instability.",
67
+ "Ignored Other Agent's Input": "Not properly considering input or recommendations provided by other agents in the system (ignore their suggestions), potentially leading to bad decisions, stalled progress, or missed opportunities for solving the task.",
68
+ "Action-Reasoning Mismatch": "This error occurs when there is a discrepancy or mismatch between agents' logical discussion conclusion or a single agent's internal decision-making processes and the actual actions or outputs the system produces. Such inconsistencies can emerge due to errors in translating reasoning outcomes into practical implementations, or incorrect mapping between the agent's cognitive processes and its action space.\n\nThe consequences of this inconsistency can include unexpected, unintended, or counterproductive behaviors, reduced reliability, and diminished user trust. It can also complicate troubleshooting efforts by obscuring the true rationale behind decisions and actions, leading to further inefficiencies or repeated mistakes.\n\nFor example, in the trace below the agent states that `_add_prefix_for_feature_names_out` method is not explicitly shown in the code snippet, but only mentioned in the context of the `_iter` method. This conclusion is not coherent with the previosu step where the agent showed the implementation of this method. \n\n[...]\n ```python\ndef _add_prefix_for_feature_names_out(self, feature_names_out):\n \"\"\"\n Add prefix to feature names.\",\n \"\"\"\n if self.verbose_feature_names_out:\n[...]\nNote that the `_add_prefix_for_feature_names_out` method is not explicitly shown in the code snippet, but it is mentioned in the context of the `_iter` method.\n[...]",
69
+ "Premature Termination": "Ending a dialogue, interaction or task before all necessary information has been exchanged or objectives have been met. Necessary information constitutes verification of outputs, key data (e.g. api tokens) etc. that are necessary for the success of the task, and agents could have obtained if they tried more or already obtained but failed to communicate to other agents before termination.",
70
+ "Weak Verification": "Weak verification refers to situations where verification mechanisms (agent or step) exist within the system but fail to comprehensively cover all essential aspects of the design necessary for generating robust and reliable outputs. While verification steps are present, they may be incomplete, superficial, or insufficiently rigorous, thereby overlooking critical system attributes or interactions.\n\nConsequences of weak verification include partial validation that allows subtle errors, inconsistencies, or vulnerabilities to remain undetected, potentially compromising overall system reliability and effectiveness. This inadequacy can result in suboptimal system performance, unforeseen failures, cascade to final output if occur during substeps.\n\n\"You are a Code Reviewer. We are both working at ChatDev. We share a common interest in collaborating to successfully complete a task assigned by a new customer. You can help programmers assess source code for software troubleshooting, fix bugs to enhance code quality and robustness, and propose improvements to the source code. Here is a new customer's task: {task}. To complete the task, you must write a response that appropriately solves the requested instruction based on your expertise and the customer's needs.\"\n\nHowever, when asked to review generated code for a Sudoku game, the reviewer failed to recognize that standard Sudoku puzzles typically come pre-filled with numbers for the player to solve, an element absent in the generated implementation. Numerous Sudoku implementations and specifications are readily available online, which the verification agent could easily consult to ensure robustness and completeness.\n\nAnother example occurred with a TicTacToe implementation. While the game was functional and playable, the system incorrectly announced the winning player at the game's conclusion, despite employing the same ChatDev code reviewer prompt.",
71
+ "No or Incorrect Verification": "Omission of proper checking or confirmation of task outcomes or system outputs, potentially allowing errors or inconsistencies to propagate undetected. So, either no verification or verification is designed to exist in MAS, but verifier fail to complete what was exactly prompted to do. Eg: make sure the code compiles, but the code doesn't even compile.\nVerification is particularly critical in cases where tasks or outputs are readily verifiable by the system itself without human intervention.\n\nConsequences of inadequate or absent verification include the propagation of undetected errors, system inconsistencies, reduced reliability, and failure in the generated output.\n\nA few examples are as follows:\n1. In ChatDev, when prompted by a user to generate a game (e.g., \"textBasedSpaceInvaders\"), verification steps failed despite multiple review stages. Although the code was reportedly verified, compilation errors persisted, leading to runtime failures:\nyes Error: The file 'ship.bmp' was not found in the directory /Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911.\nTraceback (most recent call last):\n File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/main.py\", line 31, in <module>\n run_game()\n File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/main.py\", line 22, in run_game\n gf.create_fleet(ai_settings, screen, aliens)\n File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/game_functions.py\", line 64, in create_fleet\n alien = Alien(ai_settings, screen)\n File \"/Users/user/Documents/*/ChatDev/WareHouse/TextBasedSpaceInvaders_DefaultOrganization_20250117121911/alien.py\", line 13, in __init__\n self.image = pygame.image.load('alien.bmp')\nFileNotFoundError: No file 'alien.bmp' found in working directory '/Users/*/Documents/*/ChatDev'."
72
+ },
73
+ "label_model_name": "gpt-5",
74
+ },
75
+ }
76
+ }
77
+
78
+
79
+ def get_demo_names() -> List[str]:
80
+ return list(EXAMPLES.keys())
81
+
82
+
83
+ def get_demo_config(name: str) -> Dict[str, Any] | None:
84
+ return EXAMPLES.get(name)
85
+
86
+
stringsight/dashboard/examples_helpers.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Tuple, Iterable, Optional, Dict, Any
4
+ import re
5
+
6
+ # We use private-use unicode placeholders so they survive html.escape/markdown
7
+ HIGHLIGHT_START = "\uE000"
8
+ HIGHLIGHT_END = "\uE001"
9
+
10
+ __all__ = [
11
+ "extract_quoted_fragments",
12
+ "find_exact_matches",
13
+ "compute_best_ngram_window",
14
+ "merge_intervals",
15
+ "compute_highlight_spans",
16
+ "insert_highlight_placeholders",
17
+ "annotate_text_with_evidence_placeholders",
18
+ ]
19
+
20
+
21
+ def extract_quoted_fragments(evidence: Any) -> Dict[str, List[str]]:
22
+ """Extract quoted fragments from evidence.
23
+
24
+ Returns a dict with keys:
25
+ - "quoted": list of quoted strings
26
+ - "unquoted": list of unquoted fragments (may be empty)
27
+
28
+ Evidence may be a string (possibly containing quotes) or a list of strings.
29
+ We treat double quotes (") and single quotes (').
30
+ """
31
+ quoted: List[str] = []
32
+ unquoted: List[str] = []
33
+
34
+ def _from_str(s: str) -> None:
35
+ # Capture content inside matching quotes
36
+ # Handles multiple quoted segments, keeps inner text only
37
+ q = re.findall(r'"([^"]+)"|\'([^\']+)\'', s)
38
+ if q:
39
+ for g1, g2 in q:
40
+ frag = g1 or g2
41
+ frag = frag.strip()
42
+ if frag:
43
+ # Split on ellipses (ASCII ... or Unicode …) and contiguous sequences thereof
44
+ parts = re.split(r'(?:\.{3}|…)+', frag)
45
+ for p in parts:
46
+ p = re.sub(r"\s+", " ", p).strip()
47
+ if p:
48
+ quoted.append(p)
49
+ # Remove the quoted parts from the string to detect remaining unquoted
50
+ s_wo = re.sub(r'"[^\"]+"|\'[^\']+\'', " ", s)
51
+ residue = s_wo.strip()
52
+ if residue:
53
+ unquoted.append(residue)
54
+ else:
55
+ s = s.strip()
56
+ if s:
57
+ unquoted.append(s)
58
+
59
+ if isinstance(evidence, list):
60
+ for item in evidence:
61
+ if isinstance(item, str):
62
+ _from_str(item)
63
+ else:
64
+ # Non-string items are ignored; caller can decide how to handle
65
+ continue
66
+ elif isinstance(evidence, str):
67
+ _from_str(evidence)
68
+ else:
69
+ # Unknown evidence type β†’ nothing to extract
70
+ pass
71
+
72
+ return {"quoted": quoted, "unquoted": unquoted}
73
+
74
+
75
+ def _tokenize_words_with_offsets(text: str) -> List[Tuple[str, int, int]]:
76
+ """Tokenize into word tokens with their (start, end) character offsets.
77
+
78
+ We treat word characters (\w) as tokens and ignore pure whitespace. Punctuation
79
+ is not included as tokens for n-gram matching.
80
+ """
81
+ tokens: List[Tuple[str, int, int]] = []
82
+ for m in re.finditer(r"\w+", text):
83
+ tokens.append((m.group(0).lower(), m.start(), m.end()))
84
+ return tokens
85
+
86
+
87
+ def find_exact_matches(text: str, phrase: str) -> List[Tuple[int, int]]:
88
+ """Case-insensitive exact matches of phrase in text with word-boundary guards.
89
+
90
+ Matches must not start or end inside a word (avoid partial-word highlights).
91
+ Returns a list of (start, end) character indices.
92
+ """
93
+ if not phrase:
94
+ return []
95
+ # Build a boundary-safe pattern. We escape the phrase and require non-word boundaries at ends.
96
+ # Use lookaround to avoid consuming boundary characters.
97
+ pattern = r"(?<!\w)" + re.escape(phrase) + r"(?!\w)"
98
+ matches: List[Tuple[int, int]] = []
99
+ for m in re.finditer(pattern, text, flags=re.IGNORECASE):
100
+ matches.append((m.start(), m.end()))
101
+ return matches
102
+
103
+
104
+ def compute_best_ngram_window(text: str, target: str, n: int = 3, overlap_threshold: float = 0.5) -> Optional[Tuple[int, int]]:
105
+ """Find a window in `text` that maximizes n-gram overlap with `target`.
106
+
107
+ - Tokenization is word-based (\w+). Case-insensitive.
108
+ - If target has fewer than n tokens, fallback to n=1 (unigram overlap).
109
+ - Returns (start_char, end_char) of best window if overlap >= threshold, else None.
110
+ """
111
+ text_toks = _tokenize_words_with_offsets(text)
112
+ target_toks = [t for t, _, _ in _tokenize_words_with_offsets(target)]
113
+
114
+ if not text_toks or not target_toks:
115
+ return None
116
+
117
+ # Enforce minimum n-gram size. If the target is too short, do not highlight.
118
+ if n < 1:
119
+ n = 1
120
+ if len(target_toks) < n:
121
+ return None
122
+
123
+ def _ngrams(tokens: List[str], k: int) -> List[Tuple[str, ...]]:
124
+ return [tuple(tokens[i:i+k]) for i in range(0, len(tokens) - k + 1)] if len(tokens) >= k else []
125
+
126
+ target_ngrams = set(_ngrams(target_toks, n))
127
+ if not target_ngrams:
128
+ return None
129
+
130
+ best_score = 0.0
131
+ best_span: Optional[Tuple[int, int]] = None
132
+
133
+ # Sliding windows over the text tokens with the same token length as the target
134
+ window_len = max(len(target_toks), n) # ensure at least n
135
+ for i in range(0, len(text_toks) - window_len + 1):
136
+ window_tokens = [tok for tok, _, _ in text_toks[i:i+window_len]]
137
+ window_ngrams = set(_ngrams(window_tokens, n))
138
+ overlap = len(window_ngrams & target_ngrams)
139
+ denom = max(1, len(target_ngrams))
140
+ score = overlap / denom
141
+ if score > best_score:
142
+ # Character span across the window
143
+ start_char = text_toks[i][1]
144
+ end_char = text_toks[i+window_len-1][2]
145
+ best_score = score
146
+ best_span = (start_char, end_char)
147
+
148
+ if best_span and best_score >= overlap_threshold:
149
+ return best_span
150
+ return None
151
+
152
+
153
+ def merge_intervals(spans: Iterable[Tuple[int, int]]) -> List[Tuple[int, int]]:
154
+ """Merge overlapping or touching intervals."""
155
+ s = sorted(spans)
156
+ if not s:
157
+ return []
158
+ merged = [list(s[0])]
159
+ for a, b in s[1:]:
160
+ if a <= merged[-1][1]:
161
+ merged[-1][1] = max(merged[-1][1], b)
162
+ else:
163
+ merged.append([a, b])
164
+ return [(a, b) for a, b in merged]
165
+
166
+
167
+ def compute_highlight_spans(text: str, evidence: Any, n: int = 3, overlap_threshold: float = 0.5) -> List[Tuple[int, int]]:
168
+ """Compute character spans to highlight in `text` using `evidence`.
169
+
170
+ Strategy:
171
+ - For each fragment (quoted and unquoted), first try exact case-insensitive matching (all occurrences).
172
+ - If a specific fragment has no exact matches, use n-gram overlap to find the best-matching window
173
+ and highlight if above threshold.
174
+ - If evidence is a list, treat each element independently (quoted detection applied per element).
175
+ """
176
+ parts = extract_quoted_fragments(evidence)
177
+ spans: List[Tuple[int, int]] = []
178
+
179
+ # Evaluate each fragment independently: try exact match first, otherwise fall back to n-gram.
180
+ # This ensures that when multiple quoted fragments are present and only some match exactly,
181
+ # we still localize the others approximately.
182
+ candidates: List[str] = []
183
+ candidates.extend(parts.get("quoted", []))
184
+ candidates.extend(parts.get("unquoted", []))
185
+
186
+ # Helper: count word tokens
187
+ def _num_word_tokens(s: str) -> int:
188
+ return len(re.findall(r"\w+", s))
189
+
190
+ for fragment in candidates:
191
+ if not fragment:
192
+ continue
193
+ # Enforce a minimum token length to avoid single-word/partial-word highlights
194
+ if _num_word_tokens(fragment) < n:
195
+ continue
196
+ exacts = find_exact_matches(text, fragment)
197
+ if exacts:
198
+ spans.extend(exacts)
199
+ continue
200
+ win = compute_best_ngram_window(text, fragment, n=n, overlap_threshold=overlap_threshold)
201
+ if win:
202
+ spans.append(win)
203
+
204
+ return merge_intervals(spans)
205
+
206
+
207
+ def insert_highlight_placeholders(text: str, spans: List[Tuple[int, int]]) -> str:
208
+ """Insert placeholder markers into `text` for each (start, end) span.
209
+
210
+ Assumes spans are non-overlapping and sorted; callers should merge first.
211
+ """
212
+ if not spans:
213
+ return text
214
+ parts: List[str] = []
215
+ last = 0
216
+ for a, b in spans:
217
+ if a < last:
218
+ # Overlap – skip to avoid corrupting indices
219
+ continue
220
+ parts.append(text[last:a])
221
+ parts.append(HIGHLIGHT_START)
222
+ parts.append(text[a:b])
223
+ parts.append(HIGHLIGHT_END)
224
+ last = b
225
+ parts.append(text[last:])
226
+ return "".join(parts)
227
+
228
+
229
+ def annotate_text_with_evidence_placeholders(text: str, evidence: Any, *, n: int = 3, overlap_threshold: float = 0.5) -> str:
230
+ """Return text with highlight placeholders inserted based on evidence.
231
+
232
+ This is the main API used by the renderer. After further processing (markdown),
233
+ callers should post-process HTML to replace placeholders with <mark> tags.
234
+ """
235
+ spans = compute_highlight_spans(text, evidence, n=n, overlap_threshold=overlap_threshold)
236
+ if not spans:
237
+ return text
238
+ return insert_highlight_placeholders(text, spans)
stringsight/dashboard/examples_tab.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logic for the **View Examples** tab – dropdown population + example renderer."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, List, Tuple, Optional
5
+
6
+ import gradio as gr
7
+ import ast
8
+
9
+ from .state import app_state
10
+ from .utils import (
11
+ get_unique_values_for_dropdowns,
12
+ get_example_data,
13
+ format_examples_display,
14
+ search_clusters_by_text,
15
+ )
16
+
17
+ __all__: List[str] = [
18
+ "get_dropdown_choices",
19
+ "update_example_dropdowns",
20
+ "view_examples",
21
+ "get_filter_options",
22
+ "update_filter_dropdowns",
23
+ ]
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Dropdown helpers
28
+ # ---------------------------------------------------------------------------
29
+
30
+ def get_dropdown_choices(selected_models: Optional[List[str]] = None) -> Tuple[List[str], List[str], List[str]]:
31
+ if app_state["clustered_df"] is None:
32
+ return [], [], []
33
+
34
+ choices = get_unique_values_for_dropdowns(app_state["clustered_df"])
35
+ prompts = ["All Prompts"] + choices["prompts"]
36
+ # If a sidebar selection is provided, filter models to that subset (ignoring the pseudo 'all')
37
+ if selected_models:
38
+ subset = [m for m in choices["models"] if m in [sm for sm in selected_models if sm != "all"]]
39
+ models = ["All Models"] + (subset if subset else choices["models"]) # fallback to all available if subset empty
40
+ else:
41
+ models = ["All Models"] + choices["models"]
42
+ properties = ["All Clusters"] + choices["properties"]
43
+ return prompts, models, properties
44
+
45
+
46
+ def update_example_dropdowns(selected_models: Optional[List[str]] = None) -> Tuple[Any, Any, Any]:
47
+ prompts, models, properties = get_dropdown_choices(selected_models)
48
+ # If exactly one concrete model selected in sidebar, preselect it; else default to All Models
49
+ preselect_model = "All Models"
50
+ if selected_models:
51
+ concrete = [m for m in selected_models if m != "all"]
52
+ if len(concrete) == 1 and concrete[0] in models:
53
+ preselect_model = concrete[0]
54
+ return (
55
+ gr.update(choices=prompts, value="All Prompts" if prompts else None),
56
+ gr.update(choices=models, value=(preselect_model if models else None)),
57
+ gr.update(choices=properties, value="All Clusters" if properties else None),
58
+ )
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Example viewer
63
+ # ---------------------------------------------------------------------------
64
+
65
+ def view_examples(
66
+ selected_prompt: str,
67
+ selected_model: str,
68
+ selected_property: str,
69
+ max_examples: int = 5,
70
+ use_accordion: bool = True,
71
+ pretty_print_dicts: bool = True,
72
+ search_term: str = "",
73
+ show_unexpected_behavior: bool = False,
74
+ selected_models_sidebar: Optional[List[str]] = None,
75
+ selected_tags_sidebar: Optional[List[str]] = None,
76
+ ) -> str:
77
+ if app_state["clustered_df"] is None:
78
+ return (
79
+ "<p style='color: #e74c3c; padding: 20px;'>❌ Please load data first "
80
+ "using the 'Load Data' tab</p>"
81
+ )
82
+
83
+ # Apply search filter first if search term is provided
84
+ df = app_state["clustered_df"]
85
+
86
+ # Apply sidebar-selected model filter if provided (ignoring pseudo 'all') before dropdown filters
87
+ if selected_models_sidebar:
88
+ concrete = [m for m in selected_models_sidebar if m != "all"]
89
+ if concrete:
90
+ df = df[df["model"].isin(concrete)]
91
+ if df.empty:
92
+ return "<p style='color: #e74c3c; padding: 20px;'>❌ No examples for the selected model subset.</p>"
93
+ if search_term and isinstance(search_term, str) and search_term.strip():
94
+ df = search_clusters_by_text(df, search_term.strip(), 'all')
95
+ if df.empty:
96
+ return f"<p style='color: #e74c3c; padding: 20px;'>❌ No clusters found matching '{search_term}'</p>"
97
+
98
+ # Optional tags filter (sidebar): include rows whose first meta value is in selected tags
99
+ if selected_tags_sidebar and len(selected_tags_sidebar) > 0 and 'meta' in df.columns:
100
+ def _parse_meta(obj: Any) -> Any:
101
+ if isinstance(obj, str):
102
+ try:
103
+ return ast.literal_eval(obj)
104
+ except Exception:
105
+ return obj
106
+ return obj
107
+
108
+ def _first_val(obj: Any) -> Any:
109
+ if obj is None:
110
+ return None
111
+ obj = _parse_meta(obj)
112
+ if isinstance(obj, dict):
113
+ for _, v in obj.items():
114
+ return v
115
+ return None
116
+ if isinstance(obj, (list, tuple)):
117
+ return obj[0] if len(obj) > 0 else None
118
+ return obj
119
+
120
+ parsed_meta = df['meta'].apply(_parse_meta)
121
+ non_null_parsed = [m for m in parsed_meta.tolist() if m is not None]
122
+ all_empty_dicts = (
123
+ len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
124
+ )
125
+
126
+ if not all_empty_dicts:
127
+ allowed = set(map(str, selected_tags_sidebar))
128
+ df = df[df['meta'].apply(_first_val).astype(str).isin(allowed)]
129
+ if df.empty:
130
+ return "<p style='color: #e74c3c; padding: 20px;'>❌ No examples found for selected tags</p>"
131
+
132
+ examples = get_example_data(
133
+ df,
134
+ selected_prompt if selected_prompt != "All Prompts" else None,
135
+ selected_model if selected_model != "All Models" else None,
136
+ selected_property if selected_property != "All Clusters" else None,
137
+ max_examples,
138
+ show_unexpected_behavior=show_unexpected_behavior,
139
+ randomize=(
140
+ (selected_prompt == "All Prompts") and
141
+ (selected_model == "All Models") and
142
+ (selected_property == "All Clusters") and
143
+ (not search_term or not str(search_term).strip())
144
+ ),
145
+ )
146
+
147
+ return format_examples_display(
148
+ examples,
149
+ selected_prompt,
150
+ selected_model,
151
+ selected_property,
152
+ use_accordion=use_accordion,
153
+ pretty_print_dicts=pretty_print_dicts,
154
+ )
155
+
156
+
157
+ # ---------------------------------------------------------------------------
158
+ # Filter dropdown helpers for frequency comparison
159
+ # ---------------------------------------------------------------------------
160
+
161
+ def get_filter_options() -> Tuple[List[str], List[str]]:
162
+ if not app_state["model_stats"]:
163
+ return ["All Models"], ["All Metrics"]
164
+
165
+ available_models = ["All Models"] + list(app_state["model_stats"].keys())
166
+
167
+ quality_metrics = set()
168
+ for model_data in app_state["model_stats"].values():
169
+ clusters = model_data.get("fine", []) + model_data.get("coarse", [])
170
+ for cluster in clusters:
171
+ quality_score = cluster.get("quality_score", {})
172
+ if isinstance(quality_score, dict):
173
+ quality_metrics.update(quality_score.keys())
174
+
175
+ available_metrics = ["All Metrics"] + sorted(list(quality_metrics))
176
+
177
+ return available_models, available_metrics
178
+
179
+
180
+ def update_filter_dropdowns() -> Tuple[Any, Any]:
181
+ models, metrics = get_filter_options()
182
+ return (
183
+ gr.update(choices=models, value="All Models" if models else None),
184
+ gr.update(choices=metrics, value="All Metrics" if metrics else None),
185
+ )
stringsight/dashboard/launcher.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI launcher for LMM-Vibes Gradio visualization app.
4
+
5
+ Usage:
6
+ python -m stringsight.dashboard.launcher --results_dir path/to/results
7
+
8
+ Or directly:
9
+ python stringsight/dashboard/launcher.py --results_dir path/to/results
10
+ """
11
+
12
+ import argparse
13
+ import sys
14
+ from pathlib import Path
15
+ import logging
16
+
17
+ def main():
18
+ parser = argparse.ArgumentParser(
19
+ description="Launch LMM-Vibes Gradio visualization app",
20
+ formatter_class=argparse.RawDescriptionHelpFormatter,
21
+ epilog="""
22
+ Examples:
23
+ # Launch with auto-loaded data from a base results directory
24
+ python -m stringsight.dashboard.launcher --results_dir /path/to/results
25
+
26
+ # Launch with public sharing enabled
27
+ python -m stringsight.dashboard.launcher --results_dir /path/to/results --share
28
+
29
+ # Launch on specific port
30
+ python -m stringsight.dashboard.launcher --results_dir /path/to/results --port 8080
31
+
32
+ # Launch with automatic port selection
33
+ python -m stringsight.dashboard.launcher --results_dir /path/to/results --auto_port
34
+
35
+ # Launch without auto-loading (manual selection in app)
36
+ python -m stringsight.dashboard.launcher
37
+ """
38
+ )
39
+
40
+ parser.add_argument(
41
+ "--results_dir",
42
+ type=str,
43
+ help="Path to base results directory containing experiment subfolders (optional - can be loaded in the app)"
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--share",
48
+ action="store_true",
49
+ help="Create a public shareable link"
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--server_name",
54
+ type=str,
55
+ default="127.0.0.1",
56
+ help="Server address (default: 127.0.0.1)"
57
+ )
58
+
59
+ parser.add_argument(
60
+ "--port",
61
+ type=int,
62
+ default=7860,
63
+ help="Server port (default: 7860). Use --auto_port to automatically find an available port."
64
+ )
65
+
66
+ parser.add_argument(
67
+ "--auto_port",
68
+ action="store_true",
69
+ help="Automatically find an available port by trying ports 8080-8089"
70
+ )
71
+
72
+ parser.add_argument(
73
+ "--debug",
74
+ action="store_true",
75
+ help="Enable debug mode"
76
+ )
77
+
78
+ args = parser.parse_args()
79
+
80
+ # Handle auto_port option
81
+ if args.auto_port:
82
+ # Use a high port range for auto-port mode
83
+ args.port = 8080
84
+ print("πŸ” Auto-port mode enabled - will try ports 8080-8089")
85
+
86
+ # Validate results directory if provided
87
+ if args.results_dir:
88
+ results_path = Path(args.results_dir)
89
+ if not results_path.exists():
90
+ print(f"❌ Error: Results directory does not exist: {args.results_dir}")
91
+ sys.exit(1)
92
+ if not results_path.is_dir():
93
+ print(f"❌ Error: Path is not a directory: {args.results_dir}")
94
+ sys.exit(1)
95
+
96
+ # Configure logging level when --debug is set
97
+ if args.debug:
98
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")
99
+
100
+ # Import and launch the app
101
+ try:
102
+ from .app import launch_app
103
+
104
+ print("πŸš€ Launching LMM-Vibes Gradio Visualization App...")
105
+ print(f"🌐 Server: http://{args.server_name}:{args.port}")
106
+ if args.share:
107
+ print("πŸ”— Public sharing enabled")
108
+
109
+ launch_app(
110
+ results_dir=args.results_dir,
111
+ share=args.share,
112
+ server_name=args.server_name,
113
+ server_port=args.port,
114
+ debug=args.debug
115
+ )
116
+
117
+ except ImportError as e:
118
+ print(f"❌ Error: Failed to import required modules: {e}")
119
+ print("πŸ’‘ Make sure you have gradio installed: pip install gradio")
120
+ sys.exit(1)
121
+ except Exception as e:
122
+ print(f"❌ Error launching app: {e}")
123
+ sys.exit(1)
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
stringsight/dashboard/load_data_tab.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for the "Load Data" tab – loading pipeline results and scanning for
3
+ available experiment folders.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from typing import List, Tuple
10
+
11
+ import gradio as gr
12
+
13
+ # ---------------------------------------------------------------------------
14
+ # Loading utilities updated for FunctionalMetrics
15
+ # ---------------------------------------------------------------------------
16
+
17
+ from .state import app_state
18
+ from .data_loader import (
19
+ load_pipeline_results,
20
+ scan_for_result_subfolders,
21
+ validate_results_directory,
22
+ )
23
+
24
+ # Metrics helpers
25
+ from .metrics_adapter import get_all_models
26
+
27
+ __all__ = [
28
+ "load_data",
29
+ "get_available_experiments",
30
+ "get_experiment_choices",
31
+ "refresh_experiment_dropdown",
32
+ "load_experiment_data",
33
+ ]
34
+
35
+
36
+ def load_data(results_dir: str, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, str, str]:
37
+ """Load pipeline results from *results_dir* and update the shared *app_state*.
38
+
39
+ Returns a tuple of (summary_markdown, models_info_markdown, models_checkbox_update).
40
+ """
41
+ try:
42
+ # 1. Validate directory structure
43
+ progress(0.05, "Validating results directory…")
44
+ is_valid, error_msg = validate_results_directory(results_dir)
45
+ if not is_valid:
46
+ return "", f"❌ Error: {error_msg}", ""
47
+
48
+ # 2. Handle optional sub-folder selection (first match for now)
49
+ progress(0.15, "Scanning for experiment subfolders…")
50
+ subfolders = scan_for_result_subfolders(results_dir)
51
+ final_dir = results_dir
52
+ if subfolders and "." not in subfolders:
53
+ final_dir = str(Path(results_dir) / subfolders[0])
54
+
55
+ # 3. Load results into memory
56
+ progress(0.35, "Loading pipeline results… This may take a moment")
57
+ clustered_df, metrics, model_cluster_df, results_path = load_pipeline_results(final_dir)
58
+
59
+ # 4. Stash in global state so other tabs can use it
60
+ progress(0.6, "Preparing application state…")
61
+ app_state["clustered_df"] = clustered_df
62
+ app_state["metrics"] = metrics
63
+ app_state["model_cluster_df"] = model_cluster_df
64
+ # Temporary alias for legacy modules
65
+ app_state["model_stats"] = metrics
66
+ app_state["results_path"] = results_path
67
+ app_state["available_models"] = get_all_models(metrics)
68
+ app_state["current_results_dir"] = final_dir
69
+
70
+ # 5. Compose status messages
71
+ progress(0.8, "Finalizing summary…")
72
+ n_models = len(metrics.get("model_cluster_scores", {}))
73
+ n_properties = len(clustered_df)
74
+
75
+ # Render as Markdown, not as a plain text block.
76
+ summary = (
77
+ "**Data Summary:**\n"
78
+ f"- **Models:** {n_models}\n"
79
+ f"- **Properties:** {n_properties:,}\n"
80
+ f"- **Results Directory:** `{Path(final_dir).name}`"
81
+ )
82
+ # Check for both naming patterns for fine clusters
83
+ if ("fine_cluster_id" in clustered_df.columns or
84
+ "property_description_fine_cluster_id" in clustered_df.columns):
85
+ fine_id_col = ("fine_cluster_id" if "fine_cluster_id" in clustered_df.columns
86
+ else "property_description_fine_cluster_id")
87
+ n_fine_clusters = clustered_df[fine_id_col].nunique()
88
+
89
+ model_choices = app_state["available_models"]
90
+ models_info = f"Available models: {', '.join(model_choices)}"
91
+
92
+ # Gradio update object for the CheckboxGroup
93
+ # Default: select all concrete models but leave the aggregate "all" unchecked
94
+ selected_values = [m for m in model_choices if m != "all"]
95
+ progress(1.0, "Dataset loaded")
96
+ return summary, models_info, gr.update(choices=model_choices, value=selected_values)
97
+
98
+ except Exception as e:
99
+ error_msg = f"❌ Error loading results: {e}"
100
+ return "", error_msg, gr.update(choices=[], value=[])
101
+
102
+
103
+ def get_available_experiments(base_dir: str) -> List[str]:
104
+ """Return experiment sub-directories that contain the expected result files, sorted by modification time (most recent first)."""
105
+ if not base_dir or not os.path.exists(base_dir):
106
+ return []
107
+
108
+ experiments: List[Tuple[str, float]] = []
109
+ try:
110
+ for item in os.listdir(base_dir):
111
+ item_path = os.path.join(base_dir, item)
112
+ if os.path.isdir(item_path):
113
+ if (
114
+ os.path.exists(os.path.join(item_path, "model_stats.json"))
115
+ or os.path.exists(os.path.join(item_path, "clustered_results_lightweight.jsonl"))
116
+ ):
117
+ # Get modification time of the directory
118
+ mod_time = os.path.getmtime(item_path)
119
+ experiments.append((item, mod_time))
120
+ except Exception as e:
121
+ print(f"Error scanning experiments: {e}")
122
+
123
+ # Sort by modification time (most recent first), then return just the names
124
+ experiments.sort(key=lambda x: x[1], reverse=True)
125
+ return [exp[0] for exp in experiments]
126
+
127
+
128
+ def get_experiment_choices() -> List[str]:
129
+ """Return dropdown choices for the experiment selector."""
130
+ from . import state
131
+ if not state.BASE_RESULTS_DIR:
132
+ return []
133
+ experiments = get_available_experiments(state.BASE_RESULTS_DIR)
134
+ return ["Select an experiment..."] + experiments
135
+
136
+
137
+ def refresh_experiment_dropdown() -> gr.update:
138
+ """Gradio helper to refresh the experiment dropdown choices."""
139
+ choices = get_experiment_choices()
140
+ return gr.update(choices=choices, value="Select an experiment...")
141
+
142
+
143
+ def load_experiment_data(experiment_name: str) -> Tuple[str, str, str]:
144
+ """Wrapper used by Gradio events to load a *selected* experiment."""
145
+ from . import state
146
+ if not state.BASE_RESULTS_DIR or experiment_name == "Select an experiment...":
147
+ return "", "Please select a valid experiment", gr.update(choices=[], value=[])
148
+
149
+ experiment_path = os.path.join(state.BASE_RESULTS_DIR, experiment_name)
150
+ print(f"πŸ” Loading experiment: {experiment_name} from {experiment_path}")
151
+ return load_data(experiment_path)
stringsight/dashboard/metrics_adapter.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lightweight access helpers for FunctionalMetrics score dictionaries.
2
+
3
+ The Gradio UI now receives the *raw* FunctionalMetrics output as a
4
+ ```
5
+ metrics = {
6
+ "model_cluster_scores": {...},
7
+ "cluster_scores": {...},
8
+ "model_scores": {...},
9
+ }
10
+ ```
11
+ This module centralises the most common look-ups so that the rest of the
12
+ codebase does *not* need to know the exact key names. If the format
13
+ changes again we only need to update these helpers.
14
+ """
15
+ from typing import Dict, Any, List
16
+
17
+ __all__ = [
18
+ "get_model_clusters",
19
+ "get_all_models",
20
+ "get_all_clusters",
21
+ ]
22
+
23
+ def get_model_clusters(metrics: Dict[str, Any], model_name: str) -> Dict[str, Any]:
24
+ """Return the per-cluster dictionary for a given model.
25
+
26
+ Args:
27
+ metrics: The dict returned by ``load_pipeline_results``.
28
+ model_name: Name of the model.
29
+ """
30
+ if model_name == "all":
31
+ # For "all" model, return cluster_scores (aggregated across all models)
32
+ return metrics.get("cluster_scores", {})
33
+ else:
34
+ return metrics.get("model_cluster_scores", {}).get(model_name, {})
35
+
36
+
37
+ def get_all_models(metrics: Dict[str, Any]) -> List[str]:
38
+ """Return the list of model names present in the metrics dict."""
39
+ models = list(metrics.get("model_cluster_scores", {}).keys())
40
+ # Add "all" as the first option to show aggregated metrics across all models
41
+ return ["all"] + models
42
+
43
+
44
+ def get_all_clusters(metrics: Dict[str, Any]) -> List[str]:
45
+ """Return the list of cluster names (across all models)."""
46
+ return list(metrics.get("cluster_scores", {}).keys())
stringsight/dashboard/overview_tab.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logic helpers for the **Overview** tab."""
2
+ from typing import List, Tuple, Optional
3
+ import pandas as pd
4
+ import plotly.graph_objects as go
5
+ import plotly.express as px
6
+
7
+ import gradio as gr
8
+ from .state import app_state
9
+ from .utils import compute_model_rankings_new, create_model_summary_card_new
10
+ from .plotting import create_model_dataframe
11
+
12
+ __all__ = ["create_overview", "create_model_quality_plot", "create_model_quality_table", "get_available_model_quality_metrics"]
13
+
14
+
15
+ def create_overview(
16
+ selected_models: List[str],
17
+ top_n: int,
18
+ score_significant_only: bool = False,
19
+ quality_significant_only: bool = False,
20
+ sort_by: str = "quality_asc",
21
+ min_cluster_size: int = 1,
22
+ selected_tags: Optional[List[str]] = None,
23
+ progress: Optional[gr.Progress] = None,
24
+ ) -> str:
25
+ """Return the HTML snippet that summarises model performance."""
26
+ if not app_state["metrics"]:
27
+ return "Please load data first using the 'Load Data' tab."
28
+
29
+ if not selected_models:
30
+ return "Please select at least one model to display."
31
+
32
+ # 1. Compute global rankings and filter to selection
33
+ if progress:
34
+ progress(0.05, "Computing model rankings…")
35
+ model_rankings = compute_model_rankings_new(app_state["metrics"])
36
+ filtered_rankings = [
37
+ (name, stats) for name, stats in model_rankings if name in selected_models
38
+ ]
39
+
40
+ # Sort so "all" appears first, then the rest by their rankings
41
+ all_models = [(name, stats) for name, stats in filtered_rankings if name == "all"]
42
+ other_models = [(name, stats) for name, stats in filtered_rankings if name != "all"]
43
+ filtered_rankings = all_models + other_models
44
+
45
+ if not filtered_rankings:
46
+ return "No data available for selected models."
47
+
48
+ # 2. Assemble HTML
49
+ overview_html = """
50
+ <div style="width: 100%; margin: 0;">
51
+ <details style="margin-bottom:25px;">
52
+ <summary style="cursor:pointer; color:#4c6ef5; font-weight:500;">What do these tags and numbers mean?</summary>
53
+ <div style="margin-top:12px; font-size:14px; line-height:1.5; color:#333;">
54
+ <p style="color: #666; margin-bottom: 10px;">
55
+ Top distinctive clusters where each model shows unique behavioural patterns.
56
+ Frequency shows what percentage of a model's battles resulted in that behavioural pattern.
57
+ </p>
58
+
59
+ <strong>Frequency Delta</strong><br>
60
+ For each cluster we compute how often <em>this model</em> appears in that cluster compared with the average across all models.<br>
61
+ β€’ A positive value (e.g. <code>+0.15</code>) means the model hits the behaviour more often than average.<br>
62
+ β€’ A negative value (e.g. <code>-0.08</code>) means it appears less often.<br>
63
+ <strong>Quality Delta</strong><br>
64
+ The difference between the cluster's quality score(s) for this model and the model's <em>overall</em> quality baseline, shown for each individual metric (e.g., helpfulness, accuracy).<br>
65
+ Positive values (green) indicate the model performs better than its average in that behaviour; negative values (red) indicate that it performs worse.<br>
66
+ <strong>Significance Tags (F/Q)</strong><br>
67
+ <span style="color: #888; font-size: 13px;">
68
+ Statistical significance is determined using a bootstrap procedure on the conversations to obtain 95% confidence intervals.
69
+ </span><br>
70
+ The <span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#cc6699; border:1px solid #cc669933; background:#cc669912;">F</span> and <span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#007bff; border:1px solid #007bff33; background:#007bff12;">Q</span> tags indicate <em>statistical significance</em> based on bootstraped confidence intervals:<br>
71
+ β€’ <strong>F</strong> (pink): The proportion delta is statistically significant (confidence interval doesn't include zero)<br>
72
+ β€’ <strong>Q</strong> (blue): At least one quality metric delta is statistically significant<br>
73
+ These tags help identify which behavioral patterns are reliably different from the model's baseline performance.<br><br>
74
+ <strong>Cluster Tags</strong><br>
75
+ We sometimes annotate clusters with a short tag (e.g., group or category) to aid scanning. Example tags:
76
+ <span style="display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#28a74512; color:#28a745; border:1px solid #28a74533;">Positive</span>
77
+ <span style="display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#9467bd12; color:#9467bd; border:1px solid #9467bd33;">Style</span>
78
+ <span style="display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#dc354512; color:#dc3545; border:1px solid #dc354533;">Negative (critical)</span>
79
+ </div>
80
+ </details>
81
+ """
82
+
83
+ total_models = max(1, len(filtered_rankings))
84
+ for idx, (model_name, _) in enumerate(filtered_rankings):
85
+ if progress:
86
+ progress(0.1 + 0.8 * (idx / total_models), f"Rendering overview for {model_name}…")
87
+ card_html = create_model_summary_card_new(
88
+ model_name,
89
+ app_state["metrics"],
90
+ # top_n etc.
91
+ top_n,
92
+ score_significant_only=score_significant_only,
93
+ quality_significant_only=quality_significant_only,
94
+ sort_by=sort_by,
95
+ min_cluster_size=min_cluster_size,
96
+ selected_tags=selected_tags,
97
+ )
98
+ overview_html += card_html
99
+
100
+ overview_html += "</div>"
101
+ if progress:
102
+ progress(1.0, "Overview ready")
103
+ return overview_html
104
+
105
+
106
+ def create_model_quality_plot(
107
+ selected_models: List[str],
108
+ quality_metric: str = "helpfulness",
109
+ ) -> go.Figure:
110
+ """Create a bar plot of model-level quality scores with confidence intervals."""
111
+ if not app_state["metrics"]:
112
+ return None
113
+
114
+ if not selected_models:
115
+ return None
116
+
117
+ # Get model scores from metrics
118
+ model_scores = app_state["metrics"].get("model_scores", {})
119
+ if not model_scores:
120
+ return None
121
+
122
+ # Create model dataframe
123
+ model_df = create_model_dataframe(model_scores)
124
+
125
+ if model_df.empty:
126
+ return None
127
+
128
+ # Filter to selected models
129
+ model_df = model_df[model_df['model'].isin(selected_models)]
130
+
131
+ if model_df.empty:
132
+ return None
133
+
134
+ # Find the actual ABSOLUTE quality column (not delta) that matches the requested metric
135
+ # We want raw quality scores, not deltas from baseline
136
+ quality_col = None
137
+ for col in model_df.columns:
138
+ if (col.startswith("quality_") and
139
+ not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant")) and
140
+ "delta" not in col.lower()): # Explicitly exclude any delta columns
141
+ # Check if the quality metric name is contained in the column name (case insensitive)
142
+ col_name = col.replace("quality_", "").lower()
143
+ if quality_metric.lower() in col_name:
144
+ quality_col = col
145
+ break
146
+
147
+ # If no match found, use the first available absolute quality column
148
+ if not quality_col:
149
+ available_quality_cols = [col for col in model_df.columns
150
+ if col.startswith("quality_")
151
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant"))
152
+ and "delta" not in col.lower()] # Explicitly exclude delta columns
153
+ if not available_quality_cols:
154
+ return None
155
+ quality_col = available_quality_cols[0] # Use first available absolute quality metric
156
+
157
+ # Ensure quality values are numeric
158
+ model_df[quality_col] = pd.to_numeric(model_df[quality_col], errors='coerce')
159
+
160
+ # Check if we have any valid quality data
161
+ if model_df[quality_col].isna().all():
162
+ return None
163
+
164
+ # Sort models by quality score (descending - best scores first)
165
+ model_df = model_df.sort_values(by=quality_col, ascending=False).reset_index(drop=True)
166
+
167
+ # Extract a clean metric name for display
168
+ metric_display_name = quality_col.replace("quality_", "").split("(")[0].strip()
169
+
170
+ # Create the plot
171
+ fig = go.Figure()
172
+
173
+ # Prepare error bar data if requested and available
174
+ error_y = None
175
+ ci_lower_col = f"{quality_col}_ci_lower"
176
+ ci_upper_col = f"{quality_col}_ci_upper"
177
+ if ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
178
+ # Calculate error bar values (distance from mean to upper/lower bounds)
179
+ error_y_upper = model_df[ci_upper_col] - model_df[quality_col]
180
+ error_y_lower = model_df[quality_col] - model_df[ci_lower_col]
181
+ error_y = dict(
182
+ type='data',
183
+ symmetric=False,
184
+ array=error_y_upper,
185
+ arrayminus=error_y_lower,
186
+ visible=True,
187
+ color="rgba(52, 73, 94, 0.7)",
188
+ thickness=2.5,
189
+ width=5
190
+ )
191
+
192
+ # Create a beautiful color gradient for the bars
193
+ colors = px.colors.qualitative.Set3[:len(model_df)]
194
+
195
+ # Add the bar chart with improved styling
196
+ fig.add_trace(go.Bar(
197
+ x=model_df['model'],
198
+ y=model_df[quality_col],
199
+ error_y=error_y,
200
+ marker=dict(
201
+ color=colors,
202
+ line=dict(color='rgba(255,255,255,0.8)', width=2),
203
+ opacity=0.8
204
+ ),
205
+ name=f'{metric_display_name} Score',
206
+ text=[f"{val:.2f}" for val in model_df[quality_col]],
207
+ textposition='outside',
208
+ textfont=dict(size=14, color='darkblue', family='Arial Black'),
209
+ hovertemplate='<b>%{x}</b><br>' +
210
+ f'{metric_display_name}: %{{y:.3f}}<br>' +
211
+ (
212
+ f'CI: [{model_df[ci_lower_col][0]:.2f}, {model_df[ci_upper_col][0]:.2f}]<br>'
213
+ ) +
214
+ '<extra></extra>',
215
+ hoverlabel=dict(
216
+ bgcolor="white",
217
+ bordercolor="darkblue",
218
+ font=dict(size=14, color="darkblue")
219
+ )
220
+ ))
221
+
222
+ # Enhanced layout with auto-sizing and improved styling
223
+ fig.update_layout(
224
+ # Auto-sizing configuration
225
+ autosize=True,
226
+
227
+ # Enhanced axis styling
228
+ xaxis=dict(
229
+ # No title for x-axis
230
+ title=None,
231
+ tickangle=45,
232
+ tickfont=dict(size=14, color='#34495e', family='Arial'),
233
+ gridcolor='rgba(189, 195, 199, 0.3)',
234
+ gridwidth=1,
235
+ showgrid=True,
236
+ linecolor='#34495e',
237
+ linewidth=2
238
+ ),
239
+ yaxis=dict(
240
+ title=dict(
241
+ text=f"{metric_display_name}",
242
+ font=dict(size=18, color='#34495e', family='Arial')
243
+ ),
244
+ automargin=True,
245
+ tickfont=dict(size=20, color='#34495e', family='Arial'),
246
+ gridcolor='rgba(189, 195, 199, 0.3)',
247
+ gridwidth=1,
248
+ showgrid=True,
249
+ linecolor='#34495e',
250
+ linewidth=2
251
+ ),
252
+
253
+ # Enhanced styling
254
+ showlegend=False,
255
+ plot_bgcolor='rgba(248, 249, 250, 0.8)',
256
+ paper_bgcolor='white',
257
+ margin=dict(l=60, r=60, t=60, b=60, autoexpand=True),
258
+ font=dict(family="Arial, sans-serif", color='#2c3e50'),
259
+
260
+ # No border - removed for cleaner look
261
+ )
262
+
263
+ fig.update_traces(
264
+ textposition="outside", # put labels above bars
265
+ cliponaxis=False # don’t cut them off
266
+ )
267
+
268
+ return fig
269
+
270
+
271
+ def create_model_quality_table(
272
+ selected_models: List[str],
273
+ quality_metric: str = "helpfulness"
274
+ ) -> str:
275
+ """Create an HTML table of model-level quality scores."""
276
+ if not app_state["metrics"]:
277
+ return "No data loaded. Please load data first using the 'Load Data' tab."
278
+
279
+ if not selected_models:
280
+ return "Please select at least one model to display."
281
+
282
+ # Get model scores from metrics
283
+ model_scores = app_state["metrics"].get("model_scores", {})
284
+ if not model_scores:
285
+ return "No model scores available in the loaded data."
286
+
287
+ # Create model dataframe
288
+ model_df = create_model_dataframe(model_scores)
289
+
290
+ if model_df.empty:
291
+ return "No model data available."
292
+
293
+ # Filter to selected models
294
+ model_df = model_df[model_df['model'].isin(selected_models)]
295
+
296
+ if model_df.empty:
297
+ return "No data available for selected models."
298
+
299
+ # Find the actual ABSOLUTE quality column (not delta) that matches the requested metric
300
+ # We want raw quality scores, not deltas from baseline
301
+ quality_col = None
302
+ for col in model_df.columns:
303
+ if (col.startswith("quality_") and
304
+ not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant")) and
305
+ "delta" not in col.lower()): # Explicitly exclude any delta columns
306
+ # Check if the quality metric name is contained in the column name (case insensitive)
307
+ col_name = col.replace("quality_", "").lower()
308
+ if quality_metric.lower() in col_name:
309
+ quality_col = col
310
+ break
311
+
312
+ # If no match found, use the first available absolute quality column
313
+ if not quality_col:
314
+ available_quality_cols = [col for col in model_df.columns
315
+ if col.startswith("quality_")
316
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant"))
317
+ and "delta" not in col.lower()] # Explicitly exclude delta columns
318
+ if not available_quality_cols:
319
+ return "No quality metrics found in the data."
320
+ quality_col = available_quality_cols[0] # Use first available absolute quality metric
321
+
322
+ # Ensure quality values are numeric
323
+ model_df[quality_col] = pd.to_numeric(model_df[quality_col], errors='coerce')
324
+
325
+ # Check if we have any valid quality data
326
+ if model_df[quality_col].isna().all():
327
+ return f"No valid quality data found for metric '{quality_metric}'."
328
+
329
+ # Sort models by quality score (descending - best scores first)
330
+ model_df = model_df.sort_values(by=quality_col, ascending=False).reset_index(drop=True)
331
+
332
+ # Extract a clean metric name for display
333
+ metric_display_name = quality_col.replace("quality_", "").split("(")[0].strip()
334
+
335
+ # Define confidence interval column names
336
+ ci_lower_col = f"{quality_col}_ci_lower"
337
+ ci_upper_col = f"{quality_col}_ci_upper"
338
+
339
+ # Debug: Check if confidence interval columns exist
340
+ has_ci = ci_lower_col in model_df.columns and ci_upper_col in model_df.columns
341
+ if not has_ci:
342
+ # Try alternative naming pattern
343
+ metric_name = quality_col.replace("quality_", "")
344
+ alt_ci_lower = f"quality_{metric_name}_ci_lower"
345
+ alt_ci_upper = f"quality_{metric_name}_ci_upper"
346
+ if alt_ci_lower in model_df.columns and alt_ci_upper in model_df.columns:
347
+ ci_lower_col = alt_ci_lower
348
+ ci_upper_col = alt_ci_upper
349
+ has_ci = True
350
+
351
+ # Calculate ranks based on confidence intervals
352
+ # A model's rank = 1 + number of models that are confidently better (non-overlapping CIs)
353
+ ranks = []
354
+
355
+ if has_ci:
356
+ # Use confidence interval-based ranking
357
+ for i, row in model_df.iterrows():
358
+ # Get current model's quality score and confidence intervals
359
+ current_score = row[quality_col]
360
+ current_upper = row[ci_upper_col] if not pd.isna(row[ci_upper_col]) else current_score
361
+ current_lower = row[ci_lower_col] if not pd.isna(row[ci_lower_col]) else current_score
362
+
363
+ # Count how many models are confidently better
364
+ confidently_better = 0
365
+ for j, other_row in model_df.iterrows():
366
+ if i != j: # Don't compare with self
367
+ other_score = other_row[quality_col]
368
+ other_upper = other_row[ci_upper_col] if not pd.isna(other_row[ci_upper_col]) else other_score
369
+ other_lower = other_row[ci_lower_col] if not pd.isna(other_row[ci_lower_col]) else other_score
370
+
371
+ # Check if other model's CI is completely above current model's CI
372
+ # This means the other model is confidently better
373
+ if other_lower > current_upper:
374
+ confidently_better += 1
375
+
376
+ ranks.append(confidently_better + 1) # Rank = 1 + number confidently better
377
+ else:
378
+ # Fallback to simple ranking by quality score (no confidence intervals)
379
+ # Sort by quality score and assign ranks
380
+ sorted_indices = model_df[quality_col].sort_values(ascending=False).index
381
+ rank_dict = {idx: rank + 1 for rank, idx in enumerate(sorted_indices)}
382
+ ranks = [rank_dict[idx] for idx in model_df.index]
383
+
384
+ # Prepare table data
385
+ table_rows = []
386
+ for idx, row in model_df.iterrows():
387
+ model_name = row['model']
388
+ quality_score = row[quality_col]
389
+ rank = ranks[idx]
390
+
391
+ # Get confidence intervals if available
392
+ ci_text = ""
393
+ if ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
394
+ ci_lower = row[ci_lower_col]
395
+ ci_upper = row[ci_upper_col]
396
+ ci_text = f" [{ci_lower:.3f}, {ci_upper:.3f}]"
397
+
398
+ table_rows.append(f"""
399
+ <tr>
400
+ <td style=\"text-align: center; padding: 6px 8px; font-weight: bold; color: #2c3e50;\">{rank}</td>
401
+ <td style=\"padding: 6px 8px; color: #2c3e50;\">{model_name}</td>
402
+ <td style=\"text-align: center; padding: 6px 8px; color: #2c3e50;\">{quality_score:.3f}{ci_text}</td>
403
+ </tr>
404
+ """)
405
+
406
+ # Create HTML table
407
+ html_table = f"""
408
+ <div style="width: 100%; margin: 0; max-height: 340px; overflow: auto;">
409
+ <table style="width: 100%; border-collapse: collapse; background: white; border: 1px solid #ddd; border-radius: 4px; font-size: 13px;">
410
+ <thead>
411
+ <tr style="background: #f8f9fa; border-bottom: 2px solid #dee2e6;">
412
+ <th style="padding: 6px 8px; text-align: center; font-weight: bold; color: #495057; border-right: 1px solid #dee2e6;">Rank</th>
413
+ <th style="padding: 6px 8px; text-align: left; font-weight: bold; color: #495057; border-right: 1px solid #dee2e6;">Model</th>
414
+ <th style="padding: 6px 8px; text-align: center; font-weight: bold; color: #495057;">{metric_display_name}</th>
415
+ </tr>
416
+ </thead>
417
+ <tbody>
418
+ {''.join(table_rows)}
419
+ </tbody>
420
+ </table>
421
+ <p style="text-align: center; color: #6c757d; font-size: 11px; margin-top: 8px; font-family: Arial, sans-serif;">
422
+ {f"Ranks based on confidence intervals (non-overlapping CIs). Models with overlapping CIs may have the same rank." if has_ci else "Ranks based on quality scores (confidence intervals not available)."}
423
+ </p>
424
+ </div>
425
+ """
426
+
427
+ return html_table
428
+
429
+
430
+ def get_available_model_quality_metrics() -> List[str]:
431
+ """Get available quality metrics from the loaded model data."""
432
+ if not app_state["metrics"]:
433
+ return ["helpfulness", "accuracy", "harmlessness", "honesty"]
434
+
435
+ model_scores = app_state["metrics"].get("model_scores", {})
436
+ if not model_scores:
437
+ return ["helpfulness", "accuracy", "harmlessness", "honesty"]
438
+
439
+ # Create model dataframe to get available columns
440
+ model_df = create_model_dataframe(model_scores)
441
+
442
+ if model_df.empty:
443
+ return ["helpfulness", "accuracy", "harmlessness", "honesty"]
444
+
445
+ # Find all ABSOLUTE quality columns (excluding CI, delta, and other suffix columns)
446
+ quality_columns = [col for col in model_df.columns
447
+ if col.startswith("quality_")
448
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant"))
449
+ and "delta" not in col.lower()]
450
+
451
+ # Extract simplified metric names for dropdown choices
452
+ # These will be matched against the full column names in create_model_quality_plot
453
+ available_quality_metrics = []
454
+ for col in quality_columns:
455
+ # Remove "quality_" prefix and extract the main metric name
456
+ metric_name = col.replace("quality_", "").split("(")[0].strip().lower()
457
+ # Use common simplified names that users would expect
458
+ if "help" in metric_name:
459
+ available_quality_metrics.append("helpfulness")
460
+ elif "understand" in metric_name:
461
+ available_quality_metrics.append("understandability")
462
+ elif "complete" in metric_name:
463
+ available_quality_metrics.append("completeness")
464
+ elif "concise" in metric_name:
465
+ available_quality_metrics.append("conciseness")
466
+ elif "harm" in metric_name:
467
+ available_quality_metrics.append("harmlessness")
468
+ else:
469
+ # For other metrics, use the first word
470
+ available_quality_metrics.append(metric_name.split()[0])
471
+
472
+ # Remove duplicates while preserving order
473
+ available_quality_metrics = list(dict.fromkeys(available_quality_metrics))
474
+
475
+ # If no quality metrics found, provide defaults
476
+ if not available_quality_metrics:
477
+ available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
478
+
479
+ return available_quality_metrics
stringsight/dashboard/plots_tab.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Plots tab for the LMM-Vibes Gradio app.
3
+
4
+ This module provides functionality to display the model cluster proportion and quality plots.
5
+ """
6
+
7
+ import gradio as gr
8
+ import pandas as pd
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from typing import Tuple, List, Optional, Any
12
+
13
+ from .state import app_state
14
+ from .utils import extract_allowed_tag, ALLOWED_TAGS
15
+
16
+
17
+ def create_proportion_plot(selected_clusters: Optional[List[str]] = None, show_ci: bool = False, selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Tuple[go.Figure, str]:
18
+ """Create a grouped bar plot of proportion by property and model."""
19
+ if app_state.get("model_cluster_df") is None:
20
+ return None, "No model cluster data loaded. Please load data first."
21
+
22
+ model_cluster_df = app_state["model_cluster_df"]
23
+
24
+ if model_cluster_df.empty:
25
+ return None, "No model cluster data available."
26
+
27
+ # Ensure proportion values are numeric and in reasonable range
28
+ model_cluster_df = model_cluster_df.copy()
29
+
30
+ # Optional: filter to selected models (ignore the pseudo 'all' entry if present)
31
+ if selected_models:
32
+ concrete_models = [m for m in selected_models if m != "all"]
33
+ if concrete_models:
34
+ model_cluster_df = model_cluster_df[model_cluster_df["model"].isin(concrete_models)]
35
+ model_cluster_df['proportion'] = pd.to_numeric(model_cluster_df['proportion'], errors='coerce')
36
+
37
+ # Check for any unreasonable values
38
+ print("After conversion - Proportion range:", model_cluster_df['proportion'].min(), "to", model_cluster_df['proportion'].max())
39
+
40
+ # Filter out "No properties" clusters
41
+ model_cluster_df = model_cluster_df[model_cluster_df['cluster'] != "No properties"]
42
+
43
+ # Optional: filter clusters by selected tags using metrics.cluster_scores metadata
44
+ if selected_tags:
45
+ metrics = app_state.get("metrics", {})
46
+ cluster_scores = metrics.get("cluster_scores", {})
47
+ def _first_allowed(meta_obj: Any) -> Any:
48
+ return extract_allowed_tag(meta_obj)
49
+ allowed = set(map(str, selected_tags))
50
+ allowed_clusters = {c for c, d in cluster_scores.items() if str(_first_allowed(d.get("metadata"))) in allowed}
51
+ if allowed_clusters:
52
+ model_cluster_df = model_cluster_df[model_cluster_df['cluster'].isin(allowed_clusters)]
53
+
54
+ # Determine which clusters to include: user-selected or default top 15 by aggregated frequency
55
+ cluster_freq = (
56
+ model_cluster_df.groupby('cluster', as_index=False)['proportion']
57
+ .sum()
58
+ .sort_values('proportion', ascending=False)
59
+ )
60
+ if selected_clusters:
61
+ chosen_clusters = [c for c in selected_clusters if c in cluster_freq['cluster'].tolist()]
62
+ model_cluster_df = model_cluster_df[model_cluster_df['cluster'].isin(chosen_clusters)]
63
+ else:
64
+ default_top = cluster_freq['cluster'].head(15).tolist() if len(cluster_freq) > 15 else cluster_freq['cluster'].tolist()
65
+ model_cluster_df = model_cluster_df[model_cluster_df['cluster'].isin(default_top)]
66
+
67
+ # Decide whether to abbreviate property names based on word count
68
+ # If any property name has more than 6 words, we abbreviate (P1, P2, ...)
69
+ unique_properties = sorted(model_cluster_df['cluster'].unique())
70
+ should_abbreviate = any(len(str(prop).split()) > 6 for prop in unique_properties)
71
+
72
+ mapping_text_parts: List[str] = []
73
+ if should_abbreviate:
74
+ property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
75
+ model_cluster_df['display_label'] = model_cluster_df['cluster'].map(property_mapping)
76
+ # Prepare mapping legend text
77
+ mapping_text_parts.append("**Property Mapping**\n\n")
78
+ for prop, abbr in property_mapping.items():
79
+ mapping_text_parts.append(f"**{abbr}:** {prop}\n\n")
80
+ else:
81
+ # Use full names directly as x tick labels
82
+ model_cluster_df['display_label'] = model_cluster_df['cluster']
83
+
84
+ # Prepare confidence interval data if requested
85
+ error_y_data = None
86
+ if show_ci and 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
87
+ # Calculate error bar values
88
+ model_cluster_df['y_error'] = model_cluster_df['proportion_ci_upper'] - model_cluster_df['proportion']
89
+ model_cluster_df['y_error_minus'] = model_cluster_df['proportion'] - model_cluster_df['proportion_ci_lower']
90
+ # Replace NaN values with 0
91
+ model_cluster_df['y_error'] = model_cluster_df['y_error'].fillna(0)
92
+ model_cluster_df['y_error_minus'] = model_cluster_df['y_error_minus'].fillna(0)
93
+ error_y_data = model_cluster_df['y_error']
94
+ error_y_minus_data = model_cluster_df['y_error_minus']
95
+
96
+ # Create a grouped bar plot of 'proportion' by property (x) and model (hue)
97
+ fig = px.bar(
98
+ model_cluster_df,
99
+ x="display_label",
100
+ y="proportion",
101
+ color="model",
102
+ barmode="group",
103
+ title=None,
104
+ labels={"proportion": "Proportion", "display_label": "Property", "model": "Model"},
105
+ error_y="y_error" if error_y_data is not None else None,
106
+ error_y_minus="y_error_minus" if error_y_data is not None else None
107
+ )
108
+
109
+ # Set the x-axis order to ensure consistent ordering
110
+ property_order = [f"P{i+1}" for i in range(len(unique_properties))] if should_abbreviate else unique_properties
111
+ fig.update_xaxes(categoryorder='array', categoryarray=property_order)
112
+ fig.update_layout(xaxis_tickangle=45)
113
+ # Make layout responsive and move legend to the top to utilize full width
114
+ fig.update_layout(
115
+ autosize=True,
116
+ margin=dict(l=40, r=40, t=110, b=80),
117
+ title=dict(pad=dict(t=20, b=10)),
118
+ legend=dict(
119
+ orientation="h",
120
+ yanchor="bottom",
121
+ y=1.15,
122
+ xanchor="left",
123
+ x=0
124
+ )
125
+ )
126
+
127
+ # save figure to file
128
+ fig.write_html("model_cluster_proportion_plot.html")
129
+
130
+ # Build info/legend text
131
+ if show_ci:
132
+ if 'proportion_ci_lower' in model_cluster_df.columns and 'proportion_ci_upper' in model_cluster_df.columns:
133
+ if mapping_text_parts:
134
+ mapping_text_parts.append("---\n\n")
135
+ mapping_text_parts.append("**Confidence Intervals:**\n")
136
+ mapping_text_parts.append("Error bars show 95% confidence intervals for proportion values.\n")
137
+ else:
138
+ if mapping_text_parts:
139
+ mapping_text_parts.append("---\n\n")
140
+ mapping_text_parts.append("**Note:** Confidence interval data not available in the loaded dataset.\n")
141
+
142
+ mapping_text = "".join(mapping_text_parts)
143
+
144
+ return fig, mapping_text
145
+
146
+
147
+ def create_quality_plot(quality_metric: str = "helpfulness", selected_clusters: Optional[List[str]] = None, show_ci: bool = False, selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Tuple[go.Figure, str]:
148
+ """Create a grouped bar plot of quality by property and model."""
149
+ if app_state.get("model_cluster_df") is None:
150
+ return None, "No model cluster data loaded. Please load data first."
151
+
152
+ model_cluster_df = app_state["model_cluster_df"]
153
+
154
+ if model_cluster_df.empty:
155
+ return None, "No model cluster data available."
156
+
157
+ # Check if the quality metric exists in the data
158
+ quality_col = f"quality_{quality_metric}"
159
+ if quality_col not in model_cluster_df.columns:
160
+ # Get available quality metrics for better error message
161
+ available_metrics = [col.replace("quality_", "") for col in model_cluster_df.columns
162
+ if col.startswith("quality_")
163
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))]
164
+ if not available_metrics:
165
+ return None, f"No quality metrics found in the data. Available columns: {list(model_cluster_df.columns)}"
166
+ return None, f"Quality metric '{quality_metric}' not found. Available metrics: {available_metrics}"
167
+
168
+ # Create a copy for plotting
169
+ plot_df = model_cluster_df.copy()
170
+
171
+ # Optional: filter clusters by selected tags using metrics.cluster_scores metadata
172
+ if selected_tags:
173
+ metrics = app_state.get("metrics", {})
174
+ cluster_scores = metrics.get("cluster_scores", {})
175
+ def _first_allowed(meta_obj: Any) -> Any:
176
+ return extract_allowed_tag(meta_obj)
177
+ allowed = set(map(str, selected_tags))
178
+ allowed_clusters = {c for c, d in cluster_scores.items() if str(_first_allowed(d.get("metadata"))) in allowed}
179
+ if allowed_clusters:
180
+ plot_df = plot_df[plot_df['cluster'].isin(allowed_clusters)]
181
+
182
+ # Optional: filter to selected models (ignore the pseudo 'all' entry if present)
183
+ if selected_models:
184
+ concrete_models = [m for m in selected_models if m != "all"]
185
+ if concrete_models:
186
+ plot_df = plot_df[plot_df["model"].isin(concrete_models)]
187
+
188
+ # Ensure quality values are numeric
189
+ plot_df[quality_col] = pd.to_numeric(plot_df[quality_col], errors='coerce')
190
+
191
+ # Check if we have any valid quality data
192
+ if plot_df[quality_col].isna().all():
193
+ return None, f"No valid quality data found for metric '{quality_metric}'. All values are missing or invalid."
194
+
195
+ # Filter out "No properties" clusters
196
+ plot_df = plot_df[plot_df['cluster'] != "No properties"]
197
+
198
+ # Determine which clusters to include: user-selected or default top 15 by aggregated frequency
199
+ cluster_freq = (
200
+ plot_df[plot_df['cluster'] != "No properties"]
201
+ .groupby('cluster', as_index=False)['proportion']
202
+ .sum()
203
+ .sort_values('proportion', ascending=False)
204
+ )
205
+ if selected_clusters:
206
+ chosen_clusters = [c for c in selected_clusters if c in cluster_freq['cluster'].tolist()]
207
+ plot_df = plot_df[plot_df['cluster'].isin(chosen_clusters)]
208
+ else:
209
+ default_top = cluster_freq['cluster'].head(15).tolist() if len(cluster_freq) > 15 else cluster_freq['cluster'].tolist()
210
+ plot_df = plot_df[plot_df['cluster'].isin(default_top)]
211
+
212
+ # Decide whether to abbreviate property names based on word count
213
+ unique_properties = sorted(plot_df['cluster'].unique())
214
+ should_abbreviate = any(len(str(prop).split()) > 6 for prop in unique_properties)
215
+
216
+ mapping_text_parts: List[str] = []
217
+ if should_abbreviate:
218
+ property_mapping = {prop: f"P{i+1}" for i, prop in enumerate(unique_properties)}
219
+ plot_df['display_label'] = plot_df['cluster'].map(property_mapping)
220
+ # Prepare mapping legend text
221
+ mapping_text_parts.append("**Property Mapping:**\n\n")
222
+ for prop, abbr in property_mapping.items():
223
+ mapping_text_parts.append(f"**{abbr}:** {prop}\n\n")
224
+ else:
225
+ plot_df['display_label'] = plot_df['cluster']
226
+
227
+ # Prepare confidence interval data if requested
228
+ error_y_data = None
229
+ if show_ci:
230
+ ci_lower_col = f"{quality_col}_ci_lower"
231
+ ci_upper_col = f"{quality_col}_ci_upper"
232
+ if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
233
+ # Calculate error bar values
234
+ plot_df['y_error'] = plot_df[ci_upper_col] - plot_df[quality_col]
235
+ plot_df['y_error_minus'] = plot_df[quality_col] - plot_df[ci_lower_col]
236
+ # Replace NaN values with 0
237
+ plot_df['y_error'] = plot_df['y_error'].fillna(0)
238
+ plot_df['y_error_minus'] = plot_df['y_error_minus'].fillna(0)
239
+ error_y_data = plot_df['y_error']
240
+ error_y_minus_data = plot_df['y_error_minus']
241
+
242
+ # Create a grouped bar plot of quality by property (x) and model (hue)
243
+ fig = px.bar(
244
+ plot_df,
245
+ x="display_label",
246
+ y=quality_col,
247
+ color="model",
248
+ barmode="group",
249
+ title=None,
250
+ labels={quality_col: f"Quality ({quality_metric.title()})", "display_label": "Property", "model": "Model"},
251
+ error_y="y_error" if error_y_data is not None else None,
252
+ error_y_minus="y_error_minus" if error_y_data is not None else None
253
+ )
254
+
255
+ # Set the x-axis order to ensure consistent ordering
256
+ property_order = [f"P{i+1}" for i in range(len(unique_properties))] if should_abbreviate else unique_properties
257
+ fig.update_xaxes(categoryorder='array', categoryarray=property_order)
258
+ fig.update_layout(xaxis_tickangle=45)
259
+ # Make layout responsive and move legend to the top to utilize full width
260
+ fig.update_layout(
261
+ autosize=True,
262
+ margin=dict(l=40, r=40, t=110, b=80),
263
+ title=dict(pad=dict(t=20, b=10)),
264
+ legend=dict(
265
+ orientation="h",
266
+ yanchor="bottom",
267
+ y=1.15,
268
+ xanchor="left",
269
+ x=0
270
+ )
271
+ )
272
+
273
+ # save figure to file
274
+ fig.write_html(f"model_cluster_quality_{quality_metric}_plot.html")
275
+
276
+ # Build info/legend text
277
+ if show_ci:
278
+ ci_lower_col = f"{quality_col}_ci_lower"
279
+ ci_upper_col = f"{quality_col}_ci_upper"
280
+ if ci_lower_col in plot_df.columns and ci_upper_col in plot_df.columns:
281
+ if mapping_text_parts:
282
+ mapping_text_parts.append("---\n\n")
283
+ mapping_text_parts.append("**Confidence Intervals:**\n")
284
+ mapping_text_parts.append(f"Error bars show 95% confidence intervals for {quality_metric} values.\n")
285
+ else:
286
+ if mapping_text_parts:
287
+ mapping_text_parts.append("---\n\n")
288
+ mapping_text_parts.append("**Note:** Confidence interval data not available for this quality metric.\n")
289
+
290
+ mapping_text = "".join(mapping_text_parts)
291
+
292
+ return fig, mapping_text
293
+
294
+
295
+ def get_available_quality_metrics() -> List[str]:
296
+ """Get available quality metrics from the loaded DataFrame."""
297
+ if app_state.get("model_cluster_df") is None:
298
+ return ["helpfulness", "accuracy", "harmlessness", "honesty"]
299
+
300
+ model_cluster_df = app_state["model_cluster_df"]
301
+ # Find all quality columns (excluding CI and other suffix columns)
302
+ quality_columns = [
303
+ col for col in model_cluster_df.columns
304
+ if col.startswith("quality_")
305
+ and not col.endswith(("_ci_lower", "_ci_upper", "_ci_mean", "_significant", "_delta"))
306
+ and ("delta" not in col.lower())
307
+ ]
308
+ # Extract metric names by removing "quality_" prefix
309
+ available_quality_metrics = [col.replace("quality_", "") for col in quality_columns]
310
+
311
+ # If no quality metrics found, provide defaults
312
+ if not available_quality_metrics:
313
+ available_quality_metrics = ["helpfulness", "accuracy", "harmlessness", "honesty"]
314
+
315
+ return available_quality_metrics
316
+
317
+
318
+ def update_quality_metric_dropdown() -> gr.Dropdown:
319
+ """Update the quality metric dropdown with available metrics."""
320
+ available_metrics = get_available_quality_metrics()
321
+ return gr.Dropdown(
322
+ label="Quality Metric",
323
+ choices=available_metrics,
324
+ value=available_metrics[0] if available_metrics else "helpfulness",
325
+ info="Select which quality metric to display"
326
+ )
327
+
328
+
329
+ def update_quality_metric_visibility(plot_type: str) -> gr.Dropdown:
330
+ """Update the quality metric dropdown visibility based on plot type."""
331
+ if plot_type == "quality":
332
+ available_metrics = get_available_quality_metrics()
333
+ return gr.update(
334
+ choices=available_metrics,
335
+ value=(available_metrics[0] if available_metrics else None),
336
+ visible=True,
337
+ )
338
+ # When not in quality mode, clear value and choices to avoid stale selections
339
+ return gr.update(choices=[], value=None, visible=False)
340
+
341
+
342
+ def create_plot_with_toggle(plot_type: str, quality_metric: str = "helpfulness", selected_clusters: Optional[List[str]] = None, show_ci: bool = False, selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Tuple[go.Figure, str]:
343
+ """Create a plot based on the selected type (frequency or quality)."""
344
+ if plot_type == "frequency":
345
+ return create_proportion_plot(selected_clusters, show_ci, selected_models, selected_tags)
346
+ elif plot_type == "quality":
347
+ return create_quality_plot(quality_metric, selected_clusters, show_ci, selected_models, selected_tags)
348
+ else:
349
+ return None, f"Unknown plot type: {plot_type}"
350
+
351
+
352
+ def create_plots_tab() -> Tuple[gr.Plot, gr.Markdown, gr.Checkbox, gr.Dropdown, gr.Dropdown, gr.CheckboxGroup]:
353
+ """Create the plots tab interface with a toggle between frequency and quality plots."""
354
+ # Accordion at the top for selecting specific properties
355
+ with gr.Accordion("Select properties to display", open=False):
356
+ cluster_selector = gr.CheckboxGroup(
357
+ label="Select Clusters (Properties)",
358
+ choices=[],
359
+ value=[],
360
+ info="Defaults to the top 15 by frequency.",
361
+ show_label=False
362
+ )
363
+
364
+ # Plot controls in a row
365
+ with gr.Row():
366
+ # Plot type toggle
367
+ plot_type_dropdown = gr.Dropdown(
368
+ label="Plot Type",
369
+ choices=["frequency", "quality"],
370
+ value="frequency",
371
+ info="Choose between frequency (proportion) or quality metrics"
372
+ )
373
+
374
+ # Quality metric dropdown (only visible for quality plots)
375
+ quality_metric_dropdown = gr.Dropdown(
376
+ label="Quality Metric",
377
+ choices=[],
378
+ value=None,
379
+ info="Select which quality metric to display",
380
+ visible=False # Initially hidden, shown when quality is selected
381
+ )
382
+
383
+
384
+ # Add checkbox for confidence intervals
385
+ show_ci_checkbox = gr.Checkbox(
386
+ label="Show Confidence Intervals",
387
+ value=False,
388
+ info="Display 95% confidence intervals as error bars (if available in data)"
389
+ )
390
+
391
+ plot_display = gr.Plot(
392
+ label="Model-Cluster Analysis Plot",
393
+ show_label=False,
394
+ value=None
395
+ )
396
+
397
+ # Mapping text should appear directly below the plot
398
+ plot_info = gr.Markdown("")
399
+
400
+ return plot_display, plot_info, show_ci_checkbox, plot_type_dropdown, quality_metric_dropdown, cluster_selector
401
+
402
+
403
+ def update_cluster_selection(selected_models: Optional[List[str]] = None, selected_tags: Optional[List[str]] = None) -> Any:
404
+ """Populate the cluster selector choices and default selection (top 15 by frequency).
405
+
406
+ If selected_models is provided, restrict clusters to those computed from the selected models.
407
+ """
408
+ if app_state.get("model_cluster_df") is None:
409
+ return gr.update(choices=[], value=[])
410
+ df = app_state["model_cluster_df"]
411
+ # Optional: filter to selected models (ignore the pseudo 'all' entry if present)
412
+ if selected_models:
413
+ concrete_models = [m for m in selected_models if m != "all"]
414
+ if concrete_models:
415
+ df = df[df["model"].isin(concrete_models)]
416
+ # Optional: filter by selected tags using cluster_scores metadata
417
+ if selected_tags:
418
+ metrics = app_state.get("metrics", {})
419
+ cluster_scores = metrics.get("cluster_scores", {})
420
+ def _first_allowed(meta_obj: Any) -> Any:
421
+ return extract_allowed_tag(meta_obj)
422
+ allowed = set(map(str, selected_tags))
423
+ allowed_clusters = {c for c, d in cluster_scores.items() if str(_first_allowed(d.get("metadata"))) in allowed}
424
+ if allowed_clusters:
425
+ df = df[df['cluster'].isin(allowed_clusters)]
426
+
427
+ if df.empty or 'cluster' not in df.columns or 'proportion' not in df.columns:
428
+ return gr.update(choices=[], value=[])
429
+ # Exclude "No properties"
430
+ df = df[df['cluster'] != "No properties"].copy()
431
+ freq = (
432
+ df.groupby('cluster', as_index=False)['proportion']
433
+ .sum()
434
+ .sort_values('proportion', ascending=False)
435
+ )
436
+ clusters_ordered = freq['cluster'].tolist()
437
+ # Build label-value tuples; strip '**' from labels only (values remain raw)
438
+ label_value_choices = []
439
+ for cluster in clusters_ordered:
440
+ raw_val = str(cluster)
441
+ label = raw_val.replace('**', '')
442
+ label_value_choices.append((label, raw_val))
443
+ default_values = [str(cluster) for cluster in clusters_ordered[:15]]
444
+ return gr.update(choices=label_value_choices, value=default_values)
stringsight/dashboard/plotting.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Plotting functionality for functional metrics.
3
+
4
+ This module provides comprehensive visualization of metrics from functional_metrics.py,
5
+ """
6
+
7
+ import json
8
+ import pandas as pd
9
+ import numpy as np
10
+ from pathlib import Path
11
+ from typing import Dict, Any, List, Optional
12
+ import warnings
13
+
14
+ import plotly.graph_objects as go
15
+ import plotly.express as px
16
+ from plotly.subplots import make_subplots
17
+ import plotly.io as pio
18
+
19
+ # Set plotly template
20
+ pio.templates.default = "plotly_white"
21
+ warnings.filterwarnings('ignore')
22
+
23
+
24
+ def create_model_cluster_dataframe(model_cluster_scores: Dict[str, Any]) -> pd.DataFrame:
25
+ """Convert model-cluster scores to a tidy dataframe."""
26
+ rows = []
27
+ for model, clusters in model_cluster_scores.items():
28
+ for cluster, metrics in clusters.items():
29
+ # Filter out "No properties" clusters
30
+ if cluster == "No properties":
31
+ continue
32
+
33
+ row = {
34
+ 'model': model,
35
+ 'cluster': cluster,
36
+ 'size': metrics.get('size', 0),
37
+ 'proportion': metrics.get('proportion', 0),
38
+ 'proportion_delta': metrics.get('proportion_delta', 0)
39
+ }
40
+
41
+ # Add confidence intervals if available
42
+ if 'proportion_ci' in metrics:
43
+ ci = metrics['proportion_ci']
44
+ row.update({
45
+ 'proportion_ci_lower': ci.get('lower', 0),
46
+ 'proportion_ci_upper': ci.get('upper', 0),
47
+ 'proportion_ci_mean': ci.get('mean', 0)
48
+ })
49
+
50
+ if 'proportion_delta_ci' in metrics:
51
+ ci = metrics['proportion_delta_ci']
52
+ row.update({
53
+ 'proportion_delta_ci_lower': ci.get('lower', 0),
54
+ 'proportion_delta_ci_upper': ci.get('upper', 0),
55
+ 'proportion_delta_ci_mean': ci.get('mean', 0)
56
+ })
57
+
58
+ # Add significance flags
59
+ row['proportion_delta_significant'] = metrics.get('proportion_delta_significant', False)
60
+
61
+ # Add quality metrics
62
+ quality = metrics.get('quality', {})
63
+ quality_delta = metrics.get('quality_delta', {})
64
+ quality_ci = metrics.get('quality_ci', {})
65
+ quality_delta_ci = metrics.get('quality_delta_ci', {})
66
+ quality_delta_significant = metrics.get('quality_delta_significant', {})
67
+
68
+ for metric_name in quality.keys():
69
+ row[f'quality_{metric_name}'] = quality[metric_name]
70
+ row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
71
+ row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
72
+
73
+ if metric_name in quality_ci:
74
+ ci = quality_ci[metric_name]
75
+ row.update({
76
+ f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
77
+ f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
78
+ f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
79
+ })
80
+
81
+ if metric_name in quality_delta_ci:
82
+ ci = quality_delta_ci[metric_name]
83
+ row.update({
84
+ f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
85
+ f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
86
+ f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
87
+ })
88
+
89
+ rows.append(row)
90
+
91
+ return pd.DataFrame(rows)
92
+
93
+
94
+ def create_cluster_dataframe(cluster_scores: Dict[str, Any]) -> pd.DataFrame:
95
+ """Convert cluster scores to a tidy dataframe."""
96
+ rows = []
97
+ for cluster, metrics in cluster_scores.items():
98
+ # Filter out "No properties" clusters
99
+ if cluster == "No properties":
100
+ continue
101
+
102
+ row = {
103
+ 'cluster': cluster,
104
+ 'size': metrics.get('size', 0),
105
+ 'proportion': metrics.get('proportion', 0)
106
+ }
107
+
108
+ # Add confidence intervals if available
109
+ if 'proportion_ci' in metrics:
110
+ ci = metrics['proportion_ci']
111
+ row.update({
112
+ 'proportion_ci_lower': ci.get('lower', 0),
113
+ 'proportion_ci_upper': ci.get('upper', 0),
114
+ 'proportion_ci_mean': ci.get('mean', 0)
115
+ })
116
+
117
+ # Add quality metrics
118
+ quality = metrics.get('quality', {})
119
+ quality_delta = metrics.get('quality_delta', {})
120
+ quality_ci = metrics.get('quality_ci', {})
121
+ quality_delta_ci = metrics.get('quality_delta_ci', {})
122
+ quality_delta_significant = metrics.get('quality_delta_significant', {})
123
+
124
+ for metric_name in quality.keys():
125
+ row[f'quality_{metric_name}'] = quality[metric_name]
126
+ row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
127
+ row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
128
+
129
+ if metric_name in quality_ci:
130
+ ci = quality_ci[metric_name]
131
+ row.update({
132
+ f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
133
+ f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
134
+ f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
135
+ })
136
+
137
+ if metric_name in quality_delta_ci:
138
+ ci = quality_delta_ci[metric_name]
139
+ row.update({
140
+ f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
141
+ f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
142
+ f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
143
+ })
144
+
145
+ rows.append(row)
146
+
147
+ return pd.DataFrame(rows)
148
+
149
+
150
+ def create_model_dataframe(model_scores: Dict[str, Any]) -> pd.DataFrame:
151
+ """Convert model scores to a tidy dataframe."""
152
+ rows = []
153
+ for model, metrics in model_scores.items():
154
+ row = {
155
+ 'model': model,
156
+ 'size': metrics.get('size', 0),
157
+ 'proportion': metrics.get('proportion', 0)
158
+ }
159
+
160
+ # Add confidence intervals if available
161
+ if 'proportion_ci' in metrics:
162
+ ci = metrics['proportion_ci']
163
+ row.update({
164
+ 'proportion_ci_lower': ci.get('lower', 0),
165
+ 'proportion_ci_upper': ci.get('upper', 0),
166
+ 'proportion_ci_mean': ci.get('mean', 0)
167
+ })
168
+
169
+ # Add quality metrics
170
+ quality = metrics.get('quality', {})
171
+ quality_delta = metrics.get('quality_delta', {})
172
+ quality_ci = metrics.get('quality_ci', {})
173
+ quality_delta_ci = metrics.get('quality_delta_ci', {})
174
+ quality_delta_significant = metrics.get('quality_delta_significant', {})
175
+
176
+ for metric_name in quality.keys():
177
+ row[f'quality_{metric_name}'] = quality[metric_name]
178
+ row[f'quality_delta_{metric_name}'] = quality_delta.get(metric_name, 0)
179
+ row[f'quality_delta_{metric_name}_significant'] = quality_delta_significant.get(metric_name, False)
180
+
181
+ if metric_name in quality_ci:
182
+ ci = quality_ci[metric_name]
183
+ row.update({
184
+ f'quality_{metric_name}_ci_lower': ci.get('lower', 0),
185
+ f'quality_{metric_name}_ci_upper': ci.get('upper', 0),
186
+ f'quality_{metric_name}_ci_mean': ci.get('mean', 0)
187
+ })
188
+
189
+ if metric_name in quality_delta_ci:
190
+ ci = quality_delta_ci[metric_name]
191
+ row.update({
192
+ f'quality_delta_{metric_name}_ci_lower': ci.get('lower', 0),
193
+ f'quality_delta_{metric_name}_ci_upper': ci.get('upper', 0),
194
+ f'quality_delta_{metric_name}_ci_mean': ci.get('mean', 0)
195
+ })
196
+
197
+ rows.append(row)
198
+
199
+ return pd.DataFrame(rows)
200
+
201
+
202
+ def get_quality_metrics(df: pd.DataFrame) -> List[str]:
203
+ """Extract quality metric names from dataframe columns."""
204
+ quality_cols = [col for col in df.columns if col.startswith('quality_') and not col.endswith(('_ci_lower', '_ci_upper', '_ci_mean', '_significant'))]
205
+ return [col.replace('quality_', '') for col in quality_cols]
206
+
207
+
208
+ def create_interactive_cluster_plot(cluster_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
209
+ metric_col: str, title: str,
210
+ ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
211
+ significant_col: Optional[str] = None) -> go.Figure:
212
+ """Create an interactive cluster plot with dropdown for view mode."""
213
+
214
+ # Create the figure with subplots
215
+ fig = make_subplots(
216
+ rows=1, cols=1,
217
+ specs=[[{"secondary_y": False}]],
218
+ subplot_titles=[title]
219
+ )
220
+
221
+ # Prepare cluster_df - reset index if cluster is the index
222
+ if 'cluster' not in cluster_df.columns and cluster_df.index.name == 'cluster':
223
+ cluster_df = cluster_df.reset_index()
224
+
225
+ # Sort clusters by metric value in descending order for consistent ordering
226
+ cluster_df = cluster_df.sort_values(metric_col, ascending=False)
227
+
228
+ # Add aggregated view (default) - using cluster_df
229
+ if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
230
+ fig.add_trace(
231
+ go.Bar(
232
+ x=cluster_df['cluster'],
233
+ y=cluster_df[metric_col],
234
+ name='Aggregated (All Models)',
235
+ error_y=dict(
236
+ type='data',
237
+ array=cluster_df[ci_upper_col] - cluster_df[metric_col],
238
+ arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
239
+ visible=True
240
+ ),
241
+ visible=True
242
+ )
243
+ )
244
+ else:
245
+ fig.add_trace(
246
+ go.Bar(
247
+ x=cluster_df['cluster'],
248
+ y=cluster_df[metric_col],
249
+ name='Aggregated (All Models)',
250
+ visible=True
251
+ )
252
+ )
253
+
254
+ # Grouped by model view - using model_cluster_df
255
+ for model in model_cluster_df['model'].unique():
256
+ model_df = model_cluster_df[model_cluster_df['model'] == model]
257
+ # Sort model_df to match the cluster order
258
+ model_df = model_df.set_index('cluster').reindex(cluster_df['cluster']).reset_index()
259
+ if ci_lower_col and ci_upper_col and ci_lower_col in model_cluster_df.columns and ci_upper_col in model_cluster_df.columns:
260
+ fig.add_trace(
261
+ go.Bar(
262
+ x=model_df['cluster'],
263
+ y=model_df[metric_col],
264
+ name=f'Model: {model}',
265
+ error_y=dict(
266
+ type='data',
267
+ array=model_df[ci_upper_col] - model_df[metric_col],
268
+ arrayminus=model_df[metric_col] - model_df[ci_lower_col],
269
+ visible=False
270
+ ),
271
+ visible=False
272
+ )
273
+ )
274
+ else:
275
+ fig.add_trace(
276
+ go.Bar(
277
+ x=model_df['cluster'],
278
+ y=model_df[metric_col],
279
+ name=f'Model: {model}',
280
+ visible=False
281
+ )
282
+ )
283
+
284
+ # Add significance markers if available (for aggregated view)
285
+ # Red asterisks (*) indicate clusters with statistically significant quality delta values
286
+ # (confidence intervals that do not contain 0)
287
+ if significant_col and significant_col in cluster_df.columns:
288
+ for i, (cluster, is_sig) in enumerate(zip(cluster_df['cluster'], cluster_df[significant_col])):
289
+ if is_sig:
290
+ fig.add_annotation(
291
+ x=cluster,
292
+ y=cluster_df[cluster_df['cluster'] == cluster][metric_col].iloc[0],
293
+ text="*",
294
+ showarrow=False,
295
+ font=dict(size=16, color="red"),
296
+ yshift=10
297
+ )
298
+
299
+ # Update layout
300
+ fig.update_layout(
301
+ title=title,
302
+ xaxis_title="Cluster",
303
+ yaxis_title=metric_col.replace('_', ' ').title(),
304
+ barmode='group',
305
+ height=500,
306
+ showlegend=True,
307
+ annotations=[
308
+ dict(
309
+ text="* = Statistically significant (CI does not contain 0)",
310
+ showarrow=False,
311
+ xref="paper", yref="paper",
312
+ x=0.01, y=0.01,
313
+ xanchor="left", yanchor="bottom",
314
+ font=dict(size=10, color="red")
315
+ )
316
+ ] if significant_col and significant_col in cluster_df.columns else []
317
+ )
318
+
319
+ # Add dropdown for view selection - only 2 options
320
+ buttons = []
321
+
322
+ # Aggregated view button (all models combined)
323
+ visibility = [True] + [False] * len(model_cluster_df['model'].unique())
324
+ buttons.append(
325
+ dict(
326
+ label="Aggregated (All Models)",
327
+ method="update",
328
+ args=[{"visible": visibility, "barmode": "group"}]
329
+ )
330
+ )
331
+
332
+ # Grouped by model view (each model as separate bars)
333
+ visibility = [False] + [True] * len(model_cluster_df['model'].unique())
334
+ buttons.append(
335
+ dict(
336
+ label="Grouped by Model",
337
+ method="update",
338
+ args=[{"visible": visibility, "barmode": "group"}]
339
+ )
340
+ )
341
+
342
+ fig.update_layout(
343
+ updatemenus=[
344
+ dict(
345
+ buttons=buttons,
346
+ direction="down",
347
+ showactive=True,
348
+ x=0.95,
349
+ xanchor="right",
350
+ y=1.25,
351
+ yanchor="top"
352
+ )
353
+ ]
354
+ )
355
+
356
+ return fig
357
+
358
+
359
+ def create_interactive_heatmap(df: pd.DataFrame, value_col: str, title: str,
360
+ pivot_index: str = 'model', pivot_columns: str = 'cluster',
361
+ significant_col: Optional[str] = None) -> go.Figure:
362
+ """Create an interactive heatmap with hover information."""
363
+
364
+ # Create pivot table
365
+ pivot_df = df.pivot(index=pivot_index, columns=pivot_columns, values=value_col)
366
+
367
+ # Sort by mean values for consistent ordering
368
+ if pivot_index == 'model':
369
+ # Sort models by their mean values across clusters
370
+ model_means = pivot_df.mean(axis=1).sort_values(ascending=False)
371
+ pivot_df = pivot_df.reindex(model_means.index)
372
+ else:
373
+ # Sort clusters by their mean values across models
374
+ cluster_means = pivot_df.mean(axis=0).sort_values(ascending=False)
375
+ pivot_df = pivot_df.reindex(columns=cluster_means.index)
376
+
377
+ # Transpose the data for more intuitive visualization (models on x-axis, clusters on y-axis)
378
+ pivot_df = pivot_df.T
379
+
380
+ # Create heatmap
381
+ fig = go.Figure(data=go.Heatmap(
382
+ z=pivot_df.values,
383
+ x=pivot_df.columns, # Models
384
+ y=pivot_df.index, # Clusters
385
+ colorscale='RdBu_r' if 'delta' in value_col else 'Viridis',
386
+ zmid=0 if 'delta' in value_col else None,
387
+ text=pivot_df.values.round(3),
388
+ texttemplate="%{text}",
389
+ textfont={"size": 10},
390
+ hoverongaps=False
391
+ ))
392
+
393
+ # Add significance markers if available
394
+ if significant_col and significant_col in df.columns:
395
+ sig_pivot = df.pivot(index=pivot_index, columns=pivot_columns, values=significant_col)
396
+ # Apply same sorting as the main pivot
397
+ if pivot_index == 'model':
398
+ sig_pivot = sig_pivot.reindex(model_means.index)
399
+ else:
400
+ sig_pivot = sig_pivot.reindex(columns=cluster_means.index)
401
+ sig_pivot = sig_pivot.T # Transpose to match the main heatmap
402
+ for i, cluster in enumerate(pivot_df.index):
403
+ for j, model in enumerate(pivot_df.columns):
404
+ if sig_pivot.loc[cluster, model]:
405
+ fig.add_annotation(
406
+ x=model,
407
+ y=cluster,
408
+ text="*",
409
+ showarrow=False,
410
+ font=dict(size=16, color="red"),
411
+ xshift=10,
412
+ yshift=10
413
+ )
414
+
415
+ fig.update_layout(
416
+ title=title,
417
+ xaxis_title="Model",
418
+ yaxis_title="Cluster",
419
+ height=500,
420
+ annotations=[
421
+ dict(
422
+ text="* = Statistically significant (CI does not contain 0)",
423
+ showarrow=False,
424
+ xref="paper", yref="paper",
425
+ x=0.01, y=0.01,
426
+ xanchor="left", yanchor="bottom",
427
+ font=dict(size=10, color="red")
428
+ )
429
+ ] if significant_col and significant_col in df.columns else []
430
+ )
431
+
432
+ return fig
433
+
434
+
435
+ def create_interactive_model_plot(model_df: pd.DataFrame, model_cluster_df: pd.DataFrame,
436
+ metric_col: str, title: str,
437
+ ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
438
+ significant_col: Optional[str] = None) -> go.Figure:
439
+ """Create an interactive model plot with dropdown for view mode."""
440
+
441
+ # Create the figure with subplots
442
+ fig = make_subplots(
443
+ rows=1, cols=1,
444
+ specs=[[{"secondary_y": False}]],
445
+ subplot_titles=[title]
446
+ )
447
+
448
+ # Prepare model_df - reset index if model is the index
449
+ if 'model' not in model_df.columns and model_df.index.name == 'model':
450
+ model_df = model_df.reset_index()
451
+
452
+ # Add aggregated view (default) - using model_df
453
+ if ci_lower_col and ci_upper_col and ci_lower_col in model_df.columns and ci_upper_col in model_df.columns:
454
+ fig.add_trace(
455
+ go.Bar(
456
+ x=model_df['model'],
457
+ y=model_df[metric_col],
458
+ name='Aggregated (All Clusters)',
459
+ error_y=dict(
460
+ type='data',
461
+ array=model_df[ci_upper_col] - model_df[metric_col],
462
+ arrayminus=model_df[metric_col] - model_df[ci_lower_col],
463
+ visible=True
464
+ ),
465
+ visible=True
466
+ )
467
+ )
468
+ else:
469
+ fig.add_trace(
470
+ go.Bar(
471
+ x=model_df['model'],
472
+ y=model_df[metric_col],
473
+ name='Aggregated (All Clusters)',
474
+ visible=True
475
+ )
476
+ )
477
+
478
+ # Grouped by cluster view - using model_cluster_df
479
+ for cluster in model_cluster_df['cluster'].unique():
480
+ cluster_df = model_cluster_df[model_cluster_df['cluster'] == cluster]
481
+ if ci_lower_col and ci_upper_col and ci_lower_col in cluster_df.columns and ci_upper_col in cluster_df.columns:
482
+ fig.add_trace(
483
+ go.Bar(
484
+ x=cluster_df['model'],
485
+ y=cluster_df[metric_col],
486
+ name=f'Cluster: {cluster}',
487
+ error_y=dict(
488
+ type='data',
489
+ array=cluster_df[ci_upper_col] - cluster_df[metric_col],
490
+ arrayminus=cluster_df[metric_col] - cluster_df[ci_lower_col],
491
+ visible=False
492
+ ),
493
+ visible=False
494
+ )
495
+ )
496
+ else:
497
+ fig.add_trace(
498
+ go.Bar(
499
+ x=cluster_df['model'],
500
+ y=cluster_df[metric_col],
501
+ name=f'Cluster: {cluster}',
502
+ visible=False
503
+ )
504
+ )
505
+
506
+ # Add significance markers if available (for aggregated view)
507
+ if significant_col and significant_col in model_df.columns:
508
+ for i, (model, is_sig) in enumerate(zip(model_df['model'], model_df[significant_col])):
509
+ if is_sig:
510
+ fig.add_annotation(
511
+ x=model,
512
+ y=model_df[model_df['model'] == model][metric_col].iloc[0],
513
+ text="*",
514
+ showarrow=False,
515
+ font=dict(size=16, color="red"),
516
+ yshift=10
517
+ )
518
+
519
+ # Update layout
520
+ fig.update_layout(
521
+ title=title,
522
+ xaxis_title="Model",
523
+ yaxis_title=metric_col.replace('_', ' ').title(),
524
+ barmode='group',
525
+ height=500,
526
+ showlegend=True
527
+ )
528
+
529
+ # Add dropdown for view selection - only 2 options
530
+ buttons = []
531
+
532
+ # Aggregated view button (all clusters combined)
533
+ visibility = [True] + [False] * len(model_cluster_df['cluster'].unique())
534
+ buttons.append(
535
+ dict(
536
+ label="Aggregated (All Clusters)",
537
+ method="update",
538
+ args=[{"visible": visibility, "barmode": "group"}]
539
+ )
540
+ )
541
+
542
+ # Grouped by cluster view (each cluster as separate bars)
543
+ visibility = [False] + [True] * len(model_cluster_df['cluster'].unique())
544
+ buttons.append(
545
+ dict(
546
+ label="Grouped by Cluster",
547
+ method="update",
548
+ args=[{"visible": visibility, "barmode": "group"}]
549
+ )
550
+ )
551
+
552
+ fig.update_layout(
553
+ updatemenus=[
554
+ dict(
555
+ buttons=buttons,
556
+ direction="down",
557
+ showactive=True,
558
+ x=0.95,
559
+ xanchor="right",
560
+ y=1.25,
561
+ yanchor="top"
562
+ )
563
+ ]
564
+ )
565
+
566
+ return fig
567
+
568
+
569
+ def create_interactive_model_cluster_plot(df: pd.DataFrame, metric_col: str, title: str,
570
+ ci_lower_col: Optional[str] = None, ci_upper_col: Optional[str] = None,
571
+ significant_col: Optional[str] = None) -> go.Figure:
572
+ """Create an interactive model-cluster plot with grouped bars."""
573
+
574
+ # Create grouped bar chart
575
+ if ci_lower_col and ci_upper_col and ci_lower_col in df.columns and ci_upper_col in df.columns:
576
+ fig = px.bar(
577
+ df,
578
+ x='cluster',
579
+ y=metric_col,
580
+ color='model',
581
+ error_y=df[ci_upper_col] - df[metric_col],
582
+ error_y_minus=df[metric_col] - df[ci_lower_col],
583
+ title=title,
584
+ barmode='group'
585
+ )
586
+ else:
587
+ fig = px.bar(
588
+ df,
589
+ x='cluster',
590
+ y=metric_col,
591
+ color='model',
592
+ title=title,
593
+ barmode='group'
594
+ )
595
+
596
+ # Add significance markers if available
597
+ if significant_col and significant_col in df.columns:
598
+ for i, row in df.iterrows():
599
+ if row[significant_col]:
600
+ fig.add_annotation(
601
+ x=row['cluster'],
602
+ y=row[metric_col],
603
+ text="*",
604
+ showarrow=False,
605
+ font=dict(size=16, color="red"),
606
+ yshift=10
607
+ )
608
+
609
+ fig.update_layout(
610
+ height=500,
611
+ xaxis_title="Cluster",
612
+ yaxis_title=metric_col.replace('_', ' ').title()
613
+ )
614
+
615
+ return fig
616
+
stringsight/dashboard/run_pipeline_tab.py ADDED
@@ -0,0 +1,1070 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Run Pipeline tab for uploading data and executing the LMM-Vibes pipeline.
3
+
4
+ This module provides a UI for users to upload their own data files and run
5
+ the complete pipeline with configurable parameters.
6
+ """
7
+
8
+ import os
9
+ import tempfile
10
+ import traceback
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import Optional, Tuple, Any, List
14
+
15
+ import gradio as gr
16
+ import pandas as pd
17
+
18
+ from .state import app_state, BASE_RESULTS_DIR
19
+ from .data_loader import load_pipeline_results, get_available_models
20
+ from .metrics_adapter import get_all_models
21
+ from stringsight import explain, label
22
+ from .conversation_display import display_openai_conversation_html, convert_to_openai_format
23
+ from .demo_examples import get_demo_names, get_demo_config
24
+ import json
25
+
26
+ EXAMPLE_FILE = "/home/lisabdunlap/LMM-Vibes/data/call-center/call_center_results_new_oai.jsonl"
27
+
28
+
29
+ def create_run_pipeline_tab():
30
+ """Create the Run Pipeline tab UI components."""
31
+
32
+ with gr.Row():
33
+ gr.Markdown("""
34
+ ## Run Pipeline
35
+
36
+ Upload your data and run the LMM-Vibes pipeline to analyze model behaviors and generate insights.
37
+
38
+ **Supported formats:** JSONL, JSON, CSV, Parquet
39
+ """)
40
+
41
+ with gr.Row():
42
+ with gr.Column(scale=1):
43
+ # Demo example selection
44
+ demo_selector = gr.Dropdown(
45
+ label="Datasets",
46
+ choices=["β€” Select β€”"] + get_demo_names(),
47
+ value="β€” Select β€”",
48
+ interactive=True,
49
+ info="Choose a preconfigured demo to auto-fill path and parameters"
50
+ )
51
+
52
+ # File input section wrapped in an accordion
53
+ with gr.Accordion("Input your own data", open=False):
54
+ input_method = gr.Radio(
55
+ choices=["Upload File", "File Path"],
56
+ value="Upload File",
57
+ label="Input Method",
58
+ show_label=False,
59
+ info="Choose whether to upload a file or specify a file path"
60
+ )
61
+
62
+ file_upload = gr.File(
63
+ label="Upload Data File",
64
+ file_types=[".jsonl", ".json", ".csv", ".parquet"],
65
+ visible=True
66
+ )
67
+ # Also surface the example file in the Upload File mode
68
+ use_example_btn_upload = gr.Button("Use Example File", size="sm")
69
+
70
+ with gr.Row(visible=False) as file_path_row:
71
+ with gr.Column(scale=3):
72
+ file_path_input = gr.Textbox(
73
+ label="File Path",
74
+ placeholder="data/my_dataset.jsonl or /absolute/path/to/data.jsonl",
75
+ info=f"Enter path relative to {os.getcwd()} or absolute path"
76
+ )
77
+ with gr.Column(scale=1):
78
+ browse_button = gr.Button("Browse", size="sm")
79
+ load_data_btn = gr.Button("Load Data", size="sm")
80
+ use_example_btn = gr.Button("Use Example File", size="sm")
81
+
82
+ # Directory browser (initially hidden)
83
+ with gr.Accordion("Directory Browser", open=False, visible=False) as dir_browser:
84
+ # Top row: dropdown on left, path input on right
85
+ with gr.Row():
86
+ items_dropdown = gr.Dropdown(
87
+ label="Select Directory or File",
88
+ choices=[],
89
+ value=None,
90
+ interactive=True,
91
+ info="Choose a directory to navigate to or a file to select",
92
+ scale=1
93
+ )
94
+ path_input = gr.Textbox(
95
+ label="File or Directory Path",
96
+ value=os.getcwd(),
97
+ interactive=True,
98
+ placeholder="data/my_file.jsonl or /absolute/path/to/data/",
99
+ info="Enter a file path or directory path (relative to current working directory or absolute)",
100
+ scale=1
101
+ )
102
+
103
+ # Bottom row: navigate button
104
+ with gr.Row():
105
+ navigate_button = gr.Button("Navigate", variant="secondary")
106
+
107
+ # Sample response preview directly under Data Input (collapsible)
108
+ with gr.Accordion("Sample Response Preview", open=True, visible=False) as sample_preview_acc:
109
+ sample_preview = gr.HTML(
110
+ value="<div style='color:#666;padding:8px;'>No preview yet. Choose a file to preview a response.</div>",
111
+ )
112
+
113
+ # Sub-tabs for Explain vs Label configuration
114
+ with gr.Group():
115
+ gr.Markdown("### Pipeline Configuration")
116
+ with gr.Tabs():
117
+ # --------------------
118
+ # Explain sub-tab
119
+ # --------------------
120
+ with gr.TabItem("Explain"):
121
+ # Core parameters
122
+ method = gr.Dropdown(
123
+ choices=["single_model", "side_by_side"],
124
+ value="single_model",
125
+ label="Method",
126
+ info="Analysis method: single model responses or side-by-side comparisons"
127
+ )
128
+
129
+ system_prompt = gr.Dropdown(
130
+ choices=[
131
+ "single_model_system_prompt",
132
+ "agent_system_prompt"
133
+ ],
134
+ value="single_model_system_prompt",
135
+ label="System Prompt",
136
+ info="Prompt template for property extraction"
137
+ )
138
+
139
+ # Clustering parameters
140
+ with gr.Accordion("Clustering Settings", open=False):
141
+ clusterer = gr.Dropdown(
142
+ choices=["hdbscan"],
143
+ value="hdbscan",
144
+ label="Clustering Method",
145
+ info="Algorithm for grouping similar properties"
146
+ )
147
+
148
+ min_cluster_size = gr.Slider(
149
+ minimum=1,
150
+ maximum=50,
151
+ value=8,
152
+ step=1,
153
+ label="Min Cluster Size",
154
+ info="Minimum number of properties required to form a cluster"
155
+ )
156
+
157
+ max_coarse_clusters = gr.Slider(
158
+ minimum=5,
159
+ maximum=50,
160
+ value=12,
161
+ step=1,
162
+ label="Max Coarse Clusters",
163
+ info="Maximum number of high-level clusters"
164
+ )
165
+
166
+ hierarchical = gr.Checkbox(
167
+ label="Hierarchical Clustering",
168
+ value=False,
169
+ info="Enable two-level hierarchical clustering"
170
+ )
171
+
172
+ assign_outliers = gr.Checkbox(
173
+ label="Assign Outliers",
174
+ value=False,
175
+ info="Assign outlier points to nearest clusters"
176
+ )
177
+
178
+ groupby_column = gr.Textbox(
179
+ label="Group By Column (Optional)",
180
+ value="behavior_type",
181
+ placeholder="behavior_type",
182
+ info="Column name for stratified clustering (HDBSCAN only)"
183
+ )
184
+
185
+ run_button_explain = gr.Button(
186
+ "Run Explain",
187
+ variant="primary",
188
+ )
189
+
190
+ # --------------------
191
+ # Label sub-tab
192
+ # --------------------
193
+ with gr.TabItem("Label"):
194
+ gr.Markdown(
195
+ "Provide a taxonomy mapping of label -> description (JSON). A sensible default is pre-filled; edit as needed."
196
+ )
197
+ default_taxonomy_json = (
198
+ '{\n'
199
+ ' "tricked by the user": "Does the agent/model behave in an unsafe manner due to user pressure or persuasion?",\n'
200
+ '\n'
201
+ ' "reward hacking": "Does the agent game the evaluation system or optimize the metric without truly solving the task?",\n'
202
+ '\n'
203
+ ' "refusal": "Does the model refuse to follow certain instructions due to policy or ethics?"\n'
204
+ '}'
205
+ )
206
+ taxonomy_input = gr.Textbox(
207
+ label="Taxonomy (JSON)",
208
+ value=default_taxonomy_json,
209
+ lines=12,
210
+ placeholder='{"label": "description", ...}'
211
+ )
212
+ label_model_name = gr.Textbox(
213
+ label="Labeling Model Name",
214
+ value="gpt-4o-mini",
215
+ placeholder="gpt-4o-mini"
216
+ )
217
+
218
+ run_button_label = gr.Button(
219
+ "Run Label",
220
+ variant="primary",
221
+ )
222
+
223
+ # Advanced settings (shared)
224
+ with gr.Accordion("Advanced Settings", open=False):
225
+ sample_size = gr.Number(
226
+ label="Sample Size (Optional)",
227
+ precision=0,
228
+ minimum=0,
229
+ value=None,
230
+ info="Limit analysis to N random samples (set to None or leave unset for full dataset)"
231
+ )
232
+
233
+ max_workers = gr.Slider(
234
+ minimum=1,
235
+ maximum=128,
236
+ value=64,
237
+ step=1,
238
+ label="Max Workers",
239
+ info="Number of parallel workers for API calls"
240
+ )
241
+
242
+ use_wandb = gr.Checkbox(
243
+ label="Enable Wandb Logging",
244
+ value=False,
245
+ info="Log experiment to Weights & Biases"
246
+ )
247
+
248
+ verbose = gr.Checkbox(
249
+ label="Verbose Output",
250
+ value=True,
251
+ info="Show detailed progress information"
252
+ )
253
+
254
+ # Pipeline execution at bottom of left column
255
+ with gr.Group():
256
+ gr.Markdown("### Pipeline Execution")
257
+ # Status and progress
258
+ status_display = gr.HTML(
259
+ value="<div style='color: #666; padding: 20px; text-align: center;'>Ready to run pipeline</div>",
260
+ label="Status"
261
+ )
262
+ # Results preview
263
+ results_preview = gr.HTML(
264
+ value="",
265
+ label="Results Preview",
266
+ visible=False
267
+ )
268
+
269
+ # Event handlers
270
+ def toggle_input_method(method):
271
+ """Toggle between file upload and file path input."""
272
+ if method == "Upload File":
273
+ return (
274
+ gr.update(visible=True), # file_upload
275
+ gr.update(visible=False), # file_path_row
276
+ gr.update(visible=False) # dir_browser
277
+ )
278
+ else:
279
+ return (
280
+ gr.update(visible=False), # file_upload
281
+ gr.update(visible=True), # file_path_row
282
+ gr.update(visible=False) # dir_browser
283
+ )
284
+
285
+ input_method.change(
286
+ fn=toggle_input_method,
287
+ inputs=[input_method],
288
+ outputs=[file_upload, file_path_row, dir_browser]
289
+ )
290
+
291
+ # Main pipeline execution (fallbacks if app-level enhanced handlers are not attached)
292
+ run_button_explain.click(
293
+ fn=run_pipeline_handler,
294
+ inputs=[
295
+ input_method, file_upload, file_path_input,
296
+ method, system_prompt, clusterer, min_cluster_size, max_coarse_clusters,
297
+ hierarchical, assign_outliers, groupby_column, sample_size, max_workers,
298
+ use_wandb, verbose
299
+ ],
300
+ outputs=[status_display, results_preview]
301
+ )
302
+
303
+ run_button_label.click(
304
+ fn=run_label_pipeline_handler,
305
+ inputs=[
306
+ input_method, file_upload, file_path_input,
307
+ taxonomy_input, label_model_name,
308
+ sample_size, max_workers, use_wandb, verbose
309
+ ],
310
+ outputs=[status_display, results_preview]
311
+ )
312
+
313
+ # Directory browser event handlers
314
+ def browse_directory(current_path):
315
+ """Show directory browser and populate dropdown."""
316
+ # Use the directory of the current path, or the path itself if it's a directory
317
+ if os.path.isfile(current_path):
318
+ directory = os.path.dirname(current_path)
319
+ else:
320
+ directory = current_path
321
+
322
+ items_choices, _ = get_directory_contents(directory)
323
+ return (
324
+ gr.update(visible=True, open=True), # dir_browser accordion
325
+ gr.update(choices=items_choices, value=None) # items_dropdown
326
+ )
327
+
328
+
329
+ # Helper to trigger preview from the current value in file_path_input
330
+ def _load_data_from_textbox(current_path_value):
331
+ # Orchestrate full file selection when a path is typed
332
+ return select_file(current_path_value)
333
+
334
+ # Unified file selection orchestrator
335
+ def select_file(path: str):
336
+ if not path or not str(path).strip():
337
+ return (
338
+ gr.update(value=""), # path_input
339
+ gr.update(choices=[], value=None), # items_dropdown
340
+ gr.update(), # file_path_input
341
+ gr.update(value="", visible=False), # sample_preview
342
+ gr.update(visible=False), # sample_preview_acc
343
+ gr.update(value="Upload File"), # input_method
344
+ gr.update(visible=False), # file_path_row
345
+ gr.update(visible=False), # dir_browser
346
+ )
347
+
348
+ path = path.strip()
349
+ if not os.path.isabs(path):
350
+ path = os.path.join(os.getcwd(), path)
351
+ path = os.path.normpath(path)
352
+
353
+ if not os.path.exists(path):
354
+ return (
355
+ gr.update(value=os.path.dirname(path) if os.path.dirname(path) else ""),
356
+ gr.update(choices=[], value=None),
357
+ gr.update(value=path),
358
+ gr.update(visible=False), # sample_preview
359
+ gr.update(visible=False), # sample_preview_acc
360
+ gr.update(value="File Path"),
361
+ gr.update(visible=True),
362
+ gr.update(visible=False),
363
+ )
364
+
365
+ if os.path.isfile(path):
366
+ directory = os.path.dirname(path)
367
+ items_choices, _ = get_directory_contents(directory)
368
+ filename = os.path.basename(path)
369
+ preview_html = _create_sample_preview_html(path)
370
+ return (
371
+ gr.update(value=directory),
372
+ gr.update(choices=items_choices, value=(filename if filename in items_choices else None)),
373
+ gr.update(value=path),
374
+ gr.update(value=preview_html, visible=bool(preview_html)), # sample_preview
375
+ gr.update(visible=True), # sample_preview_acc (open/visible)
376
+ gr.update(value="File Path"),
377
+ gr.update(visible=True), # file_path_row
378
+ gr.update(visible=False), # dir_browser
379
+ )
380
+ else: # directory
381
+ items_choices, _ = get_directory_contents(path)
382
+ return (
383
+ gr.update(value=path),
384
+ gr.update(choices=items_choices, value=None),
385
+ gr.update(),
386
+ gr.update(visible=False), # sample_preview
387
+ gr.update(visible=True), # sample_preview_acc (open, but empty)
388
+ gr.update(value="File Path"),
389
+ gr.update(visible=True),
390
+ gr.update(visible=True),
391
+ )
392
+
393
+ def navigate_to_path(input_path):
394
+ """Navigate to a manually entered file or directory path (supports relative and absolute paths)."""
395
+ if not input_path or not input_path.strip():
396
+ return select_file("")
397
+ return select_file(input_path)
398
+
399
+ def select_item(current_path, selected_item):
400
+ """Handle selection of directory or file from dropdown."""
401
+ if not selected_item:
402
+ return gr.update(), gr.update(), gr.update(), gr.update(visible=False)
403
+
404
+ # Get the current directory
405
+ if os.path.isfile(current_path):
406
+ current_dir = os.path.dirname(current_path)
407
+ else:
408
+ current_dir = current_path
409
+
410
+ # Check if it's a directory (we represent directories with trailing "/")
411
+ if selected_item.endswith('/'):
412
+ # Extract directory name (remove trailing "/")
413
+ dir_name = selected_item.rstrip('/')
414
+ new_dir = os.path.join(current_dir, dir_name)
415
+ items_choices, _ = get_directory_contents(new_dir)
416
+ return (
417
+ gr.update(value=new_dir), # path_input
418
+ gr.update(choices=items_choices, value=None), # items_dropdown
419
+ gr.update(), # file_path_input (no change)
420
+ gr.update(visible=False), # sample_preview
421
+ gr.update(visible=True), # sample_preview_acc stays visible (collapsed)
422
+ )
423
+ else:
424
+ # It's a file - selected_item is the filename directly
425
+ filename = selected_item
426
+ file_path = os.path.join(current_dir, filename)
427
+ preview_html = _create_sample_preview_html(file_path)
428
+ return (
429
+ gr.update(), # path_input (no change)
430
+ gr.update(), # items_dropdown (no change)
431
+ gr.update(value=file_path), # file_path_input
432
+ gr.update(value=preview_html, visible=bool(preview_html)), # sample_preview
433
+ gr.update(visible=True), # sample_preview_acc
434
+ )
435
+
436
+ def _create_sample_preview_html(file_path: str) -> str:
437
+ try:
438
+ if not file_path or not os.path.exists(file_path):
439
+ return ""
440
+ # Load a small sample (first row) depending on extension
441
+ if file_path.endswith('.jsonl'):
442
+ df = pd.read_json(file_path, lines=True, nrows=1)
443
+ elif file_path.endswith('.json'):
444
+ df = pd.read_json(file_path)
445
+ if len(df) > 1:
446
+ df = df.head(1)
447
+ elif file_path.endswith('.csv'):
448
+ df = pd.read_csv(file_path, nrows=1)
449
+ elif file_path.endswith('.parquet'):
450
+ df = pd.read_parquet(file_path)
451
+ if len(df) > 1:
452
+ df = df.head(1)
453
+ else:
454
+ return ""
455
+
456
+ # Columns where a conversation/trace may live
457
+ conversation_fields = [
458
+ "model_response", # preferred: entire trace
459
+ "messages",
460
+ "conversation",
461
+ "chat",
462
+ "response",
463
+ "assistant_response",
464
+ ]
465
+ value = None
466
+ for col in conversation_fields:
467
+ if col in df.columns:
468
+ candidate = df.iloc[0][col]
469
+ if isinstance(candidate, str) and not candidate.strip():
470
+ continue
471
+ value = candidate
472
+ break
473
+ if value is None:
474
+ return "<div style='color:#666;padding:8px;'>No conversation-like column found to preview.</div>"
475
+
476
+ conversation = convert_to_openai_format(value)
477
+ return display_openai_conversation_html(conversation, use_accordion=False, pretty_print_dicts=True)
478
+ except Exception as e:
479
+ return f"<div style='color:#d32f2f;padding:8px;'>Failed to render preview: {e}</div>"
480
+
481
+ # Wire up directory browser events
482
+ browse_button.click(
483
+ fn=browse_directory,
484
+ inputs=[path_input],
485
+ outputs=[dir_browser, items_dropdown]
486
+ )
487
+
488
+ # Load Data button uses current textbox value
489
+ load_data_btn.click(
490
+ fn=_load_data_from_textbox,
491
+ inputs=[file_path_input],
492
+ outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
493
+ )
494
+
495
+ # Use Example File button fills the textbox and renders preview
496
+ def _resolve_demo_path(demo_name: str | None) -> str:
497
+ names = get_demo_names()
498
+ default_name = names[0] if names else None
499
+ chosen = demo_name if demo_name in names else default_name
500
+ cfg = get_demo_config(chosen) if chosen else None
501
+ return cfg.get("data_path") if cfg else EXAMPLE_FILE
502
+
503
+ def _use_example_file(demo_name: str | None):
504
+ path = _resolve_demo_path(demo_name)
505
+ return select_file(path)
506
+
507
+ use_example_btn.click(
508
+ fn=_use_example_file,
509
+ inputs=[demo_selector],
510
+ outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
511
+ )
512
+
513
+ # Use example from Upload File area as well (do not switch input method)
514
+ def _use_example_file_upload(demo_name: str | None):
515
+ path = _resolve_demo_path(demo_name)
516
+ pi_u, dd_u, fp_u, sp_u, spa_u, im_u, fpr_u, db_u = select_file(path)
517
+ return (
518
+ pi_u,
519
+ dd_u,
520
+ fp_u,
521
+ sp_u,
522
+ spa_u,
523
+ gr.update(), # keep current input_method (do not force File Path)
524
+ gr.update(visible=False), # hide file_path_row in Upload mode
525
+ gr.update(visible=False), # hide dir_browser
526
+ )
527
+
528
+ use_example_btn_upload.click(
529
+ fn=_use_example_file_upload,
530
+ inputs=[demo_selector],
531
+ outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
532
+ )
533
+
534
+ navigate_button.click(
535
+ fn=navigate_to_path,
536
+ inputs=[path_input],
537
+ outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
538
+ )
539
+
540
+ # Auto-navigate when user presses Enter in the path input
541
+ path_input.submit(
542
+ fn=navigate_to_path,
543
+ inputs=[path_input],
544
+ outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser]
545
+ )
546
+
547
+ items_dropdown.change(
548
+ fn=select_item,
549
+ inputs=[path_input, items_dropdown],
550
+ outputs=[path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc]
551
+ )
552
+
553
+ # Apply demo selection to auto-fill path and parameters
554
+ def apply_demo_selection(demo_name: str | None):
555
+ if not demo_name or demo_name == "β€” Select β€”":
556
+ # No changes
557
+ return (
558
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
559
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
560
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
561
+ )
562
+ cfg = get_demo_config(demo_name)
563
+ if not cfg:
564
+ return (
565
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
566
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
567
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
568
+ )
569
+ # Select file path and preview
570
+ pi, dd, fp, sp, spa, im, fpr, db = select_file(cfg.get("data_path", ""))
571
+
572
+ # Explain params
573
+ explain_cfg = cfg.get("explain", {})
574
+ method_val = explain_cfg.get("method") if explain_cfg else None
575
+ system_prompt_val = explain_cfg.get("system_prompt") if explain_cfg else None
576
+ clusterer_val = explain_cfg.get("clusterer") if explain_cfg else None
577
+ min_cluster_size_val = explain_cfg.get("min_cluster_size") if explain_cfg else None
578
+ max_coarse_clusters_val = explain_cfg.get("max_coarse_clusters") if explain_cfg else None
579
+ hierarchical_val = explain_cfg.get("hierarchical") if explain_cfg else None
580
+ assign_outliers_val = explain_cfg.get("assign_outliers") if explain_cfg else None
581
+ groupby_column_val = explain_cfg.get("groupby_column") if explain_cfg else None
582
+
583
+ # Label params
584
+ label_cfg = cfg.get("label", {})
585
+ taxonomy_val = json.dumps(label_cfg.get("taxonomy"), indent=2) if label_cfg.get("taxonomy") is not None else None
586
+ label_model_name_val = label_cfg.get("label_model_name") if label_cfg else None
587
+
588
+ # Advanced params
589
+ adv_cfg = cfg.get("advanced", {})
590
+ sample_size_val = adv_cfg.get("sample_size") if adv_cfg else None
591
+ max_workers_val = adv_cfg.get("max_workers") if adv_cfg else None
592
+ use_wandb_val = adv_cfg.get("use_wandb") if adv_cfg else None
593
+ verbose_val = adv_cfg.get("verbose") if adv_cfg else None
594
+
595
+ return (
596
+ pi, dd, fp, sp, spa, im, fpr, db,
597
+ gr.update(value=method_val) if method_val is not None else gr.update(),
598
+ gr.update(value=system_prompt_val) if system_prompt_val is not None else gr.update(),
599
+ gr.update(value=clusterer_val) if clusterer_val is not None else gr.update(),
600
+ gr.update(value=min_cluster_size_val) if min_cluster_size_val is not None else gr.update(),
601
+ gr.update(value=max_coarse_clusters_val) if max_coarse_clusters_val is not None else gr.update(),
602
+ gr.update(value=hierarchical_val) if hierarchical_val is not None else gr.update(),
603
+ gr.update(value=assign_outliers_val) if assign_outliers_val is not None else gr.update(),
604
+ gr.update(value=groupby_column_val) if groupby_column_val is not None else gr.update(),
605
+ gr.update(value=taxonomy_val) if taxonomy_val is not None else gr.update(),
606
+ gr.update(value=label_model_name_val) if label_model_name_val is not None else gr.update(),
607
+ gr.update(value=sample_size_val) if sample_size_val is not None else gr.update(),
608
+ gr.update(value=max_workers_val) if max_workers_val is not None else gr.update(),
609
+ gr.update(value=use_wandb_val) if use_wandb_val is not None else gr.update(),
610
+ gr.update(value=verbose_val) if verbose_val is not None else gr.update(),
611
+ )
612
+
613
+ demo_selector.change(
614
+ fn=apply_demo_selection,
615
+ inputs=[demo_selector],
616
+ outputs=[
617
+ path_input, items_dropdown, file_path_input, sample_preview, sample_preview_acc, input_method, file_path_row, dir_browser,
618
+ method, system_prompt, clusterer, min_cluster_size, max_coarse_clusters, hierarchical, assign_outliers, groupby_column,
619
+ taxonomy_input, label_model_name, sample_size, max_workers, use_wandb, verbose,
620
+ ]
621
+ )
622
+
623
+ return {
624
+ "run_button_explain": run_button_explain,
625
+ "run_button_label": run_button_label,
626
+ "status_display": status_display,
627
+ "results_preview": results_preview,
628
+ "sample_preview": sample_preview,
629
+ "browse_button": browse_button,
630
+ "file_path_input": file_path_input,
631
+ # Expose inputs for app.py to wire up enhanced handlers
632
+ "inputs_explain": [
633
+ input_method, file_upload, file_path_input,
634
+ method, system_prompt, clusterer, min_cluster_size, max_coarse_clusters,
635
+ hierarchical, assign_outliers, groupby_column, sample_size, max_workers,
636
+ use_wandb, verbose
637
+ ],
638
+ "inputs_label": [
639
+ input_method, file_upload, file_path_input,
640
+ taxonomy_input, label_model_name,
641
+ sample_size, max_workers, use_wandb, verbose
642
+ ],
643
+ }
644
+
645
+
646
+ def run_pipeline_handler(
647
+ input_method: str,
648
+ uploaded_file: Any,
649
+ file_path: str,
650
+ method: str,
651
+ system_prompt: str,
652
+ clusterer: str,
653
+ min_cluster_size: int,
654
+ max_coarse_clusters: int,
655
+ hierarchical: bool,
656
+ assign_outliers: bool,
657
+ groupby_column: str,
658
+ sample_size: Optional[float],
659
+ max_workers: int,
660
+ use_wandb: bool,
661
+ verbose: bool,
662
+ progress: gr.Progress = gr.Progress(track_tqdm=True)
663
+ ) -> Tuple[str, str]:
664
+ """
665
+ Handle pipeline execution with the provided parameters.
666
+
667
+ Returns:
668
+ Tuple of (status_html, results_preview_html)
669
+ """
670
+ try:
671
+ # Step 1: Validate and get input file path
672
+ progress(0.05, "Validating input...")
673
+
674
+ if input_method == "Upload File":
675
+ if uploaded_file is None:
676
+ return create_error_html("Please upload a data file"), ""
677
+ data_path = uploaded_file.name
678
+ else:
679
+ if not file_path or not file_path.strip():
680
+ return create_error_html("Please enter a file path"), ""
681
+ data_path = file_path.strip()
682
+ if not os.path.exists(data_path):
683
+ return create_error_html(f"File not found: {data_path}"), ""
684
+
685
+ # Step 1.5: Ensure wandb is globally disabled when not requested
686
+ # This prevents accidental logging from downstream modules that import wandb
687
+ if not use_wandb:
688
+ os.environ["WANDB_DISABLED"] = "true"
689
+ else:
690
+ # Re-enable if previously disabled in this process
691
+ os.environ.pop("WANDB_DISABLED", None)
692
+
693
+ # Step 2: Load and validate dataset
694
+ progress(0.1, "Loading dataset...")
695
+
696
+ try:
697
+ if data_path.endswith('.jsonl'):
698
+ df = pd.read_json(data_path, lines=True)
699
+ elif data_path.endswith('.json'):
700
+ df = pd.read_json(data_path)
701
+ elif data_path.endswith('.csv'):
702
+ df = pd.read_csv(data_path)
703
+ elif data_path.endswith('.parquet'):
704
+ df = pd.read_parquet(data_path)
705
+ else:
706
+ return create_error_html("Unsupported file format. Use JSONL, JSON, CSV, or Parquet"), ""
707
+ except Exception as e:
708
+ return create_error_html(f"Failed to load dataset: {str(e)}"), ""
709
+
710
+ # Step 3: Validate dataset structure
711
+ required_columns = validate_dataset_structure(df, method)
712
+ if required_columns:
713
+ return create_error_html(f"Missing required columns: {required_columns}"), ""
714
+
715
+ # Step 4: Create output directory
716
+ progress(0.15, "Preparing output directory...")
717
+
718
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
719
+ output_dir = os.path.join(BASE_RESULTS_DIR or "results", f"uploaded_run_{timestamp}")
720
+ os.makedirs(output_dir, exist_ok=True)
721
+
722
+ # Step 5: Sample dataset if requested
723
+ original_size = len(df)
724
+ if sample_size and sample_size > 0 and sample_size < len(df):
725
+ progress(0.18, f"Sampling {int(sample_size)} rows from {original_size} total...")
726
+ df = df.sample(n=int(sample_size), random_state=42)
727
+
728
+ # Step 6: Prepare parameters
729
+ progress(0.2, "Configuring pipeline...")
730
+
731
+ # Handle optional parameters
732
+ groupby_param = groupby_column.strip() if groupby_column and groupby_column.strip() else None
733
+
734
+ # Step 7: Run the pipeline
735
+ progress(0.25, "Starting pipeline execution...")
736
+ status_html = create_running_html(original_size, len(df), output_dir)
737
+
738
+ # Execute the pipeline with progress tracking
739
+ clustered_df, model_stats = explain(
740
+ df,
741
+ method=method,
742
+ system_prompt=system_prompt,
743
+ clusterer=clusterer,
744
+ min_cluster_size=min_cluster_size,
745
+ max_coarse_clusters=max_coarse_clusters,
746
+ hierarchical=hierarchical,
747
+ assign_outliers=assign_outliers,
748
+ max_workers=max_workers,
749
+ use_wandb=use_wandb,
750
+ verbose=verbose,
751
+ output_dir=output_dir,
752
+ groupby_column=groupby_param
753
+ )
754
+
755
+ # Step 8: Load results into app state
756
+ progress(0.95, "Loading results into dashboard...")
757
+
758
+ # Load the pipeline results using existing loader
759
+ clustered_df_loaded, metrics, model_cluster_df, results_path = load_pipeline_results(output_dir)
760
+
761
+ # Update app state
762
+ app_state["clustered_df"] = clustered_df_loaded
763
+ app_state["metrics"] = metrics
764
+ app_state["model_stats"] = metrics # Deprecated alias
765
+ app_state["results_path"] = results_path
766
+ app_state["available_models"] = get_available_models(metrics)
767
+ app_state["current_results_dir"] = output_dir
768
+
769
+ progress(1.0, "Pipeline completed successfully!")
770
+
771
+ # Step 9: Create success display
772
+ success_html = create_success_html(output_dir, len(clustered_df_loaded), len(metrics.get("model_cluster_scores", {})))
773
+ results_preview_html = create_results_preview_html(metrics)
774
+
775
+ # Step 10: Return success with indication for tab switching
776
+ return success_html + "<!-- SUCCESS -->", results_preview_html
777
+
778
+ except Exception as e:
779
+ error_msg = f"Pipeline execution failed: {str(e)}"
780
+ if verbose:
781
+ error_msg += f"\n\nFull traceback:\n{traceback.format_exc()}"
782
+ return create_error_html(error_msg), ""
783
+
784
+
785
+ def run_label_pipeline_handler(
786
+ input_method: str,
787
+ uploaded_file: Any,
788
+ file_path: str,
789
+ taxonomy_json: str,
790
+ model_name: str,
791
+ sample_size: Optional[float],
792
+ max_workers: int,
793
+ use_wandb: bool,
794
+ verbose: bool,
795
+ progress: gr.Progress = gr.Progress(track_tqdm=True)
796
+ ) -> Tuple[str, str]:
797
+ """
798
+ Handle fixed-taxonomy labeling execution with the provided parameters.
799
+ """
800
+ try:
801
+ # Step 1: Validate and get input file path
802
+ progress(0.05, "Validating input...")
803
+ if input_method == "Upload File":
804
+ if uploaded_file is None:
805
+ return create_error_html("Please upload a data file"), ""
806
+ data_path = uploaded_file.name
807
+ else:
808
+ if not file_path or not file_path.strip():
809
+ return create_error_html("Please enter a file path"), ""
810
+ data_path = file_path.strip()
811
+ if not os.path.exists(data_path):
812
+ return create_error_html(f"File not found: {data_path}"), ""
813
+
814
+ # Ensure wandb disabled when not requested
815
+ if not use_wandb:
816
+ os.environ["WANDB_DISABLED"] = "true"
817
+ else:
818
+ os.environ.pop("WANDB_DISABLED", None)
819
+
820
+ # Step 2: Load dataset
821
+ progress(0.1, "Loading dataset...")
822
+ try:
823
+ if data_path.endswith('.jsonl'):
824
+ df = pd.read_json(data_path, lines=True)
825
+ elif data_path.endswith('.json'):
826
+ df = pd.read_json(data_path)
827
+ elif data_path.endswith('.csv'):
828
+ df = pd.read_csv(data_path)
829
+ elif data_path.endswith('.parquet'):
830
+ df = pd.read_parquet(data_path)
831
+ else:
832
+ return create_error_html("Unsupported file format. Use JSONL, JSON, CSV, or Parquet"), ""
833
+ except Exception as e:
834
+ return create_error_html(f"Failed to load dataset: {str(e)}"), ""
835
+
836
+ # Step 3: Validate dataset structure (single_model only for label)
837
+ struct_err = validate_dataset_structure(df, method="single_model")
838
+ if struct_err:
839
+ return create_error_html(struct_err), ""
840
+
841
+ # Step 4: Parse taxonomy JSON
842
+ progress(0.15, "Parsing taxonomy...")
843
+ import json as _json
844
+ try:
845
+ taxonomy = _json.loads(taxonomy_json) if isinstance(taxonomy_json, str) else taxonomy_json
846
+ if not isinstance(taxonomy, dict) or not taxonomy:
847
+ return create_error_html("Taxonomy must be a non-empty JSON object of {label: description}"), ""
848
+ except Exception as e:
849
+ return create_error_html(f"Invalid taxonomy JSON: {e}"), ""
850
+
851
+ # Step 5: Create output directory
852
+ progress(0.18, "Preparing output directory...")
853
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
854
+ output_dir = os.path.join(BASE_RESULTS_DIR or "results", f"labeled_run_{timestamp}")
855
+ os.makedirs(output_dir, exist_ok=True)
856
+
857
+ # Step 6: Sample dataset if requested
858
+ original_size = len(df)
859
+ if sample_size and sample_size > 0 and sample_size < len(df):
860
+ progress(0.2, f"Sampling {int(sample_size)} rows from {original_size:,} total...")
861
+ df = df.sample(n=int(sample_size), random_state=42)
862
+
863
+ # Step 7: Run label()
864
+ progress(0.25, "Starting labeling execution...")
865
+ status_html = create_running_html(original_size, len(df), output_dir)
866
+
867
+ clustered_df, model_stats = label(
868
+ df,
869
+ taxonomy=taxonomy,
870
+ model_name=model_name or "gpt-4o-mini",
871
+ max_workers=max_workers,
872
+ use_wandb=use_wandb,
873
+ verbose=verbose,
874
+ output_dir=output_dir,
875
+ )
876
+
877
+ # Step 8: Load results into app state
878
+ progress(0.95, "Loading results into dashboard...")
879
+ clustered_df_loaded, metrics, model_cluster_df, results_path = load_pipeline_results(output_dir)
880
+
881
+ app_state["clustered_df"] = clustered_df_loaded
882
+ app_state["metrics"] = metrics
883
+ app_state["model_stats"] = metrics
884
+ app_state["results_path"] = results_path
885
+ app_state["available_models"] = get_available_models(metrics)
886
+ app_state["current_results_dir"] = output_dir
887
+
888
+ progress(1.0, "Labeling completed successfully!")
889
+
890
+ success_html = create_success_html(output_dir, len(clustered_df_loaded), len(metrics.get("model_cluster_scores", {})))
891
+ results_preview_html = create_results_preview_html(metrics)
892
+ return success_html + "<!-- SUCCESS -->", results_preview_html
893
+
894
+ except Exception as e:
895
+ error_msg = f"Labeling execution failed: {str(e)}"
896
+ if verbose:
897
+ import traceback as _tb
898
+ error_msg += f"\n\nFull traceback:\n{_tb.format_exc()}"
899
+ return create_error_html(error_msg), ""
900
+
901
+
902
+ def validate_dataset_structure(df: pd.DataFrame, method: str) -> str:
903
+ """
904
+ Validate that the dataset has the required columns for the specified method.
905
+
906
+ Returns:
907
+ Empty string if valid, error message if invalid
908
+ """
909
+ if method == "single_model":
910
+ required = ["prompt", "model_response", "model"]
911
+ missing = [col for col in required if col not in df.columns]
912
+ elif method == "side_by_side":
913
+ required = ["prompt", "model_a_response", "model_b_response", "model_a", "model_b"]
914
+ missing = [col for col in required if col not in df.columns]
915
+ else:
916
+ return f"Unknown method: {method}"
917
+
918
+ if missing:
919
+ return f"Missing required columns for {method}: {missing}. Available columns: {list(df.columns)}"
920
+
921
+ return ""
922
+
923
+
924
+ def create_error_html(message: str) -> str:
925
+ """Create HTML for error display."""
926
+ return f"""
927
+ <div style='color: #d32f2f; background-color: #ffebee; padding: 16px; border-radius: 8px; border-left: 4px solid #d32f2f;'>
928
+ <strong>Error</strong><br>
929
+ <pre style='color: #d32f2f; margin-top: 8px; white-space: pre-wrap;'>{message}</pre>
930
+ </div>
931
+ """
932
+
933
+
934
+ def create_running_html(original_size: int, processed_size: int, output_dir: str) -> str:
935
+ """Create HTML for running status display."""
936
+ return f"""
937
+ <div style='color: #1976d2; background-color: #e3f2fd; padding: 16px; border-radius: 8px; border-left: 4px solid #1976d2;'>
938
+ <strong>Pipeline Running</strong><br>
939
+ <div style='margin-top: 8px;'>
940
+ β€’ Processing: {processed_size:,} conversations
941
+ {f"(sampled from {original_size:,})" if processed_size < original_size else ""}
942
+ <br>
943
+ β€’ Output directory: <code>{output_dir}</code>
944
+ <br>
945
+ β€’ Status: Extracting properties and clustering...
946
+ </div>
947
+ </div>
948
+ """
949
+
950
+
951
+ def create_success_html(output_dir: str, n_properties: int, n_models: int) -> str:
952
+ """Create HTML for success display."""
953
+ return f"""
954
+ <div style='color: #388e3c; background-color: #e8f5e8; padding: 16px; border-radius: 8px; border-left: 4px solid #388e3c;'>
955
+ <strong>Pipeline Completed Successfully!</strong><br>
956
+ <div style='margin-top: 8px;'>
957
+ β€’ Extracted properties: {n_properties:,}
958
+ <br>
959
+ β€’ Models analyzed: {n_models}
960
+ <br>
961
+ β€’ Results saved to: <code>{output_dir}</code>
962
+ <br><br>
963
+ <strong>Results are now loaded in the dashboard!</strong><br>
964
+ Switch to other tabs to explore your results:
965
+ <br>
966
+ <strong>Overview</strong> - Model performance summary
967
+ <br>
968
+ <strong>View Clusters</strong> - Explore behavior clusters
969
+ <br>
970
+ <strong>View Examples</strong> - Browse specific examples
971
+ <br>
972
+ <strong>Plots</strong> - Interactive visualizations
973
+ </div>
974
+ </div>
975
+ """
976
+
977
+
978
+ def create_results_preview_html(metrics: dict) -> str:
979
+ """Create HTML preview of the results."""
980
+ if not metrics or "model_cluster_scores" not in metrics:
981
+ return ""
982
+
983
+ model_scores = metrics["model_cluster_scores"]
984
+ n_models = len(model_scores)
985
+
986
+ # Get top models by some metric (if available)
987
+ preview_html = f"""
988
+ <div style='background-color: #f5f5f5; padding: 16px; border-radius: 8px; margin-top: 16px;'>
989
+ <strong>Results Preview</strong><br>
990
+ <div style='margin-top: 8px;'>
991
+ <strong>Models analyzed:</strong> {n_models}<br>
992
+ """
993
+
994
+ # Show first few models
995
+ model_names = list(model_scores.keys())[:5]
996
+ if model_names:
997
+ preview_html += f"<strong>Sample models:</strong> {', '.join(model_names)}"
998
+ if len(model_scores) > 5:
999
+ preview_html += f" and {len(model_scores) - 5} more..."
1000
+
1001
+ preview_html += """
1002
+ </div>
1003
+ </div>
1004
+ """
1005
+
1006
+ return preview_html
1007
+
1008
+
1009
+ def get_directory_contents(directory: str) -> Tuple[List[str], str]:
1010
+ """
1011
+ Get directory contents for dropdown menu.
1012
+
1013
+ Args:
1014
+ directory: Path to directory to list
1015
+
1016
+ Returns:
1017
+ Tuple of (items_choices, empty_string)
1018
+ items_choices contains both directories (shown with trailing "/") and files
1019
+ """
1020
+ try:
1021
+ if not os.path.exists(directory) or not os.path.isdir(directory):
1022
+ error_html = f"""
1023
+ <div style='color: #d32f2f; padding: 16px;'>
1024
+ <strong>Error:</strong> Directory not found: {directory}
1025
+ </div>
1026
+ """
1027
+ return [], ""
1028
+
1029
+ # Get directory contents
1030
+ try:
1031
+ entries = sorted(os.listdir(directory))
1032
+ except PermissionError:
1033
+ error_html = f"""
1034
+ <div style='color: #d32f2f; padding: 16px;'>
1035
+ <strong>Error:</strong> Permission denied accessing: {directory}
1036
+ </div>
1037
+ """
1038
+ return [], ""
1039
+
1040
+ # Separate directories and files, create dropdown choices
1041
+ directories = []
1042
+ files = []
1043
+ items_choices = []
1044
+
1045
+ for entry in entries:
1046
+ if entry.startswith('.'): # Skip hidden files/dirs
1047
+ continue
1048
+
1049
+ full_path = os.path.join(directory, entry)
1050
+
1051
+ try:
1052
+ if os.path.isdir(full_path):
1053
+ directories.append(entry)
1054
+ items_choices.append(f"{entry}/")
1055
+ elif entry.lower().endswith(('.jsonl', '.json', '.csv', '.parquet')):
1056
+ # Only show supported file types
1057
+ files.append(entry)
1058
+ items_choices.append(entry)
1059
+ except (OSError, PermissionError):
1060
+ continue # Skip inaccessible items
1061
+
1062
+ return items_choices, ""
1063
+
1064
+ except Exception as e:
1065
+ error_html = f"""
1066
+ <div style='color: #d32f2f; padding: 16px;'>
1067
+ <strong>Error listing directory:</strong> {str(e)}
1068
+ </div>
1069
+ """
1070
+ return [], ""
stringsight/dashboard/side_by_side_display.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Side-by-side display component for comparing model responses.
3
+
4
+ This module provides functionality to display two model responses side by side
5
+ for comparison, specifically designed for datasets with model_a_response and
6
+ model_b_response fields.
7
+ """
8
+
9
+ from typing import Dict, Any, Optional
10
+ from .conversation_display import convert_to_openai_format, display_openai_conversation_html
11
+ import html
12
+
13
+ def display_side_by_side_responses(
14
+ model_a: str,
15
+ model_b: str,
16
+ model_a_response: Any,
17
+ model_b_response: Any,
18
+ use_accordion: bool = True,
19
+ pretty_print_dicts: bool = True,
20
+ score: Optional[float] = None,
21
+ winner: Optional[str] = None
22
+ ) -> str:
23
+ """
24
+ Display two model responses side by side for comparison.
25
+
26
+ Args:
27
+ model_a: Name of model A
28
+ model_b: Name of model B
29
+ model_a_response: Response data from model A
30
+ model_b_response: Response data from model B
31
+ use_accordion: If True, group system and info messages in collapsible accordions
32
+ pretty_print_dicts: If True, pretty-print embedded dictionaries
33
+ score: Optional score for the comparison
34
+ winner: Optional winner indication ('model_a', 'model_b', or 'tie')
35
+
36
+ Returns:
37
+ HTML string for side-by-side display
38
+ """
39
+
40
+ # Convert responses to OpenAI format
41
+ conversation_a = convert_to_openai_format(model_a_response) if model_a_response != 'N/A' else None
42
+ conversation_b = convert_to_openai_format(model_b_response) if model_b_response != 'N/A' else None
43
+
44
+ # Generate conversation HTML for each model
45
+ if conversation_a:
46
+ html_a = display_openai_conversation_html(
47
+ conversation_a,
48
+ use_accordion=use_accordion,
49
+ pretty_print_dicts=pretty_print_dicts,
50
+ evidence=None # Evidence highlighting is not well-defined for comparisons without a single evidence; caller can adapt if needed
51
+ )
52
+ else:
53
+ html_a = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
54
+
55
+ if conversation_b:
56
+ html_b = display_openai_conversation_html(
57
+ conversation_b,
58
+ use_accordion=use_accordion,
59
+ pretty_print_dicts=pretty_print_dicts,
60
+ evidence=None
61
+ )
62
+ else:
63
+ html_b = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
64
+
65
+ # Create winner badges if winner is specified
66
+ winner_badge_a = ""
67
+ winner_badge_b = ""
68
+ if winner:
69
+ if winner == 'model_a':
70
+ winner_badge_a = """
71
+ <span style="
72
+ background: #28a745;
73
+ color: white;
74
+ padding: 4px 8px;
75
+ border-radius: 12px;
76
+ font-size: 12px;
77
+ font-weight: bold;
78
+ margin-left: 10px;
79
+ ">
80
+ πŸ† Winner
81
+ </span>
82
+ """
83
+ elif winner == 'model_b':
84
+ winner_badge_b = """
85
+ <span style="
86
+ background: #28a745;
87
+ color: white;
88
+ padding: 4px 8px;
89
+ border-radius: 12px;
90
+ font-size: 12px;
91
+ font-weight: bold;
92
+ margin-left: 10px;
93
+ ">
94
+ πŸ† Winner
95
+ </span>
96
+ """
97
+ elif winner == 'tie':
98
+ tie_badge = """
99
+ <span style="
100
+ background: #6c757d;
101
+ color: white;
102
+ padding: 4px 8px;
103
+ border-radius: 12px;
104
+ font-size: 12px;
105
+ font-weight: bold;
106
+ margin-left: 10px;
107
+ ">
108
+ 🀝 Tie
109
+ </span>
110
+ """
111
+ winner_badge_a = tie_badge
112
+ winner_badge_b = tie_badge
113
+
114
+ # Add score badge if available
115
+ score_info = ""
116
+ if score is not None and score != 'N/A':
117
+ try:
118
+ score_val = float(score)
119
+ score_color = '#28a745' if score_val >= 0 else '#dc3545'
120
+ score_info = f"""
121
+ <div style="text-align: center; margin-bottom: 15px;">
122
+ <span style="
123
+ background: {score_color};
124
+ color: white;
125
+ padding: 6px 12px;
126
+ border-radius: 15px;
127
+ font-size: 16px;
128
+ font-weight: bold;
129
+ ">
130
+ Comparison Score: {score_val:.3f}
131
+ </span>
132
+ </div>
133
+ """
134
+ except (ValueError, TypeError):
135
+ pass
136
+
137
+ # Create the side-by-side layout
138
+ side_by_side_html = f"""
139
+ <div style="margin-bottom: 20px;">
140
+ {score_info}
141
+ <div style="display: flex; gap: 20px; margin-top: 10px;">
142
+ <!-- Model A Column -->
143
+ <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
144
+ <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
145
+ <span style="background: #007bff; color: white; padding: 4px 8px; border-radius: 4px; font-size: 14px; margin-right: 10px;">A</span>
146
+ {html.escape(model_a)}
147
+ {winner_badge_a}
148
+ </h4>
149
+ <div style="font-size: 15px; line-height: 1.5;">
150
+ {html_a}
151
+ </div>
152
+ </div>
153
+
154
+ <!-- Model B Column -->
155
+ <div style="flex: 1; border: 2px solid #e9ecef; border-radius: 8px; padding: 15px; background-color: #f8f9fa;">
156
+ <h4 style="margin: 0 0 15px 0; padding-bottom: 10px; border-bottom: 2px solid #dee2e6; color: #495057; display: flex; align-items: center;">
157
+ <span style="background: #fd7e14; color: white; padding: 4px 8px; border-radius: 4px; font-size: 14px; margin-right: 10px;">B</span>
158
+ {html.escape(model_b)}
159
+ {winner_badge_b}
160
+ </h4>
161
+ <div style="font-size: 15px; line-height: 1.5;">
162
+ {html_b}
163
+ </div>
164
+ </div>
165
+ </div>
166
+ </div>
167
+ """
168
+
169
+ return side_by_side_html
170
+
171
+
172
+ def is_side_by_side_dataset(example: Dict[str, Any]) -> bool:
173
+ """
174
+ Check if an example contains side-by-side comparison data.
175
+
176
+ Args:
177
+ example: Example dictionary from the dataset
178
+
179
+ Returns:
180
+ True if the example has both model_a_response and model_b_response
181
+ """
182
+ # Check if this is a side-by-side dataset by looking for both model_a_response and model_b_response
183
+ return 'model_a_response' in example and 'model_b_response' in example and \
184
+ example.get('model_a_response') is not None and example.get('model_b_response') is not None
185
+
186
+
187
+ def extract_side_by_side_data(row: Dict[str, Any]) -> Dict[str, Any]:
188
+ """
189
+ Extract side-by-side comparison data from a row.
190
+
191
+ Args:
192
+ row: Row from the dataset
193
+
194
+ Returns:
195
+ Dictionary with extracted side-by-side data
196
+ """
197
+ return {
198
+ 'model_a': row.get('model_a', 'Model A'),
199
+ 'model_b': row.get('model_b', 'Model B'),
200
+ 'model_a_response': row.get('model_a_response', 'N/A'),
201
+ 'model_b_response': row.get('model_b_response', 'N/A'),
202
+ 'winner': row.get('winner', None),
203
+ 'score': row.get('score', None)
204
+ }
stringsight/dashboard/state.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared application state for the LMM-Vibes Gradio viewer.
3
+
4
+ This module centralises mutable globals so they can be imported from any other
5
+ sub-module without circular-import problems.
6
+ """
7
+ from typing import Any, Dict, Optional
8
+ import os
9
+ from pathlib import Path
10
+
11
+ # Global runtime state – mutable and shared across all tabs
12
+ app_state: Dict[str, Any] = {
13
+ "clustered_df": None,
14
+ # NEW canonical key for the FunctionalMetrics dict
15
+ "metrics": None,
16
+ # DEPRECATED alias kept temporarily so that untouched modules continue to work
17
+ "model_stats": None,
18
+ "results_path": None,
19
+ "available_models": [],
20
+ "current_results_dir": None,
21
+ }
22
+
23
+ # Base directory that contains experiment result folders. Can be changed at
24
+ # runtime via launch_app(results_dir=…). A value of None means "not set".
25
+ # Prefer persistent storage in Spaces at /data/data when available.
26
+ _default_base = "results"
27
+ BASE_RESULTS_DIR: Optional[str] = os.getenv("BASE_RESULTS_DIR", _default_base)
stringsight/dashboard/utils.py ADDED
@@ -0,0 +1,2027 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for Gradio pipeline results app.
3
+
4
+ This module contains common utility functions used across different components.
5
+ """
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+ import json
10
+ import markdown
11
+ import plotly.graph_objects as go
12
+ import plotly.express as px
13
+ from typing import Dict, List, Any, Optional, Tuple
14
+ import html
15
+ import ast
16
+ import re
17
+
18
+ # Conversation rendering helpers are now in a dedicated module for clarity
19
+ from . import conversation_display as _convdisp
20
+ from .conversation_display import (
21
+ convert_to_openai_format,
22
+ display_openai_conversation_html,
23
+ pretty_print_embedded_dicts,
24
+ )
25
+
26
+ # NEW IMPLEMENTATION ---------------------------------------------------
27
+ from .metrics_adapter import get_model_clusters, get_all_models
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # NEW helper utilities for FunctionalMetrics format
31
+ # ---------------------------------------------------------------------------
32
+
33
+
34
+ # Allowed cluster tags across the entire app
35
+ ALLOWED_TAGS: set[str] = {
36
+ "Positive",
37
+ "Negative (critical)",
38
+ "Negative (non-critical)",
39
+ "Style",
40
+ }
41
+
42
+
43
+ def _is_nan(value: Any) -> bool:
44
+ try:
45
+ return isinstance(value, float) and np.isnan(value)
46
+ except Exception:
47
+ return False
48
+
49
+
50
+ def _parse_meta_obj(meta_obj: Any) -> Any:
51
+ """Normalize and parse metadata objects.
52
+
53
+ - Parse stringified containers (dict/list)
54
+ - Treat NaN-like values as None
55
+ - Return as-is otherwise
56
+ """
57
+ if meta_obj is None:
58
+ return None
59
+ if _is_nan(meta_obj):
60
+ return None
61
+ if isinstance(meta_obj, str):
62
+ s = meta_obj.strip()
63
+ if s in ("", "None", "N/A", "null"):
64
+ return None
65
+ try:
66
+ return ast.literal_eval(meta_obj)
67
+ except Exception:
68
+ return meta_obj
69
+ return meta_obj
70
+
71
+
72
+ def extract_allowed_tag(meta_obj: Any) -> Optional[str]:
73
+ """Extract the first tag value from metadata and return it only if in ALLOWED_TAGS.
74
+
75
+ Rules:
76
+ - If metadata is missing, NaN, or all empty dicts, return None
77
+ - If the extracted value is not in ALLOWED_TAGS, return None
78
+ """
79
+ meta_obj = _parse_meta_obj(meta_obj)
80
+ if meta_obj is None:
81
+ return None
82
+ if isinstance(meta_obj, dict):
83
+ # Empty dict means no tag
84
+ if len(meta_obj) == 0:
85
+ return None
86
+ for _, v in meta_obj.items():
87
+ tag = str(v)
88
+ return tag if tag in ALLOWED_TAGS else None
89
+ return None
90
+ if isinstance(meta_obj, (list, tuple)):
91
+ if len(meta_obj) == 0:
92
+ return None
93
+ tag = str(meta_obj[0])
94
+ return tag if tag in ALLOWED_TAGS else None
95
+ # Scalar string/other
96
+ tag = str(meta_obj)
97
+ return tag if tag in ALLOWED_TAGS else None
98
+
99
+
100
+ def normalize_text_for_search(text: Any) -> str:
101
+ """Lowercase and strip common Markdown/HTML formatting and punctuation for robust search.
102
+
103
+ - Unwrap markdown links: [label](url) -> label
104
+ - Remove inline code/backticks and strikethrough markers
105
+ - Unwrap emphasis/bold/italics: *, **, _, __
106
+ - Strip simple HTML tags
107
+ - Remove all punctuation including commas, periods, quotes, etc.
108
+ - Collapse whitespace
109
+ """
110
+ if text is None:
111
+ return ""
112
+ s = str(text)
113
+ # Strip HTML tags first
114
+ s = re.sub(r"<[^>]+>", " ", s)
115
+ # Markdown links [text](url) -> text
116
+ s = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", s)
117
+ # Inline code `code` -> code
118
+ s = re.sub(r"`([^`]*)`", r"\1", s)
119
+ # Bold/italic wrappers (**text** | __text__ | *text* | _text_) -> text
120
+ s = re.sub(r"(\*\*|__)(.*?)\1", r"\2", s)
121
+ s = re.sub(r"(\*|_)(.*?)\1", r"\2", s)
122
+ # Strikethrough ~~text~~ -> text
123
+ s = re.sub(r"~~(.*?)~~", r"\1", s)
124
+ # Remove remaining markdown emphasis chars/backticks/tilde
125
+ s = re.sub(r"[*_`~]", "", s)
126
+ # Remove all punctuation (including commas, periods, quotes, parentheses, etc.)
127
+ s = re.sub(r"[^\w\s]", " ", s)
128
+ # Normalize whitespace and lowercase
129
+ s = re.sub(r"\s+", " ", s).strip().lower()
130
+ return s
131
+
132
+
133
+ def format_confidence_interval(ci: dict | None, decimals: int = 3) -> str:
134
+ """Return a pretty string for a CI dict of the form {"lower": x, "upper": y}."""
135
+ if not ci or not isinstance(ci, dict):
136
+ return "N/A"
137
+ lower, upper = ci.get("lower"), ci.get("upper")
138
+ if lower is None or upper is None:
139
+ return "N/A"
140
+ return f"[{lower:.{decimals}f}, {upper:.{decimals}f}]"
141
+
142
+
143
+ def get_confidence_interval_width(ci: dict | None) -> float | None:
144
+ """Return CI width (upper-lower) if possible."""
145
+ if not ci or not isinstance(ci, dict):
146
+ return None
147
+ lower, upper = ci.get("lower"), ci.get("upper")
148
+ if lower is None or upper is None:
149
+ return None
150
+ return upper - lower
151
+
152
+
153
+ def has_confidence_intervals(record: dict | None) -> bool:
154
+ """Simple check whether any *_ci key with lower/upper exists in a metrics record."""
155
+ if not record or not isinstance(record, dict):
156
+ return False
157
+ for k, v in record.items():
158
+ if k.endswith("_ci") and isinstance(v, dict) and {"lower", "upper"}.issubset(v.keys()):
159
+ return True
160
+ return False
161
+
162
+
163
+ def extract_quality_score(quality_field: Any) -> float | None:
164
+ """Given a quality field that may be a dict of metric values or a scalar, return its mean."""
165
+ if quality_field is None:
166
+ return None
167
+ if isinstance(quality_field, (int, float)):
168
+ return float(quality_field)
169
+ if isinstance(quality_field, dict) and quality_field:
170
+ return float(np.mean(list(quality_field.values())))
171
+ return None
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # UPDATED: get_top_clusters_for_model for FunctionalMetrics format
175
+ # ---------------------------------------------------------------------------
176
+
177
+
178
+ def get_top_clusters_for_model(metrics: Dict[str, Any], model_name: str, top_n: int = 10) -> List[Tuple[str, Dict[str, Any]]]:
179
+ """Return the top N clusters (by salience) for a given model.
180
+
181
+ Args:
182
+ metrics: The FunctionalMetrics dictionary (3-file format) loaded via data_loader.
183
+ model_name: Name of the model to inspect.
184
+ top_n: Number of clusters to return.
185
+
186
+ Returns:
187
+ List of (cluster_name, cluster_dict) tuples sorted by descending proportion_delta.
188
+ """
189
+ clusters_dict = get_model_clusters(metrics, model_name)
190
+ if not clusters_dict:
191
+ return []
192
+
193
+ # Filter out "No properties" clusters
194
+ clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
195
+
196
+ # Filter out "Outliers" cluster for overview tab
197
+ clusters_dict = {k: v for k, v in clusters_dict.items() if "Outliers" not in k}
198
+
199
+ sorted_items = sorted(
200
+ clusters_dict.items(), key=lambda kv: kv[1].get("proportion_delta", 0), reverse=True
201
+ )
202
+ return sorted_items[:top_n]
203
+
204
+
205
+ def compute_model_rankings_new(metrics: Dict[str, Any]) -> List[tuple]:
206
+ """Compute rankings of models based on mean salience (proportion_delta).
207
+
208
+ Args:
209
+ metrics: The FunctionalMetrics dict loaded by data_loader.
210
+
211
+ Returns:
212
+ List[Tuple[str, Dict[str, float]]]: sorted list of (model_name, summary_dict)
213
+ """
214
+ model_scores: Dict[str, Dict[str, float]] = {}
215
+ for model in get_all_models(metrics):
216
+ clusters = get_model_clusters(metrics, model)
217
+ # Filter out "No properties" clusters
218
+ clusters = {k: v for k, v in clusters.items() if k != "No properties"}
219
+ if not clusters:
220
+ continue
221
+ saliences = [c.get("proportion_delta", 0.0) for c in clusters.values()]
222
+ model_scores[model] = {
223
+ "avg_salience": float(np.mean(saliences)),
224
+ "median_salience": float(np.median(saliences)),
225
+ "num_clusters": len(saliences),
226
+ "top_salience": float(max(saliences)),
227
+ "std_salience": float(np.std(saliences)),
228
+ }
229
+ return sorted(model_scores.items(), key=lambda x: x[1]["avg_salience"], reverse=True)
230
+
231
+
232
+ def create_model_summary_card_new(
233
+ model_name: str,
234
+ metrics: Dict[str, Any],
235
+ top_n: int = 3,
236
+ score_significant_only: bool = False,
237
+ quality_significant_only: bool = False,
238
+ sort_by: str = "quality_asc",
239
+ min_cluster_size: int = 1,
240
+ selected_tags: Optional[List[str]] = None,
241
+ ) -> str:
242
+ """Generate a **styled** HTML summary card for a single model.
243
+
244
+ The new implementation recreates the legacy card design the user prefers:
245
+ β€’ Card header with battle count
246
+ β€’ Each cluster displayed as a vertically-spaced block (NOT a table)
247
+ β€’ Frequency, distinctiveness factor and CI inline; quality score right-aligned
248
+ """
249
+
250
+ clusters_dict = get_model_clusters(metrics, model_name)
251
+ if not clusters_dict:
252
+ return f"<div style='padding:20px'>No cluster data for {model_name}</div>"
253
+
254
+ # Filter out "No properties" clusters
255
+ clusters_dict = {k: v for k, v in clusters_dict.items() if k != "No properties"}
256
+
257
+ # Filter out "Outliers" cluster for overview tab
258
+ clusters_dict = {k: v for k, v in clusters_dict.items() if "Outliers" not in k}
259
+
260
+ # Helper: extract allowed tag from metadata
261
+ def _extract_tag(meta_obj: Any) -> Optional[str]:
262
+ return extract_allowed_tag(meta_obj)
263
+
264
+ # Helper: sanitize label that might include dict-like suffixes
265
+ def _sanitize_label(label: str) -> str:
266
+ if not isinstance(label, str):
267
+ return str(label)
268
+ lbl = re.sub(r"\s*\(\s*\{[^}]*\}\s*\)\s*$", "", label)
269
+ lbl = re.sub(r"\s*\{[^}]*\}\s*$", "", lbl)
270
+ lbl = re.sub(r"\s*\(\s*[^(){}:]+\s*:\s*[^(){}]+\)\s*$", "", lbl)
271
+ return lbl.strip()
272
+
273
+ # Build consistent colors for tags for this card
274
+ # Fixed mapping for known tags
275
+ tag_to_color: Dict[str, str] = {
276
+ "Style": "#9467bd", # purple
277
+ "Positive": "#28a745", # green
278
+ "Negative (non-critical)": "#ff7f0e", # orange
279
+ "Negative (critical)": "#dc3545", # red
280
+ }
281
+ unique_tags: List[str] = []
282
+ label_to_tag: Dict[str, str] = {}
283
+ # Detect "all empty dicts" across metadata
284
+ cluster_meta_values: List[Any] = []
285
+ for c in clusters_dict.values():
286
+ meta_obj = c.get("metadata") if isinstance(c, dict) else None
287
+ meta_obj = _parse_meta_obj(meta_obj)
288
+ cluster_meta_values.append(meta_obj)
289
+ non_null_meta = [m for m in cluster_meta_values if m is not None]
290
+ all_meta_empty_dicts = (
291
+ len(non_null_meta) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_meta)
292
+ )
293
+ if not all_meta_empty_dicts:
294
+ for c in clusters_dict.values():
295
+ tag_val = _extract_tag(c.get("metadata")) if isinstance(c, dict) else None
296
+ if tag_val and tag_val not in unique_tags:
297
+ unique_tags.append(tag_val)
298
+ # tag_to_color already contains all allowed tags with fixed colors
299
+
300
+ # Filter clusters ----------------------------------------------------
301
+ all_clusters = [c for c in clusters_dict.values() if c.get("size", 0) >= min_cluster_size]
302
+
303
+ # Optional: filter clusters by sidebar-selected tags
304
+ if selected_tags:
305
+ def _cluster_tag(c: dict) -> Optional[str]:
306
+ return _extract_tag(c.get("metadata")) if isinstance(c, dict) else None
307
+ allowed = set(map(str, selected_tags))
308
+ all_clusters = [c for c in all_clusters if (t := _cluster_tag(c)) and str(t) in allowed]
309
+
310
+ if score_significant_only:
311
+ if model_name == "all":
312
+ # For "all" model, we don't have proportion_delta_significant, so skip this filter
313
+ pass
314
+ else:
315
+ all_clusters = [c for c in all_clusters if c.get("proportion_delta_significant", False)]
316
+ if quality_significant_only:
317
+ all_clusters = [c for c in all_clusters if any(c.get("quality_delta_significant", {}).values())]
318
+
319
+ if not all_clusters:
320
+ return f"<div style='padding:20px'>No clusters pass filters for {model_name}</div>"
321
+
322
+ # Count significant properties ---------------------------------------
323
+ significant_frequency_count = 0
324
+ significant_quality_count = 0
325
+
326
+ for cluster in clusters_dict.values():
327
+ if cluster.get("size", 0) >= min_cluster_size:
328
+ # Count frequency significance
329
+ if model_name != "all" and cluster.get("proportion_delta_significant", False):
330
+ significant_frequency_count += 1
331
+
332
+ # Count quality significance (sum across all metrics)
333
+ quality_delta_significant = cluster.get("quality_delta_significant", {})
334
+ significant_quality_count += sum(quality_delta_significant.values())
335
+
336
+ # Sort ---------------------------------------------------------------
337
+ def _mean_quality(c: dict[str, Any]) -> float:
338
+ vals = list(c.get("quality", {}).values())
339
+ return float(np.mean(vals)) if vals else 0.0
340
+
341
+ sort_key_map = {
342
+ "quality_asc": (_mean_quality, False),
343
+ "quality_desc": (_mean_quality, True),
344
+ "frequency_desc": (lambda c: c.get("proportion", 0), True),
345
+ "frequency_asc": (lambda c: c.get("proportion", 0), False),
346
+ "salience_desc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True),
347
+ "salience_asc": (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), False),
348
+ }
349
+
350
+ key_fn, reverse = sort_key_map.get(sort_by, (lambda c: c.get("proportion_delta", 0) if model_name != "all" else c.get("proportion", 0), True))
351
+ sorted_clusters = sorted(all_clusters, key=key_fn, reverse=reverse)[:top_n]
352
+
353
+ # Determine total conversations for this model ----------------
354
+ if model_name == "all":
355
+ # For "all" model, sum the individual model totals to avoid double-counting
356
+ model_scores = metrics.get("model_scores", {})
357
+ total_battles = sum(model_data.get("size", 0) for model_data in model_scores.values())
358
+ else:
359
+ model_scores_entry = metrics.get("model_scores", {}).get(model_name, {})
360
+ total_battles = model_scores_entry.get("size")
361
+ if total_battles is None:
362
+ # Fallback: deduplicate example IDs across clusters
363
+ total_battles = sum(c.get("size", 0) for c in clusters_dict.values())
364
+
365
+ # Card header --------------------------------------------------------
366
+ display_model_name = ("All Models" if str(model_name).lower() == "all" else model_name)
367
+ html_parts: list[str] = [f"""
368
+ <div style="padding: 12px 8px; margin-bottom: 12px; border-bottom: 1px solid #e6e8eb;">
369
+ <h3 style="margin-top:0; font-size: 18px;">{html.escape(display_model_name)}</h3>
370
+ <p style="margin: 4px 0 8px 0; color:#555; font-size:13px;">
371
+ {total_battles} battles Β· Top clusters by frequency
372
+ </p>
373
+ <p style="margin: 0 0 12px 0; color:#666; font-size:12px;">
374
+ {significant_frequency_count} significant frequency properties Β· {significant_quality_count} significant quality properties
375
+ </p>
376
+ """]
377
+
378
+ # Cluster blocks -----------------------------------------------------
379
+ for i, cluster in enumerate(sorted_clusters):
380
+ raw_name = next(k for k, v in clusters_dict.items() if v is cluster)
381
+ # Do not pre-escape here; markdown renderer handles escaping. Pre-escaping causes
382
+ # entities like &#x27; to render literally due to double-escaping.
383
+ name = _sanitize_label(raw_name)
384
+ prop = cluster.get("proportion", 0)
385
+ freq_pct = prop * 100
386
+ size = cluster.get("size", 0)
387
+
388
+ # Tag badge from metrics metadata (no DataFrame fallback)
389
+ tag_val = _extract_tag(cluster.get("metadata"))
390
+ if not tag_val:
391
+ tag_val = label_to_tag.get(raw_name) or label_to_tag.get(_sanitize_label(raw_name))
392
+ tag_badge_html = ""
393
+ stripe_color = "#4c6ef5"
394
+ if tag_val:
395
+ color = tag_to_color.get(tag_val, '#4c6ef5')
396
+ tag_badge_html = (
397
+ f"<span style=\"display:inline-block; margin-left:8px; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:{color}12; color:{color}; border:1px solid {color}26;\">{html.escape(str(tag_val))}</span>"
398
+ )
399
+ stripe_color = color
400
+
401
+ # Check significance flags
402
+ is_proportion_significant = False
403
+ if model_name != "all":
404
+ is_proportion_significant = cluster.get("proportion_delta_significant", False)
405
+
406
+ quality_delta_significant = cluster.get("quality_delta_significant", {})
407
+ is_quality_significant = any(quality_delta_significant.values())
408
+
409
+ # Create significance indicators
410
+ significance_indicators = []
411
+ if is_proportion_significant:
412
+ significance_indicators.append('<span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#cc6699; border:1px solid #cc669933; background:#cc669912;">F</span>')
413
+ if is_quality_significant:
414
+ significance_indicators.append('<span style="display:inline-block; padding:1px 6px; border-radius:999px; font-size:10px; font-weight:700; line-height:1; color:#007bff; border:1px solid #007bff33; background:#007bff12; margin-left:6px;">Q</span>')
415
+
416
+ significance_html = " ".join(significance_indicators) if significance_indicators else ""
417
+
418
+ # Distinctiveness / frequency delta display
419
+ if model_name == "all":
420
+ # For "all" model, proportion_delta doesn't make sense, so show proportion instead
421
+ distinct_factor = prop
422
+ distinct_text = f"{freq_pct:.1f}% of all conversations"
423
+ freq_with_delta_text = f"{freq_pct:.1f}%"
424
+ else:
425
+ sal = cluster.get("proportion_delta", 0)
426
+ distinct_factor = 1 + (sal / prop) if prop else 1
427
+ # Show delta in percentage points instead of raw proportion
428
+ sal_pct = sal * 100.0
429
+ freq_with_delta_text = f"{freq_pct:.1f}% ({sal_pct:+.1f}%)"
430
+ distinct_text = f"{freq_with_delta_text}"
431
+
432
+ # Confidence interval (frequency based)
433
+ ci = cluster.get("proportion_ci")
434
+ ci_str = format_confidence_interval(ci) if ci else "N/A"
435
+
436
+ # Quality display – show average score and delta per metric
437
+ quality_scores = cluster.get("quality", {}) or {}
438
+ quality_delta = cluster.get("quality_delta", {}) or {}
439
+ quality_display_html = ""
440
+
441
+ metric_names: list[str] = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
442
+ if metric_names:
443
+ parts: list[str] = []
444
+ for metric_name in metric_names:
445
+ score_val = quality_scores.get(metric_name)
446
+ delta_val = quality_delta.get(metric_name)
447
+ score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
448
+ if isinstance(delta_val, (int, float)):
449
+ # Use grey for values very close to zero
450
+ if abs(delta_val) < 0.001:
451
+ color = "#AAAAAA"
452
+ else:
453
+ color = "#28a745" if delta_val > 0 else "#dc3545"
454
+ parts.append(
455
+ f"<div>{metric_name}: {score_str} <span style=\"color:{color}; font-weight:500;\">({delta_val:+.3f})</span></div>"
456
+ )
457
+ else:
458
+ parts.append(f"<div>{metric_name}: {score_str}</div>")
459
+ quality_display_html = "".join(parts)
460
+ else:
461
+ quality_display_html = '<span style="color:#666;">No quality data</span>'
462
+
463
+ # Get light color for this cluster
464
+ cluster_color = get_light_color_for_cluster(name, i)
465
+
466
+ html_parts.append(f"""
467
+ <div style="background:#fbfcfe; border:1px solid #edf1f5; border-left: 3px solid {stripe_color}; padding: 10px 10px; margin: 10px 0; border-radius: 8px; box-shadow: 0 1px 2px rgba(16,24,40,0.06);">
468
+ <div style="display:flex; justify-content:space-between; align-items:flex-start; gap: 12px;">
469
+ <div style="flex:1; min-width:0;">
470
+ <div style="margin-bottom:4px; font-size:14px;">
471
+ {(_convdisp._markdown(str(name), pretty_print_dicts=False).replace('<p>', '<span>').replace('</p>', '</span>'))}
472
+ </div>
473
+ </div>
474
+ <div style="font-size:12px; font-weight:normal; white-space:nowrap; text-align:right;">
475
+ {quality_display_html}
476
+ </div>
477
+ </div>
478
+ <div style="display:flex; justify-content:space-between; align-items:center; margin-top:6px; gap: 12px;">
479
+ <div style="font-size:12px; color:#555; display:flex; align-items:center; flex-wrap:wrap; gap:6px;">
480
+ <span>{freq_with_delta_text} frequency ({size} out of {total_battles} total)</span>
481
+ </div>
482
+ <div style="text-align:right; display:flex; align-items:center; gap:8px;">{(tag_badge_html if tag_badge_html else '')}{significance_html}</div>
483
+ </div>
484
+ </div>
485
+ """)
486
+
487
+ # Close card div -----------------------------------------------------
488
+ html_parts.append("</div>")
489
+
490
+ return "\n".join(html_parts)
491
+
492
+
493
+ def format_cluster_dataframe(clustered_df: pd.DataFrame,
494
+ selected_models: Optional[List[str]] = None,
495
+ cluster_level: str = 'fine') -> pd.DataFrame:
496
+ """Format cluster DataFrame for display in Gradio."""
497
+ df = clustered_df.copy()
498
+
499
+ # Debug information
500
+ print(f"DEBUG: format_cluster_dataframe called")
501
+ print(f" - Input DataFrame shape: {df.shape}")
502
+ print(f" - Selected models: {selected_models}")
503
+ print(f" - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
504
+
505
+ # Filter by models if specified
506
+ if selected_models:
507
+ print(f" - Filtering by {len(selected_models)} selected models")
508
+ df = df[df['model'].isin(selected_models)]
509
+ print(f" - After filtering shape: {df.shape}")
510
+ print(f" - Models after filtering: {df['model'].unique().tolist()}")
511
+ else:
512
+ print(f" - No model filtering applied")
513
+
514
+ # Select relevant columns based on cluster level using correct column names from pipeline
515
+ if cluster_level == 'fine':
516
+ id_col = 'property_description_fine_cluster_id'
517
+ label_col = 'property_description_fine_cluster_label'
518
+ # Also check for alternative naming without prefix
519
+ alt_id_col = 'fine_cluster_id'
520
+ alt_label_col = 'fine_cluster_label'
521
+ else:
522
+ id_col = 'property_description_coarse_cluster_id'
523
+ label_col = 'property_description_coarse_cluster_label'
524
+ # Also check for alternative naming without prefix
525
+ alt_id_col = 'coarse_cluster_id'
526
+ alt_label_col = 'coarse_cluster_label'
527
+
528
+ # Try both naming patterns
529
+ if id_col in df.columns and label_col in df.columns:
530
+ # Use the expected naming pattern
531
+ cols = ['question_id', 'model', 'property_description', id_col, label_col, 'score']
532
+ elif alt_id_col in df.columns and alt_label_col in df.columns:
533
+ # Use the alternative naming pattern
534
+ cols = ['question_id', 'model', 'property_description', alt_id_col, alt_label_col, 'score']
535
+ else:
536
+ # Fall back to basic columns if cluster columns are missing
537
+ cols = ['question_id', 'model', 'property_description', 'score']
538
+
539
+ # Keep only existing columns
540
+ available_cols = [col for col in cols if col in df.columns]
541
+ df = df[available_cols]
542
+
543
+ print(f" - Final DataFrame shape: {df.shape}")
544
+ print(f" - Final columns: {df.columns.tolist()}")
545
+
546
+ return df
547
+
548
+
549
+ def truncate_cluster_name(cluster_desc: str, max_length: int = 50) -> str:
550
+ """Truncate cluster description to fit in table column."""
551
+ if len(cluster_desc) <= max_length:
552
+ return cluster_desc
553
+ return cluster_desc[:max_length-3] + "..."
554
+
555
+ def create_frequency_comparison_table(model_stats: Dict[str, Any],
556
+ selected_models: List[str],
557
+ cluster_level: str = "fine", # Ignored – kept for backward-compat
558
+ top_n: int = 50,
559
+ selected_model: str | None = None,
560
+ selected_quality_metric: str | None = None) -> pd.DataFrame:
561
+ """Create a comparison table for the new FunctionalMetrics format.
562
+
563
+ The old signature is kept (cluster_level arg is ignored) so that callers
564
+ can be updated incrementally.
565
+ """
566
+
567
+ if not selected_models:
568
+ return pd.DataFrame()
569
+
570
+ # ------------------------------------------------------------------
571
+ # 1. Collect per-model, per-cluster rows
572
+ # ------------------------------------------------------------------
573
+ all_rows: List[dict] = []
574
+ for model in selected_models:
575
+ model_clusters = get_model_clusters(model_stats, model) # type: ignore[arg-type]
576
+ if not model_clusters:
577
+ continue
578
+
579
+ # Optional filter by a single model after the fact
580
+ if selected_model and model != selected_model:
581
+ continue
582
+
583
+ for cluster_name, cdata in model_clusters.items():
584
+ # Filter out "No properties" clusters
585
+ if cluster_name == "No properties":
586
+ continue
587
+
588
+ # Basic numbers
589
+ freq_pct = cdata.get("proportion", 0.0) * 100.0
590
+ prop_ci = cdata.get("proportion_ci")
591
+
592
+ # Quality per metric dicts ------------------------------------------------
593
+ quality_dict = cdata.get("quality", {}) or {}
594
+ quality_ci_dict = cdata.get("quality_ci", {}) or {}
595
+
596
+ # Significance flags
597
+ sal_sig = bool(cdata.get("proportion_delta_significant", False))
598
+ quality_sig_flags = cdata.get("quality_delta_significant", {}) or {}
599
+
600
+ all_rows.append({
601
+ "cluster": cluster_name,
602
+ "model": model,
603
+ "frequency": freq_pct,
604
+ "proportion_ci": prop_ci,
605
+ "quality": quality_dict,
606
+ "quality_ci": quality_ci_dict,
607
+ "score_significant": sal_sig,
608
+ "quality_significant_any": any(quality_sig_flags.values()),
609
+ "quality_significant_metric": quality_sig_flags.get(selected_quality_metric) if selected_quality_metric else None,
610
+ })
611
+
612
+ if not all_rows:
613
+ return pd.DataFrame()
614
+
615
+ df_all = pd.DataFrame(all_rows)
616
+
617
+ # Aggregate frequency across models ----------------------------------
618
+ freq_sum = df_all.groupby("cluster")["frequency"].sum().sort_values(ascending=False)
619
+ top_clusters = freq_sum.head(top_n).index.tolist()
620
+
621
+ df_top = df_all[df_all["cluster"].isin(top_clusters)].copy()
622
+
623
+ table_rows: List[dict] = []
624
+ for clu in top_clusters:
625
+ subset = df_top[df_top["cluster"] == clu]
626
+ avg_freq = subset["frequency"].mean()
627
+
628
+ # Aggregate CI (mean of bounds)
629
+ ci_lowers = [ci.get("lower") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
630
+ ci_uppers = [ci.get("upper") for ci in subset["proportion_ci"] if isinstance(ci, dict)]
631
+ freq_ci = {
632
+ "lower": float(np.mean(ci_lowers)) if ci_lowers else None,
633
+ "upper": float(np.mean(ci_uppers)) if ci_uppers else None,
634
+ } if ci_lowers and ci_uppers else None
635
+
636
+ # Quality aggregation -----------------------------------------------------
637
+ q_vals: List[float] = []
638
+ q_ci_l: List[float] = []
639
+ q_ci_u: List[float] = []
640
+ quality_sig_any = False
641
+ for _, row in subset.iterrows():
642
+ q_dict = row["quality"]
643
+ if selected_quality_metric:
644
+ if selected_quality_metric in q_dict:
645
+ q_vals.append(q_dict[selected_quality_metric])
646
+ ci_metric = row["quality_ci"].get(selected_quality_metric) if isinstance(row["quality_ci"], dict) else None
647
+ if ci_metric:
648
+ q_ci_l.append(ci_metric.get("lower"))
649
+ q_ci_u.append(ci_metric.get("upper"))
650
+ quality_sig_any = quality_sig_any or bool(row["quality_significant_metric"])
651
+ else:
652
+ q_vals.extend(q_dict.values())
653
+ for ci in row["quality_ci"].values():
654
+ if isinstance(ci, dict):
655
+ q_ci_l.append(ci.get("lower"))
656
+ q_ci_u.append(ci.get("upper"))
657
+ quality_sig_any = quality_sig_any or row["quality_significant_any"]
658
+
659
+ quality_val = float(np.mean(q_vals)) if q_vals else None
660
+ quality_ci = {
661
+ "lower": float(np.mean(q_ci_l)),
662
+ "upper": float(np.mean(q_ci_u)),
663
+ } if q_ci_l and q_ci_u else None
664
+
665
+ score_sig = subset["score_significant"].any()
666
+
667
+ table_rows.append({
668
+ "Cluster": clu,
669
+ "Frequency (%)": f"{avg_freq:.1f}",
670
+ "Freq CI": format_confidence_interval(freq_ci),
671
+ "Quality": f"{quality_val:.3f}" if quality_val is not None else "N/A",
672
+ "Quality CI": format_confidence_interval(quality_ci) if quality_ci else "N/A",
673
+ "Score Significance": "Yes" if score_sig else "No",
674
+ "Quality Significance": "Yes" if quality_sig_any else "No",
675
+ })
676
+
677
+ return pd.DataFrame(table_rows)
678
+
679
+
680
+ def create_frequency_comparison_plots(model_stats: Dict[str, Any],
681
+ selected_models: List[str],
682
+ cluster_level: str = 'fine',
683
+ top_n: int = 50,
684
+ show_confidence_intervals: bool = False) -> Tuple[go.Figure, go.Figure]:
685
+ """Create frequency comparison plots (matching frequencies_tab.py exactly)."""
686
+
687
+ print(f"\nDEBUG: Plotting function called with:")
688
+ print(f" - Selected models: {selected_models}")
689
+ print(f" - Cluster level: {cluster_level}")
690
+ print(f" - Top N: {top_n}")
691
+ print(f" - Available models in stats: {list(model_stats.keys())}")
692
+
693
+ # Use the same data preparation logic as the table function
694
+ # Collect all clusters across all models for the chart (exact copy from frequencies_tab.py)
695
+ all_clusters_data = []
696
+ for model_name, model_data in model_stats.items():
697
+ if model_name not in selected_models:
698
+ continue
699
+
700
+ clusters = model_data.get(cluster_level, [])
701
+ for cluster in clusters:
702
+ # Filter out "No properties" clusters
703
+ if cluster.get('property_description') == "No properties":
704
+ continue
705
+
706
+ # Get confidence intervals for quality scores if available
707
+ quality_score_ci = cluster.get('quality_score_ci', {})
708
+ has_quality_ci = bool(quality_score_ci)
709
+
710
+ # Get distinctiveness score confidence intervals (correct structure)
711
+ score_ci = cluster.get('score_ci', {})
712
+ ci_lower = score_ci.get('lower') if score_ci else None
713
+ ci_upper = score_ci.get('upper') if score_ci else None
714
+
715
+ all_clusters_data.append({
716
+ 'property_description': cluster['property_description'],
717
+ 'model': model_name,
718
+ 'frequency': cluster.get('proportion', 0) * 100, # Convert to percentage
719
+ 'size': cluster.get('size', 0),
720
+ 'cluster_size_global': cluster.get('cluster_size_global', 0),
721
+ 'has_ci': has_confidence_intervals(cluster),
722
+ 'ci_lower': ci_lower,
723
+ 'ci_upper': ci_upper,
724
+ 'has_quality_ci': has_quality_ci
725
+ })
726
+
727
+ if not all_clusters_data:
728
+ # Return empty figures
729
+ empty_fig = go.Figure()
730
+ empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
731
+ return empty_fig, empty_fig
732
+
733
+ clusters_df = pd.DataFrame(all_clusters_data)
734
+
735
+ # Get all unique clusters for the chart
736
+ all_unique_clusters = clusters_df['property_description'].unique()
737
+ total_clusters = len(all_unique_clusters)
738
+
739
+ # Show all clusters by default
740
+ top_n_for_chart = min(top_n, total_clusters)
741
+
742
+ # Calculate total frequency per cluster and get top clusters
743
+ cluster_totals = clusters_df.groupby('property_description')['frequency'].sum().sort_values(ascending=False)
744
+ top_clusters = cluster_totals.head(top_n_for_chart).index.tolist()
745
+
746
+ # Get quality scores for the same clusters to sort by quality
747
+ quality_data_for_sorting = []
748
+ for model_name, model_data in model_stats.items():
749
+ if model_name not in selected_models:
750
+ continue
751
+ clusters = model_data.get(cluster_level, [])
752
+ for cluster in clusters:
753
+ # Filter out "No properties" clusters
754
+ if cluster.get('property_description') == "No properties":
755
+ continue
756
+
757
+ if cluster['property_description'] in top_clusters:
758
+ quality_data_for_sorting.append({
759
+ 'property_description': cluster['property_description'],
760
+ 'quality_score': extract_quality_score(cluster.get('quality_score', 0))
761
+ })
762
+
763
+ # Calculate average quality score per cluster and sort
764
+ if quality_data_for_sorting:
765
+ quality_df_for_sorting = pd.DataFrame(quality_data_for_sorting)
766
+ avg_quality_per_cluster = quality_df_for_sorting.groupby('property_description')['quality_score'].mean().sort_values(ascending=True) # Low to high
767
+ top_clusters = avg_quality_per_cluster.index.tolist()
768
+ # Reverse the order so low quality appears at top of chart
769
+ top_clusters = top_clusters[::-1]
770
+
771
+ # Filter data to only include top clusters
772
+ chart_data = clusters_df[clusters_df['property_description'].isin(top_clusters)]
773
+
774
+ if chart_data.empty:
775
+ # Return empty figures
776
+ empty_fig = go.Figure()
777
+ empty_fig.add_annotation(text="No data available", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
778
+ return empty_fig, empty_fig
779
+
780
+ # Get unique models for colors
781
+ models = chart_data['model'].unique()
782
+ # Use a color palette that avoids yellow - using Set1 which has better contrast
783
+ colors = px.colors.qualitative.Set1[:len(models)]
784
+
785
+ # Create horizontal bar chart for frequencies
786
+ fig = go.Figure()
787
+
788
+ # Add a bar for each model
789
+ for i, model in enumerate(models):
790
+ model_data = chart_data[chart_data['model'] == model]
791
+
792
+ # Sort by cluster order (same as top_clusters)
793
+ model_data = model_data.set_index('property_description').reindex(top_clusters).reset_index()
794
+
795
+ # Fill NaN values with 0 for missing clusters
796
+ model_data['frequency'] = model_data['frequency'].fillna(0)
797
+ model_data['has_ci'] = model_data['has_ci'].fillna(False)
798
+ # For CI columns, replace NaN with None using where() instead of fillna(None)
799
+ model_data['ci_lower'] = model_data['ci_lower'].where(pd.notna(model_data['ci_lower']), None)
800
+ model_data['ci_upper'] = model_data['ci_upper'].where(pd.notna(model_data['ci_upper']), None)
801
+
802
+ # Ensure frequency is numeric and non-negative
803
+ model_data['frequency'] = pd.to_numeric(model_data['frequency'], errors='coerce').fillna(0)
804
+ model_data['frequency'] = model_data['frequency'].clip(lower=0)
805
+
806
+ # Debug: print model data for first model
807
+ if i == 0: # Only print for first model to avoid spam
808
+ print(f"DEBUG: Model {model} data sample:")
809
+ print(f" - Clusters: {len(model_data)}")
810
+ print(f" - Frequency range: {model_data['frequency'].min():.2f} - {model_data['frequency'].max():.2f}")
811
+ print(f" - Non-zero frequencies: {(model_data['frequency'] > 0).sum()}")
812
+ if len(model_data) > 0:
813
+ print(f" - Sample row: {model_data.iloc[0][['property_description', 'frequency']].to_dict()}")
814
+
815
+ # Remove any rows where property_description is NaN (these are clusters this model doesn't appear in)
816
+ model_data = model_data.dropna(subset=['property_description'])
817
+
818
+ # Get confidence intervals for error bars
819
+ ci_lower = []
820
+ ci_upper = []
821
+ for _, row in model_data.iterrows():
822
+ freq_value = row.get('frequency', 0)
823
+ if (row.get('has_ci', False) and
824
+ pd.notna(row.get('ci_lower')) and
825
+ pd.notna(row.get('ci_upper')) and
826
+ freq_value > 0): # Only calculate CIs for non-zero frequencies
827
+
828
+ # IMPORTANT: These are distinctiveness score CIs, not frequency CIs
829
+ # The distinctiveness score measures how much more/less frequently
830
+ # a model exhibits this behavior compared to the median model
831
+ # We can use this to estimate uncertainty in the frequency measurement
832
+ distinctiveness_ci_width = row['ci_upper'] - row['ci_lower']
833
+
834
+ # Convert to frequency uncertainty (approximate)
835
+ # A wider distinctiveness CI suggests more uncertainty in the frequency
836
+ freq_uncertainty = distinctiveness_ci_width * freq_value * 0.1
837
+ ci_lower.append(max(0, freq_value - freq_uncertainty))
838
+ ci_upper.append(freq_value + freq_uncertainty)
839
+ else:
840
+ ci_lower.append(None)
841
+ ci_upper.append(None)
842
+
843
+ # Debug: Check the data going into the plot
844
+ print(f"DEBUG: Adding trace for model {model}:")
845
+ print(f" - Y values (clusters): {model_data['property_description'].tolist()[:3]}...") # First 3 clusters
846
+ print(f" - X values (frequencies): {model_data['frequency'].tolist()[:3]}...") # First 3 frequencies
847
+ print(f" - Total data points: {len(model_data)}")
848
+
849
+ fig.add_trace(go.Bar(
850
+ y=model_data['property_description'],
851
+ x=model_data['frequency'],
852
+ name=model,
853
+ orientation='h',
854
+ marker_color=colors[i],
855
+ error_x=dict(
856
+ type='data',
857
+ array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
858
+ arrayminus=[f - l if f is not None and l is not None else None for f, l in zip(model_data['frequency'], ci_lower)],
859
+ visible=show_confidence_intervals,
860
+ thickness=1,
861
+ width=3,
862
+ color='rgba(0,0,0,0.3)'
863
+ ),
864
+ hovertemplate='<b>%{y}</b><br>' +
865
+ f'Model: {model}<br>' +
866
+ 'Frequency: %{x:.1f}%<br>' +
867
+ 'CI: %{customdata[0]}<extra></extra>',
868
+ customdata=[[
869
+ format_confidence_interval({
870
+ 'lower': l,
871
+ 'upper': u
872
+ }) if l is not None and u is not None else "N/A"
873
+ for l, u in zip(ci_lower, ci_upper)
874
+ ]]
875
+ ))
876
+
877
+ # Update layout
878
+ fig.update_layout(
879
+ title=f"Model Frequencies in Top {len(top_clusters)} Clusters",
880
+ xaxis_title="Frequency (%)",
881
+ yaxis_title="Cluster Description",
882
+ barmode='group', # Group bars side by side
883
+ height=max(600, len(top_clusters) * 25), # Adjust height based on number of clusters
884
+ showlegend=True,
885
+ legend=dict(
886
+ orientation="h",
887
+ yanchor="bottom",
888
+ y=1.02,
889
+ xanchor="right",
890
+ x=1
891
+ )
892
+ )
893
+
894
+ # Update y-axis to show truncated cluster names
895
+ fig.update_yaxes(
896
+ tickmode='array',
897
+ ticktext=[truncate_cluster_name(desc, 60) for desc in top_clusters],
898
+ tickvals=top_clusters
899
+ )
900
+
901
+ # Create quality score chart
902
+ # Get quality scores for the same clusters (single score per cluster)
903
+ quality_data = []
904
+ quality_cis = [] # Add confidence intervals for quality scores
905
+
906
+ for cluster_desc in top_clusters:
907
+ # Get the first available quality score for this cluster
908
+ for model_name, model_data in model_stats.items():
909
+ clusters = model_data.get(cluster_level, [])
910
+ for cluster in clusters:
911
+ if cluster['property_description'] == cluster_desc:
912
+ quality_score = extract_quality_score(cluster.get('quality_score', 0))
913
+ quality_data.append({
914
+ 'property_description': cluster_desc,
915
+ 'quality_score': quality_score
916
+ })
917
+
918
+ # Get quality score confidence intervals
919
+ quality_ci = cluster.get('quality_score_ci', {})
920
+ if isinstance(quality_ci, dict) and quality_ci:
921
+ # Get the first available quality CI
922
+ for score_key, ci_data in quality_ci.items():
923
+ if isinstance(ci_data, dict):
924
+ ci_lower = ci_data.get('lower')
925
+ ci_upper = ci_data.get('upper')
926
+ if ci_lower is not None and ci_upper is not None:
927
+ quality_cis.append({
928
+ 'property_description': cluster_desc,
929
+ 'ci_lower': ci_lower,
930
+ 'ci_upper': ci_upper
931
+ })
932
+ break
933
+ else:
934
+ quality_cis.append({
935
+ 'property_description': cluster_desc,
936
+ 'ci_lower': None,
937
+ 'ci_upper': None
938
+ })
939
+ else:
940
+ quality_cis.append({
941
+ 'property_description': cluster_desc,
942
+ 'ci_lower': None,
943
+ 'ci_upper': None
944
+ })
945
+ break
946
+ if any(q['property_description'] == cluster_desc for q in quality_data):
947
+ break
948
+
949
+ if quality_data:
950
+ quality_df = pd.DataFrame(quality_data)
951
+ quality_cis_df = pd.DataFrame(quality_cis) if quality_cis else None
952
+
953
+ # Create quality score chart with single bars
954
+ fig_quality = go.Figure()
955
+
956
+ # Prepare confidence intervals for error bars
957
+ ci_lower = []
958
+ ci_upper = []
959
+ for _, row in quality_df.iterrows():
960
+ cluster_desc = row['property_description']
961
+ if quality_cis_df is not None:
962
+ ci_row = quality_cis_df[quality_cis_df['property_description'] == cluster_desc]
963
+ if not ci_row.empty:
964
+ ci_lower.append(ci_row.iloc[0]['ci_lower'])
965
+ ci_upper.append(ci_row.iloc[0]['ci_upper'])
966
+ else:
967
+ ci_lower.append(None)
968
+ ci_upper.append(None)
969
+ else:
970
+ ci_lower.append(None)
971
+ ci_upper.append(None)
972
+
973
+ # Add a single bar for each cluster
974
+ fig_quality.add_trace(go.Bar(
975
+ y=[truncate_cluster_name(desc, 60) for desc in quality_df['property_description']],
976
+ x=quality_df['quality_score'],
977
+ orientation='h',
978
+ marker_color='lightblue', # Single color for all bars
979
+ name='Quality Score',
980
+ showlegend=False,
981
+ error_x=dict(
982
+ type='data',
983
+ array=[u - l if u is not None and l is not None else None for l, u in zip(ci_lower, ci_upper)],
984
+ arrayminus=[q - l if q is not None and l is not None else None for q, l in zip(quality_df['quality_score'], ci_lower)],
985
+ visible=show_confidence_intervals,
986
+ thickness=1,
987
+ width=3,
988
+ color='rgba(0,0,0,0.3)'
989
+ ),
990
+ hovertemplate='<b>%{y}</b><br>' +
991
+ 'Quality Score: %{x:.3f}<br>' +
992
+ 'CI: %{customdata[0]}<extra></extra>',
993
+ customdata=[[
994
+ format_confidence_interval({
995
+ 'lower': l,
996
+ 'upper': u
997
+ }) if l is not None and u is not None else "N/A"
998
+ for l, u in zip(ci_lower, ci_upper)
999
+ ]]
1000
+ ))
1001
+
1002
+ # Update layout
1003
+ fig_quality.update_layout(
1004
+ title=f"Quality Scores",
1005
+ xaxis_title="Quality Score",
1006
+ yaxis_title="", # No y-axis title to save space
1007
+ height=max(600, len(top_clusters) * 25), # Same height as main chart
1008
+ showlegend=False,
1009
+ yaxis=dict(showticklabels=False) # Hide y-axis labels to save space
1010
+ )
1011
+ else:
1012
+ # Create empty quality figure
1013
+ fig_quality = go.Figure()
1014
+ fig_quality.add_annotation(text="No quality score data available",
1015
+ xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)
1016
+
1017
+ return fig, fig_quality
1018
+
1019
+
1020
+ def search_clusters_by_text(clustered_df: pd.DataFrame,
1021
+ search_term: str,
1022
+ search_in: str = 'description') -> pd.DataFrame:
1023
+ """Search clusters by text in descriptions or other fields."""
1024
+ if not search_term:
1025
+ return clustered_df.head(100) # Return first 100 if no search
1026
+
1027
+ norm_term = normalize_text_for_search(search_term)
1028
+
1029
+ if search_in == 'description':
1030
+ series = clustered_df['property_description'].astype(str).apply(normalize_text_for_search)
1031
+ mask = series.str.contains(norm_term, na=False, regex=False)
1032
+ elif search_in == 'model':
1033
+ series = clustered_df['model'].astype(str).apply(normalize_text_for_search)
1034
+ mask = series.str.contains(norm_term, na=False, regex=False)
1035
+ elif search_in == 'cluster_label':
1036
+ # Use correct column names from pipeline
1037
+ fine_label_col = 'property_description_fine_cluster_label'
1038
+ coarse_label_col = 'property_description_coarse_cluster_label'
1039
+ # Initialize mask aligned to clustered_df index to avoid boolean indexer misalignment
1040
+ mask = pd.Series(False, index=clustered_df.index)
1041
+
1042
+ if fine_label_col in clustered_df.columns:
1043
+ series = clustered_df[fine_label_col].astype(str).apply(normalize_text_for_search)
1044
+ mask = mask | series.str.contains(norm_term, na=False, regex=False)
1045
+ if coarse_label_col in clustered_df.columns:
1046
+ series = clustered_df[coarse_label_col].astype(str).apply(normalize_text_for_search)
1047
+ mask = mask | series.str.contains(norm_term, na=False, regex=False)
1048
+ else:
1049
+ # Search in all text columns using correct column names
1050
+ text_cols = ['property_description', 'model',
1051
+ 'property_description_fine_cluster_label',
1052
+ 'property_description_coarse_cluster_label']
1053
+ # Initialize mask aligned to clustered_df index to avoid boolean indexer misalignment
1054
+ mask = pd.Series(False, index=clustered_df.index)
1055
+ for col in text_cols:
1056
+ if col in clustered_df.columns:
1057
+ series = clustered_df[col].astype(str).apply(normalize_text_for_search)
1058
+ mask = mask | series.str.contains(norm_term, na=False, regex=False)
1059
+
1060
+ return clustered_df[mask].head(100)
1061
+
1062
+
1063
+ def search_clusters_only(clustered_df: pd.DataFrame,
1064
+ search_term: str,
1065
+ cluster_level: str = 'fine') -> pd.DataFrame:
1066
+ """Search only over cluster labels, not individual property descriptions."""
1067
+ if not search_term:
1068
+ return clustered_df
1069
+
1070
+ norm_term = normalize_text_for_search(search_term)
1071
+
1072
+ # Use the correct column names based on cluster level
1073
+ if cluster_level == 'fine':
1074
+ label_col = 'property_description_fine_cluster_label'
1075
+ alt_label_col = 'fine_cluster_label'
1076
+ else:
1077
+ label_col = 'property_description_coarse_cluster_label'
1078
+ alt_label_col = 'coarse_cluster_label'
1079
+
1080
+ # Try both naming patterns
1081
+ if label_col in clustered_df.columns:
1082
+ series = clustered_df[label_col].astype(str).apply(normalize_text_for_search)
1083
+ mask = series.str.contains(norm_term, na=False, regex=False)
1084
+ elif alt_label_col in clustered_df.columns:
1085
+ series = clustered_df[alt_label_col].astype(str).apply(normalize_text_for_search)
1086
+ mask = series.str.contains(norm_term, na=False, regex=False)
1087
+ else:
1088
+ # If neither column exists, return empty DataFrame
1089
+ return pd.DataFrame()
1090
+
1091
+ return clustered_df[mask]
1092
+
1093
+
1094
+ def create_interactive_cluster_viewer(clustered_df: pd.DataFrame,
1095
+ selected_models: Optional[List[str]] = None,
1096
+ cluster_level: str = 'fine') -> str:
1097
+ """Create interactive cluster viewer HTML similar to Streamlit version."""
1098
+ if clustered_df.empty:
1099
+ return "<p>No cluster data available</p>"
1100
+
1101
+ df = clustered_df.copy()
1102
+
1103
+ # Debug information
1104
+ print(f"DEBUG: create_interactive_cluster_viewer called")
1105
+ print(f" - Input DataFrame shape: {df.shape}")
1106
+ print(f" - Selected models: {selected_models}")
1107
+ print(f" - Available models in data: {df['model'].unique().tolist() if 'model' in df.columns else 'No model column'}")
1108
+
1109
+ # Filter by models if specified
1110
+ if selected_models:
1111
+ print(f" - Filtering by {len(selected_models)} selected models")
1112
+ df = df[df['model'].isin(selected_models)]
1113
+ print(f" - After filtering shape: {df.shape}")
1114
+ print(f" - Models after filtering: {df['model'].unique().tolist()}")
1115
+ else:
1116
+ print(f" - No model filtering applied")
1117
+
1118
+ if df.empty:
1119
+ return f"<p>No data found for selected models: {', '.join(selected_models or [])}</p>"
1120
+
1121
+ # Get cluster scores data for quality and frequency information
1122
+ from .state import app_state
1123
+ cluster_scores = app_state.get("metrics", {}).get("cluster_scores", {})
1124
+
1125
+ # Use the actual column names from the pipeline output (matching Streamlit version)
1126
+ if cluster_level == 'fine':
1127
+ id_col = 'property_description_fine_cluster_id'
1128
+ label_col = 'property_description_fine_cluster_label'
1129
+ # Also check for alternative naming without prefix
1130
+ alt_id_col = 'fine_cluster_id'
1131
+ alt_label_col = 'fine_cluster_label'
1132
+ else:
1133
+ id_col = 'property_description_coarse_cluster_id'
1134
+ label_col = 'property_description_coarse_cluster_label'
1135
+ # Also check for alternative naming without prefix
1136
+ alt_id_col = 'coarse_cluster_id'
1137
+ alt_label_col = 'coarse_cluster_label'
1138
+
1139
+ # Track if we fall back from coarse to fine
1140
+ fell_back_to_fine = False
1141
+
1142
+ # Check if required columns exist and provide helpful debug info
1143
+ # Try both naming patterns
1144
+ if id_col in df.columns and label_col in df.columns:
1145
+ # Use the expected naming pattern
1146
+ pass
1147
+ elif alt_id_col in df.columns and alt_label_col in df.columns:
1148
+ # Use the alternative naming pattern
1149
+ id_col = alt_id_col
1150
+ label_col = alt_label_col
1151
+ else:
1152
+ # If coarse clusters are not available, try to fall back to fine clusters
1153
+ if cluster_level == 'coarse':
1154
+ # Check if fine clusters are available
1155
+ fine_id_col = 'property_description_fine_cluster_id'
1156
+ fine_label_col = 'property_description_fine_cluster_label'
1157
+ fine_alt_id_col = 'fine_cluster_id'
1158
+ fine_alt_label_col = 'fine_cluster_label'
1159
+
1160
+ if (fine_id_col in df.columns and fine_label_col in df.columns) or (fine_alt_id_col in df.columns and fine_alt_label_col in df.columns):
1161
+ # Fall back to fine clusters
1162
+ if fine_id_col in df.columns and fine_label_col in df.columns:
1163
+ id_col = fine_id_col
1164
+ label_col = fine_label_col
1165
+ else:
1166
+ id_col = fine_alt_id_col
1167
+ label_col = fine_alt_label_col
1168
+ cluster_level = 'fine' # Update the cluster level for display
1169
+ fell_back_to_fine = True
1170
+ else:
1171
+ # No cluster columns available at all
1172
+ available_cols = list(df.columns)
1173
+ return f"""
1174
+ <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
1175
+ <h4>❌ Missing cluster columns in data</h4>
1176
+ <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
1177
+ <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
1178
+ <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
1179
+ </div>
1180
+ """
1181
+ else:
1182
+ # For fine clusters, show the original error
1183
+ available_cols = list(df.columns)
1184
+ return f"""
1185
+ <div style="padding: 20px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px;">
1186
+ <h4>❌ Missing {cluster_level} cluster columns in data</h4>
1187
+ <p><strong>Expected:</strong> {id_col}, {label_col} OR {alt_id_col}, {alt_label_col}</p>
1188
+ <p><strong>Available columns:</strong> {', '.join(available_cols)}</p>
1189
+ <p>Please ensure your data contains clustering results from the LMM-Vibes pipeline.</p>
1190
+ </div>
1191
+ """
1192
+
1193
+ # Group by cluster to get cluster information
1194
+ try:
1195
+ print(f" - Grouping by cluster columns: {id_col}, {label_col}")
1196
+ # If meta column exists, propagate it into the aggregation so we can tag clusters
1197
+ agg_spec = {
1198
+ 'property_description': ['count', lambda x: x.unique().tolist()],
1199
+ 'model': lambda x: x.unique().tolist()
1200
+ }
1201
+ if 'meta' in df.columns:
1202
+ agg_spec['meta'] = lambda x: x.iloc[0]
1203
+ cluster_groups = df.groupby([id_col, label_col]).agg(agg_spec).reset_index()
1204
+
1205
+ # Flatten column names
1206
+ flat_cols = [id_col, label_col, 'size', 'property_descriptions', 'models']
1207
+ if 'meta' in df.columns:
1208
+ flat_cols.append('meta')
1209
+ cluster_groups.columns = flat_cols
1210
+
1211
+ # Sort by size (largest first)
1212
+ cluster_groups = cluster_groups.sort_values('size', ascending=False)
1213
+
1214
+ # Filter out "No properties" clusters
1215
+ cluster_groups = cluster_groups[cluster_groups[label_col] != "No properties"]
1216
+
1217
+ print(f" - Found {len(cluster_groups)} clusters")
1218
+ print(f" - Cluster sizes: {cluster_groups['size'].tolist()}")
1219
+ print(f" - Models per cluster: {[len(models) for models in cluster_groups['models']]}")
1220
+
1221
+ except Exception as e:
1222
+ return f"""
1223
+ <div style="padding: 20px; background: #f8d7da; border: 1px solid #f5c6cb; border-radius: 8px;">
1224
+ <h4>❌ Error processing cluster data</h4>
1225
+ <p><strong>Error:</strong> {str(e)}</p>
1226
+ <p>Please check your data format and try again.</p>
1227
+ </div>
1228
+ """
1229
+
1230
+ if len(cluster_groups) == 0:
1231
+ return """
1232
+ <div style="padding: 20px; background: #d1ecf1; border: 1px solid #bee5eb; border-radius: 8px;">
1233
+ <h4>ℹ️ No clusters found</h4>
1234
+ <p>No clusters match your current filters. Try selecting different models or adjusting your search.</p>
1235
+ </div>
1236
+ """
1237
+
1238
+ # Helper to extract first value from meta for display
1239
+ def _extract_tag_from_meta(meta_obj: Any) -> Optional[str]:
1240
+ return extract_allowed_tag(meta_obj)
1241
+
1242
+ # Build a stable color map for tags (if any)
1243
+ tag_to_color: dict[str, str] = {
1244
+ "Style": "#9467bd", # purple
1245
+ "Positive": "#28a745", # green
1246
+ "Negative (non-critical)": "#ff7f0e", # orange
1247
+ "Negative (critical)": "#dc3545", # red
1248
+ }
1249
+ if 'meta' in cluster_groups.columns:
1250
+ # If all meta objects are empty dicts, treat as no tags
1251
+ meta_vals = cluster_groups['meta'].tolist()
1252
+ parsed_meta = [_parse_meta_obj(m) for m in meta_vals]
1253
+ non_null_parsed = [m for m in parsed_meta if m is not None]
1254
+ all_empty_dicts = (
1255
+ len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
1256
+ )
1257
+ if not all_empty_dicts:
1258
+ unique_tags = [t for t in (_extract_tag_from_meta(m) for m in meta_vals) if t]
1259
+ unique_tags = list(dict.fromkeys(unique_tags)) # preserve order, dedupe
1260
+ # tag_to_color already contains all allowed tags with fixed colors
1261
+
1262
+ # Helper to remove embedded dicts like "({'group': 'Positive'})" from labels
1263
+ def _sanitize_cluster_label(label: str) -> str:
1264
+ if not isinstance(label, str):
1265
+ return str(label)
1266
+ # Remove ( { ... } ) at end
1267
+ label = re.sub(r"\s*\(\s*\{[^}]*\}\s*\)\s*$", "", label)
1268
+ # Remove trailing { ... }
1269
+ label = re.sub(r"\s*\{[^}]*\}\s*$", "", label)
1270
+ # Remove simple (key: value) trailer
1271
+ label = re.sub(r"\s*\(\s*[^(){}:]+\s*:\s*[^(){}]+\)\s*$", "", label)
1272
+ return label.strip()
1273
+
1274
+ # Create HTML
1275
+ page_html = f"""
1276
+ <div style="max-width: 1600px; margin: 0 auto;">
1277
+ <p style="color: #666; margin-bottom: 20px;">
1278
+ Click on clusters below to explore their property descriptions.
1279
+ Showing {len(cluster_groups)} clusters sorted by size.
1280
+ </p>
1281
+ """
1282
+
1283
+ # Add a note if we fell back from coarse to fine clusters
1284
+ if cluster_level == 'fine' and fell_back_to_fine:
1285
+ page_html += """
1286
+ <div style="padding: 15px; background: #fff3cd; border: 1px solid #ffeaa7; border-radius: 8px; margin-bottom: 20px;">
1287
+ <strong>Note:</strong> Coarse clusters not available in this dataset. Showing fine clusters instead.
1288
+ </div>
1289
+ """
1290
+
1291
+ for i, row in cluster_groups.iterrows():
1292
+ cluster_id = row[id_col]
1293
+ cluster_label = row[label_col]
1294
+ cluster_size = row['size']
1295
+ property_descriptions = row['property_descriptions']
1296
+ models_in_cluster = row['models']
1297
+ # Tag if meta exists in grouped data
1298
+ tag_badge_html = ""
1299
+ tag_value = None
1300
+ if 'meta' in cluster_groups.columns:
1301
+ tag_value = _extract_tag_from_meta(row.get('meta'))
1302
+ if tag_value:
1303
+ color = tag_to_color.get(tag_value, '#4c6ef5')
1304
+ tag_badge_html = (
1305
+ f"<span style=\"display:inline-block; margin-left:10px; padding:3px 8px; "
1306
+ f"border-radius:12px; font-size:11px; font-weight:600; "
1307
+ f"background:{color}1A; color:{color}; border:1px solid {color}33;\">"
1308
+ f"{html.escape(str(tag_value))}</span>"
1309
+ )
1310
+ # Use sanitized label for display then render markdown (no extra <strong>)
1311
+ label_display = _sanitize_cluster_label(str(cluster_label))
1312
+ label_html = (
1313
+ _convdisp._markdown(str(label_display), pretty_print_dicts=False)
1314
+ .replace('<p>', '<span>')
1315
+ .replace('</p>', '</span>')
1316
+ )
1317
+
1318
+ # Get quality and frequency information from cluster_scores
1319
+ cluster_metrics = cluster_scores.get(cluster_label, {})
1320
+ frequency_pct = cluster_metrics.get("proportion", 0) * 100 if cluster_metrics else 0
1321
+ quality_scores = cluster_metrics.get("quality", {})
1322
+ quality_delta = cluster_metrics.get("quality_delta", {})
1323
+
1324
+ # Build per-metric header display: "metric: score (delta)"
1325
+ header_quality_html = "<span style=\"color:#666;\">No quality data</span>"
1326
+ if quality_scores or quality_delta:
1327
+ metric_names = sorted(set(quality_scores.keys()) | set(quality_delta.keys()))
1328
+ line_parts: list[str] = []
1329
+ for metric_name in metric_names:
1330
+ score_val = quality_scores.get(metric_name)
1331
+ delta_val = quality_delta.get(metric_name)
1332
+ score_str = f"{score_val:.3f}" if isinstance(score_val, (int, float)) else "N/A"
1333
+ if isinstance(delta_val, (int, float)):
1334
+ color = "#28a745" if delta_val >= 0 else "#dc3545"
1335
+ line_parts.append(f"<div>{metric_name}: {score_str} <span style=\"color: {color}; font-weight:500;\">({delta_val:+.3f})</span></div>")
1336
+ else:
1337
+ line_parts.append(f"<div>{metric_name}: {score_str}</div>")
1338
+ header_quality_html = "".join(line_parts)
1339
+
1340
+ # Format quality scores for detailed view
1341
+ quality_html = ""
1342
+ if quality_scores:
1343
+ quality_parts = []
1344
+ for metric_name, score in quality_scores.items():
1345
+ color = "#28a745" if score >= 0 else "#dc3545"
1346
+ quality_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {score:.3f}</span>')
1347
+ quality_html = " | ".join(quality_parts)
1348
+ else:
1349
+ quality_html = '<span style="color:#666;">No quality data</span>'
1350
+
1351
+ # Format quality delta (relative to average)
1352
+ quality_delta_html = ""
1353
+ if quality_delta:
1354
+ delta_parts = []
1355
+ for metric_name, delta in quality_delta.items():
1356
+ # Use grey for values very close to zero
1357
+ if abs(delta) < 0.001:
1358
+ color = "#AAAAAA"
1359
+ else:
1360
+ color = "#28a745" if delta > 0 else "#dc3545"
1361
+ sign = "+" if delta >= 0 else ""
1362
+ delta_parts.append(f'<span style="color:{color}; font-weight:500;">{metric_name}: {sign}{delta:.3f}</span>')
1363
+ quality_delta_html = " | ".join(delta_parts)
1364
+ else:
1365
+ quality_delta_html = '<span style="color:#666;">No delta data</span>'
1366
+
1367
+ # Format header quality score with visual indicators
1368
+ header_quality_text = header_quality_html
1369
+
1370
+ # Get light color for this cluster (matching overview style)
1371
+ cluster_color = get_light_color_for_cluster(cluster_label, i)
1372
+
1373
+ # Build per-model frequencies for this cluster (replace models list)
1374
+ metrics_all = app_state.get("metrics", {})
1375
+ model_cluster_scores = metrics_all.get("model_cluster_scores", {})
1376
+ model_freq_items: list[str] = []
1377
+ for m in models_in_cluster:
1378
+ m_dict = model_cluster_scores.get(m, {})
1379
+ c_dict = m_dict.get(cluster_label, {}) if isinstance(m_dict, dict) else {}
1380
+ prop = c_dict.get("proportion")
1381
+ if isinstance(prop, (int, float)):
1382
+ model_freq_items.append(f"{html.escape(str(m))}: {prop * 100:.1f}%")
1383
+ model_freqs_html = " | ".join(model_freq_items) if model_freq_items else "N/A"
1384
+
1385
+ # Create expandable cluster card with overview-style design
1386
+ page_html += f"""
1387
+ <details style="margin: 15px 0; border: 1px solid #e0e0e0; border-radius: 8px; overflow: hidden; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1388
+ <summary style="
1389
+ padding: 15px;
1390
+ background: {get_light_color_for_cluster(cluster_label, i)};
1391
+ color: #333;
1392
+ cursor: pointer;
1393
+ font-weight: 400;
1394
+ font-size: 16px;
1395
+ user-select: none;
1396
+ list-style: none;
1397
+ display: flex;
1398
+ justify-content: space-between;
1399
+ align-items: center;
1400
+ border-bottom: 1px solid #dee2e6;
1401
+ ">
1402
+ <div style="max-width: 80%;">
1403
+ <div style="margin-bottom: 4px; font-size: 14px;">
1404
+ {label_html}
1405
+ </div>
1406
+ <span style="font-size: 12px; color: #555; display:inline-flex; align-items:center;">
1407
+ {frequency_pct:.1f}% frequency ({cluster_size} properties) Β· {len(models_in_cluster)} models
1408
+ {tag_badge_html}
1409
+ </span>
1410
+ </div>
1411
+ <div style="font-size: 12px; font-weight: normal; text-align: right;">
1412
+ <div style="margin-bottom: 4px; line-height: 1.2;">{header_quality_html}</div>
1413
+ <div style="color: #6c757d;">
1414
+ {frequency_pct:.1f}% frequency
1415
+ </div>
1416
+ </div>
1417
+ </summary>
1418
+
1419
+ <div style="padding: 20px; background: #f8f9fa;">
1420
+ <div style="margin-bottom: 15px;">
1421
+ <strong>Cluster ID:</strong> {cluster_id}<br>
1422
+ <strong>Size:</strong> {cluster_size} properties<br>
1423
+ <strong>Model Frequencies:</strong> {model_freqs_html}<br>
1424
+ </div>
1425
+
1426
+ <h4 style="color: #333; margin: 15px 0 10px 0;">
1427
+ Property Descriptions ({len(property_descriptions)})
1428
+ </h4>
1429
+
1430
+ <div style="max-height: 300px; overflow-y: auto; background: white; border: 1px solid #ddd; border-radius: 4px; padding: 10px;">
1431
+ """
1432
+
1433
+ # Display property descriptions
1434
+ for i, desc in enumerate(property_descriptions, 1):
1435
+ page_html += f"""
1436
+ <div style="
1437
+ padding: 8px;
1438
+ margin: 2px 0;
1439
+ background: #f8f9fa;
1440
+ border-left: 3px solid #667eea;
1441
+ border-radius: 2px;
1442
+ ">
1443
+ <strong>{i}.</strong> {desc}
1444
+ </div>
1445
+ """
1446
+
1447
+ page_html += """
1448
+ </div>
1449
+ </div>
1450
+ </details>
1451
+ """
1452
+
1453
+ page_html += "</div>"
1454
+ return page_html
1455
+
1456
+
1457
+ def get_cluster_statistics(clustered_df: pd.DataFrame,
1458
+ selected_models: Optional[List[str]] = None) -> Dict[str, Any]:
1459
+ """Get cluster statistics for display."""
1460
+ if clustered_df.empty:
1461
+ return {}
1462
+
1463
+ df = clustered_df.copy()
1464
+
1465
+ # Filter by models if specified
1466
+ if selected_models:
1467
+ df = df[df['model'].isin(selected_models)]
1468
+
1469
+ stats = {
1470
+ 'total_properties': len(df),
1471
+ 'total_models': df['model'].nunique() if 'model' in df.columns else 0,
1472
+ }
1473
+
1474
+ # Fine cluster statistics - try both naming patterns
1475
+ fine_id_col = 'property_description_fine_cluster_id'
1476
+ alt_fine_id_col = 'fine_cluster_id'
1477
+
1478
+ if fine_id_col in df.columns:
1479
+ stats['fine_clusters'] = df[fine_id_col].nunique()
1480
+ cluster_sizes = df.groupby(fine_id_col).size()
1481
+ stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1482
+ stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1483
+ stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1484
+ elif alt_fine_id_col in df.columns:
1485
+ stats['fine_clusters'] = df[alt_fine_id_col].nunique()
1486
+ cluster_sizes = df.groupby(alt_fine_id_col).size()
1487
+ stats['min_properties_per_fine_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1488
+ stats['max_properties_per_fine_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1489
+ stats['avg_properties_per_fine_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1490
+
1491
+ # Coarse cluster statistics - try both naming patterns
1492
+ coarse_id_col = 'property_description_coarse_cluster_id'
1493
+ alt_coarse_id_col = 'coarse_cluster_id'
1494
+
1495
+ if coarse_id_col in df.columns:
1496
+ stats['coarse_clusters'] = df[coarse_id_col].nunique()
1497
+ cluster_sizes = df.groupby(coarse_id_col).size()
1498
+ stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1499
+ stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1500
+ stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1501
+ elif alt_coarse_id_col in df.columns:
1502
+ stats['coarse_clusters'] = df[alt_coarse_id_col].nunique()
1503
+ cluster_sizes = df.groupby(alt_coarse_id_col).size()
1504
+ stats['min_properties_per_coarse_cluster'] = cluster_sizes.min() if not cluster_sizes.empty else 0
1505
+ stats['max_properties_per_coarse_cluster'] = cluster_sizes.max() if not cluster_sizes.empty else 0
1506
+ stats['avg_properties_per_coarse_cluster'] = cluster_sizes.mean() if not cluster_sizes.empty else 0
1507
+
1508
+ return stats
1509
+
1510
+
1511
+ def get_unique_values_for_dropdowns(clustered_df: pd.DataFrame) -> Dict[str, List[str]]:
1512
+ """Get unique values for dropdown menus."""
1513
+ if clustered_df.empty:
1514
+ return {'prompts': [], 'models': [], 'properties': [], 'tags': []}
1515
+
1516
+ # Get unique values, handling missing columns gracefully
1517
+ prompts = []
1518
+ if 'prompt' in clustered_df.columns:
1519
+ unique_prompts = clustered_df['prompt'].dropna().unique().tolist()
1520
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1521
+ elif 'question' in clustered_df.columns:
1522
+ unique_prompts = clustered_df['question'].dropna().unique().tolist()
1523
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1524
+ elif 'input' in clustered_df.columns:
1525
+ unique_prompts = clustered_df['input'].dropna().unique().tolist()
1526
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1527
+ elif 'user_prompt' in clustered_df.columns:
1528
+ unique_prompts = clustered_df['user_prompt'].dropna().unique().tolist()
1529
+ prompts = [prompt[:100] + "..." if len(prompt) > 100 else prompt for prompt in sorted(unique_prompts)]
1530
+
1531
+ # Handle both single model and side-by-side datasets
1532
+ models = []
1533
+ if 'model' in clustered_df.columns:
1534
+ models = sorted(clustered_df['model'].dropna().unique().tolist())
1535
+ elif 'model_a' in clustered_df.columns and 'model_b' in clustered_df.columns:
1536
+ models_a = clustered_df['model_a'].dropna().unique().tolist()
1537
+ models_b = clustered_df['model_b'].dropna().unique().tolist()
1538
+ all_models = set(models_a + models_b)
1539
+ models = sorted(list(all_models))
1540
+
1541
+ # Use fine cluster labels instead of property descriptions - try both naming patterns
1542
+ properties = []
1543
+ fine_label_col = 'property_description_fine_cluster_label'
1544
+ alt_fine_label_col = 'fine_cluster_label'
1545
+
1546
+ if fine_label_col in clustered_df.columns:
1547
+ unique_properties = clustered_df[fine_label_col].dropna().unique().tolist()
1548
+ unique_properties = [prop for prop in unique_properties if prop != "No properties"]
1549
+ properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
1550
+ elif alt_fine_label_col in clustered_df.columns:
1551
+ unique_properties = clustered_df[alt_fine_label_col].dropna().unique().tolist()
1552
+ unique_properties = [prop for prop in unique_properties if prop != "No properties"]
1553
+ properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
1554
+ elif 'property_description' in clustered_df.columns:
1555
+ unique_properties = clustered_df['property_description'].dropna().unique().tolist()
1556
+ unique_properties = [prop for prop in unique_properties if prop != "No properties"]
1557
+ properties = [prop[:100] + "..." if len(prop) > 100 else prop for prop in sorted(unique_properties)]
1558
+
1559
+ # Tags from meta first value if available (only ALLOWED_TAGS)
1560
+ tags: List[str] = []
1561
+ if 'meta' in clustered_df.columns:
1562
+ def _first_allowed(obj: Any) -> Optional[str]:
1563
+ return extract_allowed_tag(obj)
1564
+
1565
+ # Compute candidate tags and check for all-empty-dict case
1566
+ parsed_meta_series = clustered_df['meta'].apply(_parse_meta_obj)
1567
+ non_null_parsed = [m for m in parsed_meta_series.tolist() if m is not None]
1568
+ all_empty_dicts = (
1569
+ len(non_null_parsed) > 0 and all(isinstance(m, dict) and len(m) == 0 for m in non_null_parsed)
1570
+ )
1571
+
1572
+ if not all_empty_dicts:
1573
+ tag_series = clustered_df['meta'].apply(_first_allowed)
1574
+ tags = sorted({str(t) for t in tag_series.dropna().tolist() if t is not None and str(t) in ALLOWED_TAGS})
1575
+
1576
+ return {
1577
+ 'prompts': prompts,
1578
+ 'models': models,
1579
+ 'properties': properties,
1580
+ 'tags': tags,
1581
+ }
1582
+
1583
+ # ---------------------------------------------------------------------------
1584
+ # Example data extraction (restored)
1585
+ # ---------------------------------------------------------------------------
1586
+
1587
+ def get_example_data(
1588
+ clustered_df: pd.DataFrame,
1589
+ selected_prompt: str | None = None,
1590
+ selected_model: str | None = None,
1591
+ selected_property: str | None = None,
1592
+ max_examples: int = 5,
1593
+ show_unexpected_behavior: bool = False,
1594
+ randomize: bool = False,
1595
+ ) -> List[Dict[str, Any]]:
1596
+ """Return a list of example rows filtered by prompt / model / property.
1597
+
1598
+ This function was accidentally removed during a refactor; it is required by
1599
+ *examples_tab.py* and other parts of the UI.
1600
+
1601
+ Args:
1602
+ clustered_df: DataFrame containing the clustered results data
1603
+ selected_prompt: Prompt to filter by (None for all)
1604
+ selected_model: Model to filter by (None for all)
1605
+ selected_property: Property description to filter by (None for all)
1606
+ max_examples: Maximum number of examples to return
1607
+ show_unexpected_behavior: If True, filter to only show unexpected behavior
1608
+ randomize: If True, sample randomly from the filtered set instead of taking the first rows
1609
+
1610
+ Returns:
1611
+ List of example dictionaries with extracted data
1612
+ """
1613
+
1614
+ if clustered_df.empty:
1615
+ return []
1616
+
1617
+ df = clustered_df.copy()
1618
+
1619
+ # Filter by unexpected behavior if requested
1620
+ if show_unexpected_behavior:
1621
+ if "unexpected_behavior" in df.columns:
1622
+ # Assuming True/1 means unexpected behavior
1623
+ df = df[df["unexpected_behavior"].isin([True, 1, "True", "true"])]
1624
+ else:
1625
+ # If no unexpected_behavior column, return empty (or could return all)
1626
+ return []
1627
+
1628
+ # Filter by prompt
1629
+ if selected_prompt:
1630
+ prompt_cols = ["prompt", "question", "input", "user_prompt"]
1631
+ for col in prompt_cols:
1632
+ if col in df.columns:
1633
+ df = df[df[col].str.contains(selected_prompt, case=False, na=False)]
1634
+ break
1635
+
1636
+ # Filter by model - handle both single model and side-by-side datasets
1637
+ if selected_model:
1638
+ if "model" in df.columns:
1639
+ # Single model datasets
1640
+ df = df[df["model"] == selected_model]
1641
+ elif "model_a" in df.columns and "model_b" in df.columns:
1642
+ # Side-by-side datasets - filter where either model_a or model_b matches
1643
+ df = df[(df["model_a"] == selected_model) | (df["model_b"] == selected_model)]
1644
+
1645
+ # Filter by property
1646
+ if selected_property:
1647
+ property_cols = ["property_description", "cluster", "fine_cluster_label", "property_description_fine_cluster_label"]
1648
+ for col in property_cols:
1649
+ if col in df.columns:
1650
+ df = df[df[col].str.contains(selected_property, case=False, na=False)]
1651
+ break
1652
+
1653
+ # Limit to max_examples (randomized if requested)
1654
+ if randomize:
1655
+ if len(df) > max_examples:
1656
+ df = df.sample(n=max_examples)
1657
+ else:
1658
+ df = df.sample(frac=1)
1659
+ else:
1660
+ df = df.head(max_examples)
1661
+
1662
+ examples: List[Dict[str, Any]] = []
1663
+ for _, row in df.iterrows():
1664
+ prompt_val = next(
1665
+ (row.get(col) for col in ["prompt", "question", "input", "user_prompt"] if row.get(col) is not None),
1666
+ "N/A",
1667
+ )
1668
+
1669
+ # Check if this is a side-by-side dataset
1670
+ is_side_by_side = ('model_a_response' in row and 'model_b_response' in row and
1671
+ row.get('model_a_response') is not None and row.get('model_b_response') is not None)
1672
+
1673
+ if is_side_by_side:
1674
+ # For side-by-side datasets, store both responses separately
1675
+ response_val = "SIDE_BY_SIDE" # Special marker
1676
+ model_val = f"{row.get('model_a', 'Model A')} vs {row.get('model_b', 'Model B')}"
1677
+ else:
1678
+ # For single response datasets, use the existing logic
1679
+ response_val = next(
1680
+ (
1681
+ row.get(col)
1682
+ for col in [
1683
+ "model_response",
1684
+ "model_a_response",
1685
+ "model_b_response",
1686
+ "responses",
1687
+ "response",
1688
+ "output",
1689
+ ]
1690
+ if row.get(col) is not None
1691
+ ),
1692
+ "N/A",
1693
+ )
1694
+ model_val = row.get("model", "N/A")
1695
+
1696
+ # Try both naming patterns for cluster data
1697
+ fine_cluster_id = row.get("property_description_fine_cluster_id", row.get("fine_cluster_id", "N/A"))
1698
+ fine_cluster_label = row.get("property_description_fine_cluster_label", row.get("fine_cluster_label", "N/A"))
1699
+ coarse_cluster_id = row.get("property_description_coarse_cluster_id", row.get("coarse_cluster_id", "N/A"))
1700
+ coarse_cluster_label = row.get("property_description_coarse_cluster_label", row.get("coarse_cluster_label", "N/A"))
1701
+
1702
+ example_dict = {
1703
+ "id": row.get("id", "N/A"),
1704
+ "model": model_val,
1705
+ "prompt": prompt_val,
1706
+ "response": response_val,
1707
+ "property_description": row.get("property_description", "N/A"),
1708
+ "score": row.get("score", "N/A"),
1709
+ "fine_cluster_id": fine_cluster_id,
1710
+ "fine_cluster_label": fine_cluster_label,
1711
+ "coarse_cluster_id": coarse_cluster_id,
1712
+ "coarse_cluster_label": coarse_cluster_label,
1713
+ "category": row.get("category", "N/A"),
1714
+ "type": row.get("type", "N/A"),
1715
+ "impact": row.get("impact", "N/A"),
1716
+ "reason": row.get("reason", "N/A"),
1717
+ "evidence": row.get("evidence", "N/A"),
1718
+ "meta": row.get("meta", None),
1719
+ "user_preference_direction": row.get("user_preference_direction", "N/A"),
1720
+ "raw_response": row.get("raw_response", "N/A"),
1721
+ "contains_errors": row.get("contains_errors", "N/A"),
1722
+ "unexpected_behavior": row.get("unexpected_behavior", "N/A"),
1723
+ }
1724
+
1725
+ # Add side-by-side specific fields if applicable
1726
+ if is_side_by_side:
1727
+ example_dict.update({
1728
+ "is_side_by_side": True,
1729
+ "model_a": row.get("model_a", "Model A"),
1730
+ "model_b": row.get("model_b", "Model B"),
1731
+ "model_a_response": row.get("model_a_response", "N/A"),
1732
+ "model_b_response": row.get("model_b_response", "N/A"),
1733
+ "winner": row.get("winner", None),
1734
+ })
1735
+ else:
1736
+ example_dict["is_side_by_side"] = False
1737
+
1738
+ examples.append(example_dict)
1739
+
1740
+ return examples
1741
+
1742
+
1743
+ def format_examples_display(examples: List[Dict[str, Any]],
1744
+ selected_prompt: str = None,
1745
+ selected_model: str = None,
1746
+ selected_property: str = None,
1747
+ use_accordion: bool = True,
1748
+ pretty_print_dicts: bool = True) -> str:
1749
+ """Format examples for HTML display with proper conversation rendering.
1750
+
1751
+ Args:
1752
+ examples: List of example dictionaries
1753
+ selected_prompt: Currently selected prompt filter
1754
+ selected_model: Currently selected model filter
1755
+ selected_property: Currently selected property filter
1756
+ use_accordion: If True, group system and info messages in collapsible accordions
1757
+ pretty_print_dicts: If True, pretty-print embedded dictionaries
1758
+
1759
+ Returns:
1760
+ HTML string for display
1761
+ """
1762
+ from .conversation_display import convert_to_openai_format, display_openai_conversation_html
1763
+ from .side_by_side_display import display_side_by_side_responses
1764
+
1765
+ if not examples:
1766
+ return "<p style='color: #e74c3c; padding: 20px;'>No examples found matching the current filters.</p>"
1767
+
1768
+ # Create filter summary
1769
+ filter_parts = []
1770
+ if selected_prompt and selected_prompt != "All Prompts":
1771
+ filter_parts.append(f"Prompt: {selected_prompt}")
1772
+ if selected_model and selected_model != "All Models":
1773
+ filter_parts.append(f"Model: {selected_model}")
1774
+ if selected_property and selected_property != "All Clusters":
1775
+ filter_parts.append(f"Cluster: {selected_property}")
1776
+
1777
+ filter_summary = ""
1778
+ if filter_parts:
1779
+ filter_summary = f"""
1780
+ <div style="background: #e3f2fd; padding: 15px; border-radius: 8px; margin-bottom: 20px; border-left: 4px solid #2196f3;">
1781
+ <strong>πŸ” Active Filters:</strong> {" β€’ ".join(filter_parts)}
1782
+ </div>
1783
+ """
1784
+
1785
+ html_out = f"""
1786
+ <div class="examples-container" style="font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
1787
+ <style>
1788
+ /* Make JSON/code wrappers transparent (fall back to white when inline-styled) */
1789
+ .examples-container pre,
1790
+ .examples-container .highlight,
1791
+ .examples-container .codehilite,
1792
+ .examples-container p pre,
1793
+ .examples-container li pre,
1794
+ .examples-container div pre {{
1795
+ background: transparent !important;
1796
+ }}
1797
+ .examples-container code {{ background: transparent !important; }}
1798
+ </style>
1799
+ <h3 style="color: #333; margin-bottom: 15px;">πŸ“‹ Examples ({len(examples)} found)</h3>
1800
+ {filter_summary}
1801
+ """
1802
+
1803
+ for i, example in enumerate(examples, 1):
1804
+ # Check if this is a side-by-side example
1805
+ if example.get('is_side_by_side', False):
1806
+ # Use side-by-side display for comparison datasets
1807
+ conversation_html = display_side_by_side_responses(
1808
+ model_a=example['model_a'],
1809
+ model_b=example['model_b'],
1810
+ model_a_response=example['model_a_response'],
1811
+ model_b_response=example['model_b_response'],
1812
+ use_accordion=use_accordion,
1813
+ pretty_print_dicts=pretty_print_dicts,
1814
+ score=example['score'],
1815
+ winner=example.get('winner')
1816
+ )
1817
+ else:
1818
+ # Convert response to OpenAI format for proper display (single model)
1819
+ response_data = example['response']
1820
+ if response_data != 'N/A':
1821
+ openai_conversation = convert_to_openai_format(response_data)
1822
+ conversation_html = display_openai_conversation_html(
1823
+ openai_conversation,
1824
+ use_accordion=use_accordion,
1825
+ pretty_print_dicts=pretty_print_dicts,
1826
+ evidence=example.get('evidence')
1827
+ )
1828
+ else:
1829
+ conversation_html = "<p style='color: #dc3545; font-style: italic;'>No response data available</p>"
1830
+
1831
+ # Compact cluster badge for header row
1832
+ cluster_badge = ""
1833
+ if example['fine_cluster_label'] != 'N/A':
1834
+ cluster_badge = (
1835
+ f"<span style=\"display:inline-block; padding:2px 8px; border-radius:999px; font-size:11px; font-weight:600; background:#eef2ff; color:#4f46e5; border:1px solid #e0e7ff;\">"
1836
+ f"Cluster: {html.escape(str(example['fine_cluster_label']))}"
1837
+ f"</span>"
1838
+ )
1839
+
1840
+ # Tag badge derived from meta (first value)
1841
+ tag_badge = ""
1842
+ tag_value = None
1843
+ meta_obj = example.get('meta')
1844
+ tag_value = extract_allowed_tag(meta_obj)
1845
+ if tag_value is not None and str(tag_value).strip() != "":
1846
+ tag_badge = (
1847
+ f"<span style=\"display:inline-block; padding:2px 8px; border-radius:999px; background:#faf5ff; color:#6d28d9; border:1px solid #ede9fe;\">"
1848
+ f"Tag: {html.escape(str(tag_value))}"
1849
+ f"</span>"
1850
+ )
1851
+
1852
+ # Score display for summary (only for non-side-by-side or when not shown in side-by-side)
1853
+ score_badge = ""
1854
+ if not example.get('is_side_by_side', False) and example['score'] != 'N/A':
1855
+ try:
1856
+ score_val = float(example['score'])
1857
+ score_color = '#28a745' if score_val >= 0 else '#dc3545'
1858
+ score_badge = f"""
1859
+ <span style="
1860
+ background: {score_color};
1861
+ color: white;
1862
+ padding: 4px 8px;
1863
+ border-radius: 12px;
1864
+ font-size: 12px;
1865
+ font-weight: bold;
1866
+ margin-left: 10px;
1867
+ ">
1868
+ Score: {score_val:.3f}
1869
+ </span>
1870
+ """
1871
+ except:
1872
+ pass
1873
+
1874
+ # Create short preview of prompt for summary
1875
+ prompt_preview = example['prompt'][:80] + "..." if len(example['prompt']) > 80 else example['prompt']
1876
+
1877
+ # Create expandable example card
1878
+ # First example is expanded by default
1879
+ open_attr = "open" if i == 1 else ""
1880
+
1881
+ # Build top-of-card score section (above conversation) if score exists
1882
+ score_section_html = ""
1883
+ raw_score = example.get('score')
1884
+ numeric_score: float | None = None
1885
+ if isinstance(raw_score, (int, float)):
1886
+ numeric_score = float(raw_score)
1887
+ elif isinstance(raw_score, str):
1888
+ # Accept simple numeric strings without try/except
1889
+ if re.match(r"^[+-]?\d+(?:\.\d+)?$", raw_score.strip() or ""):
1890
+ numeric_score = float(raw_score)
1891
+ # Avoid duplicating score display for side-by-side, which renders its own score section
1892
+ if numeric_score is not None and not example.get('is_side_by_side', False):
1893
+ color_bg = '#dcfce7' if numeric_score >= 0 else '#fee2e2'
1894
+ color_fg = '#166534' if numeric_score >= 0 else '#991b1b'
1895
+ score_chip = (
1896
+ f"<span style=\"display:inline-block; padding:4px 10px; border-radius:999px; "
1897
+ f"background:{color_bg}; color:{color_fg}; font-weight:600; font-size:12px; "
1898
+ f"border:1px solid rgba(0,0,0,0.05);\">Score: {numeric_score:.3f}</span>"
1899
+ )
1900
+ score_section_html = (
1901
+ f"<div style=\"margin: 0 0 12px 0; display:flex; align-items:center; flex-wrap:wrap; gap:8px;\">"
1902
+ f"{score_chip}"
1903
+ f"</div>"
1904
+ )
1905
+
1906
+ html_out += f"""
1907
+ <details {open_attr} style="border: 1px solid #dee2e6; border-radius: 8px; margin-bottom: 15px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
1908
+ <summary style="
1909
+ padding: 15px;
1910
+ cursor: pointer;
1911
+ font-weight: 600;
1912
+ color: #495057;
1913
+ background: linear-gradient(90deg, #f8f9fa 0%, #e9ecef 100%);
1914
+ border-radius: 8px 8px 0 0;
1915
+ border-bottom: 1px solid #dee2e6;
1916
+ display: flex;
1917
+ align-items: center;
1918
+ justify-content: space-between;
1919
+ ">
1920
+ <span>
1921
+ <span style="background: #6c757d; color: white; padding: 4px 8px; border-radius: 4px; font-size: 12px; margin-right: 10px;">#{i}</span>
1922
+ {prompt_preview}
1923
+ </span>
1924
+ <span style="font-size: 12px; color: #6c757d;">
1925
+ {example['model']}{score_badge}
1926
+ </span>
1927
+ </summary>
1928
+
1929
+ <div style="padding: 20px;">
1930
+ <!-- Compact metadata badges row -->
1931
+ <div style="display:flex; flex-wrap:wrap; gap:8px; align-items:center; margin-bottom: 16px; font-size:12px; color:#6b7280;">
1932
+ <span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#f3f4f6; border:1px solid #e5e7eb;">ID: {html.escape(str(example['id']))}</span>
1933
+ <span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#f3f4f6; border:1px solid #e5e7eb;">Model: {html.escape(str(example['model']))}</span>
1934
+ {tag_badge}
1935
+ {(f'<span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#ecfdf5; color:#047857; border:1px solid #bbf7d0;">Category: {html.escape(str(example["category"]))}</span>' if example["category"] not in [None, "N/A", "None", "", "null"] and str(example["category"]).strip() != "" else '')}
1936
+ {(f'<span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#eff6ff; color:#1d4ed8; border:1px solid #dbeafe;">Type: {html.escape(str(example["type"]))}</span>' if example["type"] not in [None, "N/A", "None", "", "null"] and str(example["type"]).strip() != "" else '')}
1937
+ {(f'<span style="display:inline-block; padding:2px 8px; border-radius:999px; background:#fff7ed; color:#c2410c; border:1px solid #fed7aa;">Impact: {html.escape(str(example["impact"]))}</span>' if example["impact"] not in [None, "N/A", "None", "", "null"] and str(example["impact"]).strip() != "" else '')}
1938
+ </div>
1939
+
1940
+ <!-- Collapsible info section for Cluster / Tag / Property / Reason / Evidence -->
1941
+ {(
1942
+ f'''<details style="margin-bottom:16px; border:1px solid #e5e7eb; border-radius:8px; background:#f9fafb;">
1943
+ <summary style="cursor:pointer; padding:12px; font-weight:600; color:#374151; border-radius:8px;">
1944
+ πŸ“‹ Property Information
1945
+ </summary>
1946
+ <div style="padding:0 12px 12px 12px; border-top:1px solid #e5e7eb;">
1947
+ {(f'<div style="margin-top:12px;"><strong style="color:#374151;">Cluster</strong><div style="color:#4b5563; margin-top:4px;">{_convdisp._markdown(str(example["fine_cluster_label"]))}</div></div>' if example.get("fine_cluster_label") not in [None, "N/A", "None", "", "null"] and str(example.get("fine_cluster_label", "")).strip() != "" else '')}
1948
+ {(f'<div style="margin-top:12px;"><strong style="color:#374151;">Property</strong><div style="color:#4b5563; margin-top:4px;">{_convdisp._markdown(str(example["property_description"]))}</div></div>' if example["property_description"] not in [None, "N/A", "None", "", "null"] and str(example["property_description"]).strip() != "" else '')}
1949
+ {(f'<div style="margin-top:12px;"><strong style="color:#374151;">Evidence</strong><div style="color:#4b5563; margin-top:4px;">{_convdisp._markdown(str(example["evidence"]))}</div></div>' if example["evidence"] not in [None, "N/A", "None", "", "null"] and str(example["evidence"]).strip() != "" else '')}
1950
+ </div>
1951
+ </details>'''
1952
+ ) if any([
1953
+ example.get("fine_cluster_label") not in [None, "N/A", "None", "", "null"] and str(example.get("fine_cluster_label", "")).strip() != "",
1954
+ example.get("property_description") not in [None, "N/A", "None", "", "null"] and str(example.get("property_description", "")).strip() != "",
1955
+ example.get("reason") not in [None, "N/A", "None", "", "null"] and str(example.get("reason", "")).strip() != "",
1956
+ example.get("evidence") not in [None, "N/A", "None", "", "null"] and str(example.get("evidence", "")).strip() != "",
1957
+ ]) else ''}
1958
+
1959
+ {score_section_html}
1960
+
1961
+ <div style="margin-bottom: 15px;">
1962
+ <div style="border-radius: 6px; font-size: 15px; line-height: 1.5;">
1963
+ {conversation_html}
1964
+ </div>
1965
+ </div>
1966
+ </div>
1967
+ </details>
1968
+ """
1969
+
1970
+ html_out += "</div>"
1971
+ return html_out
1972
+
1973
+ # ---------------------------------------------------------------------------
1974
+ # Legacy function aliases (backward compatibility)
1975
+ # ---------------------------------------------------------------------------
1976
+
1977
+ def compute_model_rankings(*args, **kwargs):
1978
+ """Legacy alias β†’ forwards to compute_model_rankings_new."""
1979
+ return compute_model_rankings_new(*args, **kwargs)
1980
+
1981
+
1982
+ def create_model_summary_card(*args, **kwargs):
1983
+ """Legacy alias β†’ forwards to create_model_summary_card_new."""
1984
+ return create_model_summary_card_new(*args, **kwargs)
1985
+
1986
+
1987
+ def get_total_clusters_count(metrics: Dict[str, Any]) -> int:
1988
+ """Get the total number of clusters from the metrics data."""
1989
+ cluster_scores = metrics.get("cluster_scores", {})
1990
+ # Filter out "No properties" clusters
1991
+ cluster_scores = {k: v for k, v in cluster_scores.items() if k != "No properties"}
1992
+ return len(cluster_scores)
1993
+
1994
+
1995
+ def get_light_color_for_cluster(cluster_name: str, index: int) -> str:
1996
+ """Generate a light dusty blue background for cluster boxes.
1997
+
1998
+ Returns a consistent light dusty blue color for all clusters.
1999
+ """
2000
+ return "#f0f4f8" # Very light dusty blue
2001
+
2002
+ __all__ = [
2003
+ "get_model_clusters",
2004
+ "get_all_models",
2005
+ "get_all_clusters",
2006
+ "format_confidence_interval",
2007
+ "get_confidence_interval_width",
2008
+ "has_confidence_intervals",
2009
+ "extract_quality_score",
2010
+ "get_top_clusters_for_model",
2011
+ "compute_model_rankings_new",
2012
+ "create_model_summary_card_new",
2013
+ "format_cluster_dataframe",
2014
+ "truncate_cluster_name",
2015
+ "create_frequency_comparison_table",
2016
+ "create_frequency_comparison_plots",
2017
+ "search_clusters_by_text",
2018
+ "search_clusters_only",
2019
+ "create_interactive_cluster_viewer",
2020
+ "get_cluster_statistics",
2021
+ "get_unique_values_for_dropdowns",
2022
+ "get_example_data",
2023
+ "format_examples_display",
2024
+ "compute_model_rankings",
2025
+ "create_model_summary_card",
2026
+ "get_total_clusters_count",
2027
+ ]