import gradio as gr import pandas as pd import matplotlib.pyplot as plt # ─── 1. BENCHMARK DATA ────────────────────────────────────────────────────────── # Nested dict: Domain → { Model Name → {metric_name: value, …, "SOTA_": value } } benchmark_data = { "Protein Folding": { "Nexa Bio1 (Secondary)": { "Accuracy (%)": 71, "Q3 (%)": 65, "Q8 (%)": 55, "TM-score": 0.60, "SOTA_Accuracy (%)": 85, "SOTA_TM-score": 0.75 }, "Nexa Bio2 (Tertiary)": { "Confidence (%)": 90, "GDT_TS": 0.82, "Entropy Threshold (%)": 80, "SOTA_Confidence (%)": 92, "SOTA_GDT_TS": 0.85 }, }, "Astrophysics": { "Nexa Astro": { "Accuracy (%)": 97, "Macro-F1 (%)": 96, "ROC-AUC": 0.98, "SOTA_Accuracy (%)": 96, "SOTA_ROC-AUC": 0.97 }, }, "Materials Science": { "Nexa MatSci": { "MAE (eV)": 0.02, "RMSE (eV)": 0.03, "Bandgap Accuracy (%)": 98, "SOTA_MAE (eV)": 0.03, "SOTA_Bandgap Accuracy (%)": 95 }, }, "Quantum State Tomography": { "Nexa QST": { "Fidelity": 0.80, "Purity": 1.00, "Trace Distance": 0.15, "SOTA_Fidelity": 0.83, "SOTA_Trace Distance": 0.12 }, }, "Computational Fluid Dynamics": { "Nexa CFD": { "Relative L2 Error": 0.015, "Energy Conservation Loss": 0.005, "PSNR": 30, "SSIM": 0.88, "SOTA_Relative L2 Error": 0.020, "SOTA_SSIM": 0.85 }, }, "High-Energy Physics": { "Nexa HEP": { "ROC-AUC": 0.92, "Event Accuracy (%)": 90, "Jet Tagging (%)": 88, "SOTA_ROC-AUC": 0.93, "SOTA_Event Accuracy (%)": 89 }, }, "LLM Hypothesis & Methodology": { "Nexa MOE": { "Coherence (1–10)": 9.1, "Novelty (1–10)": 8.6, "Utility (1–10)": 8.8, "Expert-Rated SOTA (1–10)": 9.0 }, }, } # ─── 2. SECTION DESCRIPTIONS ─────────────────────────────────────────────────── section_descriptions = { "Protein Folding": """**Protein Folding** Benchmarks for secondary (Q3/Q8) and tertiary (TM-score) structure prediction. Nexa Bio1 handles sequence→secondary, Nexa Bio2 handles full 3D fold confidence.""", "Astrophysics": """**Astrophysics** Stellar classification and redshift estimation. Metrics: Accuracy, F1, ROC-AUC against SDSS-Net and astroML baselines.""", "Materials Science": """**Materials Science** Property prediction for novel materials (e.g., bandgap, formation energy). Metrics: MAE/RMSE, bandgap‐prediction accuracy vs. CGCNN, ALIGNN.""", "Quantum State Tomography": """**Quantum State Tomography** Reconstruct quantum states from measurement data. Metrics: Fidelity, Purity, Trace Distance against PINNs and QuNet.""", "Computational Fluid Dynamics": """**CFD** Flow field prediction (Navier–Stokes). Metrics: Relative L2 Error, PSNR/SSIM, Energy Conservation Loss vs. FNO.""", "High-Energy Physics": """**High-Energy Physics** Particle classification and signal/background separation. Metrics: ROC-AUC, event reconstruction accuracy, jet-tagging efficiency.""", "LLM Hypothesis & Methodology": """**LLM-Based Scientific Reasoning** Hypothesis and methodology generation. Metrics scored 1–10 by expert rubric on Coherence, Novelty, and Utility; compared to top academic LLM baselines.""" } # ─── 3. PLOTTING FUNCTION ──────────────────────────────────────────────────────── def plot_comparison(category): data = benchmark_data[category] fig, ax = plt.subplots(figsize=(7, 4)) bar_width = 0.4 indices = list(range(len(data))) labels = list(data.keys()) # collect metrics that aren’t SOTA for i, model in enumerate(labels): metrics = data[model] # extract non-SOTA metrics non_sota = {k: v for k, v in metrics.items() if not k.startswith("SOTA")} sota = {k.replace("SOTA_", ""): v for k, v in metrics.items() if k.startswith("SOTA")} # bar positions pos = i * 2 ax.bar([pos + j*bar_width for j in range(len(non_sota))], list(non_sota.values()), width=bar_width, label=f"{model} Metrics") if sota: ax.bar([pos + bar_width*len(non_sota) + j*bar_width for j in range(len(sota))], list(sota.values()), width=bar_width, alpha=0.7, label=f"{model} SOTA") # formatting ax.set_xticks([i * (2) + bar_width*(len(non_sota)/2) for i in indices]) ax.set_xticklabels(labels, rotation=45, ha='right') ax.set_ylabel("Value / Score") ax.set_title(f"{category} — Nexa vs. SOTA") ax.legend(loc="upper right") plt.tight_layout() return fig # ─── 4. CALLBACK TO RENDER SECTION ───────────────────────────────────────────── def show_eval(category): desc = section_descriptions[category] df = pd.DataFrame(benchmark_data[category]).T fig = plot_comparison(category) return desc, df, fig # ─── 5. BUILD GRADIO APP ─────────────────────────────────────────────────────── with gr.Blocks(css=""" body { background-color: #f7f9fc; font-family: Arial, sans-serif; } .gradio-container { max-width: 900px; margin: auto; } h1, h2, h3 { color: #333; } """) as app: gr.Markdown("# 🔬 Nexa Evals Dashboard") gr.Markdown("A **comprehensive** SciML benchmark framework. Select a domain to view metrics, compare with SOTA, and explore detailed plots and tables.") with gr.Row(): with gr.Column(scale=1): category = gr.Radio( choices=list(benchmark_data.keys()), value="Protein Folding", label="Select Domain / Model Group" ) with gr.Column(scale=3): description = gr.Markdown("") table = gr.Dataframe(headers=["Metric", "Value"], interactive=False) plot = gr.Plot() category.change( fn=show_eval, inputs=category, outputs=[description, table, plot] ) # initialize description.value, table.value, _ = show_eval("Protein Folding") # Launch (on Hugging Face the config flags will be auto-managed) app.launch()