import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt

# ─── 1. BENCHMARK DATA ──────────────────────────────────────────────────────────
# Nested dict: Domain → { Model Name → {metric_name: value, …, "SOTA_<metric>": value } }
benchmark_data = {
    "Protein Folding": {
        "Nexa Bio1 (Secondary)": {
            "Accuracy (%)": 71,
            "Q3 (%)": 65,
            "Q8 (%)": 55,
            "TM-score": 0.60,
            "SOTA_Accuracy (%)": 85,
            "SOTA_TM-score": 0.75
        },
        "Nexa Bio2 (Tertiary)": {
            "Confidence (%)": 90,
            "GDT_TS": 0.82,
            "Entropy Threshold (%)": 80,
            "SOTA_Confidence (%)": 92,
            "SOTA_GDT_TS": 0.85
        },
    },
    "Astrophysics": {
        "Nexa Astro": {
            "Accuracy (%)": 97,
            "Macro-F1 (%)": 96,
            "ROC-AUC": 0.98,
            "SOTA_Accuracy (%)": 96,
            "SOTA_ROC-AUC": 0.97
        },
    },
    "Materials Science": {
        "Nexa MatSci": {
            "MAE (eV)": 0.02,
            "RMSE (eV)": 0.03,
            "Bandgap Accuracy (%)": 98,
            "SOTA_MAE (eV)": 0.03,
            "SOTA_Bandgap Accuracy (%)": 95
        },
    },
    "Quantum State Tomography": {
        "Nexa QST": {
            "Fidelity": 0.80,
            "Purity": 1.00,
            "Trace Distance": 0.15,
            "SOTA_Fidelity": 0.83,
            "SOTA_Trace Distance": 0.12
        },
    },
    "Computational Fluid Dynamics": {
        "Nexa CFD": {
            "Relative L2 Error": 0.015,
            "Energy Conservation Loss": 0.005,
            "PSNR": 30,
            "SSIM": 0.88,
            "SOTA_Relative L2 Error": 0.020,
            "SOTA_SSIM": 0.85
        },
    },
    "High-Energy Physics": {
        "Nexa HEP": {
            "ROC-AUC": 0.92,
            "Event Accuracy (%)": 90,
            "Jet Tagging (%)": 88,
            "SOTA_ROC-AUC": 0.93,
            "SOTA_Event Accuracy (%)": 89
        },
    },
    "LLM Hypothesis & Methodology": {
        "Nexa MOE": {
            "Coherence (1–10)": 9.1,
            "Novelty (1–10)": 8.6,
            "Utility (1–10)": 8.8,
            "Expert-Rated SOTA (1–10)": 9.0
        },
    },
}

# ─── 2. SECTION DESCRIPTIONS ───────────────────────────────────────────────────
section_descriptions = {
    "Protein Folding": """**Protein Folding**  
Benchmarks for secondary (Q3/Q8) and tertiary (TM-score) structure prediction.  
Nexa Bio1 handles sequence→secondary, Nexa Bio2 handles full 3D fold confidence.""",
    "Astrophysics": """**Astrophysics**  
Stellar classification and redshift estimation.  
Metrics: Accuracy, F1, ROC-AUC against SDSS-Net and astroML baselines.""",
    "Materials Science": """**Materials Science**  
Property prediction for novel materials (e.g., bandgap, formation energy).  
Metrics: MAE/RMSE, bandgap‐prediction accuracy vs. CGCNN, ALIGNN.""",
    "Quantum State Tomography": """**Quantum State Tomography**  
Reconstruct quantum states from measurement data.  
Metrics: Fidelity, Purity, Trace Distance against PINNs and QuNet.""",
    "Computational Fluid Dynamics": """**CFD**  
Flow field prediction (Navier–Stokes).  
Metrics: Relative L2 Error, PSNR/SSIM, Energy Conservation Loss vs. FNO.""",
    "High-Energy Physics": """**High-Energy Physics**  
Particle classification and signal/background separation.  
Metrics: ROC-AUC, event reconstruction accuracy, jet-tagging efficiency.""",
    "LLM Hypothesis & Methodology": """**LLM-Based Scientific Reasoning**  
Hypothesis and methodology generation.  
Metrics scored 1–10 by expert rubric on Coherence, Novelty, and Utility; compared to top academic LLM baselines."""
}

# ─── 3. PLOTTING FUNCTION ────────────────────────────────────────────────────────
def plot_comparison(category):
    data = benchmark_data[category]
    fig, ax = plt.subplots(figsize=(7, 4))
    bar_width = 0.4
    indices = list(range(len(data)))
    labels = list(data.keys())

    # collect metrics that aren’t SOTA
    for i, model in enumerate(labels):
        metrics = data[model]
        # extract non-SOTA metrics
        non_sota = {k: v for k, v in metrics.items() if not k.startswith("SOTA")}
        sota = {k.replace("SOTA_", ""): v for k, v in metrics.items() if k.startswith("SOTA")}

        # bar positions
        pos = i * 2
        ax.bar([pos + j*bar_width for j in range(len(non_sota))],
               list(non_sota.values()),
               width=bar_width, label=f"{model} Metrics")
        if sota:
            ax.bar([pos + bar_width*len(non_sota) + j*bar_width for j in range(len(sota))],
                   list(sota.values()),
                   width=bar_width, alpha=0.7, label=f"{model} SOTA")

    # formatting
    ax.set_xticks([i * (2) + bar_width*(len(non_sota)/2) for i in indices])
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_ylabel("Value / Score")
    ax.set_title(f"{category} — Nexa vs. SOTA")
    ax.legend(loc="upper right")
    plt.tight_layout()
    return fig

# ─── 4. CALLBACK TO RENDER SECTION ─────────────────────────────────────────────
def show_eval(category):
    desc = section_descriptions[category]
    df = pd.DataFrame(benchmark_data[category]).T
    fig = plot_comparison(category)
    return desc, df, fig

# ─── 5. BUILD GRADIO APP ───────────────────────────────────────────────────────
with gr.Blocks(css="""
    body { background-color: #f7f9fc; font-family: Arial, sans-serif; }
    .gradio-container { max-width: 900px; margin: auto; }
    h1, h2, h3 { color: #333; }
""") as app:
    gr.Markdown("# 🔬 Nexa Evals Dashboard")
    gr.Markdown("A **comprehensive** SciML benchmark framework. Select a domain to view metrics, compare with SOTA, and explore detailed plots and tables.")

    with gr.Row():
        with gr.Column(scale=1):
            category = gr.Radio(
                choices=list(benchmark_data.keys()),
                value="Protein Folding",
                label="Select Domain / Model Group"
            )
        with gr.Column(scale=3):
            description = gr.Markdown("")
            table = gr.Dataframe(headers=["Metric", "Value"], interactive=False)
            plot = gr.Plot()

    category.change(
        fn=show_eval,
        inputs=category,
        outputs=[description, table, plot]
    )

    # initialize
    description.value, table.value, _ = show_eval("Protein Folding")

# Launch (on Hugging Face the config flags will be auto-managed)
app.launch()