Spaces:

Allanatrix
/

NexaEvals

Sleeping

App Files Files Community

Allanatrix commited on Jun 26

Commit

31a042b

verified ·

1 Parent(s): f931469

Create app.py

Browse files

Files changed (1) hide show

app.py +378 -0

app.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import gradio as gr
+import plotly.graph_objs as go
+import plotly.express as px
+import pandas as pd
+import json
+# Domain-specific model evaluations
+MODEL_EVALS = {
+    "Proteins": {
+        "Nexa Bio1 (Secondary)": 0.71,
+        "Porter6 (Secondary)": 0.8456,
+        "DeepCNF (Secondary)": 0.85,
+        "AlphaFold2 (Tertiary GDT-TS)": 0.924,
+        "Nexa Bio2 (Tertiary)": 0.90,
+    },
+    "Astro": {
+        "Nexa Astro": 0.97,
+        "Baseline CNN": 0.89,
+    },
+    "Materials": {
+        "Nexa Materials": 0.9999,
+        "Random Forest Baseline": 0.92,
+    },
+    "QST": {
+        "Nexa PIN Model": 0.80,
+        "Quantum TomoNet": 0.85,
+    },
+    "HEP": {
+        "Nexa HEP Model": 0.91,
+        "CMSNet": 0.94,
+    },
+    "CFD": {
+        "Nexa CFD Model": 0.92,
+        "FlowNet": 0.89,
+    },
+}
+# SCIEVAL/OSIR metrics data
+SCIEVAL_METRICS = {
+    "Nexa Mistral Sci-7B": {
+        "OSIR (General)": {
+            "Entropy / Novelty": 6.2,
+            "Internal Consistency": 8.5,
+            "Hypothesis Framing": 6.8,
+            "Thematic Grounding": 7.9,
+            "Citation & Structure": 7.3,
+            "Symbolism & Math Logic": 6.1,
+            "Scientific Utility": 7.6
+        },
+        "OSIR-Field (Physics)": {
+            "Entropy / Novelty": 7.1,
+            "Internal Consistency": 8.9,
+            "Hypothesis Framing": 7.4,
+            "Thematic Grounding": 8.2,
+            "Citation & Structure": 6.5,
+            "Symbolism & Math Logic": 7.8,
+            "Scientific Utility": 8.3
+        }
+    },
+    # (Data below here is a demo⬇️)
+    "GPT-4 Scientific": {
+        "OSIR (General)": {
+            "Entropy / Novelty": 7.8,
+            "Internal Consistency": 8.2,
+            "Hypothesis Framing": 8.1,
+            "Thematic Grounding": 8.4,
+            "Citation & Structure": 8.9,
+            "Symbolism & Math Logic": 7.4,
+            "Scientific Utility": 8.1
+        },
+        "OSIR-Field (Physics)": {
+            "Entropy / Novelty": 7.2,
+            "Internal Consistency": 8.6,
+            "Hypothesis Framing": 8.3,
+            "Thematic Grounding": 8.7,
+            "Citation & Structure": 9.1,
+            "Symbolism & Math Logic": 8.2,
+            "Scientific Utility": 8.4
+        }
+    },
+    "Claude Scientific": {
+        "OSIR (General)": {
+            "Entropy / Novelty": 7.5,
+            "Internal Consistency": 9.1,
+            "Hypothesis Framing": 7.9,
+            "Thematic Grounding": 8.8,
+            "Citation & Structure": 8.7,
+            "Symbolism & Math Logic": 7.8,
+            "Scientific Utility": 8.3
+        },
+        "OSIR-Field (Physics)": {
+            "Entropy / Novelty": 7.4,
+            "Internal Consistency": 9.2,
+            "Hypothesis Framing": 8.1,
+            "Thematic Grounding": 8.9,
+            "Citation & Structure": 8.5,
+            "Symbolism & Math Logic": 8.4,
+            "Scientific Utility": 8.6
+        }
+    }
+}
+def plot_domain_benchmark(domain):
+    """Create bar chart for domain-specific benchmarks"""
+    models = list(MODEL_EVALS[domain].keys())
+    scores = list(MODEL_EVALS[domain].values())
+    # Color coding for Nexa models vs others
+    colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        x=models,
+        y=scores,
+        marker_color=colors,
+        text=[f'{score:.3f}' for score in scores],
+        textposition='auto'
+    ))
+    fig.update_layout(
+        title=f"Model Benchmark Scores — {domain}",
+        xaxis_title="Model",
+        yaxis_title="Score",
+        yaxis_range=[0, 1.0],
+        template="plotly_white",
+        height=500,
+        showlegend=False
+    )
+    return fig
+def plot_scieval_comparison(model_name):
+    """Create comparison chart for SCIEVAL metrics"""
+    if model_name not in SCIEVAL_METRICS:
+        return go.Figure()
+    metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
+    osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
+    field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        name='OSIR (General)',
+        x=metrics,
+        y=osir_scores,
+        marker_color='#FFD700',
+        text=[f'{score:.1f}' for score in osir_scores],
+        textposition='auto'
+    ))
+    fig.add_trace(go.Bar(
+        name='OSIR-Field (Physics)',
+        x=metrics,
+        y=field_scores,
+        marker_color='#FF6B35',
+        text=[f'{score:.1f}' for score in field_scores],
+        textposition='auto'
+    ))
+    fig.update_layout(
+        title=f"SCIEVAL Metrics Comparison — {model_name}",
+        xaxis_title="Metric",
+        yaxis_title="Score (1-10)",
+        yaxis_range=[0, 10],
+        template="plotly_white",
+        height=500,
+        barmode='group',
+        xaxis_tickangle=-45
+    )
+    return fig
+def create_leaderboard():
+    """Create leaderboard table"""
+    leaderboard_data = []
+    # Add domain benchmark leaders
+    for domain, models in MODEL_EVALS.items():
+        best_model = max(models.items(), key=lambda x: x[1])
+        leaderboard_data.append({
+            "Domain": domain,
+            "Best Model": best_model[0],
+            "Score": f"{best_model[1]:.3f}",
+            "Metric Type": "Domain Benchmark"
+        })
+    # Add SCIEVAL leaders
+    for model, evaluations in SCIEVAL_METRICS.items():
+        avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
+        avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
+        leaderboard_data.append({
+            "Domain": "OSIR General",
+            "Best Model": model,
+            "Score": f"{avg_osir:.2f}",
+            "Metric Type": "SCIEVAL"
+        })
+        leaderboard_data.append({
+            "Domain": "OSIR Physics",
+            "Best Model": model,
+            "Score": f"{avg_field:.2f}",
+            "Metric Type": "SCIEVAL"
+        })
+    df = pd.DataFrame(leaderboard_data)
+    return df
+def get_model_details(domain):
+    """Get JSON details for domain models"""
+    return json.dumps(MODEL_EVALS[domain], indent=2)
+def display_domain_eval(domain):
+    """Display domain evaluation results"""
+    plot = plot_domain_benchmark(domain)
+    details = get_model_details(domain)
+    return plot, details
+def display_scieval(model_name):
+    """Display SCIEVAL results"""
+    plot = plot_scieval_comparison(model_name)
+    if model_name in SCIEVAL_METRICS:
+        details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
+    else:
+        details = "Model not found in SCIEVAL database"
+    return plot, details
+# Create Gradio interface
+with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🔬 Scientific ML Benchmark Suite
+    ### Comprehensive evaluation framework for scientific machine learning models
+    This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide
+    comprehensive assessment of ML models across scientific disciplines.
+    """)
+    with gr.Tabs():
+        # Domain Benchmarks Tab
+        with gr.TabItem("🧪 Domain Benchmarks"):
+            gr.Markdown("""
+            ### Domain-Specific Model Evaluations
+            Compare models across scientific domains including Proteins, Astronomy, Materials Science,
+            Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
+            """)
+            with gr.Row():
+                domain_dropdown = gr.Dropdown(
+                    choices=list(MODEL_EVALS.keys()),
+                    label="Select Scientific Domain",
+                    value="Proteins"
+                )
+                domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
+            with gr.Row():
+                domain_plot = gr.Plot(label="Domain Benchmark Results")
+                domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
+            domain_btn.click(
+                display_domain_eval,
+                inputs=domain_dropdown,
+                outputs=[domain_plot, domain_metrics]
+            )
+        # SCIEVAL Tab
+        with gr.TabItem("📊 SCIEVAL Metrics"):
+            gr.Markdown("""
+            ### SCIEVAL: Scientific Reasoning Evaluation
+            Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
+            **Metrics evaluated:**
+            - **Entropy/Novelty**: Originality and information richness
+            - **Internal Consistency**: Logical structure and argument continuity
+            - **Hypothesis Framing**: Research aim clarity
+            - **Thematic Grounding**: Domain focus and relevance
+            - **Citation & Structure**: Scientific formatting
+            - **Symbolism & Math Logic**: Mathematical rigor
+            - **Scientific Utility**: Real-world research value
+            """)
+            with gr.Row():
+                scieval_dropdown = gr.Dropdown(
+                    choices=list(SCIEVAL_METRICS.keys()),
+                    label="Select Model for SCIEVAL",
+                    value="Nexa Mistral Sci-7B"
+                )
+                scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
+            with gr.Row():
+                scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
+                scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
+            scieval_btn.click(
+                display_scieval,
+                inputs=scieval_dropdown,
+                outputs=[scieval_plot, scieval_metrics]
+            )
+        # Leaderboard Tab
+        with gr.TabItem("🏆 Leaderboard"):
+            gr.Markdown("""
+            ### Scientific ML Model Leaderboard
+            Current best-performing models across all evaluated domains and metrics.
+            """)
+            leaderboard_df = create_leaderboard()
+            leaderboard_table = gr.Dataframe(
+                value=leaderboard_df,
+                label="Current Leaders by Domain",
+                interactive=False
+            )
+        # About Tab
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown("""
+            ### About the Scientific ML Benchmark Suite
+            This comprehensive evaluation framework combines two powerful assessment methodologies:
+            #### 🎯 Domain Benchmarks
+            - **Proteins**: Secondary/tertiary structure prediction accuracy
+            - **Astronomy**: Object classification and detection
+            - **Materials**: Property prediction and discovery
+            - **QST**: Quantum state tomography reconstruction
+            - **HEP**: High energy physics event classification
+            - **CFD**: Computational fluid dynamics modeling
+            #### 🔬 SCIEVAL Framework
+            SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
+            - **Standardized Evaluation**: Reproducible metrics for scientific LLMs
+            - **Domain Adaptation**: Field-specific evaluation extensions
+            - **Research Utility**: Assessment of real-world scientific value
+            **OSIR-Field Extensions:**
+            - `osir-field-physics`: Physics-specific reasoning evaluation
+            - `osir-field-bio`: Biological sciences assessment
+            - `osir-field-chem`: Chemistry domain evaluation
+            - `osir-field-cs`: Computer science applications
+            #### 📈 Scoring System
+            - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
+            - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
+            #### 🤝 Contributing
+            This is an open framework welcoming contributions:
+            - New domain-specific test sets
+            - Additional evaluation metrics
+            - Model submissions for benchmarking
+            #### 📄 Citation
+            ```
+            @misc{scieval2024,
+              title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
+              author={NEXA Research},
+              year={2025},
+              url={https://huggingface.co/spaces/osir/scieval}
+            }
+            ```
+            ---
+            **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
+            """)
+    # Initialize with default values
+    demo.load(
+        lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
+        outputs=[domain_plot, domain_metrics]
+    )
+    demo.load(
+        lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"),
+                json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
+        outputs=[scieval_plot, scieval_metrics]
+    )
+if __name__ == "__main__":
+    demo.launch()