Allanatrix commited on
Commit
31a042b
Β·
verified Β·
1 Parent(s): f931469

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +378 -0
app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import plotly.graph_objs as go
3
+ import plotly.express as px
4
+ import pandas as pd
5
+ import json
6
+
7
+ # Domain-specific model evaluations
8
+ MODEL_EVALS = {
9
+ "Proteins": {
10
+ "Nexa Bio1 (Secondary)": 0.71,
11
+ "Porter6 (Secondary)": 0.8456,
12
+ "DeepCNF (Secondary)": 0.85,
13
+ "AlphaFold2 (Tertiary GDT-TS)": 0.924,
14
+ "Nexa Bio2 (Tertiary)": 0.90,
15
+ },
16
+ "Astro": {
17
+ "Nexa Astro": 0.97,
18
+ "Baseline CNN": 0.89,
19
+ },
20
+ "Materials": {
21
+ "Nexa Materials": 0.9999,
22
+ "Random Forest Baseline": 0.92,
23
+ },
24
+ "QST": {
25
+ "Nexa PIN Model": 0.80,
26
+ "Quantum TomoNet": 0.85,
27
+ },
28
+ "HEP": {
29
+ "Nexa HEP Model": 0.91,
30
+ "CMSNet": 0.94,
31
+ },
32
+ "CFD": {
33
+ "Nexa CFD Model": 0.92,
34
+ "FlowNet": 0.89,
35
+ },
36
+ }
37
+
38
+ # SCIEVAL/OSIR metrics data
39
+ SCIEVAL_METRICS = {
40
+ "Nexa Mistral Sci-7B": {
41
+ "OSIR (General)": {
42
+ "Entropy / Novelty": 6.2,
43
+ "Internal Consistency": 8.5,
44
+ "Hypothesis Framing": 6.8,
45
+ "Thematic Grounding": 7.9,
46
+ "Citation & Structure": 7.3,
47
+ "Symbolism & Math Logic": 6.1,
48
+ "Scientific Utility": 7.6
49
+ },
50
+ "OSIR-Field (Physics)": {
51
+ "Entropy / Novelty": 7.1,
52
+ "Internal Consistency": 8.9,
53
+ "Hypothesis Framing": 7.4,
54
+ "Thematic Grounding": 8.2,
55
+ "Citation & Structure": 6.5,
56
+ "Symbolism & Math Logic": 7.8,
57
+ "Scientific Utility": 8.3
58
+ }
59
+ },
60
+ # (Data below here is a demo⬇️)
61
+ "GPT-4 Scientific": {
62
+ "OSIR (General)": {
63
+ "Entropy / Novelty": 7.8,
64
+ "Internal Consistency": 8.2,
65
+ "Hypothesis Framing": 8.1,
66
+ "Thematic Grounding": 8.4,
67
+ "Citation & Structure": 8.9,
68
+ "Symbolism & Math Logic": 7.4,
69
+ "Scientific Utility": 8.1
70
+ },
71
+ "OSIR-Field (Physics)": {
72
+ "Entropy / Novelty": 7.2,
73
+ "Internal Consistency": 8.6,
74
+ "Hypothesis Framing": 8.3,
75
+ "Thematic Grounding": 8.7,
76
+ "Citation & Structure": 9.1,
77
+ "Symbolism & Math Logic": 8.2,
78
+ "Scientific Utility": 8.4
79
+ }
80
+ },
81
+ "Claude Scientific": {
82
+ "OSIR (General)": {
83
+ "Entropy / Novelty": 7.5,
84
+ "Internal Consistency": 9.1,
85
+ "Hypothesis Framing": 7.9,
86
+ "Thematic Grounding": 8.8,
87
+ "Citation & Structure": 8.7,
88
+ "Symbolism & Math Logic": 7.8,
89
+ "Scientific Utility": 8.3
90
+ },
91
+ "OSIR-Field (Physics)": {
92
+ "Entropy / Novelty": 7.4,
93
+ "Internal Consistency": 9.2,
94
+ "Hypothesis Framing": 8.1,
95
+ "Thematic Grounding": 8.9,
96
+ "Citation & Structure": 8.5,
97
+ "Symbolism & Math Logic": 8.4,
98
+ "Scientific Utility": 8.6
99
+ }
100
+ }
101
+ }
102
+
103
+ def plot_domain_benchmark(domain):
104
+ """Create bar chart for domain-specific benchmarks"""
105
+ models = list(MODEL_EVALS[domain].keys())
106
+ scores = list(MODEL_EVALS[domain].values())
107
+
108
+ # Color coding for Nexa models vs others
109
+ colors = ['#FF6B35' if 'Nexa' in model else '#4A90E2' for model in models]
110
+
111
+ fig = go.Figure()
112
+ fig.add_trace(go.Bar(
113
+ x=models,
114
+ y=scores,
115
+ marker_color=colors,
116
+ text=[f'{score:.3f}' for score in scores],
117
+ textposition='auto'
118
+ ))
119
+
120
+ fig.update_layout(
121
+ title=f"Model Benchmark Scores β€” {domain}",
122
+ xaxis_title="Model",
123
+ yaxis_title="Score",
124
+ yaxis_range=[0, 1.0],
125
+ template="plotly_white",
126
+ height=500,
127
+ showlegend=False
128
+ )
129
+ return fig
130
+
131
+ def plot_scieval_comparison(model_name):
132
+ """Create comparison chart for SCIEVAL metrics"""
133
+ if model_name not in SCIEVAL_METRICS:
134
+ return go.Figure()
135
+
136
+ metrics = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].keys())
137
+ osir_scores = list(SCIEVAL_METRICS[model_name]["OSIR (General)"].values())
138
+ field_scores = list(SCIEVAL_METRICS[model_name]["OSIR-Field (Physics)"].values())
139
+
140
+ fig = go.Figure()
141
+
142
+ fig.add_trace(go.Bar(
143
+ name='OSIR (General)',
144
+ x=metrics,
145
+ y=osir_scores,
146
+ marker_color='#FFD700',
147
+ text=[f'{score:.1f}' for score in osir_scores],
148
+ textposition='auto'
149
+ ))
150
+
151
+ fig.add_trace(go.Bar(
152
+ name='OSIR-Field (Physics)',
153
+ x=metrics,
154
+ y=field_scores,
155
+ marker_color='#FF6B35',
156
+ text=[f'{score:.1f}' for score in field_scores],
157
+ textposition='auto'
158
+ ))
159
+
160
+ fig.update_layout(
161
+ title=f"SCIEVAL Metrics Comparison β€” {model_name}",
162
+ xaxis_title="Metric",
163
+ yaxis_title="Score (1-10)",
164
+ yaxis_range=[0, 10],
165
+ template="plotly_white",
166
+ height=500,
167
+ barmode='group',
168
+ xaxis_tickangle=-45
169
+ )
170
+ return fig
171
+
172
+ def create_leaderboard():
173
+ """Create leaderboard table"""
174
+ leaderboard_data = []
175
+
176
+ # Add domain benchmark leaders
177
+ for domain, models in MODEL_EVALS.items():
178
+ best_model = max(models.items(), key=lambda x: x[1])
179
+ leaderboard_data.append({
180
+ "Domain": domain,
181
+ "Best Model": best_model[0],
182
+ "Score": f"{best_model[1]:.3f}",
183
+ "Metric Type": "Domain Benchmark"
184
+ })
185
+
186
+ # Add SCIEVAL leaders
187
+ for model, evaluations in SCIEVAL_METRICS.items():
188
+ avg_osir = sum(evaluations["OSIR (General)"].values()) / len(evaluations["OSIR (General)"])
189
+ avg_field = sum(evaluations["OSIR-Field (Physics)"].values()) / len(evaluations["OSIR-Field (Physics)"])
190
+
191
+ leaderboard_data.append({
192
+ "Domain": "OSIR General",
193
+ "Best Model": model,
194
+ "Score": f"{avg_osir:.2f}",
195
+ "Metric Type": "SCIEVAL"
196
+ })
197
+
198
+ leaderboard_data.append({
199
+ "Domain": "OSIR Physics",
200
+ "Best Model": model,
201
+ "Score": f"{avg_field:.2f}",
202
+ "Metric Type": "SCIEVAL"
203
+ })
204
+
205
+ df = pd.DataFrame(leaderboard_data)
206
+ return df
207
+
208
+ def get_model_details(domain):
209
+ """Get JSON details for domain models"""
210
+ return json.dumps(MODEL_EVALS[domain], indent=2)
211
+
212
+ def display_domain_eval(domain):
213
+ """Display domain evaluation results"""
214
+ plot = plot_domain_benchmark(domain)
215
+ details = get_model_details(domain)
216
+ return plot, details
217
+
218
+ def display_scieval(model_name):
219
+ """Display SCIEVAL results"""
220
+ plot = plot_scieval_comparison(model_name)
221
+ if model_name in SCIEVAL_METRICS:
222
+ details = json.dumps(SCIEVAL_METRICS[model_name], indent=2)
223
+ else:
224
+ details = "Model not found in SCIEVAL database"
225
+ return plot, details
226
+
227
+ # Create Gradio interface
228
+ with gr.Blocks(title="Scientific ML Benchmark Suite", theme=gr.themes.Soft()) as demo:
229
+ gr.Markdown("""
230
+ # πŸ”¬ Scientific ML Benchmark Suite
231
+ ### Comprehensive evaluation framework for scientific machine learning models
232
+
233
+ This suite combines domain-specific benchmarks with SCIEVAL (Scientific Evaluation) metrics to provide
234
+ comprehensive assessment of ML models across scientific disciplines.
235
+ """)
236
+
237
+ with gr.Tabs():
238
+ # Domain Benchmarks Tab
239
+ with gr.TabItem("πŸ§ͺ Domain Benchmarks"):
240
+ gr.Markdown("""
241
+ ### Domain-Specific Model Evaluations
242
+ Compare models across scientific domains including Proteins, Astronomy, Materials Science,
243
+ Quantum State Tomography (QST), High Energy Physics (HEP), and Computational Fluid Dynamics (CFD).
244
+ """)
245
+
246
+ with gr.Row():
247
+ domain_dropdown = gr.Dropdown(
248
+ choices=list(MODEL_EVALS.keys()),
249
+ label="Select Scientific Domain",
250
+ value="Proteins"
251
+ )
252
+ domain_btn = gr.Button("Run Domain Evaluation", variant="primary")
253
+
254
+ with gr.Row():
255
+ domain_plot = gr.Plot(label="Domain Benchmark Results")
256
+ domain_metrics = gr.Code(label="Raw Scores (JSON)", language="json")
257
+
258
+ domain_btn.click(
259
+ display_domain_eval,
260
+ inputs=domain_dropdown,
261
+ outputs=[domain_plot, domain_metrics]
262
+ )
263
+
264
+ # SCIEVAL Tab
265
+ with gr.TabItem("πŸ“Š SCIEVAL Metrics"):
266
+ gr.Markdown("""
267
+ ### SCIEVAL: Scientific Reasoning Evaluation
268
+ Assess models on scientific reasoning capabilities using the OSIR (Open Scientific Intelligence & Reasoning) framework.
269
+
270
+ **Metrics evaluated:**
271
+ - **Entropy/Novelty**: Originality and information richness
272
+ - **Internal Consistency**: Logical structure and argument continuity
273
+ - **Hypothesis Framing**: Research aim clarity
274
+ - **Thematic Grounding**: Domain focus and relevance
275
+ - **Citation & Structure**: Scientific formatting
276
+ - **Symbolism & Math Logic**: Mathematical rigor
277
+ - **Scientific Utility**: Real-world research value
278
+ """)
279
+
280
+ with gr.Row():
281
+ scieval_dropdown = gr.Dropdown(
282
+ choices=list(SCIEVAL_METRICS.keys()),
283
+ label="Select Model for SCIEVAL",
284
+ value="Nexa Mistral Sci-7B"
285
+ )
286
+ scieval_btn = gr.Button("Run SCIEVAL Analysis", variant="primary")
287
+
288
+ with gr.Row():
289
+ scieval_plot = gr.Plot(label="SCIEVAL Metrics Comparison")
290
+ scieval_metrics = gr.Code(label="Detailed Scores (JSON)", language="json")
291
+
292
+ scieval_btn.click(
293
+ display_scieval,
294
+ inputs=scieval_dropdown,
295
+ outputs=[scieval_plot, scieval_metrics]
296
+ )
297
+
298
+ # Leaderboard Tab
299
+ with gr.TabItem("πŸ† Leaderboard"):
300
+ gr.Markdown("""
301
+ ### Scientific ML Model Leaderboard
302
+ Current best-performing models across all evaluated domains and metrics.
303
+ """)
304
+
305
+ leaderboard_df = create_leaderboard()
306
+ leaderboard_table = gr.Dataframe(
307
+ value=leaderboard_df,
308
+ label="Current Leaders by Domain",
309
+ interactive=False
310
+ )
311
+
312
+ # About Tab
313
+ with gr.TabItem("ℹ️ About"):
314
+ gr.Markdown("""
315
+ ### About the Scientific ML Benchmark Suite
316
+
317
+ This comprehensive evaluation framework combines two powerful assessment methodologies:
318
+
319
+ #### 🎯 Domain Benchmarks
320
+ - **Proteins**: Secondary/tertiary structure prediction accuracy
321
+ - **Astronomy**: Object classification and detection
322
+ - **Materials**: Property prediction and discovery
323
+ - **QST**: Quantum state tomography reconstruction
324
+ - **HEP**: High energy physics event classification
325
+ - **CFD**: Computational fluid dynamics modeling
326
+
327
+ #### πŸ”¬ SCIEVAL Framework
328
+ SCIEVAL is part of the OSIR (Open Scientific Intelligence & Reasoning) initiative, providing:
329
+
330
+ - **Standardized Evaluation**: Reproducible metrics for scientific LLMs
331
+ - **Domain Adaptation**: Field-specific evaluation extensions
332
+ - **Research Utility**: Assessment of real-world scientific value
333
+
334
+ **OSIR-Field Extensions:**
335
+ - `osir-field-physics`: Physics-specific reasoning evaluation
336
+ - `osir-field-bio`: Biological sciences assessment
337
+ - `osir-field-chem`: Chemistry domain evaluation
338
+ - `osir-field-cs`: Computer science applications
339
+
340
+ #### πŸ“ˆ Scoring System
341
+ - **Domain Benchmarks**: 0.0 - 1.0 scale (higher is better)
342
+ - **SCIEVAL Metrics**: 1 - 10 scale across seven dimensions
343
+
344
+ #### 🀝 Contributing
345
+ This is an open framework welcoming contributions:
346
+ - New domain-specific test sets
347
+ - Additional evaluation metrics
348
+ - Model submissions for benchmarking
349
+
350
+ #### πŸ“„ Citation
351
+ ```
352
+ @misc{scieval2024,
353
+ title={SCIEVAL: A Benchmark for Evaluating Scientific Reasoning in Language Models},
354
+ author={NEXA Research},
355
+ year={2025},
356
+ url={https://huggingface.co/spaces/osir/scieval}
357
+ }
358
+ ```
359
+
360
+ ---
361
+
362
+ **License**: Apache 2.0 | **Framework**: OSIR Initiative | **Platform**: Gradio + Plotly
363
+ """)
364
+
365
+ # Initialize with default values
366
+ demo.load(
367
+ lambda: (plot_domain_benchmark("Proteins"), get_model_details("Proteins")),
368
+ outputs=[domain_plot, domain_metrics]
369
+ )
370
+
371
+ demo.load(
372
+ lambda: (plot_scieval_comparison("Nexa Mistral Sci-7B"),
373
+ json.dumps(SCIEVAL_METRICS["Nexa Mistral Sci-7B"], indent=2)),
374
+ outputs=[scieval_plot, scieval_metrics]
375
+ )
376
+
377
+ if __name__ == "__main__":
378
+ demo.launch()