NexaEvals / app.py
Allanatrix's picture
Create app.py
0bbd367 verified
raw
history blame
3.23 kB
import gradio as gr
import plotly.graph_objects as go
import os
# ============ Leaderboard Data ============
MODEL_EVALS = {
"LLM (General OSIR)": {
"Nexa Mistral Sci-7B": 0.61,
"Llama-3-8B-Instruct": 0.39,
"Mixtral-8x7B-Instruct-v0.1": 0.41,
"Claude-3-Sonnet": 0.64,
"GPT-4-Turbo": 0.68,
"GPT-4o": 0.71,
},
"LLM (Field-Specific OSIR)": {
"Nexa Bio Adapter": 0.66,
"Nexa Astro Adapter": 0.70,
"GPT-4o (Biomed)": 0.69,
"Claude-3-Opus (Bio)": 0.67,
"Llama-3-8B-Bio": 0.42,
"Mixtral-8x7B-BioTune": 0.43,
},
}
# ============ Plotting Function ============
def plot_domain(domain):
sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
models, scores = zip(*sorted_items)
fig = go.Figure()
fig.add_trace(go.Bar(
x=scores,
y=models,
orientation='h',
marker_color='lightblue',
))
fig.update_layout(
title=f"Model vs. Overall Score β€” {domain}",
xaxis_title="Scientific Utility Score",
yaxis_title="Model",
xaxis_range=[0, 1.0],
template="plotly_white",
height=500,
margin=dict(l=120, r=20, t=40, b=40),
)
return fig
# ============ Upload Handling (for later use) ============
def handle_upload(file):
if file is not None:
return f"Uploaded: {file.name}"
return "No file uploaded."
# ============ Gradio UI ============
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
gr.Markdown("""
# 🧠 SciEval | OSIR Leaderboard
Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
""")
with gr.Row():
with gr.Column():
domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
leaderboard_plot = gr.Plot()
domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)
with gr.Column():
gr.Markdown("""
### πŸ“„ Upload Model Output
Upload a generated scientific paper or abstract (PDF or TXT).
""")
upload = gr.File(file_types=[".pdf", ".txt"])
upload_btn = gr.Button("Submit File")
result = gr.Textbox(label="Upload Status")
upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)
gr.Markdown("""
---
### ℹ️ About
**SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:
- Information entropy & novelty
- Internal consistency
- Hypothesis framing
- Domain grounding & math logic
- Scientific utility (overall use to researchers)
This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
""")
leaderboard_plot.render()
demo.launch()