Spaces:
Sleeping
Sleeping
import gradio as gr | |
import plotly.graph_objects as go | |
import os | |
# ============ Leaderboard Data ============ | |
MODEL_EVALS = { | |
"LLM (General OSIR)": { | |
"Nexa Mistral Sci-7B": 0.61, | |
"Llama-3-8B-Instruct": 0.39, | |
"Mixtral-8x7B-Instruct-v0.1": 0.41, | |
"Claude-3-Sonnet": 0.64, | |
"GPT-4-Turbo": 0.68, | |
"GPT-4o": 0.71, | |
}, | |
"LLM (Field-Specific OSIR)": { | |
"Nexa Bio Adapter": 0.66, | |
"Nexa Astro Adapter": 0.70, | |
"GPT-4o (Biomed)": 0.69, | |
"Claude-3-Opus (Bio)": 0.67, | |
"Llama-3-8B-Bio": 0.42, | |
"Mixtral-8x7B-BioTune": 0.43, | |
}, | |
} | |
# ============ Plotting Function ============ | |
def plot_domain(domain): | |
sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True) | |
models, scores = zip(*sorted_items) | |
fig = go.Figure() | |
fig.add_trace(go.Bar( | |
x=scores, | |
y=models, | |
orientation='h', | |
marker_color='lightblue', | |
)) | |
fig.update_layout( | |
title=f"Model vs. Overall Score β {domain}", | |
xaxis_title="Scientific Utility Score", | |
yaxis_title="Model", | |
xaxis_range=[0, 1.0], | |
template="plotly_white", | |
height=500, | |
margin=dict(l=120, r=20, t=40, b=40), | |
) | |
return fig | |
# ============ Upload Handling (for later use) ============ | |
def handle_upload(file): | |
if file is not None: | |
return f"Uploaded: {file.name}" | |
return "No file uploaded." | |
# ============ Gradio UI ============ | |
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo: | |
gr.Markdown(""" | |
# π§ SciEval | OSIR Leaderboard | |
Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)") | |
leaderboard_plot = gr.Plot() | |
domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot) | |
with gr.Column(): | |
gr.Markdown(""" | |
### π Upload Model Output | |
Upload a generated scientific paper or abstract (PDF or TXT). | |
""") | |
upload = gr.File(file_types=[".pdf", ".txt"]) | |
upload_btn = gr.Button("Submit File") | |
result = gr.Textbox(label="Upload Status") | |
upload_btn.click(fn=handle_upload, inputs=upload, outputs=result) | |
gr.Markdown(""" | |
--- | |
### βΉοΈ About | |
**SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on: | |
- Information entropy & novelty | |
- Internal consistency | |
- Hypothesis framing | |
- Domain grounding & math logic | |
- Scientific utility (overall use to researchers) | |
This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA. | |
""") | |
leaderboard_plot.render() | |
demo.launch() | |