File size: 3,233 Bytes
0bbd367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import gradio as gr
import plotly.graph_objects as go
import os

# ============ Leaderboard Data ============
MODEL_EVALS = {
    "LLM (General OSIR)": {
        "Nexa Mistral Sci-7B": 0.61,
        "Llama-3-8B-Instruct": 0.39,
        "Mixtral-8x7B-Instruct-v0.1": 0.41,
        "Claude-3-Sonnet": 0.64,
        "GPT-4-Turbo": 0.68,
        "GPT-4o": 0.71,
    },
    "LLM (Field-Specific OSIR)": {
        "Nexa Bio Adapter": 0.66,
        "Nexa Astro Adapter": 0.70,
        "GPT-4o (Biomed)": 0.69,
        "Claude-3-Opus (Bio)": 0.67,
        "Llama-3-8B-Bio": 0.42,
        "Mixtral-8x7B-BioTune": 0.43,
    },
}

# ============ Plotting Function ============
def plot_domain(domain):
    sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
    models, scores = zip(*sorted_items)

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=scores,
        y=models,
        orientation='h',
        marker_color='lightblue',
    ))

    fig.update_layout(
        title=f"Model vs. Overall Score โ€” {domain}",
        xaxis_title="Scientific Utility Score",
        yaxis_title="Model",
        xaxis_range=[0, 1.0],
        template="plotly_white",
        height=500,
        margin=dict(l=120, r=20, t=40, b=40),
    )
    return fig

# ============ Upload Handling (for later use) ============
def handle_upload(file):
    if file is not None:
        return f"Uploaded: {file.name}"
    return "No file uploaded."

# ============ Gradio UI ============
with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
    gr.Markdown("""
    # ๐Ÿง  SciEval | OSIR Leaderboard
    Welcome to the **OSIR** benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the **SciEval** benchmark.
    """)

    with gr.Row():
        with gr.Column():
            domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
            leaderboard_plot = gr.Plot()
            domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)

        with gr.Column():
            gr.Markdown("""
            ### ๐Ÿ“„ Upload Model Output
            Upload a generated scientific paper or abstract (PDF or TXT).
            """)
            upload = gr.File(file_types=[".pdf", ".txt"])
            upload_btn = gr.Button("Submit File")
            result = gr.Textbox(label="Upload Status")
            upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)

    gr.Markdown("""
    ---
    ### โ„น๏ธ About
    **SciEval** is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the **OSIR** (Open Scientific Intelligence & Reasoning) initiative. We score models based on:

    - Information entropy & novelty
    - Internal consistency
    - Hypothesis framing
    - Domain grounding & math logic
    - Scientific utility (overall use to researchers)

    This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
    """)

    leaderboard_plot.render()

demo.launch()