Spaces:

Allanatrix
/

NexaEvals

Sleeping

App Files Files Community

NexaEvals / app.py

Allanatrix

Create app.py

0bbd367 verified 2 months ago

raw

history blame

3.23 kB

	import gradio as gr
	import plotly.graph_objects as go
	import os

	# ============ Leaderboard Data ============
	MODEL_EVALS = {
	"LLM (General OSIR)": {
	"Nexa Mistral Sci-7B": 0.61,
	"Llama-3-8B-Instruct": 0.39,
	"Mixtral-8x7B-Instruct-v0.1": 0.41,
	"Claude-3-Sonnet": 0.64,
	"GPT-4-Turbo": 0.68,
	"GPT-4o": 0.71,
	},
	"LLM (Field-Specific OSIR)": {
	"Nexa Bio Adapter": 0.66,
	"Nexa Astro Adapter": 0.70,
	"GPT-4o (Biomed)": 0.69,
	"Claude-3-Opus (Bio)": 0.67,
	"Llama-3-8B-Bio": 0.42,
	"Mixtral-8x7B-BioTune": 0.43,
	},
	}

	# ============ Plotting Function ============
	def plot_domain(domain):
	sorted_items = sorted(MODEL_EVALS[domain].items(), key=lambda x: x[1], reverse=True)
	models, scores = zip(*sorted_items)

	fig = go.Figure()
	fig.add_trace(go.Bar(
	x=scores,
	y=models,
	orientation='h',
	marker_color='lightblue',
	))

	fig.update_layout(
	title=f"Model vs. Overall Score — {domain}",
	xaxis_title="Scientific Utility Score",
	yaxis_title="Model",
	xaxis_range=[0, 1.0],
	template="plotly_white",
	height=500,
	margin=dict(l=120, r=20, t=40, b=40),
	)
	return fig

	# ============ Upload Handling (for later use) ============
	def handle_upload(file):
	if file is not None:
	return f"Uploaded: {file.name}"
	return "No file uploaded."

	# ============ Gradio UI ============
	with gr.Blocks(css="body {font-family: 'Inter', sans-serif; background-color: #fafafa;}") as demo:
	gr.Markdown("""
	# 🧠 SciEval \| OSIR Leaderboard
	Welcome to the OSIR benchmarking suite for evaluating scientific language models. This leaderboard tracks general-purpose and field-specific reasoning performance across the SciEval benchmark.
	""")

	with gr.Row():
	with gr.Column():
	domain_choice = gr.Dropdown(choices=list(MODEL_EVALS.keys()), label="Select Evaluation Domain", value="LLM (General OSIR)")
	leaderboard_plot = gr.Plot()
	domain_choice.change(fn=plot_domain, inputs=domain_choice, outputs=leaderboard_plot)

	with gr.Column():
	gr.Markdown("""
	### 📄 Upload Model Output
	Upload a generated scientific paper or abstract (PDF or TXT).
	""")
	upload = gr.File(file_types=[".pdf", ".txt"])
	upload_btn = gr.Button("Submit File")
	result = gr.Textbox(label="Upload Status")
	upload_btn.click(fn=handle_upload, inputs=upload, outputs=result)

	gr.Markdown("""
	---
	### ℹ️ About
	SciEval is a model-agnostic benchmark to evaluate the scientific utility of LLMs under the OSIR (Open Scientific Intelligence & Reasoning) initiative. We score models based on:

	- Information entropy & novelty
	- Internal consistency
	- Hypothesis framing
	- Domain grounding & math logic
	- Scientific utility (overall use to researchers)

	This leaderboard includes Nexa's adapters and comparisons to general-purpose LLMs like GPT-4o, Claude 3, and open-source Mistral / LLaMA.
	""")

	leaderboard_plot.render()

	demo.launch()