Spaces:
Sleeping
Sleeping
import gradio as gr | |
import os | |
from huggingface_hub import login | |
from mmlu_pro_eval_adapted import evaluate_mmlu_pro | |
import spaces | |
import pandas as pd | |
import time | |
import traceback | |
from dataset_previews import mmlupro_dataset_preview, format_preview_for_display | |
# Read token and login | |
hf_token = os.getenv("HF_READ_WRITE_TOKEN") | |
if hf_token: | |
login(hf_token) | |
else: | |
print("⚠️ No HF_READ_WRITE_TOKEN found in environment") | |
# --------------------------------------------------------------------------- | |
# 1. Model configuration | |
# --------------------------------------------------------------------------- | |
model_name = "mistralai/Mistral-7B-v0.1" | |
# --------------------------------------------------------------------------- | |
# 2. MMLU-Pro Evaluation | |
# --------------------------------------------------------------------------- | |
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()): | |
""" | |
Runs the MMLU evaluation with the specified parameters. | |
Args: | |
all_subjects (bool): Whether to evaluate all subjects | |
num_subjects (int): Number of subjects to evaluate (1-14) | |
num_shots (int): Number of few-shot examples (0-5) | |
all_questions (bool): Whether to evaluate all questions per subject | |
num_questions (int): Number of examples per subject (1-100 or all) | |
progress (gr.Progress): Progress indicator | |
""" | |
try: | |
# Convert parameters if needed | |
if all_subjects: | |
num_subjects = -1 | |
if all_questions: | |
num_questions = -1 | |
# Run evaluation with timing | |
start_time = time.time() | |
results = evaluate_mmlu_pro( | |
model_name, | |
num_subjects=num_subjects, | |
num_questions=num_questions, | |
num_shots=num_shots, | |
) | |
elapsed_time = time.time() - start_time | |
# Format results | |
overall_acc = results["overall_accuracy"] | |
min_subject, min_acc = results["min_accuracy_subject"] | |
max_subject, max_acc = results["max_accuracy_subject"] | |
# Create DataFrame from results table | |
results_df = pd.DataFrame(results["full_accuracy_table"]) | |
# Calculate totals for the overall row | |
total_samples = results_df['Num_samples'].sum() | |
total_correct = results_df['Num_correct'].sum() | |
# Create overall row | |
overall_row = pd.DataFrame({ | |
'Subject': ['**Overall**'], | |
'Num_samples': [total_samples], | |
'Num_correct': [total_correct], | |
'Accuracy': [overall_acc] | |
}) | |
# Concatenate overall row with results | |
results_df = pd.concat([overall_row, results_df], ignore_index=True) | |
# Format the report | |
report = ( | |
f"### Overall Results\n" | |
f"* Overall Accuracy: {overall_acc:.3f}\n" | |
f"* Best Performance: {max_subject} ({max_acc:.3f})\n" | |
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n" | |
f"* Evaluation completed in {elapsed_time:.2f} seconds\n" | |
) | |
# Return values that re-enable UI components after completion | |
return (report, | |
results_df, | |
gr.update(interactive=True), | |
gr.update(visible=False), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(visible=True)) | |
except Exception as e: | |
# Handle errors gracefully | |
error_trace = traceback.format_exc() | |
error_message = f"### Error during evaluation\n```\n{error_trace}\n```" | |
# Re-enable UI components on error | |
return (error_message, | |
None, | |
gr.update(interactive=True), | |
gr.update(visible=False), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(interactive=True), | |
gr.update(visible=False)) | |
def format_links_with_bullets(links_text): | |
"""Format links with bullet points for better readability""" | |
lines = links_text.split('\n') | |
return "• " + "\n• ".join(lines) | |
# Function to format dataset preview for better display | |
def enhanced_format_preview_for_display(preview_data): | |
""" | |
Format the preview data with improved readability | |
""" | |
# Create links with bullet points | |
links_value = ( | |
f"Hugging Face: {preview_data['links']['huggingface']}\n" | |
f"GitHub: {preview_data['links']['github']}\n" | |
f"Paper: {preview_data['links']['paper']}" | |
) | |
links_formatted = format_links_with_bullets(links_value) | |
# Create a table format with better column names | |
rows = [ | |
{"Dataset Property": "Dataset Name", "Details": preview_data["dataset_name"]}, | |
{"Dataset Property": "Evaluation Type", "Details": preview_data["evaluation_type"]}, | |
{"Dataset Property": "Description", "Details": preview_data["description"]}, | |
{"Dataset Property": "Links", "Details": links_formatted}, | |
{"Dataset Property": "Organization", "Details": preview_data["organization"]}, | |
{"Dataset Property": "Number of Questions", "Details": preview_data["num_questions"]}, | |
{"Dataset Property": "Number of Input Tokens", "Details": preview_data["input_tokens"]}, | |
{"Dataset Property": "Estimated Evaluation Time", "Details": f"{preview_data['evaluation_time']['total_time_minutes']} minutes (for 2 models on A100)"} | |
] | |
return pd.DataFrame(rows) | |
# --------------------------------------------------------------------------- | |
# 3. Gradio Interface | |
# --------------------------------------------------------------------------- | |
with gr.Blocks() as demo: | |
gr.Markdown("# Head-to-Head Model Evaluation Comparator") | |
gr.Markdown(""" | |
This demo evaluates two models (or one model with two different configs) on a benchmark dataset. | |
Available Datasets:[MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro) | |
Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) | |
""") | |
# Dataset Selection Section | |
gr.Markdown("## (A) Select Dataset for evaluation") | |
with gr.Row(): | |
dataset_dropdown = gr.Dropdown( | |
choices=["(Select Dataset)", "MMLU-Pro"], | |
value="(Select Dataset)", | |
label="Dataset", | |
info="Select a dataset to perform the Head to Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)" | |
) | |
preview_toggle = gr.Button("Show Preview", interactive=False, variant="secondary") | |
# Dataset Preview Container - Initially hidden | |
with gr.Column(visible=False) as dataset_preview_container: | |
gr.Markdown("## Dataset Preview", elem_id="preview_header") | |
preview_output = gr.DataFrame( | |
interactive=False, | |
wrap=True, | |
elem_id="preview_table" | |
) | |
# Add vertical space after the preview | |
gr.Markdown(" ") | |
gr.Markdown(" ") | |
# MMLU Config Container - Initially hidden until dataset is selected | |
with gr.Column(visible=False) as mmlu_config_container: | |
gr.Markdown("## (B) Select Dataset Configuration Options") | |
with gr.Row(): | |
all_subjects_checkbox = gr.Checkbox( | |
label="Evaluate All Subjects", | |
value=False, | |
info="When checked, evaluates all 14 MMLU-Pro subjects" | |
) | |
num_subjects_slider = gr.Slider( | |
minimum=1, | |
maximum=14, | |
value=14, | |
step=1, | |
label="Number of Subjects", | |
info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.", | |
interactive=True | |
) | |
with gr.Row(): | |
num_shots_slider = gr.Slider( | |
minimum=0, | |
maximum=5, | |
value=5, | |
step=1, | |
label="Number of Few-shot Examples", | |
info="Number of examples to use for few-shot learning (0-5)." | |
) | |
with gr.Row(): | |
all_questions_checkbox = gr.Checkbox( | |
label="Evaluate All Questions", | |
value=False, | |
info="When checked, evaluates all available questions for each subject" | |
) | |
questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**") | |
with gr.Row(elem_id="questions_selection_row"): | |
questions_container = gr.Column(scale=1, elem_id="questions_slider_container") | |
with questions_container: | |
num_questions_slider = gr.Slider( | |
minimum=1, | |
maximum=100, | |
value=20, | |
step=1, | |
label="Questions per Subject", | |
info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.", | |
interactive=True | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True) | |
cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False) | |
# Results Section - Initially hidden | |
with gr.Column(visible=False) as results_container: | |
results_output = gr.Markdown(label="Evaluation Results") | |
# Results table - Initially hidden until evaluation completes | |
with gr.Column(visible=False) as results_table_container: | |
with gr.Row(): | |
results_table = gr.DataFrame( | |
interactive=True, | |
label="Detailed Results (Sortable)", | |
visible=True | |
) | |
# Track evaluation state and preview state | |
evaluation_state = gr.State({"running": False}) | |
preview_state = gr.State({"visible": False}) | |
# Function to show/hide configuration based on selected dataset | |
def update_interface_based_on_dataset(dataset): | |
if dataset == "MMLU-Pro": | |
return ( | |
gr.update(visible=True), # mmlu_config_container | |
gr.update(visible=True), # results_container | |
gr.update(interactive=True) # preview_toggle | |
) | |
else: | |
return ( | |
gr.update(visible=False), # mmlu_config_container | |
gr.update(visible=False), # results_container | |
gr.update(interactive=False) # preview_toggle | |
) | |
# Connect dataset dropdown to show/hide appropriate configuration | |
dataset_dropdown.change( | |
fn=update_interface_based_on_dataset, | |
inputs=[dataset_dropdown], | |
outputs=[mmlu_config_container, results_container, preview_toggle] | |
) | |
# Function to toggle dataset preview visibility | |
def toggle_preview(state, dataset): | |
# Toggle visibility state | |
new_visible = not state["visible"] | |
state["visible"] = new_visible | |
# If becoming visible, get the preview data | |
if new_visible and dataset == "MMLU-Pro": | |
preview_data = mmlupro_dataset_preview() | |
formatted_preview = enhanced_format_preview_for_display(preview_data) | |
button_text = "Hide Preview" | |
return state, gr.update(visible=True), formatted_preview, gr.update(value=button_text) | |
elif new_visible: | |
# For other datasets (not implemented yet) | |
button_text = "Hide Preview" | |
return state, gr.update(visible=True), None, gr.update(value=button_text) | |
else: | |
# Hiding the preview | |
button_text = "Show Preview" | |
return state, gr.update(visible=False), None, gr.update(value=button_text) | |
# Connect preview toggle to show/hide dataset information | |
preview_toggle.click( | |
fn=toggle_preview, | |
inputs=[preview_state, dataset_dropdown], | |
outputs=[preview_state, dataset_preview_container, preview_output, preview_toggle] | |
) | |
# Update num_subjects_slider interactivity based on all_subjects checkbox | |
def update_subjects_slider(checked): | |
return gr.update(interactive=not checked) | |
all_subjects_checkbox.change( | |
fn=update_subjects_slider, | |
inputs=[all_subjects_checkbox], | |
outputs=[num_subjects_slider] | |
) | |
# Update interface based on all_questions checkbox | |
def update_questions_interface(checked): | |
if checked: | |
return gr.update(visible=False), gr.update(visible=True) | |
else: | |
return gr.update(visible=True), gr.update(visible=False) | |
all_questions_checkbox.change( | |
fn=update_questions_interface, | |
inputs=[all_questions_checkbox], | |
outputs=[questions_container, questions_info_text] | |
) | |
# Function to disable UI components during evaluation | |
def start_evaluation(state): | |
if state["running"]: | |
return [ | |
state, | |
gr.update(interactive=False), | |
gr.update(interactive=False), | |
gr.update(interactive=False), | |
gr.update(interactive=False), | |
gr.update(interactive=False), | |
gr.update(interactive=False), | |
gr.update(visible=True), | |
"Evaluation already in progress. Please wait.", | |
None, | |
gr.update(visible=False) | |
] | |
# Update state to running | |
state["running"] = True | |
return [ | |
state, | |
gr.update(interactive=False), # all_subjects_checkbox | |
gr.update(interactive=False), # num_subjects_slider | |
gr.update(interactive=False), # num_shots_slider | |
gr.update(interactive=False), # all_questions_checkbox | |
gr.update(interactive=False), # num_questions_slider | |
gr.update(interactive=False), # eval_mmlu_button | |
gr.update(visible=True), # cancel_mmlu_button | |
"Starting evaluation...", # results_output | |
None, # results_table | |
gr.update(visible=False) # results_table_container | |
] | |
# Function to reset UI after evaluation | |
def finish_evaluation(state): | |
state["running"] = False | |
return state | |
# Function to handle cancel button click | |
def cancel_evaluation(state): | |
# Note: This doesn't actually stop the evaluation process | |
# It only updates the UI state to appear canceled | |
state["running"] = False | |
return [ | |
state, | |
gr.update(interactive=True), # all_subjects_checkbox | |
gr.update(interactive=True), # num_subjects_slider | |
gr.update(interactive=True), # num_shots_slider | |
gr.update(interactive=True), # all_questions_checkbox | |
gr.update(interactive=True), # num_questions_slider | |
gr.update(interactive=True), # eval_mmlu_button | |
gr.update(visible=False), # cancel_mmlu_button | |
"⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output | |
None, # results_table | |
gr.update(visible=False) # results_table_container | |
] | |
# Connect MMLU evaluation button with state tracking | |
eval_mmlu_button.click( | |
fn=start_evaluation, | |
inputs=[evaluation_state], | |
outputs=[ | |
evaluation_state, | |
all_subjects_checkbox, | |
num_subjects_slider, | |
num_shots_slider, | |
all_questions_checkbox, | |
num_questions_slider, | |
eval_mmlu_button, | |
cancel_mmlu_button, | |
results_output, | |
results_table, | |
results_table_container | |
] | |
).then( | |
fn=run_mmlu_evaluation, | |
inputs=[ | |
all_subjects_checkbox, | |
num_subjects_slider, | |
num_shots_slider, | |
all_questions_checkbox, | |
num_questions_slider | |
], | |
outputs=[ | |
results_output, | |
results_table, | |
eval_mmlu_button, | |
cancel_mmlu_button, | |
all_subjects_checkbox, | |
num_subjects_slider, | |
num_shots_slider, | |
all_questions_checkbox, | |
num_questions_slider, | |
results_table_container | |
] | |
).then( | |
fn=finish_evaluation, | |
inputs=[evaluation_state], | |
outputs=[evaluation_state] | |
) | |
# Connect cancel button | |
cancel_mmlu_button.click( | |
fn=cancel_evaluation, | |
inputs=[evaluation_state], | |
outputs=[ | |
evaluation_state, | |
all_subjects_checkbox, | |
num_subjects_slider, | |
num_shots_slider, | |
all_questions_checkbox, | |
num_questions_slider, | |
eval_mmlu_button, | |
cancel_mmlu_button, | |
results_output, | |
results_table, | |
results_table_container | |
] | |
) | |
# Add custom CSS for styling | |
css = """ | |
#preview_header { | |
margin-bottom: 10px; | |
margin-top: 5px; | |
} | |
#preview_table { | |
background-color: #f8f9fa; | |
border-radius: 8px; | |
padding: 10px; | |
} | |
""" | |
demo.launch(css=css) |