import gradio as gr import os from huggingface_hub import login from mmlu_pro_eval_adapted import evaluate_mmlu_pro import spaces import pandas as pd import time import traceback from dataset_previews import mmlupro_dataset_preview, format_preview_for_display # Read token and login hf_token = os.getenv("HF_READ_WRITE_TOKEN") if hf_token: login(hf_token) else: print("⚠️ No HF_READ_WRITE_TOKEN found in environment") # --------------------------------------------------------------------------- # 1. Model configuration # --------------------------------------------------------------------------- model_name = "mistralai/Mistral-7B-v0.1" # --------------------------------------------------------------------------- # 2. MMLU-Pro Evaluation # --------------------------------------------------------------------------- @spaces.GPU(duration=240) def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()): """ Runs the MMLU evaluation with the specified parameters. Args: all_subjects (bool): Whether to evaluate all subjects num_subjects (int): Number of subjects to evaluate (1-14) num_shots (int): Number of few-shot examples (0-5) all_questions (bool): Whether to evaluate all questions per subject num_questions (int): Number of examples per subject (1-100 or all) progress (gr.Progress): Progress indicator """ try: # Convert parameters if needed if all_subjects: num_subjects = -1 if all_questions: num_questions = -1 # Run evaluation with timing start_time = time.time() results = evaluate_mmlu_pro( model_name, num_subjects=num_subjects, num_questions=num_questions, num_shots=num_shots, ) elapsed_time = time.time() - start_time # Format results overall_acc = results["overall_accuracy"] min_subject, min_acc = results["min_accuracy_subject"] max_subject, max_acc = results["max_accuracy_subject"] # Create DataFrame from results table results_df = pd.DataFrame(results["full_accuracy_table"]) # Calculate totals for the overall row total_samples = results_df['Num_samples'].sum() total_correct = results_df['Num_correct'].sum() # Create overall row overall_row = pd.DataFrame({ 'Subject': ['**Overall**'], 'Num_samples': [total_samples], 'Num_correct': [total_correct], 'Accuracy': [overall_acc] }) # Concatenate overall row with results results_df = pd.concat([overall_row, results_df], ignore_index=True) # Format the report report = ( f"### Overall Results\n" f"* Overall Accuracy: {overall_acc:.3f}\n" f"* Best Performance: {max_subject} ({max_acc:.3f})\n" f"* Worst Performance: {min_subject} ({min_acc:.3f})\n" f"* Evaluation completed in {elapsed_time:.2f} seconds\n" ) # Return values that re-enable UI components after completion return (report, results_df, gr.update(interactive=True), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(visible=True)) except Exception as e: # Handle errors gracefully error_trace = traceback.format_exc() error_message = f"### Error during evaluation\n```\n{error_trace}\n```" # Re-enable UI components on error return (error_message, None, gr.update(interactive=True), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(visible=False)) # --------------------------------------------------------------------------- # 3. Gradio Interface # --------------------------------------------------------------------------- with gr.Blocks(css=""" #preview_header { margin-bottom: 10px; margin-top: 5px; } #preview_table { background-color: #f8f9fa; border-radius: 8px; padding: 10px; } h1 { text-align: center; } """) as demo: gr.Markdown("# Head-to-Head Model Evaluation Comparator") gr.Markdown(""" This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro) Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) """) # Dataset Selection Section gr.Markdown("## (A) Select Dataset for Evaluation") with gr.Row(): dataset_dropdown = gr.Dropdown( choices=["(Select Dataset)", "MMLU-Pro"], value="(Select Dataset)", label="Dataset", info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)" ) preview_toggle = gr.Button("Show Preview", interactive=False, variant="secondary") # Dataset Preview Container - Initially hidden with gr.Column(visible=False) as dataset_preview_container: gr.Markdown("## Dataset Preview", elem_id="preview_header") preview_output = gr.DataFrame( interactive=False, wrap=True, elem_id="preview_table" ) # Add vertical space after the preview gr.Markdown(" ") gr.Markdown(" ") # MMLU Config Container - Initially hidden until dataset is selected with gr.Column(visible=False) as mmlu_config_container: gr.Markdown("## (B) Select Dataset Configuration Options") with gr.Row(): all_subjects_checkbox = gr.Checkbox( label="Evaluate All Subjects", value=False, info="When checked, evaluates all 14 MMLU-Pro subjects" ) num_subjects_slider = gr.Slider( minimum=1, maximum=14, value=14, step=1, label="Number of Subjects", info="Number of subjects to evaluate (1-14). They will be loaded in alphabetical order.", interactive=True ) with gr.Row(): num_shots_slider = gr.Slider( minimum=0, maximum=5, value=5, step=1, label="Number of Few-shot Examples", info="Number of examples to use for few-shot learning (0-5)." ) with gr.Row(): all_questions_checkbox = gr.Checkbox( label="Evaluate All Questions", value=False, info="When checked, evaluates all available questions for each subject" ) questions_info_text = gr.Markdown(visible=False, value="**All 12,032 questions across all subjects will be evaluated**") with gr.Row(elem_id="questions_selection_row"): questions_container = gr.Column(scale=1, elem_id="questions_slider_container") with questions_container: num_questions_slider = gr.Slider( minimum=1, maximum=100, value=20, step=1, label="Questions per Subject", info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.", interactive=True ) with gr.Row(): with gr.Column(scale=1): eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True) cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False) # Results Section - Initially hidden with gr.Column(visible=False) as results_container: results_output = gr.Markdown(label="Evaluation Results") # Results table - Initially hidden until evaluation completes with gr.Column(visible=False) as results_table_container: with gr.Row(): results_table = gr.DataFrame( interactive=True, label="Detailed Results (Sortable)", visible=True ) # Track evaluation state evaluation_state = gr.State({"running": False}) # Track preview visibility state preview_visibility = gr.State(False) # Function to show/hide configuration based on selected dataset def update_interface_based_on_dataset(dataset, current_visibility): if dataset == "MMLU-Pro": return ( gr.update(visible=True), # mmlu_config_container gr.update(visible=True), # results_container gr.update(interactive=True), # preview_toggle gr.update(visible=False), # dataset_preview_container - hide it initially False, # Reset preview_visibility to False gr.update(value="Show Preview") # Reset button text ) else: return ( gr.update(visible=False), # mmlu_config_container gr.update(visible=False), # results_container gr.update(interactive=False), # preview_toggle gr.update(visible=False), # dataset_preview_container - hide when no dataset False, # Reset preview_visibility to False gr.update(value="Show Preview") # Reset button text ) # Connect dataset dropdown to show/hide appropriate configuration dataset_dropdown.change( fn=update_interface_based_on_dataset, inputs=[dataset_dropdown, preview_visibility], outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle] ) # Function to toggle dataset preview visibility def toggle_preview(dataset, preview_visibility): # Toggle the visibility state is_visible = not preview_visibility # Update button text based on new state button_text = "Hide Preview" if is_visible else "Show Preview" # Get preview data if becoming visible if is_visible and dataset == "MMLU-Pro": preview_data = mmlupro_dataset_preview() formatted_preview = format_preview_for_display(preview_data) return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text) elif is_visible: # For other datasets (not implemented yet) return is_visible, gr.update(visible=True), None, gr.update(value=button_text) else: # Hiding the preview return is_visible, gr.update(visible=False), None, gr.update(value=button_text) # Connect preview toggle to show/hide dataset information preview_toggle.click( fn=toggle_preview, inputs=[dataset_dropdown, preview_visibility], outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle] ) # Update num_subjects_slider interactivity based on all_subjects checkbox def update_subjects_slider(checked): return gr.update(interactive=not checked) all_subjects_checkbox.change( fn=update_subjects_slider, inputs=[all_subjects_checkbox], outputs=[num_subjects_slider] ) # Update interface based on all_questions checkbox def update_questions_interface(checked): if checked: return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=True), gr.update(visible=False) all_questions_checkbox.change( fn=update_questions_interface, inputs=[all_questions_checkbox], outputs=[questions_container, questions_info_text] ) # Function to disable UI components during evaluation def start_evaluation(state): if state["running"]: return [ state, gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(visible=True), "Evaluation already in progress. Please wait.", None, gr.update(visible=False) ] # Update state to running state["running"] = True return [ state, gr.update(interactive=False), # all_subjects_checkbox gr.update(interactive=False), # num_subjects_slider gr.update(interactive=False), # num_shots_slider gr.update(interactive=False), # all_questions_checkbox gr.update(interactive=False), # num_questions_slider gr.update(interactive=False), # eval_mmlu_button gr.update(visible=True), # cancel_mmlu_button "Starting evaluation...", # results_output None, # results_table gr.update(visible=False) # results_table_container ] # Function to reset UI after evaluation def finish_evaluation(state): state["running"] = False return state # Function to handle cancel button click def cancel_evaluation(state): # Note: This doesn't actually stop the evaluation process # It only updates the UI state to appear canceled state["running"] = False return [ state, gr.update(interactive=True), # all_subjects_checkbox gr.update(interactive=True), # num_subjects_slider gr.update(interactive=True), # num_shots_slider gr.update(interactive=True), # all_questions_checkbox gr.update(interactive=True), # num_questions_slider gr.update(interactive=True), # eval_mmlu_button gr.update(visible=False), # cancel_mmlu_button "⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output None, # results_table gr.update(visible=False) # results_table_container ] # Connect MMLU evaluation button with state tracking eval_mmlu_button.click( fn=start_evaluation, inputs=[evaluation_state], outputs=[ evaluation_state, all_subjects_checkbox, num_subjects_slider, num_shots_slider, all_questions_checkbox, num_questions_slider, eval_mmlu_button, cancel_mmlu_button, results_output, results_table, results_table_container ] ).then( fn=run_mmlu_evaluation, inputs=[ all_subjects_checkbox, num_subjects_slider, num_shots_slider, all_questions_checkbox, num_questions_slider ], outputs=[ results_output, results_table, eval_mmlu_button, cancel_mmlu_button, all_subjects_checkbox, num_subjects_slider, num_shots_slider, all_questions_checkbox, num_questions_slider, results_table_container ] ).then( fn=finish_evaluation, inputs=[evaluation_state], outputs=[evaluation_state] ) # Connect cancel button cancel_mmlu_button.click( fn=cancel_evaluation, inputs=[evaluation_state], outputs=[ evaluation_state, all_subjects_checkbox, num_subjects_slider, num_shots_slider, all_questions_checkbox, num_questions_slider, eval_mmlu_button, cancel_mmlu_button, results_output, results_table, results_table_container ] ) demo.launch()