Spaces:
Sleeping
Sleeping
File size: 6,759 Bytes
c5224d3 3195f7f 4c36941 8e8ba80 53c9367 3f6f5f7 3d567ab 4c36941 e8d7a5b c5224d3 3195f7f a7f824f 3195f7f bb6fa7e 614dffd a7f824f 3195f7f 614dffd a7f824f cc13a5e a7f824f 3195f7f a7f824f 3195f7f a7f824f 3195f7f a7f824f 614dffd a7f824f 3195f7f a7f824f 3195f7f 614dffd df31ae3 ee60006 df31ae3 ee60006 e2aa1de df31ae3 9190bb9 df31ae3 9190bb9 df31ae3 ee60006 df31ae3 ee60006 9190bb9 df31ae3 3195f7f a7f824f 3195f7f df31ae3 e8d7a5b df31ae3 e8d7a5b a7f824f df31ae3 a7f824f df31ae3 a7f824f df31ae3 ee60006 df31ae3 3d567ab df31ae3 3d567ab df31ae3 3d567ab df31ae3 ee60006 df31ae3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from huggingface_hub import login
from toy_dataset_eval import evaluate_toy_dataset
from mmlu_eval_original import evaluate_mmlu
import spaces
import pandas as pd
# Read token and login
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
if hf_token:
login(hf_token)
else:
print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
# ---------------------------------------------------------------------------
# 1. Model and tokenizer setup and Loading
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = None
model = None
model_loaded = False
@spaces.GPU
def load_model():
"""Loads the Mistral model and tokenizer and updates the load status."""
global tokenizer, model, model_loaded
try:
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
if model is None:
model = AutoModelForCausalLM.from_pretrained(
model_name,
token=hf_token,
torch_dtype=torch.float16
)
model.to('cuda')
model_loaded = True
return "✅ Model Loaded!"
except Exception as e:
model_loaded = False
return f"❌ Model Load Failed: {str(e)}"
# ---------------------------------------------------------------------------
# 2. Toy Evaluation
# ---------------------------------------------------------------------------
@spaces.GPU (duration=120)
def run_toy_evaluation():
"""Runs the toy dataset evaluation."""
if not model_loaded:
load_model()
if not model_loaded:
return "⚠️ Model not loaded. Please load the model first."
results = evaluate_toy_dataset(model, tokenizer)
return results # Ensure load confirmation is shown before results
# ---------------------------------------------------------------------------
# 3. MMLU Evaluation call
# ---------------------------------------------------------------------------
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, num_examples):
"""
Runs the MMLU evaluation with the specified parameters.
Args:
all_subjects (bool): Whether to evaluate all subjects
num_subjects (int): Number of subjects to evaluate (1-57)
num_shots (int): Number of few-shot examples (0-5)
num_examples (int): Number of examples per subject (1-10 or -1 for all)
"""
if not model_loaded:
load_model()
if not model_loaded:
return "⚠️ Model not loaded. Please load the model first."
# Convert num_subjects to -1 if all_subjects is True
if all_subjects:
num_subjects = -1
# Run evaluation
results = evaluate_mmlu(
model,
tokenizer,
num_subjects=num_subjects,
num_questions=num_examples,
num_shots=num_shots
)
# Format results
overall_acc = results["overall_accuracy"]
min_subject, min_acc = results["min_accuracy_subject"]
max_subject, max_acc = results["max_accuracy_subject"]
# Create DataFrame from results table
results_df = pd.DataFrame(results["full_accuracy_table"])
# Format the report
report = (
f"### Overall Results\n"
f"* Overall Accuracy: {overall_acc:.3f}\n"
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n\n"
f"### Detailed Results Table\n"
f"{results_df.to_markdown()}\n"
)
return report
# ---------------------------------------------------------------------------
# 4. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Mistral-7B on MMLU - Evaluation Demo")
gr.Markdown("""
This demo evaluates Mistral-7B on the MMLU Dataset.
""")
# Load Model Section
with gr.Row():
load_button = gr.Button("Load Model", variant="primary")
load_status = gr.Textbox(label="Model Status", interactive=False)
# Toy Dataset Evaluation Section
gr.Markdown("### Toy Dataset Evaluation")
with gr.Row():
eval_toy_button = gr.Button("Run Toy Evaluation", variant="primary")
toy_output = gr.Textbox(label="Results")
toy_plot = gr.HTML(label="Visualization and Details")
# MMLU Evaluation Section
gr.Markdown("### MMLU Evaluation")
with gr.Row():
all_subjects_checkbox = gr.Checkbox(
label="Evaluate All Subjects",
value=True,
info="When checked, evaluates all 57 MMLU subjects"
)
num_subjects_slider = gr.Slider(
minimum=1,
maximum=57,
value=57,
step=1,
label="Number of Subjects",
info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order.",
interactive=True
)
with gr.Row():
num_shots_slider = gr.Slider(
minimum=0,
maximum=5,
value=5,
step=1,
label="Number of Few-shot Examples",
info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."
)
num_examples_slider = gr.Slider(
minimum=1,
maximum=10,
value=5,
step=1,
label="Examples per Subject",
info="Number of test examples per subject (1-10). They will be loaded in alphabetical order."
)
with gr.Row():
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
results_output = gr.Markdown(label="Evaluation Results")
# Connect components
load_button.click(fn=load_model, inputs=None, outputs=load_status)
# Connect toy evaluation
eval_toy_button.click(
fn=run_toy_evaluation,
inputs=None,
outputs=[toy_output, toy_plot]
)
# Update num_subjects_slider interactivity based on all_subjects checkbox
all_subjects_checkbox.change(
fn=lambda x: gr.update(interactive=not x),
inputs=[all_subjects_checkbox],
outputs=[num_subjects_slider]
)
# Connect MMLU evaluation button
eval_mmlu_button.click(
fn=run_mmlu_evaluation,
inputs=[
all_subjects_checkbox,
num_subjects_slider,
num_shots_slider,
num_examples_slider
],
outputs=results_output
)
demo.launch()
|