Spaces:
Sleeping
Sleeping
File size: 5,084 Bytes
c5224d3 3195f7f 4c36941 8e8ba80 ee60006 3f6f5f7 4c36941 e8d7a5b c5224d3 3195f7f a7f824f 3195f7f e8d7a5b 614dffd a7f824f 3195f7f 614dffd a7f824f 3195f7f a7f824f 3195f7f a7f824f 3195f7f a7f824f 614dffd a7f824f 3195f7f a7f824f 3195f7f 614dffd ee60006 a7f824f ee60006 9190bb9 ee60006 9190bb9 ee60006 9190bb9 ee60006 9190bb9 ee60006 3195f7f a7f824f 3195f7f e8d7a5b a7f824f e8d7a5b a7f824f e8d7a5b 3195f7f e8d7a5b c5224d3 a7f824f ee60006 bd3a12c a7f824f ee60006 a7f824f ee60006 e8d7a5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from huggingface_hub import login
from toy_dataset_eval import evaluate_toy_dataset
from mmlu_eval import evaluate_mmlu
import spaces
# Read token and login
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
if hf_token:
login(hf_token)
else:
print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
# ---------------------------------------------------------------------------
# 1. Model and tokenizer setup and Loading
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = None
model = None
model_loaded = False
@spaces.GPU
def load_model():
"""Loads the Mistral model and tokenizer and updates the load status."""
global tokenizer, model, model_loaded
try:
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
if model is None:
model = AutoModelForCausalLM.from_pretrained(
model_name,
token=hf_token,
torch_dtype=torch.float16
)
model.to('cuda')
model_loaded = True
return "✅ Model Loaded!"
except Exception as e:
model_loaded = False
return f"❌ Model Load Failed: {str(e)}"
# ---------------------------------------------------------------------------
# 2. Toy Evaluation
# ---------------------------------------------------------------------------
@spaces.GPU (duration=120)
def run_toy_evaluation():
"""Runs the toy dataset evaluation."""
if not model_loaded:
load_model()
if not model_loaded:
return "⚠️ Model not loaded. Please load the model first."
results = evaluate_toy_dataset(model, tokenizer)
return results # Ensure load confirmation is shown before results
# ---------------------------------------------------------------------------
# 3. MMLU Evaluation call
# ---------------------------------------------------------------------------
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
def run_mmlu_evaluation(num_questions):
if not model_loaded:
load_model()
if not model_loaded:
return "⚠️ Model not loaded. Please load the model first."
"""
Runs the MMLU evaluation with the specified number of questions per task.
Also displays two correct and two incorrect examples.
"""
results = evaluate_mmlu(model, tokenizer, num_questions)
overall_accuracy = results["overall_accuracy"]
min_task, min_acc = results["min_accuracy_task"]
max_task, max_acc = results["max_accuracy_task"]
correct_examples = results["correct_examples"]
incorrect_examples = results["incorrect_examples"]
# Format examples for readability
def format_example(example):
task, question, model_output, correct_answer = example
return f"**Task:** {task}\n**Question:** {question}\n**Model Output:** {model_output}\n**Correct Answer:** {correct_answer}\n"
correct_text = "\n\n".join(format_example(ex) for ex in correct_examples)
incorrect_text = "\n\n".join(format_example(ex) for ex in incorrect_examples)
report = (
f"### Overall Accuracy: {overall_accuracy:.2f}\n"
f"**Min Accuracy:** {min_acc:.2f} on `{min_task}`\n"
f"**Max Accuracy:** {max_acc:.2f} on `{max_task}`\n\n"
f"---\n\n"
f"### ✅ Correct Examples\n{correct_text if correct_examples else 'No correct examples available.'}\n\n"
f"### ❌ Incorrect Examples\n{incorrect_text if incorrect_examples else 'No incorrect examples available.'}"
)
return report
# ---------------------------------------------------------------------------
# 4. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Mistral-7B Math Evaluation Demo")
gr.Markdown("""
This demo evaluates Mistral-7B on Various Datasets.
""")
# Load Model Button
load_button = gr.Button("Load Model", variant="primary")
load_status = gr.Textbox(label="Model Status", interactive=False)
load_button.click(fn=load_model, inputs=None, outputs=load_status)
# Toy Dataset Evaluation
gr.Markdown("### Toy Dataset Evaluation")
eval_button = gr.Button("Run Evaluation", variant="primary")
output_text = gr.Textbox(label="Results")
output_plot = gr.HTML(label="Visualization and Details")
eval_button.click(fn=run_toy_evaluation, inputs=None, outputs=[output_text, output_plot])
# MMLU Evaluation
gr.Markdown("### MMLU Evaluation")
num_questions_input = gr.Number(label="Questions per Task (Total of 57 tasks)", value=5, precision=0)
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
demo.launch() |