Spaces:
Sleeping
Sleeping
File size: 5,069 Bytes
c44c829 22c1e11 c44c829 22c1e11 c44c829 eb7de2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import os
import json
import pandas as pd
from typing import Dict, Any
def calculate_evaluation_time(num_questions: int, input_tokens: int, generated_tokens_per_question: int = 300):
"""Calculate approximate evaluation time based on token counts and throughput assumptions"""
# Constants
PROMPT_THROUGHPUT = 5000 # tokens per second
GENERATION_THROUGHPUT = 500 # tokens per second
OVERHEAD_MINUTES = 2 # Fixed overhead for model loading, etc.
# Calculate total generated tokens
total_generated_tokens = num_questions * generated_tokens_per_question
# Calculate time components (in seconds)
prompt_time = input_tokens / PROMPT_THROUGHPUT
generation_time = total_generated_tokens / GENERATION_THROUGHPUT
# Total time in minutes
total_time_minutes = (prompt_time + generation_time) / 60 + OVERHEAD_MINUTES
return total_time_minutes
def mmlupro_dataset_preview() -> Dict[str, Any]:
"""
Generate or retrieve the MMLU-Pro dataset preview information.
Returns:
Dict[str, Any]: Dictionary containing dataset information
"""
preview_file = "mmlu_pro_dataset_preview_table.json"
# Check if preview file exists
if os.path.exists(preview_file):
try:
# Read existing preview file
with open(preview_file, 'r') as f:
preview_data = json.load(f)
return preview_data
except Exception as e:
print(f"Error reading preview file: {e}")
# If file exists but can't be read, regenerate it
# Generate preview data if file doesn't exist or couldn't be read
num_questions = 12032
input_tokens = 12642105
generated_tokens_per_question = 300
# Calculate evaluation time
eval_time_minutes = calculate_evaluation_time(
num_questions,
input_tokens,
generated_tokens_per_question
)
# Create preview data
preview_data = {
"dataset_name": "MMLU-Pro",
"evaluation_type": "Multiple Choice",
"description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
"links": {
"huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
"github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
"paper": "https://arxiv.org/abs/2406.01574"
},
"organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
"num_questions": num_questions,
"input_tokens": input_tokens,
"evaluation_time": {
"generated_tokens_per_question": generated_tokens_per_question,
"total_generated_tokens": num_questions * generated_tokens_per_question,
"prompt_throughput": 5000,
"generation_throughput": 500,
"total_time_minutes": round(eval_time_minutes, 2)
}
}
# Save preview data to file
try:
with open(preview_file, 'w') as f:
json.dump(preview_data, f, indent=2)
except Exception as e:
print(f"Error writing preview file: {e}")
return preview_data
def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
"""
Format the preview data into a DataFrame for display in Gradio
Args:
preview_data (Dict[str, Any]): Dataset preview information
Returns:
pd.DataFrame: Formatted data for display
"""
# Create a table format with keys and values
rows = [
{"Key": "Dataset Name", "Value": preview_data["dataset_name"]},
{"Key": "Evaluation Type", "Value": preview_data["evaluation_type"]},
{"Key": "Description", "Value": preview_data["description"]},
{"Key": "Links", "Value": (
f"Hugging Face: {preview_data['links']['huggingface']}\n"
f"GitHub: {preview_data['links']['github']}\n"
f"Paper: {preview_data['links']['paper']}"
)},
{"Key": "Organization", "Value": preview_data["organization"]},
{"Key": "Number of Questions", "Value": preview_data["num_questions"]},
{"Key": "Number of Input Tokens", "Value": preview_data["input_tokens"]},
{"Key": "Estimated Evaluation Time", "Value": f"{preview_data['evaluation_time']['total_time_minutes']} minutes (for 2 models on A100)"}
]
return pd.DataFrame(rows) |