File size: 17,501 Bytes
1d31670 4653c51 1d31670 4653c51 1d31670 4653c51 1d31670 4653c51 1d31670 4653c51 1d31670 4653c51 1d31670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 |
import { Box, Typography } from "@mui/material";
const createTooltipContent = (title, items) => (
<Box sx={{ maxWidth: 400 }}>
<Typography variant="body2" paragraph sx={{ mb: 1, color: "inherit" }}>
{title}
</Typography>
<Box component="ul" sx={{ m: 0, pl: 2 }}>
{items.map(({ label, description, subItems }, index) => (
<li key={index}>
<Typography variant="body2" sx={{ mb: 0.5, color: "inherit" }}>
<b>{label}</b>: {description}
{subItems && (
<Box component="ul" sx={{ mt: 0.5, mb: 1 }}>
{subItems.map((item, subIndex) => (
<li key={subIndex}>
<Typography variant="body2" sx={{ color: "inherit" }}>
{item}
</Typography>
</li>
))}
</Box>
)}
</Typography>
</li>
))}
</Box>
</Box>
);
export const COLUMN_TOOLTIPS = {
AVERAGE: createTooltipContent("Average score across all benchmarks:", [
{
label: "Calculation",
description: "Weighted average of normalized scores from all benchmarks",
subItems: [
"Each benchmark is normalized to a 0-100 scale",
"All normalised benchmarks are then averaged together",
],
},
]),
IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
{
label: "Purpose",
description:
"Tests model's ability to follow explicit formatting instructions",
subItems: ["Instruction following", "Formatting", "Generation"],
},
{
label: "Scoring: Accuracy",
description: "Was the format asked for strictly respected.",
},
]),
BBH: createTooltipContent("Big Bench Hard (BBH):", [
{
label: "Overview",
description: "Collection of challenging for LLM tasks across domains, for example",
subItems: [
"Language understanding",
"Mathematical reasoning",
"Common sense and world knowledge",
],
},
{
label: "Scoring: Accuracy",
description:
"Was the correct choice selected among the options.",
},
]),
MATH: createTooltipContent(
"Mathematics Aptitude Test of Heuristics (MATH), level 5:",
[
{
label: "Content",
description: "High school level competitions mathematical problems",
subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
},
{
label: "Scoring: Exact match",
description:
"Was the solution generated correct and in the expected format",
},
]
),
GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
{
label: "Focus",
description: "PhD-level knowledge multiple choice questions in science",
subItems: [
"Chemistry",
"Biology",
"Physics",
],
},
{
label: "Scoring: Accuracy",
description:
"Was the correct choice selected among the options.",
},
]),
MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
{
label: "Scope",
description: "Reasoning and understanding on/of long texts",
subItems: [
"Language understanding",
"Reasoning capabilities",
"Long context reasoning",
],
},
{
label: "Scoring: Accuracy",
description:
"Was the correct choice selected among the options.",
},
]),
MMLU_PRO: createTooltipContent(
"Massive Multitask Language Understanding - Professional (MMLU-Pro):",
[
{
label: "Coverage",
description: "Expertly reviewed multichoice questions across domains, for example:",
subItems: [
"Medicine and healthcare",
"Law and ethics",
"Engineering",
"Mathematics",
],
},
{
label: "Scoring: Accuracy",
description:
"Was the correct choice selected among the options.",
},
]
),
ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
{
label: "Definition",
description: "The fundamental structure and design of the model",
subItems: [
"Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
"Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
"Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
"Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
"Merged: Combining multiple models through weights averaging or similar methods.",
"Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
],
},
{
label: "Impact",
description: "How architecture affects model capabilities",
subItems: [
"Base models are expected to perform less well on instruction following evaluations, like IFEval.",
"Fine-tuned and chat models can be more verbose and more chatty than base models.",
"Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
],
},
]),
PRECISION: createTooltipContent("Numerical Precision Format:", [
{
label: "Overview",
description:
"Data format used to store model weights and perform computations",
subItems: [
"bfloat16: Half precision (Brain Float format), good for stability",
"float16: Half precision",
"8bit/4bit: Quantized formats, for efficiency",
"GPTQ/AWQ: Quantized methods",
],
},
{
label: "Impact",
description: "How precision affects model deployment",
subItems: [
"Higher precision = better accuracy but more memory usage",
"Lower precision = faster inference and smaller size",
"Trade-off between model quality and resource usage",
],
},
]),
REASONING: createTooltipContent("Model Reasoning Capabilities:", [
{
label: "Reasoning Models",
description: "Models that use reasoning capabilities to think through problems step by step",
subItems: [
"Can break down complex problems into smaller steps",
"Often show their thinking process in responses",
"May take longer to respond but provide more thorough answers",
],
},
{
label: "Non-reasoning Models",
description: "Traditional models that generate responses directly",
subItems: [
"Generate responses without explicit step-by-step reasoning",
"Typically faster response times",
"May still be highly capable but use implicit reasoning",
],
},
]),
PARAMETERS: createTooltipContent("Model Parameters:", [
{
label: "Measurement",
description: "Total number of trainable parameters in billions",
subItems: [
"Indicates model capacity and complexity",
"Correlates with computational requirements",
"Influences memory usage and inference speed",
],
},
]),
LICENSE: createTooltipContent("Model License Information:", [
{
label: "Importance",
description: "Legal terms governing model usage and distribution",
subItems: [
"Commercial vs non-commercial use",
"Attribution requirements",
"Modification and redistribution rights",
"Liability and warranty terms",
],
},
]),
CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
{
label: "What is it?",
description: "CO₂ emissions of the model evaluation ",
subItems: [
"Only focuses on model inference for our specific setup",
"Considers data center location and energy mix",
"Allows equivalent comparision of models on our use case",
],
},
{
label: "Why it matters",
description: "Environmental impact of AI model training",
subItems: [
"Large models can have significant carbon footprints",
"Helps make informed choices about model selection",
],
},
{
label: "Learn more",
description:
"For detailed information about our CO₂ calculation methodology, visit:",
subItems: [
<a
href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Carbon Emissions Documentation ↗
</a>,
],
},
]),
// Icelandic benchmarks
WINOGRANDE_IS: createTooltipContent("WinoGrande-IS (3-shot):", [
{
label: "Description",
description: "Icelandic version of the WinoGrande task for coreference resolution",
subItems: [
"Human-translated and localized ~1000 test set examples",
"Each example has a sentence with a blank and two answer choices",
"Tests knowledge and common sense reasoning in Icelandic",
"Evaluation: 3-shot, exact match",
],
},
{
label: "Dataset",
description: "IceBERT paper describes this dataset in detail",
subItems: [
<a
href="https://huggingface.co/datasets/mideind/icelandic-winogrande"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Dataset ↗
</a>,
],
},
]),
GED: createTooltipContent("Grammatical Error Detection:", [
{
label: "Description",
description: "Binary sentence-level Icelandic grammatical error detection",
subItems: [
"Adapted from the Icelandic Error Corpus (IEC)",
"Contains 200 examples",
"Task: predict whether sentence contains grammatical errors",
"Evaluation: exact match",
],
},
{
label: "Dataset",
description: "Available on Hugging Face",
subItems: [
<a
href="https://huggingface.co/datasets/mideind/icelandic-sentences-gec"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Dataset ↗
</a>,
],
},
]),
INFLECTION: createTooltipContent("Inflection (1-shot):", [
{
label: "Description",
description: "Tests ability to generate inflected forms of Icelandic words",
subItems: [
"300 Icelandic adjective-noun pairs",
"All four cases, singular and plural",
"Evaluation: 1-shot, exact match",
],
},
{
label: "Dataset",
description: "Available on Hugging Face",
subItems: [
<a
href="https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Dataset ↗
</a>,
],
},
]),
BELEBELE_IS: createTooltipContent("Belebele (IS):", [
{
label: "Description",
description: "Icelandic subset of the Belebele reading comprehension benchmark",
subItems: [
"900 examples of multiple-choice reading comprehension",
"Task: answer questions about given passages",
"Evaluation: exact match",
],
},
{
label: "Dataset",
description: "Part of the multilingual Belebele benchmark",
subItems: [
<a
href="https://huggingface.co/datasets/facebook/belebele"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Dataset ↗
</a>,
],
},
]),
ARC_CHALLENGE_IS: createTooltipContent("ARC-Challenge-IS:", [
{
label: "Description",
description: "Machine-translated version of ARC-Challenge for Icelandic",
subItems: [
"Multiple-choice question-answering dataset",
"Test set contains 1.23k examples",
"Evaluation: exact match",
],
},
{
label: "Dataset",
description: "Available on Hugging Face",
subItems: [
<a
href="https://huggingface.co/datasets/mideind/icelandic-arc-challenge"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Dataset ↗
</a>,
],
},
]),
WIKIQA_IS: createTooltipContent("WikiQA-IS:", [
{
label: "Description",
description: "Icelandic Wikipedia question-answer pairs for cultural and historical knowledge",
subItems: [
"1.9k question-answer pairs from Icelandic Wikipedia",
"Tests knowledge of Icelandic culture and history",
"Generated by GPT-4o, manually verified and corrected",
"Evaluation: LLM judge scoring by GPT-4o (0=poor, 1=fair, 2=excellent)",
],
},
{
label: "Dataset",
description: "Available on Hugging Face",
subItems: [
<a
href="https://huggingface.co/datasets/mideind/icelandic_wiki_qa"
target="_blank"
rel="noopener noreferrer"
style={{ color: "#90caf9" }}
>
Dataset ↗
</a>,
],
},
]),
};
export const UI_TOOLTIPS = {
COLUMN_SELECTOR: "Choose which columns to display in the table",
DISPLAY_OPTIONS: createTooltipContent("Table Display Options", [
{
label: "Overview",
description: "Configure how the table displays data and information",
subItems: [
"Row size and layout",
"Score display format",
"Ranking calculation",
"Average score computation",
],
},
]),
SEARCH_BAR: createTooltipContent("Advanced Model Search", [
{
label: "Name Search",
description: "Search directly by model name",
subItems: [
"Supports regular expressions (e.g., ^mistral.*7b)",
"Case sensitive",
],
},
{
label: "Field Search",
description: "Use @field:value syntax for precise filtering",
subItems: [
"@architecture:llama - Filter by architecture",
"@license:mit - Filter by license",
"@precision:float16 - Filter by precision",
"@type:chat - Filter by model type",
],
},
{
label: "Multiple Searches",
description: "Combine multiple criteria using semicolons",
subItems: [
"meta @license:mit; @architecture:llama",
"^mistral.*7b; @precision:float16",
],
},
]),
QUICK_FILTERS: createTooltipContent(
"Filter models based on their size and applicable hardware:",
[
{
label: "Edge devices (Up to 3BB)",
description:
"Efficient models for edge devices, optimized for blazing fast inference.",
},
{
label: "Smol Models (3B-7B)",
description:
"Efficient models for consumer hardware, optimized for fast inference.",
},
{
label: "Mid-range models (7B-65B)",
description:
"A bit of everything here, with overall balanced performance and resource usage around 30B.",
},
{
label: "GPU-rich models (65B+)",
description:
"State-of-the-art performance for complex tasks, requires significant computing power.",
},
{
label: "Official Providers",
description:
"Models directly maintained by their original creators, ensuring reliability and up-to-date performance.",
},
]
),
ROW_SIZE: {
title: "Row Size",
description:
"Adjust the height of table rows. Compact is ideal for viewing more data at once, while Large provides better readability and touch targets.",
},
SCORE_DISPLAY: {
title: "Score Display",
description:
"Choose between normalized scores (0-100% scale for easy comparison) or raw scores (actual benchmark results). Normalized scores help compare performance across different benchmarks, while raw scores show actual benchmark outputs.",
},
RANKING_MODE: {
title: "Ranking Mode",
description:
"Choose between static ranking (original position in the full leaderboard) or dynamic ranking (position based on current filters and sorting).",
},
AVERAGE_SCORE: {
title: "Average Score Calculation",
description:
"Define how the average score is calculated. 'All Scores' uses all benchmarks, while 'Visible Only' calculates the average using only the visible benchmark columns.",
},
};
export const getTooltipStyle = {};
export const TABLE_TOOLTIPS = {
HUB_LINK: (modelName) => `View ${modelName} on Hugging Face Hub`,
EVAL_RESULTS: (modelName) =>
`View detailed evaluation results for ${modelName}`,
POSITION_CHANGE: (change) =>
`${Math.abs(change)} position${Math.abs(change) > 1 ? "s" : ""} ${
change > 0 ? "up" : "down"
}`,
METADATA: {
TYPE: (type) => type || "-",
ARCHITECTURE: (arch) => arch || "-",
PRECISION: (precision) => precision || "-",
LICENSE: (license) => license || "-",
UPLOAD_DATE: (date) => date || "-",
SUBMISSION_DATE: (date) => date || "-",
BASE_MODEL: (model) => model || "-",
},
};
|