File size: 17,501 Bytes
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4653c51
1d31670
4653c51
 
1d31670
4653c51
 
 
1d31670
 
 
4653c51
 
1d31670
4653c51
 
 
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4653c51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d31670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
import { Box, Typography } from "@mui/material";

const createTooltipContent = (title, items) => (
  <Box sx={{ maxWidth: 400 }}>
    <Typography variant="body2" paragraph sx={{ mb: 1, color: "inherit" }}>
      {title}
    </Typography>
    <Box component="ul" sx={{ m: 0, pl: 2 }}>
      {items.map(({ label, description, subItems }, index) => (
        <li key={index}>
          <Typography variant="body2" sx={{ mb: 0.5, color: "inherit" }}>
            <b>{label}</b>: {description}
            {subItems && (
              <Box component="ul" sx={{ mt: 0.5, mb: 1 }}>
                {subItems.map((item, subIndex) => (
                  <li key={subIndex}>
                    <Typography variant="body2" sx={{ color: "inherit" }}>
                      {item}
                    </Typography>
                  </li>
                ))}
              </Box>
            )}
          </Typography>
        </li>
      ))}
    </Box>
  </Box>
);

export const COLUMN_TOOLTIPS = {
  AVERAGE: createTooltipContent("Average score across all benchmarks:", [
    {
      label: "Calculation",
      description: "Weighted average of normalized scores from all benchmarks",
      subItems: [
        "Each benchmark is normalized to a 0-100 scale",
        "All normalised benchmarks are then averaged together",
      ],
    },
  ]),

  IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
    {
      label: "Purpose",
      description:
        "Tests model's ability to follow explicit formatting instructions",
      subItems: ["Instruction following", "Formatting", "Generation"],
    },
    {
      label: "Scoring: Accuracy",
      description: "Was the format asked for strictly respected.",
    },
  ]),

  BBH: createTooltipContent("Big Bench Hard (BBH):", [
    {
      label: "Overview",
      description: "Collection of challenging for LLM tasks across domains, for example",
      subItems: [
        "Language understanding",
        "Mathematical reasoning",
        "Common sense and world knowledge",
      ],
    },
    {
      label: "Scoring: Accuracy",
      description:
        "Was the correct choice selected among the options.",
    },
  ]),

  MATH: createTooltipContent(
    "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
    [
      {
        label: "Content",
        description: "High school level competitions mathematical problems",
        subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
      },
      {
        label: "Scoring: Exact match",
        description:
          "Was the solution generated correct and in the expected format",
      },
    ]
  ),

  GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
    {
      label: "Focus",
      description: "PhD-level knowledge multiple choice questions in science",
      subItems: [
        "Chemistry",
        "Biology",
        "Physics",
      ],
    },
    {
      label: "Scoring: Accuracy",
      description:
        "Was the correct choice selected among the options.",
    },
  ]),

  MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
    {
      label: "Scope",
      description: "Reasoning and understanding on/of long texts",
      subItems: [
        "Language understanding",
        "Reasoning capabilities",
        "Long context reasoning",
      ],
    },
    {
      label: "Scoring: Accuracy",
      description:
        "Was the correct choice selected among the options.",
    },
  ]),

  MMLU_PRO: createTooltipContent(
    "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
    [
      {
        label: "Coverage",
        description: "Expertly reviewed multichoice questions across domains, for example:",
        subItems: [
          "Medicine and healthcare",
          "Law and ethics",
          "Engineering",
          "Mathematics",
        ],
      },
      {
        label: "Scoring: Accuracy",
        description:
          "Was the correct choice selected among the options.",
      },
    ]
  ),

  ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
    {
      label: "Definition",
      description: "The fundamental structure and design of the model",
      subItems: [
        "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
        "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
        "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
        "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
        "Merged: Combining multiple models through weights averaging or similar methods.",
        "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
      ],
    },
    {
      label: "Impact",
      description: "How architecture affects model capabilities",
      subItems: [
        "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
        "Fine-tuned and chat models can be more verbose and more chatty than base models.",
        "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
      ],
    },
  ]),

  PRECISION: createTooltipContent("Numerical Precision Format:", [
    {
      label: "Overview",
      description:
        "Data format used to store model weights and perform computations",
      subItems: [
        "bfloat16: Half precision (Brain Float format), good for stability",
        "float16: Half precision",
        "8bit/4bit: Quantized formats, for efficiency",
        "GPTQ/AWQ: Quantized methods",
      ],
    },
    {
      label: "Impact",
      description: "How precision affects model deployment",
      subItems: [
        "Higher precision = better accuracy but more memory usage",
        "Lower precision = faster inference and smaller size",
        "Trade-off between model quality and resource usage",
      ],
    },
  ]),

  REASONING: createTooltipContent("Model Reasoning Capabilities:", [
    {
      label: "Reasoning Models",
      description: "Models that use reasoning capabilities to think through problems step by step",
      subItems: [
        "Can break down complex problems into smaller steps",
        "Often show their thinking process in responses",
        "May take longer to respond but provide more thorough answers",
      ],
    },
    {
      label: "Non-reasoning Models",
      description: "Traditional models that generate responses directly",
      subItems: [
        "Generate responses without explicit step-by-step reasoning",
        "Typically faster response times",
        "May still be highly capable but use implicit reasoning",
      ],
    },
  ]),

  PARAMETERS: createTooltipContent("Model Parameters:", [
    {
      label: "Measurement",
      description: "Total number of trainable parameters in billions",
      subItems: [
        "Indicates model capacity and complexity",
        "Correlates with computational requirements",
        "Influences memory usage and inference speed",
      ],
    },
  ]),

  LICENSE: createTooltipContent("Model License Information:", [
    {
      label: "Importance",
      description: "Legal terms governing model usage and distribution",
      subItems: [
        "Commercial vs non-commercial use",
        "Attribution requirements",
        "Modification and redistribution rights",
        "Liability and warranty terms",
      ],
    },
  ]),

  CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
    {
      label: "What is it?",
      description: "CO₂ emissions of the model evaluation ",
      subItems: [
        "Only focuses on model inference for our specific setup",
        "Considers data center location and energy mix",
        "Allows equivalent comparision of models on our use case",
      ],
    },
    {
      label: "Why it matters",
      description: "Environmental impact of AI model training",
      subItems: [
        "Large models can have significant carbon footprints",
        "Helps make informed choices about model selection",
      ],
    },
    {
      label: "Learn more",
      description:
        "For detailed information about our CO₂ calculation methodology, visit:",
      subItems: [
        <a
          href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Carbon Emissions Documentation ↗
        </a>,
      ],
    },
  ]),

  // Icelandic benchmarks
  WINOGRANDE_IS: createTooltipContent("WinoGrande-IS (3-shot):", [
    {
      label: "Description",
      description: "Icelandic version of the WinoGrande task for coreference resolution",
      subItems: [
        "Human-translated and localized ~1000 test set examples",
        "Each example has a sentence with a blank and two answer choices",
        "Tests knowledge and common sense reasoning in Icelandic",
        "Evaluation: 3-shot, exact match",
      ],
    },
    {
      label: "Dataset",
      description: "IceBERT paper describes this dataset in detail",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-winogrande"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  GED: createTooltipContent("Grammatical Error Detection:", [
    {
      label: "Description",
      description: "Binary sentence-level Icelandic grammatical error detection",
      subItems: [
        "Adapted from the Icelandic Error Corpus (IEC)",
        "Contains 200 examples",
        "Task: predict whether sentence contains grammatical errors",
        "Evaluation: exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-sentences-gec"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  INFLECTION: createTooltipContent("Inflection (1-shot):", [
    {
      label: "Description",
      description: "Tests ability to generate inflected forms of Icelandic words",
      subItems: [
        "300 Icelandic adjective-noun pairs",
        "All four cases, singular and plural",
        "Evaluation: 1-shot, exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  BELEBELE_IS: createTooltipContent("Belebele (IS):", [
    {
      label: "Description",
      description: "Icelandic subset of the Belebele reading comprehension benchmark",
      subItems: [
        "900 examples of multiple-choice reading comprehension",
        "Task: answer questions about given passages",
        "Evaluation: exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Part of the multilingual Belebele benchmark",
      subItems: [
        <a
          href="https://huggingface.co/datasets/facebook/belebele"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  ARC_CHALLENGE_IS: createTooltipContent("ARC-Challenge-IS:", [
    {
      label: "Description",
      description: "Machine-translated version of ARC-Challenge for Icelandic",
      subItems: [
        "Multiple-choice question-answering dataset",
        "Test set contains 1.23k examples",
        "Evaluation: exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-arc-challenge"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  WIKIQA_IS: createTooltipContent("WikiQA-IS:", [
    {
      label: "Description",
      description: "Icelandic Wikipedia question-answer pairs for cultural and historical knowledge",
      subItems: [
        "1.9k question-answer pairs from Icelandic Wikipedia",
        "Tests knowledge of Icelandic culture and history",
        "Generated by GPT-4o, manually verified and corrected",
        "Evaluation: LLM judge scoring by GPT-4o (0=poor, 1=fair, 2=excellent)",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic_wiki_qa"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),
};

export const UI_TOOLTIPS = {
  COLUMN_SELECTOR: "Choose which columns to display in the table",
  DISPLAY_OPTIONS: createTooltipContent("Table Display Options", [
    {
      label: "Overview",
      description: "Configure how the table displays data and information",
      subItems: [
        "Row size and layout",
        "Score display format",
        "Ranking calculation",
        "Average score computation",
      ],
    },
  ]),
  SEARCH_BAR: createTooltipContent("Advanced Model Search", [
    {
      label: "Name Search",
      description: "Search directly by model name",
      subItems: [
        "Supports regular expressions (e.g., ^mistral.*7b)",
        "Case sensitive",
      ],
    },
    {
      label: "Field Search",
      description: "Use @field:value syntax for precise filtering",
      subItems: [
        "@architecture:llama - Filter by architecture",
        "@license:mit - Filter by license",
        "@precision:float16 - Filter by precision",
        "@type:chat - Filter by model type",
      ],
    },
    {
      label: "Multiple Searches",
      description: "Combine multiple criteria using semicolons",
      subItems: [
        "meta @license:mit; @architecture:llama",
        "^mistral.*7b; @precision:float16",
      ],
    },
  ]),
  QUICK_FILTERS: createTooltipContent(
    "Filter models based on their size and applicable hardware:",
    [
      {
        label: "Edge devices (Up to 3BB)",
        description:
          "Efficient models for edge devices, optimized for blazing fast inference.",
      },
      {
        label: "Smol Models (3B-7B)",
        description:
          "Efficient models for consumer hardware, optimized for fast inference.",
      },
      {
        label: "Mid-range models (7B-65B)",
        description:
          "A bit of everything here, with overall balanced performance and resource usage around 30B.",
      },
      {
        label: "GPU-rich models (65B+)",
        description:
          "State-of-the-art performance for complex tasks, requires significant computing power.",
      },
      {
        label: "Official Providers",
        description:
          "Models directly maintained by their original creators, ensuring reliability and up-to-date performance.",
      },
    ]
  ),
  ROW_SIZE: {
    title: "Row Size",
    description:
      "Adjust the height of table rows. Compact is ideal for viewing more data at once, while Large provides better readability and touch targets.",
  },
  SCORE_DISPLAY: {
    title: "Score Display",
    description:
      "Choose between normalized scores (0-100% scale for easy comparison) or raw scores (actual benchmark results). Normalized scores help compare performance across different benchmarks, while raw scores show actual benchmark outputs.",
  },
  RANKING_MODE: {
    title: "Ranking Mode",
    description:
      "Choose between static ranking (original position in the full leaderboard) or dynamic ranking (position based on current filters and sorting).",
  },
  AVERAGE_SCORE: {
    title: "Average Score Calculation",
    description:
      "Define how the average score is calculated. 'All Scores' uses all benchmarks, while 'Visible Only' calculates the average using only the visible benchmark columns.",
  },
};

export const getTooltipStyle = {};

export const TABLE_TOOLTIPS = {
  HUB_LINK: (modelName) => `View ${modelName} on Hugging Face Hub`,
  EVAL_RESULTS: (modelName) =>
    `View detailed evaluation results for ${modelName}`,
  POSITION_CHANGE: (change) =>
    `${Math.abs(change)} position${Math.abs(change) > 1 ? "s" : ""} ${
      change > 0 ? "up" : "down"
    }`,
  METADATA: {
    TYPE: (type) => type || "-",
    ARCHITECTURE: (arch) => arch || "-",
    PRECISION: (precision) => precision || "-",
    LICENSE: (license) => license || "-",
    UPLOAD_DATE: (date) => date || "-",
    SUBMISSION_DATE: (date) => date || "-",
    BASE_MODEL: (model) => model || "-",
  },
};