Spaces:

mideind
/

icelandic-llm-leaderboard

Running

File size: 17,501 Bytes

import { Box, Typography } from "@mui/material";

const createTooltipContent = (title, items) => (
  <Box sx={{ maxWidth: 400 }}>
    <Typography variant="body2" paragraph sx={{ mb: 1, color: "inherit" }}>
      {title}
    </Typography>
    <Box component="ul" sx={{ m: 0, pl: 2 }}>
      {items.map(({ label, description, subItems }, index) => (
        <li key={index}>
          <Typography variant="body2" sx={{ mb: 0.5, color: "inherit" }}>
            <b>{label}</b>: {description}
            {subItems && (
              <Box component="ul" sx={{ mt: 0.5, mb: 1 }}>
                {subItems.map((item, subIndex) => (
                  <li key={subIndex}>
                    <Typography variant="body2" sx={{ color: "inherit" }}>
                      {item}
                    </Typography>
                  </li>
                ))}
              </Box>
            )}
          </Typography>
        </li>
      ))}
    </Box>
  </Box>
);

export const COLUMN_TOOLTIPS = {
  AVERAGE: createTooltipContent("Average score across all benchmarks:", [
    {
      label: "Calculation",
      description: "Weighted average of normalized scores from all benchmarks",
      subItems: [
        "Each benchmark is normalized to a 0-100 scale",
        "All normalised benchmarks are then averaged together",
      ],
    },
  ]),

  IFEVAL: createTooltipContent("Instruction-Following Evaluation (IFEval):", [
    {
      label: "Purpose",
      description:
        "Tests model's ability to follow explicit formatting instructions",
      subItems: ["Instruction following", "Formatting", "Generation"],
    },
    {
      label: "Scoring: Accuracy",
      description: "Was the format asked for strictly respected.",
    },
  ]),

  BBH: createTooltipContent("Big Bench Hard (BBH):", [
    {
      label: "Overview",
      description: "Collection of challenging for LLM tasks across domains, for example",
      subItems: [
        "Language understanding",
        "Mathematical reasoning",
        "Common sense and world knowledge",
      ],
    },
    {
      label: "Scoring: Accuracy",
      description:
        "Was the correct choice selected among the options.",
    },
  ]),

  MATH: createTooltipContent(
    "Mathematics Aptitude Test of Heuristics (MATH), level 5:",
    [
      {
        label: "Content",
        description: "High school level competitions mathematical problems",
        subItems: ["Complex algebra", "Geometry problems", "Advanced calculus"],
      },
      {
        label: "Scoring: Exact match",
        description:
          "Was the solution generated correct and in the expected format",
      },
    ]
  ),

  GPQA: createTooltipContent("Graduate-Level Google-Proof Q&A (GPQA):", [
    {
      label: "Focus",
      description: "PhD-level knowledge multiple choice questions in science",
      subItems: [
        "Chemistry",
        "Biology",
        "Physics",
      ],
    },
    {
      label: "Scoring: Accuracy",
      description:
        "Was the correct choice selected among the options.",
    },
  ]),

  MUSR: createTooltipContent("Multistep Soft Reasoning (MuSR):", [
    {
      label: "Scope",
      description: "Reasoning and understanding on/of long texts",
      subItems: [
        "Language understanding",
        "Reasoning capabilities",
        "Long context reasoning",
      ],
    },
    {
      label: "Scoring: Accuracy",
      description:
        "Was the correct choice selected among the options.",
    },
  ]),

  MMLU_PRO: createTooltipContent(
    "Massive Multitask Language Understanding - Professional (MMLU-Pro):",
    [
      {
        label: "Coverage",
        description: "Expertly reviewed multichoice questions across domains, for example:",
        subItems: [
          "Medicine and healthcare",
          "Law and ethics",
          "Engineering",
          "Mathematics",
        ],
      },
      {
        label: "Scoring: Accuracy",
        description:
          "Was the correct choice selected among the options.",
      },
    ]
  ),

  ARCHITECTURE: createTooltipContent("Model Architecture Information:", [
    {
      label: "Definition",
      description: "The fundamental structure and design of the model",
      subItems: [
        "Pretrained: Foundational models, initially trained on large datasets without task-specific tuning, serving as a versatile base for further development.",
        "Continuously Pretrained: Base models trained with a data mix evolving as the model is trained, with the addition of specialized data during the last training steps.",
        "Fine-tuned: Base models, fine-tuned on specialised domain data (legal, medical, ...), and optimized for particular tasks.",
        "Chat: Models fine-tuned with IFT, RLHF, DPO, and other techniques, to handle conversational contexts effectively.",
        "Merged: Combining multiple models through weights averaging or similar methods.",
        "Multimodal: Models which can handle several modalities (text & image/audio/video/...). We only evaluate the text capabilities.",
      ],
    },
    {
      label: "Impact",
      description: "How architecture affects model capabilities",
      subItems: [
        "Base models are expected to perform less well on instruction following evaluations, like IFEval.",
        "Fine-tuned and chat models can be more verbose and more chatty than base models.",
        "Merged models tend to exhibit good performance on benchmarks, which do not translate to real-world situations.",
      ],
    },
  ]),

  PRECISION: createTooltipContent("Numerical Precision Format:", [
    {
      label: "Overview",
      description:
        "Data format used to store model weights and perform computations",
      subItems: [
        "bfloat16: Half precision (Brain Float format), good for stability",
        "float16: Half precision",
        "8bit/4bit: Quantized formats, for efficiency",
        "GPTQ/AWQ: Quantized methods",
      ],
    },
    {
      label: "Impact",
      description: "How precision affects model deployment",
      subItems: [
        "Higher precision = better accuracy but more memory usage",
        "Lower precision = faster inference and smaller size",
        "Trade-off between model quality and resource usage",
      ],
    },
  ]),

  REASONING: createTooltipContent("Model Reasoning Capabilities:", [
    {
      label: "Reasoning Models",
      description: "Models that use reasoning capabilities to think through problems step by step",
      subItems: [
        "Can break down complex problems into smaller steps",
        "Often show their thinking process in responses",
        "May take longer to respond but provide more thorough answers",
      ],
    },
    {
      label: "Non-reasoning Models",
      description: "Traditional models that generate responses directly",
      subItems: [
        "Generate responses without explicit step-by-step reasoning",
        "Typically faster response times",
        "May still be highly capable but use implicit reasoning",
      ],
    },
  ]),

  PARAMETERS: createTooltipContent("Model Parameters:", [
    {
      label: "Measurement",
      description: "Total number of trainable parameters in billions",
      subItems: [
        "Indicates model capacity and complexity",
        "Correlates with computational requirements",
        "Influences memory usage and inference speed",
      ],
    },
  ]),

  LICENSE: createTooltipContent("Model License Information:", [
    {
      label: "Importance",
      description: "Legal terms governing model usage and distribution",
      subItems: [
        "Commercial vs non-commercial use",
        "Attribution requirements",
        "Modification and redistribution rights",
        "Liability and warranty terms",
      ],
    },
  ]),

  CO2_COST: createTooltipContent("Carbon Dioxide Emissions:", [
    {
      label: "What is it?",
      description: "CO₂ emissions of the model evaluation ",
      subItems: [
        "Only focuses on model inference for our specific setup",
        "Considers data center location and energy mix",
        "Allows equivalent comparision of models on our use case",
      ],
    },
    {
      label: "Why it matters",
      description: "Environmental impact of AI model training",
      subItems: [
        "Large models can have significant carbon footprints",
        "Helps make informed choices about model selection",
      ],
    },
    {
      label: "Learn more",
      description:
        "For detailed information about our CO₂ calculation methodology, visit:",
      subItems: [
        <a
          href="https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Carbon Emissions Documentation ↗
        </a>,
      ],
    },
  ]),

  // Icelandic benchmarks
  WINOGRANDE_IS: createTooltipContent("WinoGrande-IS (3-shot):", [
    {
      label: "Description",
      description: "Icelandic version of the WinoGrande task for coreference resolution",
      subItems: [
        "Human-translated and localized ~1000 test set examples",
        "Each example has a sentence with a blank and two answer choices",
        "Tests knowledge and common sense reasoning in Icelandic",
        "Evaluation: 3-shot, exact match",
      ],
    },
    {
      label: "Dataset",
      description: "IceBERT paper describes this dataset in detail",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-winogrande"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  GED: createTooltipContent("Grammatical Error Detection:", [
    {
      label: "Description",
      description: "Binary sentence-level Icelandic grammatical error detection",
      subItems: [
        "Adapted from the Icelandic Error Corpus (IEC)",
        "Contains 200 examples",
        "Task: predict whether sentence contains grammatical errors",
        "Evaluation: exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-sentences-gec"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  INFLECTION: createTooltipContent("Inflection (1-shot):", [
    {
      label: "Description",
      description: "Tests ability to generate inflected forms of Icelandic words",
      subItems: [
        "300 Icelandic adjective-noun pairs",
        "All four cases, singular and plural",
        "Evaluation: 1-shot, exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-inflection-all-flat"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  BELEBELE_IS: createTooltipContent("Belebele (IS):", [
    {
      label: "Description",
      description: "Icelandic subset of the Belebele reading comprehension benchmark",
      subItems: [
        "900 examples of multiple-choice reading comprehension",
        "Task: answer questions about given passages",
        "Evaluation: exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Part of the multilingual Belebele benchmark",
      subItems: [
        <a
          href="https://huggingface.co/datasets/facebook/belebele"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  ARC_CHALLENGE_IS: createTooltipContent("ARC-Challenge-IS:", [
    {
      label: "Description",
      description: "Machine-translated version of ARC-Challenge for Icelandic",
      subItems: [
        "Multiple-choice question-answering dataset",
        "Test set contains 1.23k examples",
        "Evaluation: exact match",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic-arc-challenge"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),

  WIKIQA_IS: createTooltipContent("WikiQA-IS:", [
    {
      label: "Description",
      description: "Icelandic Wikipedia question-answer pairs for cultural and historical knowledge",
      subItems: [
        "1.9k question-answer pairs from Icelandic Wikipedia",
        "Tests knowledge of Icelandic culture and history",
        "Generated by GPT-4o, manually verified and corrected",
        "Evaluation: LLM judge scoring by GPT-4o (0=poor, 1=fair, 2=excellent)",
      ],
    },
    {
      label: "Dataset",
      description: "Available on Hugging Face",
      subItems: [
        <a
          href="https://huggingface.co/datasets/mideind/icelandic_wiki_qa"
          target="_blank"
          rel="noopener noreferrer"
          style={{ color: "#90caf9" }}
        >
          Dataset ↗
        </a>,
      ],
    },
  ]),
};

export const UI_TOOLTIPS = {
  COLUMN_SELECTOR: "Choose which columns to display in the table",
  DISPLAY_OPTIONS: createTooltipContent("Table Display Options", [
    {
      label: "Overview",
      description: "Configure how the table displays data and information",
      subItems: [
        "Row size and layout",
        "Score display format",
        "Ranking calculation",
        "Average score computation",
      ],
    },
  ]),
  SEARCH_BAR: createTooltipContent("Advanced Model Search", [
    {
      label: "Name Search",
      description: "Search directly by model name",
      subItems: [
        "Supports regular expressions (e.g., ^mistral.*7b)",
        "Case sensitive",
      ],
    },
    {
      label: "Field Search",
      description: "Use @field:value syntax for precise filtering",
      subItems: [
        "@architecture:llama - Filter by architecture",
        "@license:mit - Filter by license",
        "@precision:float16 - Filter by precision",
        "@type:chat - Filter by model type",
      ],
    },
    {
      label: "Multiple Searches",
      description: "Combine multiple criteria using semicolons",
      subItems: [
        "meta @license:mit; @architecture:llama",
        "^mistral.*7b; @precision:float16",
      ],
    },
  ]),
  QUICK_FILTERS: createTooltipContent(
    "Filter models based on their size and applicable hardware:",
    [
      {
        label: "Edge devices (Up to 3BB)",
        description:
          "Efficient models for edge devices, optimized for blazing fast inference.",
      },
      {
        label: "Smol Models (3B-7B)",
        description:
          "Efficient models for consumer hardware, optimized for fast inference.",
      },
      {
        label: "Mid-range models (7B-65B)",
        description:
          "A bit of everything here, with overall balanced performance and resource usage around 30B.",
      },
      {
        label: "GPU-rich models (65B+)",
        description:
          "State-of-the-art performance for complex tasks, requires significant computing power.",
      },
      {
        label: "Official Providers",
        description:
          "Models directly maintained by their original creators, ensuring reliability and up-to-date performance.",
      },
    ]
  ),
  ROW_SIZE: {
    title: "Row Size",
    description:
      "Adjust the height of table rows. Compact is ideal for viewing more data at once, while Large provides better readability and touch targets.",
  },
  SCORE_DISPLAY: {
    title: "Score Display",
    description:
      "Choose between normalized scores (0-100% scale for easy comparison) or raw scores (actual benchmark results). Normalized scores help compare performance across different benchmarks, while raw scores show actual benchmark outputs.",
  },
  RANKING_MODE: {
    title: "Ranking Mode",
    description:
      "Choose between static ranking (original position in the full leaderboard) or dynamic ranking (position based on current filters and sorting).",
  },
  AVERAGE_SCORE: {
    title: "Average Score Calculation",
    description:
      "Define how the average score is calculated. 'All Scores' uses all benchmarks, while 'Visible Only' calculates the average using only the visible benchmark columns.",
  },
};

export const getTooltipStyle = {};

export const TABLE_TOOLTIPS = {
  HUB_LINK: (modelName) => `View ${modelName} on Hugging Face Hub`,
  EVAL_RESULTS: (modelName) =>
    `View detailed evaluation results for ${modelName}`,
  POSITION_CHANGE: (change) =>
    `${Math.abs(change)} position${Math.abs(change) > 1 ? "s" : ""} ${
      change > 0 ? "up" : "down"
    }`,
  METADATA: {
    TYPE: (type) => type || "-",
    ARCHITECTURE: (arch) => arch || "-",
    PRECISION: (precision) => precision || "-",
    LICENSE: (license) => license || "-",
    UPLOAD_DATE: (date) => date || "-",
    SUBMISSION_DATE: (date) => date || "-",
    BASE_MODEL: (model) => model || "-",
  },
};