tfrere's picture
add hallucination to how to submit | fix show arena only behaviour
de3d81e
raw
history blame
24.2 kB
import React from "react";
import {
Box,
Typography,
Paper,
Stack,
Divider,
alpha,
Link,
Grid,
InputLabel,
Tooltip,
IconButton,
} from "@mui/material";
import InfoOutlinedIcon from "@mui/icons-material/InfoOutlined";
import PageHeader from "../../components/PageHeader/PageHeader";
const StepNumber = ({ number }) => (
<Box
sx={{
width: 32,
height: 32,
borderRadius: "50%",
display: "flex",
alignItems: "center",
justifyContent: "center",
border: "1px solid",
borderColor: "primary.main",
color: "primary.main",
fontSize: "0.875rem",
fontWeight: 600,
flexShrink: 0,
bgcolor: "transparent",
}}
>
{number}
</Box>
);
const Section = ({ title, children }) => (
<Paper
elevation={0}
sx={{
border: "1px solid",
borderColor: "divider",
borderRadius: 1,
overflow: "hidden",
mb: 3,
}}
>
<Box
sx={{
px: 3,
py: 2,
borderBottom: "1px solid",
borderColor: "divider",
bgcolor: (theme) =>
theme.palette.mode === "dark"
? alpha(theme.palette.background.paper, 0.5)
: "grey.50",
}}
>
<Typography variant="h6" sx={{ fontWeight: 600, color: "text.primary" }}>
{title}
</Typography>
</Box>
<Box sx={{ p: 3, bgcolor: "background.paper" }}>{children}</Box>
</Paper>
);
const Tag = ({ children }) => (
<Box
component="span"
sx={{
display: "inline-block",
px: 1.5,
py: 0.5,
bgcolor: (theme) => alpha(theme.palette.primary.main, 0.1),
color: "primary.main",
borderRadius: 1,
fontSize: "0.875rem",
fontWeight: 600,
mr: 1,
mb: 1,
}}
>
{children}
</Box>
);
const TagCard = ({ title, description, tags, explanations }) => (
<Paper
elevation={1}
sx={{
p: 3,
height: "100%",
display: "flex",
flexDirection: "column",
borderRadius: 2,
border: "1px solid",
borderColor: "grey.200",
}}
>
<Typography variant="h6" sx={{ fontWeight: 600, mb: 2 }}>
{title}
</Typography>
{description && (
<Typography variant="body2" sx={{ mb: 2, color: "text.secondary" }}>
{description}
</Typography>
)}
<Box sx={{ flex: 1 }}>
{tags.map((tag, index) => (
<Box key={index} sx={{ mb: 2 }}>
<Tag>{tag}</Tag>
{explanations && explanations[index] && (
<Typography
variant="body2"
sx={{
color: "text.secondary",
mt: 1,
display: "block",
}}
dangerouslySetInnerHTML={{ __html: explanations[index] }}
/>
)}
</Box>
))}
</Box>
</Paper>
);
const CodeBlock = ({ children }) => (
<Box
sx={{
backgroundColor: (theme) =>
alpha(
theme.palette.primary.main,
theme.palette.mode === "dark" ? 0.15 : 0.05
),
px: 2,
py: 4,
borderRadius: 1,
fontFamily: "monospace",
mb: 2,
position: "relative",
"& .key": {
color: (theme) => theme.palette.primary.main,
},
"& .value": {
color: (theme) =>
theme.palette.mode === "dark"
? theme.palette.success.light
: theme.palette.success.dark,
},
"& .comment": {
color: (theme) => theme.palette.text.secondary,
},
"& .punctuation": {
color: (theme) => theme.palette.text.primary,
},
}}
>
<InputLabel
sx={{
position: "absolute",
right: 8,
top: 8,
fontSize: "0.75rem",
color: "text.secondary",
fontFamily: "monospace",
bgcolor: "background.paper",
px: 1,
py: 0.5,
borderRadius: 1,
border: "1px solid",
borderColor: "divider",
zIndex: 1,
}}
>
README.md
</InputLabel>
{children}
</Box>
);
const getTagEmoji = (tag) => {
const type = tag.split(":")[0];
const name = tag.split(":")[1];
const emojiMap = {
submission: {
automatic: "🤖",
semiautomatic: "🔄",
manual: "👨‍💻",
closed: "🔒",
},
test: {
public: "👀",
mix: "🔀",
private: "🔐",
rolling: "🎲",
},
judge: {
function: "⚙️",
model: "🧠",
humans: "👥",
vibeCheck: "✨",
},
modality: {
text: "📝",
image: "🖼️",
audio: "🎵",
video: "🎥",
tools: "🛠️",
artefacts: "🏺",
embeddings: "🔤",
},
eval: {
generation: "✨",
math: "🔢",
code: "💻",
reasoning: "🧠",
performance: "⚡",
safety: "🛡️",
hallucination: "🌫️",
},
task: {
rag: "🔍",
},
language: {
english: "🇬🇧",
french: "🇫🇷",
yourOwnLanguage: "🌍",
},
domain: {
financial: "💰",
medical: "⚕️",
legal: "⚖️",
biology: "🧬",
translation: "🔄",
chemistry: "🧪",
physics: "⚛️",
commercial: "🏢",
},
};
return emojiMap[type]?.[name] || "🏷️";
};
const TagItem = ({ tag, explanation }) => {
// Extract the name without prefix
const name = tag.split(":")[1];
const emoji = getTagEmoji(tag);
return (
<Paper
elevation={0}
sx={{
height: "100%",
display: "flex",
flexDirection: "column",
borderRadius: 2,
border: "1px solid",
borderColor: "divider",
overflow: "hidden",
}}
>
<Box
sx={{
bgcolor: (theme) =>
alpha(
theme.palette.primary.main,
theme.palette.mode === "dark" ? 0.15 : 0.05
),
py: 2,
px: 2,
borderRadius: 0,
mb: 2,
position: "relative",
}}
>
<Typography
variant="h6"
sx={{
fontWeight: 700,
color: "text.primary",
letterSpacing: "-0.02em",
pr: 5,
textTransform: "capitalize",
}}
>
{emoji} &nbsp;&nbsp; {name}
</Typography>
</Box>
<Box sx={{ px: 2, pb: 2 }}>
<Typography
variant="body2"
sx={{
color: "text.secondary",
mb: 2,
fontSize: "0.75rem",
}}
>
<strong>{tag.split(":")[0]}</strong>:{tag.split(":")[1]}
</Typography>
{explanation && (
<Typography
variant="body2"
sx={{
color: "text.secondary",
flex: 1,
}}
dangerouslySetInnerHTML={{ __html: explanation }}
/>
)}
</Box>
</Paper>
);
};
const TagSection = ({ title, description, tags, explanations }) => {
// Determine if this section should have 4 columns
const shouldHaveFourColumns = [
"Submission type",
"Test set status",
"Judges",
"Domain",
].includes(title);
return (
<Box sx={{ mb: 8 }}>
<Typography variant="h6" sx={{ fontWeight: 600, mb: 1 }}>
{title}
</Typography>
{description && (
<Typography variant="body1" sx={{ mb: 4, color: "text.secondary" }}>
{description}
</Typography>
)}
<Grid container spacing={2}>
{tags.map((tag, index) => (
<Grid
item
xs={12}
sm={6}
md={shouldHaveFourColumns ? 3 : 4}
key={index}
>
<TagItem
tag={tag}
explanation={explanations ? explanations[index] : null}
/>
</Grid>
))}
</Grid>
</Box>
);
};
const HowToSubmitPage = () => {
return (
<Box sx={{ width: "100%", maxWidth: 1200, margin: "0 auto", padding: 4 }}>
<PageHeader
title="How to submit ?"
subtitle={
<>
Join the <span style={{ fontWeight: 600 }}>community</span> of{" "}
<span style={{ fontWeight: 600 }}>"leaderboards on the Hub"</span>
</>
}
/>
<Section title="Configuration steps">
<Box
sx={{
display: "flex",
gap: 4,
flexDirection: { xs: "column", md: "column", lg: "row" },
}}
>
<Stack spacing={4} sx={{ flex: { xs: "1 1 auto", md: "0 0 45%" } }}>
<Stack spacing={3}>
<Stack direction="row" spacing={2} alignItems="center">
<StepNumber number={1} />
<Typography
variant="subtitle1"
sx={{
fontWeight: 600,
color: "text.primary",
letterSpacing: "-0.01em",
}}
>
Create a Space
</Typography>
</Stack>
<Box sx={{ pl: 7 }}>
<Typography variant="body2" color="text.secondary">
Your leaderboard must be hosted on a{" "}
<Link
href="https://huggingface.co/docs/hub/spaces"
target="_blank"
rel="noopener noreferrer"
>
Hugging Face Space
</Link>
.
</Typography>
</Box>
</Stack>
<Stack spacing={3}>
<Stack direction="row" spacing={2} alignItems="center">
<StepNumber number={2} />
<Typography
variant="subtitle1"
sx={{
fontWeight: 600,
color: "text.primary",
letterSpacing: "-0.01em",
}}
>
Add metadata
</Typography>
</Stack>
<Box sx={{ pl: 7 }}>
<Typography
variant="body2"
color="text.secondary"
sx={{ mb: 2 }}
>
Like{" "}
<Link
href="https://huggingface.co/docs/hub/model-cards"
target="_blank"
rel="noopener noreferrer"
>
model cards
</Link>
, your Space's{" "}
<InputLabel
sx={{
display: "inline-flex",
fontSize: "0.75rem",
color: "text.secondary",
fontFamily: "monospace",
bgcolor: "background.paper",
px: 1,
py: 0.5,
borderRadius: 1,
border: "1px solid",
borderColor: "divider",
mx: 0.5,
}}
>
README.md
</InputLabel>{" "}
file should include specific <strong>metadata</strong> in a
YAML section at the top:
</Typography>
<ul
style={{
margin: 0,
paddingLeft: "20px",
color: "text.secondary",
}}
>
<li>
<Typography
variant="body2"
color="text.secondary"
sx={{ display: "flex", alignItems: "center", gap: 0.5 }}
>
Add either the <strong>leaderboard</strong> or{" "}
<strong>arena</strong> tag
<Tooltip
title={
<Box sx={{ p: 1, maxWidth: 300 }}>
<Typography
variant="subtitle2"
sx={{
mb: 1,
fontWeight: 600,
color: "text.secondary",
}}
>
Choose between:
</Typography>
<Typography
variant="body2"
component="div"
sx={{ mb: 1 }}
>
<strong>arena</strong> - for human evaluations
<br />
<Box component="span" sx={{ pl: 2 }}>
requires <Tag>judge:humans</Tag>
</Box>
</Typography>
<Typography variant="body2" component="div">
<strong>leaderboard</strong> - for automated
evaluations
<br />
<Box component="span" sx={{ pl: 2 }}>
with <Tag>judge:function</Tag> or{" "}
<Tag>judge:model</Tag>
</Box>
</Typography>
</Box>
}
arrow
placement="right"
componentsProps={{
tooltip: {
sx: {
bgcolor: "background.paper",
color: "text.primary",
"& .MuiTooltip-arrow": {
color: "background.paper",
},
boxShadow: (theme) => theme.shadows[2],
},
},
}}
>
<IconButton
size="small"
sx={{
p: 0.5,
color: "text.secondary",
"&:hover": {
color: "primary.main",
bgcolor: (theme) =>
alpha(theme.palette.primary.main, 0.1),
},
}}
>
<InfoOutlinedIcon sx={{ fontSize: "1rem" }} />
</IconButton>
</Tooltip>
</Typography>
</li>
<li>
<Typography variant="body2" color="text.secondary">
Include a <strong>short_description</strong> field to
explain the purpose of your evaluation
</Typography>
</li>
<li>
<Typography variant="body2" color="text.secondary">
Add <strong>metadata tags</strong> to categorize your
evaluation (see examples on the right)
</Typography>
</li>
</ul>
</Box>
</Stack>
</Stack>
<Box sx={{ flex: 1 }}>
<CodeBlock>
---
<br />
<span className="key">short_description</span>
<span className="punctuation">:</span>{" "}
<span className="value">
Evaluating LLMs on math reasoning tasks
</span>
<br />
<span className="key">tags</span>
<span className="punctuation">:</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">leaderboard</span>
<span className="comment">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;#
Type of leaderboard
</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">submission:automatic</span>{" "}
<span className="comment"># How models are submitted</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">test:public</span>{" "}
<span className="comment">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# Test set
visibility
</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">judge:function</span>{" "}
<span className="comment">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# Evaluation method
</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">modality:text</span>{" "}
<span className="comment">
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;# Input/output type
</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">language:english</span>{" "}
<span className="comment">
&nbsp;&nbsp;&nbsp;&nbsp;# Language coverage
</span>
<br />
<span className="punctuation">&nbsp;&nbsp;-</span>{" "}
<span className="value">domain:financial</span>{" "}
<span className="comment">
&nbsp;&nbsp;&nbsp;&nbsp;# Specific domain
</span>
<br />
---
</CodeBlock>
</Box>
</Box>
</Section>
<Section title="What do the tags mean?">
<TagSection
title="Domain"
description="Indicates the specific domain of the leaderboard"
tags={[
"domain:medical",
"domain:chemistry",
"domain:physics",
"domain:biology",
"domain:financial",
"domain:legal",
"domain:commercial",
"domain:translation",
]}
/>
<TagSection
title="Modalities"
description="Can be any (or several) of the following list"
tags={[
"modality:text",
"modality:image",
"modality:audio",
"modality:video",
"modality:agent",
"modality:artefacts",
"modality:3d",
]}
explanations={[
"",
"",
"",
"",
"requires added <strong>tool usage</strong> - mostly for <strong>assistant models</strong> (a bit outside of usual modalities)",
"the leaderboard concerns itself with <strong>machine learning artefacts</strong> as themselves, for example, quality evaluation of <strong>text embeddings</strong>",
"",
]}
/>
<TagSection
title="Evaluation categories"
description="Can be any (or several) of the following list"
tags={[
"eval:generation",
"eval:math",
"eval:code",
"eval:reasoning",
"eval:performance",
"eval:safety",
"eval:hallucination",
"eval:rag",
]}
explanations={[
"the evaluation looks at <strong>generation capabilities</strong> specifically (can be image generation, text generation, ...)",
"the evaluation tests <strong>math abilities</strong>",
"the evaluation tests <strong>coding capabilities</strong>",
"the evaluation tests <strong>reasoning abilities</strong>",
"model <strong>performance</strong> (speed, energy consumption, ...)",
"the evaluation considers <strong>safety</strong>, <strong>toxicity</strong>, <strong>bias</strong>",
"the evaluation measures the model's tendency to <strong>hallucinate</strong> or generate <strong>false information</strong>",
"the evaluation tests <strong>RAG</strong> (Retrieval-Augmented Generation) capabilities",
]}
/>
<TagSection
title="Language"
description="You can indicate the languages covered by your benchmark like so: language:mylanguage."
tags={[
"language:english",
"language:french",
"language:your own language",
]}
explanations={[
"",
"",
"At the moment, we do not support language codes, please use the language name in English.",
]}
/>
<TagSection
title="Submission type"
description="Arenas are not concerned by this category."
tags={[
"submission:automatic",
"submission:semiautomatic",
"submission:manual",
"submission:closed",
]}
explanations={[
"users can submit their models as such to the leaderboard, and evaluation is run <strong>automatically</strong> without human intervention",
"the leaderboard requires the <strong>model owner</strong> to run evaluations on his side and submit the results",
"the leaderboard requires the <strong>leaderboard owner</strong> to run evaluations for new submissions",
"the leaderboard <strong>does not accept</strong> submissions at the moment",
]}
/>
<TagSection
title="Test set status"
description="Arenas are not concerned by this category."
tags={["test:public", "test:mix", "test:private", "test:rolling"]}
explanations={[
"all the test sets used are <strong>public</strong>, the evaluations are completely <strong>reproducible</strong>",
"some test sets are <strong>public</strong> and some <strong>private</strong>",
"all the test sets used are <strong>private</strong>, the evaluations are hard to game",
"the test sets used <strong>change regularly</strong> through time and evaluation scores are refreshed",
]}
/>
<TagSection
title="Judges"
tags={[
"judge:function",
"judge:model",
"judge:humans",
"judge:vibe check",
]}
explanations={[
"evaluations are run <strong>automatically</strong>, using an evaluation suite such as <strong>lm_eval</strong> or <strong>lighteval</strong>",
"evaluations are run using a <strong>model as a judge</strong> approach to rate answer",
"evaluations are <strong>done by humans</strong> to rate answer - <strong>this is an arena</strong>",
"evaluations are <strong>done manually</strong> by one or several humans",
]}
/>
<Typography
variant="body2"
sx={{
mt: 3,
color: "text.secondary",
fontSize: "0.875rem",
fontStyle: "italic",
}}
>
If you would like to see a tag that is not currently represented,
please contact{" "}
<Link
href="https://huggingface.co/clementine"
target="_blank"
rel="noopener noreferrer"
sx={{
color: "primary.main",
textDecoration: "none",
"&:hover": {
textDecoration: "underline",
},
}}
>
Clémentine Fourrier
</Link>{" "}
on Hugging Face.
</Typography>
</Section>
</Box>
);
};
export default HowToSubmitPage;