/**
*
* Copyright 2023-2025 InspectorRAGet Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
'use client';
import { groupBy, isEmpty } from 'lodash';
import cx from 'classnames';
import { useState, useEffect, useMemo } from 'react';
import {
DataTable,
TableContainer,
Table,
TableHead,
TableRow,
TableHeader,
TableBody,
TableCell,
} from '@carbon/react';
import { WarningAlt } from '@carbon/icons-react';
import {
SimpleBarChart,
RadarChart,
GroupedBarChart,
} from '@carbon/charts-react';
import { Alignments, ScaleTypes } from '@carbon/charts';
import { useTheme } from '@/src/theme';
import {
AggregationConfidenceLevels,
AggregationStatistics,
Aggregator,
Metric,
Model,
TaskEvaluation,
} from '@/src/types';
import {
extractMetricDisplayName,
castToNumber,
} from '@/src/utilities/metrics';
import {
meanAggregator,
medianAggregator,
majorityAggregator,
} from '@/src/utilities/aggregators';
import { areObjectsIntersecting } from '@/src/utilities/objects';
import { getModelColorPalette } from '@/src/utilities/colors';
import AggregatorSelector from '@/src/components/selectors/AggregatorSelector';
import Filters from '@/src/components/filters/Filters';
import HidePanel from '@/src/views/performance-overview/Hide';
import '@carbon/charts-react/styles.css';
import classes from './PerformanceOverview.module.scss';
// ===================================================================================
// TYPES
// ===================================================================================
interface Props {
evaluationsPerMetric: { [key: string]: TaskEvaluation[] };
models: Model[];
metrics: Metric[];
filters: { [key: string]: string[] };
numTasks: number;
}
// ===================================================================================
// COMPUTE FUNCTIONS
// ===================================================================================
function calculateRanks(
data: {
model: string;
metric: string;
score: number;
rank: number;
std?: number;
order?: 'ascending' | 'descending';
levels: { low: number; medium: number; high: number };
}[],
) {
const peformancePerMetric: {
[key: string]: {
model: string;
score: number;
rank: number;
std?: number;
}[];
} = {};
const order: { [key: string]: 'ascending' | 'descending' } = {};
for (const entry of data) {
if (peformancePerMetric.hasOwnProperty(entry.metric)) {
peformancePerMetric[entry.metric].push(entry);
} else {
peformancePerMetric[entry.metric] = [entry];
}
if (!order.hasOwnProperty(entry.metric)) {
order[entry.metric] = entry.order ? entry.order : 'ascending';
}
}
for (const [metric, performance] of Object.entries(peformancePerMetric)) {
performance.sort((a, b) => {
if (order[metric] == 'ascending') {
return a.score > b.score ? -1 : 1;
} else {
return a.score > b.score ? 1 : -1;
}
});
let rank = 0;
performance.forEach((entry, idx) => {
if (idx !== 0 && entry.score === performance[idx - 1].score) {
entry['rank'] = rank;
} else {
entry['rank'] = rank + 1;
rank += 1;
}
});
}
}
// ===================================================================================
// RENDER FUNCTIONS
// ===================================================================================
function sparkline(
distribution: { [key: string]: number } | undefined,
theme?: string,
) {
if (distribution === undefined) {
return null;
} else {
return (
{
return { group: value, value: count };
})}
options={{
color: {
scale: { low: '#fa4d56', medium: '#f1c21b', high: '#42be65' },
},
axes: {
left: {
mapsTo: 'value',
visible: false,
scaleType: ScaleTypes.LINEAR,
},
bottom: {
mapsTo: 'group',
visible: false,
scaleType: ScaleTypes.LABELS,
},
},
grid: {
y: {
enabled: false,
},
x: {
enabled: false,
},
},
legend: {
enabled: false,
},
toolbar: {
enabled: false,
},
theme: theme,
height: '24px',
width: '48px',
}}
>
);
}
}
function drawTable(
data: {
model: string;
metric: string;
score: number;
rank: number;
std?: number;
levels: { low: number; medium: number; high: number };
}[],
metrics: string[],
plot: boolean = false,
theme?: string,
) {
// Step 1: Define headers
const headers = [
{ key: 'model', header: 'Model' },
...metrics.map((metric) => {
return { key: metric, header: metric };
}),
{ key: 'rank', header: 'Rank' },
];
// Step 2: Group data per model
const dataPerModel: {
[key: string]: {
model: string;
metric: string;
score: number;
rank: number;
std?: number;
}[];
} = groupBy(data, (entry) => entry.model);
// Step 3: Compute overall rank
const overallRank: [string, number][] = Object.entries(dataPerModel).map(
([model, entry]) => [model, entry.reduce((n, { rank }) => n + rank, 0)],
);
// Step 4: Sort based on overall rank, if necessary
if (overallRank.length > 1) {
overallRank.sort((a, b) => {
return a[1] - b[1];
});
}
// Step 5: Define distribution map
const distributions = new Map(
data.map((entry) => [`${entry.model}:${entry.metric}`, entry.levels]),
);
// Step 5: Define rows
const rows: { [key: string]: string }[] = [];
overallRank.forEach(([model, sum], index) => {
rows.push({
id: model,
model: model,
...Object.fromEntries(
dataPerModel[model].map((record) => [
record.metric,
record.std
? `${record.score} ± ${record.std} (${record.rank})`
: `${record.score} (${record.rank})`,
]),
),
rank: `${sum.toLocaleString()} (${(index + 1).toLocaleString()})`,
});
});
// Step 6: Draw table
return (
{({ rows, headers, getTableProps, getHeaderProps, getRowProps }) => (
{headers.map((header, index) => (
{header.key === 'rank' ? (
Σ {header.header}
) : (
header.header
)}
))}
{rows.map((row, index) => (
{row.cells.map((cell) => (
{cell.value ? (
cell.value.includes('(1)') ? (
{cell.value}
) : (
cell.value
)
) : (
'-'
)}
{plot && metrics.includes(cell.info.header)
? sparkline(distributions.get(cell.id), theme)
: null}
))}
))}
)}
);
}
function disclaimers({
std = false,
spakline = false,
theme,
}: {
std?: boolean;
spakline?: boolean;
theme?: string;
}) {
return (
*
(rank) indicates model's comparative position w.r.t to
other models for a given metric
{std && (
*
value±std shows averages of aggregate values and
standard deviation across all tasks
)}
{spakline && (
*
{sparkline({ low: 5, medium: 10, high: 15 }, theme)}
reflects confidence level on the aggregate values.
■
# of tasks where where minority rating is far from majority
rating,
■
# of tasks where where minority rating is similar to majority
rating and
■
# of tasks where where all annotators chose same rating
)}
);
}
// ===================================================================================
// MAIN FUNCTION
// ===================================================================================
export default function PerformanceOverview({
evaluationsPerMetric,
models,
metrics,
filters,
numTasks,
}: Props) {
// Step 1: Initialize state and necessary variables
const [WindowWidth, setWindowWidth] = useState(
global?.window && window.innerWidth,
);
const [WindowHeight, setWindowHeight] = useState(
global?.window && window.innerHeight,
);
const aggregators: Aggregator[] = [
meanAggregator,
medianAggregator,
majorityAggregator,
];
const [selectedAggregators, setSelectedAggregators] = useState<{
[key: string]: Aggregator;
}>(
Object.fromEntries(
metrics
.filter((metric) => metric.author === 'human')
.map((metric) => [
metric.name,
metric.aggregator === 'majority'
? majorityAggregator
: metric.aggregator === 'median'
? medianAggregator
: meanAggregator,
]),
),
);
const [selectedFilters, setSelectedFilters] = useState<{
[key: string]: string[];
}>({});
const [modelColors, modelOrder] = getModelColorPalette(models);
const [hiddenModels, setHiddenModels] = useState([]);
const [hiddenMetrics, setHiddenMetrics] = useState([]);
// Step 2: Run effects
// Step 2.a: Adjust graph width & heigh based on window size
useEffect(() => {
const handleWindowResize = () => {
setWindowWidth(window.innerWidth);
setWindowHeight(window.innerHeight);
};
// Step: Add event listener
window.addEventListener('resize', handleWindowResize);
// Step: Cleanup to remove event listener
return () => {
window.removeEventListener('resize', handleWindowResize);
};
}, []);
// Step 2.a: Fetch theme
const { theme } = useTheme();
// Step 2.c: Generate performance data for human and algorithmic metrics
const [humanMetricsData, algorithmicMetricsData, numSelectedTasks] =
useMemo(() => {
// Eligible metrics
const eligibleMetrics = Object.fromEntries(
metrics.map((metric) => [metric.name, metric]),
);
let hData: {
model: string;
metric: string;
score: number;
std: number;
rank: number;
size: number;
levels: { low: number; medium: number; high: number };
order?: 'ascending' | 'descending';
}[] = [];
let aData: {
model: string;
metric: string;
score: number;
std?: number;
rank: number;
size: number;
levels: { low: number; medium: number; high: number };
order?: 'ascending' | 'descending';
}[] = [];
const performancePerModel: {
[key: string]: {
[key: string]: {
value: number;
std: number;
levels: { low: number; medium: number; high: number };
};
};
} = {};
const eligibleEvaluationsPerModel: {
[key: string]: { [key: string]: number };
} = {};
// Step 1: Calculate model performance across entire dataset
let selectedTasksCount;
for (const [metric, evaluations] of Object.entries(
evaluationsPerMetric,
)) {
const aggregator = selectedAggregators[metric] || meanAggregator;
// Select evaluations based on selected filters
const selectedEvaluations = !isEmpty(selectedFilters)
? evaluations.filter((e) => {
return areObjectsIntersecting(selectedFilters, e);
})
: evaluations;
// Calculate selected tasks count
selectedTasksCount = selectedEvaluations.length / models.length;
selectedEvaluations.forEach((evaluation) => {
// Step 1.a: Calcuate aggregated value
const aggregateStatistics: AggregationStatistics = aggregator.apply(
Object.values(evaluation.annotations[`${metric}`]).map(
(entry) => entry.value,
),
eligibleMetrics[metric].values,
);
// Step 1.b: Skip evaluations, if no majority aggrement exist
if (
aggregator.name === 'majority' &&
aggregateStatistics.value === 'Indeterminate'
) {
return;
}
// Step 1.c: Cast to numeric value for further processing
const aggregateValue = castToNumber(
aggregateStatistics.value,
eligibleMetrics[metric].values,
);
// Step 1.d: Translate model id to model name
const modelName =
models.find((model) => model.modelId === evaluation.modelId)
?.name || evaluation.modelId;
// Step 1.d: Update performance per model object
if (performancePerModel.hasOwnProperty(modelName)) {
if (performancePerModel[modelName].hasOwnProperty(metric)) {
performancePerModel[modelName][metric].value += aggregateValue;
performancePerModel[modelName][metric].std +=
aggregateStatistics.std;
if (
aggregateStatistics.confidence ===
AggregationConfidenceLevels.LOW
) {
performancePerModel[modelName][metric].levels.low += 1;
}
if (
aggregateStatistics.confidence ===
AggregationConfidenceLevels.MEDIUM
) {
performancePerModel[modelName][metric].levels.medium += 1;
}
if (
aggregateStatistics.confidence ===
AggregationConfidenceLevels.HIGH
) {
performancePerModel[modelName][metric].levels.high += 1;
}
} else {
performancePerModel[modelName][metric] = {
value: aggregateValue,
std: aggregateStatistics.std,
levels: {
low:
aggregateStatistics.confidence ===
AggregationConfidenceLevels.LOW
? 1
: 0,
medium:
aggregateStatistics.confidence ===
AggregationConfidenceLevels.MEDIUM
? 1
: 0,
high:
aggregateStatistics.confidence ===
AggregationConfidenceLevels.HIGH
? 1
: 0,
},
};
}
} else {
performancePerModel[modelName] = {
[metric]: {
value: aggregateValue,
std: aggregateStatistics.std,
levels: {
low:
aggregateStatistics.confidence ===
AggregationConfidenceLevels.LOW
? 1
: 0,
medium:
aggregateStatistics.confidence ===
AggregationConfidenceLevels.MEDIUM
? 1
: 0,
high:
aggregateStatistics.confidence ===
AggregationConfidenceLevels.HIGH
? 1
: 0,
},
},
};
}
// Step 1.e: Update eligible evaluations per model object
if (eligibleEvaluationsPerModel.hasOwnProperty(modelName)) {
if (eligibleEvaluationsPerModel[modelName].hasOwnProperty(metric)) {
eligibleEvaluationsPerModel[modelName][metric] += 1;
} else {
eligibleEvaluationsPerModel[modelName][metric] = 1;
}
} else {
eligibleEvaluationsPerModel[modelName] = {
[metric]: 1,
};
}
});
}
// Step 2: Add raw performance data
for (const [model, performance] of Object.entries(performancePerModel)) {
for (const [metric, statistics] of Object.entries(performance)) {
if (eligibleMetrics.hasOwnProperty(metric)) {
if (eligibleMetrics[metric].author === 'human') {
hData.push({
model: model,
metric: extractMetricDisplayName(eligibleMetrics[metric]),
score: parseFloat(
(statistics.value / selectedTasksCount).toFixed(2),
),
rank: -1,
size: eligibleEvaluationsPerModel[model][metric],
std: parseFloat(
(statistics.std / selectedTasksCount).toFixed(2),
),
levels: statistics.levels,
...(eligibleMetrics[metric].order && {
order: eligibleMetrics[metric].order,
}),
});
} else if (eligibleMetrics[metric].author === 'algorithm') {
aData.push({
model: model,
metric: extractMetricDisplayName(eligibleMetrics[metric]),
score: parseFloat(
(statistics.value / selectedTasksCount).toFixed(2),
),
rank: -1,
size: eligibleEvaluationsPerModel[model][metric],
std: parseFloat(
(statistics.std / selectedTasksCount).toFixed(2),
),
levels: statistics.levels,
...(eligibleMetrics[metric].order && {
order: eligibleMetrics[metric].order,
}),
});
}
}
}
}
// Step 3: Filter hidden metrics data
const hiddenMetricNames = hiddenMetrics.map((metric) =>
extractMetricDisplayName(metric),
);
// Step 3.a: Human metrics
if (Array.isArray(hData)) {
hData = hData.filter(
(entry) => !hiddenMetricNames.includes(entry.metric),
);
}
// Step 3.b: Algorithmic metrics
if (Array.isArray(aData)) {
aData = aData.filter(
(entry) => !hiddenMetricNames.includes(entry.metric),
);
}
// Step 4: Filter hidden models data
const hiddenModelNames = hiddenModels.map((model) =>
model.name ? model.name : model.modelId,
);
// Step 4.a: Human metrics
if (Array.isArray(hData)) {
hData = hData.filter(
(entry) => !hiddenModelNames.includes(entry.model),
);
}
// Step 4.b: Algorithmic metrics
if (Array.isArray(aData)) {
aData = aData.filter(
(entry) => !hiddenModelNames.includes(entry.model),
);
}
// Step 5: Generate add rank information
// Step 5.a: Human metrics
if (Array.isArray(hData)) {
calculateRanks(hData);
}
// Step 5.b: Algorithmic metrics
if (Array.isArray(aData)) {
calculateRanks(aData);
}
return [hData, aData, selectedTasksCount];
}, [
evaluationsPerMetric,
metrics,
models,
selectedAggregators,
selectedFilters,
hiddenModels,
hiddenMetrics,
]);
const humanMetricsInData = new Set(
humanMetricsData.map((entry) => entry.metric),
);
const algorithmicmetricsInData = new Set(
algorithmicMetricsData.map((entry) => entry.metric),
);
// Step 3: Render
return (
{Object.entries(selectedAggregators).map(([metricName, aggregator]) => {
const metric = metrics.find((entry) => entry.name === metricName);
return (
{metric
? extractMetricDisplayName(metric)
: metricName.charAt(0).toUpperCase() +
metricName.slice(1).toLowerCase()}
{
setSelectedAggregators({
...selectedAggregators,
[metricName]: selection,
});
}}
warn={aggregator.name === 'majority'}
warnText={
aggregator.name === 'majority'
? 'Caution: Denominator might vary for categorical metrics.'
: 'You must select an aggregator to view results.'
}
>
);
})}
{!isEmpty(filters) ? (
) : null}
{humanMetricsInData.size ? (
Human Evaluations ({numSelectedTasks}/{numTasks})
{drawTable(
humanMetricsData,
Array.from(humanMetricsInData),
true,
theme,
)}
{disclaimers({ std: true, spakline: true, theme: theme })}
{humanMetricsInData.size < 3 ? (
<>
(a.model > b.model ? -1 : 1))
.map((entry) => {
return {
group: entry.model,
key: entry.metric,
value: entry.score,
};
})}
options={{
axes: {
left: {
mapsTo: 'value',
},
bottom: {
mapsTo: 'key',
scaleType: ScaleTypes.LABELS,
},
},
width: `${Math.round(WindowWidth * 0.45)}px`,
height: `${Math.round(WindowHeight * 0.5)}px`,
toolbar: {
enabled: false,
},
color: {
scale: modelColors,
},
legend: {
order: modelOrder,
},
theme: theme,
}}
>
>
) : (
<>
{
// Step 1: Find metric under consideration
const metric = metrics.find(
(m) => m.displayName === entry.metric,
);
// Step 2: Calculate normalized score
let normalizedScore = entry.score;
if (
metric?.minValue !== undefined &&
metric.maxValue !== undefined
) {
// Step 2.a: Fetch minimum value
const minValue =
typeof metric.minValue === 'number'
? metric.minValue
: castToNumber(
metric.minValue?.value,
metric.values,
);
// Step 2.b: Fetch maximum value
const maxValue =
typeof metric.maxValue === 'number'
? metric.maxValue
: castToNumber(
metric.maxValue?.value,
metric.values,
);
// Step 2.c: Calculate min-max normalized score
normalizedScore =
Math.round(
((entry.score - minValue) / (maxValue - minValue)) *
100,
) / 100;
}
// Step 3: Return
return {
model: entry.model,
metric: entry.metric,
score: normalizedScore,
};
})}
options={{
radar: {
alignment: Alignments.CENTER,
axes: {
angle: 'metric',
value: 'score',
},
},
data: {
groupMapsTo: 'model',
},
color: {
scale: modelColors,
},
legend: {
alignment: Alignments.CENTER,
order: modelOrder,
},
width: `${Math.round(WindowWidth * 0.45)}px`,
height: `${Math.round(WindowHeight * 0.5)}px`,
toolbar: {
enabled: false,
},
theme: theme,
}}
>
>
)}
) : null}
{humanMetricsInData.size && algorithmicmetricsInData.size ? (
) : null}
{algorithmicmetricsInData.size ? (
Algorithmic Evaluations ({numSelectedTasks}/{numTasks})
{drawTable(
algorithmicMetricsData,
Array.from(algorithmicmetricsInData),
)}
{disclaimers({})}
{algorithmicmetricsInData.size < 3 ? (
<>
(a.model > b.model ? -1 : 1))
.map((entry) => {
// Step 1: Find metric under consideration
const metric = metrics.find(
(m) => m.displayName === entry.metric,
);
// Step 2: Calculate normalized score
let normalizedScore = entry.score;
if (
metric?.minValue !== undefined &&
metric.maxValue !== undefined
) {
// Step 2.a: Fetch minimum value
const minValue =
typeof metric.minValue === 'number'
? metric.minValue
: castToNumber(
metric.minValue?.value,
metric.values,
);
// Step 2.b: Fetch maximum value
const maxValue =
typeof metric.maxValue === 'number'
? metric.maxValue
: castToNumber(
metric.maxValue?.value,
metric.values,
);
// Step 2.c: Calculate min-max normalized score
normalizedScore =
Math.round(
((entry.score - minValue) /
(maxValue - minValue)) *
100,
) / 100;
}
// Step 3: Return
return {
group: entry.model,
key: entry.metric,
value: normalizedScore,
};
})}
options={{
axes: {
left: {
mapsTo: 'value',
},
bottom: {
mapsTo: 'key',
scaleType: ScaleTypes.LABELS,
},
},
width: `${Math.round(WindowWidth * 0.45)}px`,
height: `${Math.round(WindowHeight * 0.5)}px`,
toolbar: {
enabled: false,
},
color: {
scale: modelColors,
},
legend: {
order: modelOrder,
},
theme: theme,
}}
>
>
) : (
<>
{
const metric = metrics.find(
(m) => m.displayName === entry.metric,
);
return {
model: entry.model,
metric: entry.metric,
score:
metric && metric.maxValue
? Math.round(
(entry.score /
(typeof metric.maxValue === 'number'
? metric.maxValue
: castToNumber(
metric.maxValue?.value,
metric.values,
))) *
100,
) / 100
: entry.score,
};
})}
options={{
radar: {
alignment: Alignments.CENTER,
axes: {
angle: 'metric',
value: 'score',
},
},
data: {
groupMapsTo: 'model',
},
color: {
scale: modelColors,
},
legend: {
alignment: Alignments.CENTER,
order: modelOrder,
},
width: `${Math.round(WindowWidth * 0.45)}px`,
height: `${Math.round(WindowHeight * 0.5)}px`,
toolbar: {
enabled: false,
},
theme: theme,
}}
>
>
)}
) : null}
{humanMetricsInData.size === 0 &&
algorithmicmetricsInData.size === 0 ? (
{`No matching evaluations found. ${!isEmpty(selectedFilters) ? 'Please try again by removing one or more additional filters.' : ''}`}
) : null}
);
}