/** * * Copyright 2023-2025 InspectorRAGet Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ 'use client'; import { countBy, isEmpty } from 'lodash'; import cx from 'classnames'; import { useState, useMemo, useEffect, useRef } from 'react'; import { Tile, Button, Slider } from '@carbon/react'; import { WarningAlt } from '@carbon/icons-react'; import { ScatterChart } from '@carbon/charts-react'; import { useTheme } from '@/src/theme'; import { Model, Metric, TaskEvaluation } from '@/src/types'; import { castToNumber, AgreementLevels, extractMetricDisplayName, } from '@/src/utilities/metrics'; import { calculateFisherRandomization } from '@/src/utilities/significance'; import { areObjectsIntersecting } from '@/src/utilities/objects'; import { hash } from '@/src/utilities/strings'; import Filters from '@/src/components/filters/Filters'; import TasksTable from '@/src/views/tasks-table/TasksTable'; import ModelSelector from '@/src/components/selectors/ModelSelector'; import MetricSelector from '@/src/components/selectors/MetricSelector'; import { getModelColorPalette } from '@/src/utilities/colors'; import '@carbon/charts-react/styles.css'; import classes from './ModelComparator.module.scss'; // =================================================================================== // TYPES // =================================================================================== type StatisticalInformation = { p: number; distributionA: number[]; meanA: number; distributionB: number[]; meanB: number; taskIds?: string[]; }; interface Props { evaluationsPerMetric: { [key: string]: TaskEvaluation[] }; models: Model[]; metrics: Metric[]; filters: { [key: string]: string[] }; onTaskSelection: Function; } // =================================================================================== // COMPUTE FUNCTIONS // =================================================================================== /** * Build an array containing evaluations only for selected models for each task. * * Eligbility criteria: * * 1. Must have evaluations for both selected models * * 2. Each evaluation must have an agreement value for selected metric * * * @param evaluations evaluations for all task * @param modelA selected model * @param modelB selected model * @param metric selected metric * @returns */ function extractEvaluationsPerTask( evaluations: TaskEvaluation[], modelA: Model, modelB: Model, metric: string, selectedFilters: { [key: string]: string[] }, selectedMetricRange?: number[], ) { // Step 1: Initiaze necessary variable const modelEvaluationsPerTask: { [key: string]: TaskEvaluation[] } = {}; // Step 2: Add to model evaluations for a task, if evaluation meets eligbility criteria evaluations.forEach((evaluation) => { if ( (evaluation.modelId === modelA.modelId || evaluation.modelId === modelB.modelId) && evaluation[`${metric}_agg`].level !== AgreementLevels.NO_AGREEMENT && (!isEmpty(selectedFilters) ? areObjectsIntersecting(selectedFilters, evaluation) : true) ) { const modelEvaluationsForTask = modelEvaluationsPerTask[evaluation.taskId]; if (modelEvaluationsForTask) { modelEvaluationsForTask.push(evaluation); } else { modelEvaluationsPerTask[evaluation.taskId] = [evaluation]; } } }); // Step 3: Retain only those task which has evaluations for both models // and one or more models have aggregate value in the selected range return Object.values(modelEvaluationsPerTask).filter( (entry) => entry.length == 2 && (selectedMetricRange ? (entry[0][`${metric}_agg`].value >= selectedMetricRange[0] && entry[0][`${metric}_agg`].value <= selectedMetricRange[1]) || (entry[1][`${metric}_agg`].value >= selectedMetricRange[0] && entry[1][`${metric}_agg`].value <= selectedMetricRange[1]) : true), ); } /** * Run statistical significance test based on Fisher randomization method. * @param evaluationsPerMetric evaluations per metric * @param metrics metrics * @param modelA selected model * @param modelB selected model * @param selectedMetric If `undefined`, run for all metrics in `evaluationsPerMetric` object * @returns */ function runStatisticalSignificanceTest( evaluationsPerMetric: { [key: string]: TaskEvaluation[] }, metrics: Metric[], modelA: Model, modelB: Model, selectedMetric: Metric | undefined, selectedFilters: { [key: string]: string[] }, selectedMetricRange?: number[], ) { // Step 1: Initialize necessary variables const evaluationsPerMetricPerTask: { [key: string]: TaskEvaluation[][] } = {}; // Step 2: Retain evaluations for tasks where both models have agreement value if (selectedMetric) { const evaluationsPerTask = extractEvaluationsPerTask( evaluationsPerMetric[selectedMetric.name], modelA, modelB, selectedMetric.name, selectedFilters, selectedMetricRange, ); if (evaluationsPerTask.length !== 0) { evaluationsPerMetricPerTask[selectedMetric.name] = evaluationsPerTask; } } else { Object.keys(evaluationsPerMetric).forEach((metric) => { const evaluationsPerTask = extractEvaluationsPerTask( evaluationsPerMetric[metric], modelA, modelB, metric, selectedFilters, selectedMetricRange, ); if (evaluationsPerTask.length !== 0) { evaluationsPerMetricPerTask[metric] = evaluationsPerTask; } }); } // Step 3: Compute model value distribution for every metric const distributionA: { [key: string]: number[] } = {}; const distributionB: { [key: string]: number[] } = {}; const taskIds: { [key: string]: string[] } = {}; Object.keys(evaluationsPerMetricPerTask).forEach((metric) => { const metricValues = metrics.find((entry) => entry.name === metric)?.values; taskIds[metric] = evaluationsPerMetricPerTask[metric].map( (entry) => entry[0].taskId, ); distributionA[metric] = evaluationsPerMetricPerTask[metric].map((entry) => castToNumber( entry[0].modelId === modelA.modelId ? entry[0][`${metric}_agg`].value : entry[1][`${metric}_agg`].value, metricValues, ), ); distributionB[metric] = evaluationsPerMetricPerTask[metric].map((entry) => castToNumber( entry[1].modelId === modelB.modelId ? entry[1][`${metric}_agg`].value : entry[0][`${metric}_agg`].value, metricValues, ), ); }); // Step 3: Compute p value and means for every metric by comparing distributions const information: { [key: string]: StatisticalInformation } = {}; Object.keys(evaluationsPerMetricPerTask).forEach((metric) => { const [p, meanA, meanB] = calculateFisherRandomization( distributionA[metric], distributionB[metric], ); information[metric] = { p: p, distributionA: distributionA[metric], meanA: meanA, distributionB: distributionB[metric], meanB: meanB, taskIds: taskIds[metric], }; }); return information; } // =================================================================================== // RENDER FUNCTIONS // =================================================================================== function prepareScatterPlotData( modelA: string, distributionA: number[], modelB: string, distributionB: number[], taskIds?: string[], ) { if (distributionA.length !== distributionB.length) { return []; } // Step 2: Collate model wise predictions per task const distributions: { values: number[]; taskId: string }[] = []; distributionA.forEach((valueA, index) => { distributions.push({ taskId: taskIds ? taskIds[index] : `${index}`, values: [valueA, distributionB[index]], }); }); // Step 3: Primary sort based on model A's value distributions.sort((a, b) => a.values[0] - b.values[0]); // Step 4: Scondary sort based on Model B's value distributions.sort((a, b) => a.values[1] - b.values[1]); // Step 5: Prepare chart data const chartData: { [key: string]: string | number }[] = []; distributions.forEach((entry, idx) => { // Model A record chartData.push({ group: modelA, key: idx, value: entry.values[0], ...(taskIds && { taskId: entry.taskId }), }); // Model B record chartData.push({ group: modelB, key: idx, value: entry.values[1], ...(taskIds && { taskId: entry.taskId }), }); }); return chartData; } function renderResult( statisticalInformationPerMetric: { [key: string]: StatisticalInformation }, metric: Metric, modelA: Model, modelB: Model, numEvaluations: number, modelColors: { [key: string]: string }, modelOrder: string[], theme?: string, ) { if (statisticalInformationPerMetric.hasOwnProperty(metric.name)) { return (
{extractMetricDisplayName(metric)}
p-value {statisticalInformationPerMetric[metric.name]['p'].toFixed(4)} {statisticalInformationPerMetric[metric.name]['p'] <= 0.05 ? 'Significant' : 'Not significant'}
); } else { return null; } } // =================================================================================== // MAIN FUNCTION // =================================================================================== export default function ModelComparator({ evaluationsPerMetric, models, metrics, filters, onTaskSelection, }: Props) { // Step 1: Initialize state and necessary variables const [WindowWidth, setWindowWidth] = useState( global?.window && window.innerWidth, ); const [modelA, setModelA] = useState(models[0]); const [modelB, setModelB] = useState(models[1]); const [selectedMetric, setSelectedMetric] = useState( undefined, ); const [selectedFilters, setSelectedFilters] = useState<{ [key: string]: string[]; }>({}); const [statisticalInformationPerMetric, setStatisticalInformationPerMetric] = useState<{ [key: string]: StatisticalInformation } | undefined>(undefined); const [modelColors, modelOrder] = getModelColorPalette(models); const [selectedMetricRange, setSelectedMetricRange] = useState(); const chartRef = useRef(null); // Step 2: Run effects // Step 2.a: Window resizing useEffect(() => { const handleWindowResize = () => { setWindowWidth(window.innerWidth); }; // Step: Add event listener window.addEventListener('resize', handleWindowResize); // Step: Cleanup to remove event listener return () => { window.removeEventListener('resize', handleWindowResize); }; }, []); // Step 2.a: Fetch theme const { theme } = useTheme(); //Step 2.c: Bucket human and algoritmic metrics const [humanMetrics, algorithmMetrics] = useMemo(() => { const hMetrics: Metric[] = []; const aMetrics: Metric[] = []; Object.values(metrics).forEach((metric) => { if (metric.author === 'human') { hMetrics.push(metric); } else if (metric.author === 'algorithm') { aMetrics.push(metric); } }); return [hMetrics, aMetrics]; }, [metrics]); // Step 2.d: Reset selected metric range, only applicable for numerical metrics useEffect(() => { if ( selectedMetric && selectedMetric.type === 'numerical' && selectedMetric.range ) { setSelectedMetricRange([ selectedMetric.range[0], selectedMetric.range[1], ]); } else setSelectedMetricRange(undefined); }, [selectedMetric]); // Step 2.e: Identify visible evaluations const filteredEvaluations = useMemo(() => { if (selectedMetric) { // Step 1: Identify evaluations for selected models const evaluationsForSelectedModels = evaluationsPerMetric[ selectedMetric.name ].filter( (evaluation) => (evaluation.modelId === modelA.modelId || evaluation.modelId === modelB.modelId) && (!isEmpty(selectedFilters) ? areObjectsIntersecting(selectedFilters, evaluation) : true), ); // Step 2: Collate evaluation per task id const evaluationsPerTask: { [key: string]: { [key: string]: number } } = {}; evaluationsForSelectedModels.forEach((evaluation) => { const entry = evaluationsPerTask[evaluation.taskId]; if (entry) { entry[evaluation.modelId] = evaluation[`${selectedMetric.name}_agg`].value; } else { evaluationsPerTask[evaluation.taskId] = { [evaluation.modelId]: evaluation[`${selectedMetric.name}_agg`].value, }; } }); // Step 3: Only select evaluation tasks where models aggregate values differe // and one or more models have aggregate value in the selected range const visibleEvaluationTaskIds = Object.keys(evaluationsPerTask).filter( (taskId) => Object.keys(countBy(Object.values(evaluationsPerTask[taskId]))) .length > 1 && (selectedMetricRange ? (Object.values(evaluationsPerTask[taskId])[0] >= selectedMetricRange[0] && Object.values(evaluationsPerTask[taskId])[0] <= selectedMetricRange[1]) || (Object.values(evaluationsPerTask[taskId])[1] >= selectedMetricRange[0] && Object.values(evaluationsPerTask[taskId])[1] <= selectedMetricRange[1]) : true), ); // Step 4: Return evaluations for selected evaluation tasks where models aggregate values differe return evaluationsForSelectedModels.filter((evaluation) => visibleEvaluationTaskIds.includes(evaluation.taskId), ); } return []; }, [ evaluationsPerMetric, selectedMetric, modelA, modelB, selectedMetricRange, ]); // Step 2.f: Reset statistical information, if either of model changes or filters are changed useEffect(() => { setStatisticalInformationPerMetric(undefined); }, [modelA, modelB, selectedFilters]); // Step 2.g: Recalculate statistical information, if metric changes useEffect(() => { if ( !selectedMetric && statisticalInformationPerMetric && Object.keys(statisticalInformationPerMetric).length == 1 ) { setStatisticalInformationPerMetric( runStatisticalSignificanceTest( evaluationsPerMetric, metrics, modelA, modelB, selectedMetric, selectedFilters, selectedMetricRange, ), ); } else if ( selectedMetric && selectedMetricRange && statisticalInformationPerMetric && statisticalInformationPerMetric.hasOwnProperty(selectedMetric.name) ) { setStatisticalInformationPerMetric( runStatisticalSignificanceTest( evaluationsPerMetric, metrics, modelA, modelB, selectedMetric, selectedFilters, selectedMetricRange, ), ); } }, [selectedMetric, selectedMetricRange]); // Step 2.h: Compute computation complexity const complexity = useMemo(() => { let size = 0; if (selectedMetric) { size = evaluationsPerMetric[selectedMetric.name].length / models.length; } else { size = Object.values(evaluationsPerMetric) .map((evaluations) => evaluations.length / models.length) .reduce((a, b) => a + b, 0); } if (size > 1000) { return 'high'; } return 'low'; }, [evaluationsPerMetric, selectedMetric]); // Step 2.i: Add chart event useEffect(() => { // Step 2.i.*: Local copy of reference let ref = null; // Step 2.i.**: Update reference and add event if (chartRef && chartRef.current) { ref = chartRef.current; //@ts-ignore ref.chart.services.events.addEventListener( 'scatter-click', ({ detail }) => { onTaskSelection(detail.datum.taskId); }, ); } // Step 2.i.***: Cleanup function return () => { if (ref) { //@ts-ignore ref.chart.services.events.removeEventListener( 'scatter-click', ({ detail }) => { onTaskSelection(detail.datum.taskId); }, ); } }; }, [chartRef, selectedMetric, statisticalInformationPerMetric]); // Step 3: Render return (
{ const selectedModel = models.find( (model) => model.modelId === modelId, ); if (selectedModel) { setModelA(selectedModel); } }} disabledModels={[modelB]} />
{ const selectedModel = models.find( (model) => model.modelId === modelId, ); if (selectedModel) { setModelB(selectedModel); } }} disabledModels={[modelA]} />
{ setSelectedMetric(metric); }} warn={!selectedMetric} warnText={'You must select a single metric to view tasks. '} />
{selectedMetric && selectedMetric.type === 'numerical' && selectedMetric.range ? (
{ setSelectedMetricRange((prev) => [ value, valueUpper ? valueUpper : prev ? prev[1] : selectedMetric.range ? selectedMetric.range[2] : 100, ]); }} />
) : null}
{!isEmpty(filters) ? ( ) : null} {statisticalInformationPerMetric ? (
H0: {modelA.name} and {modelB.name} scores are derived from the same distribution. {'Reject the null hypothesis if p < 0.05'}
{!selectedMetric && humanMetrics.length ? (

Human Evaluations

3 ? classes.graphsGrid : classes.graphsFlex, )} > {humanMetrics.map((metric) => renderResult( statisticalInformationPerMetric, metric, modelA, modelB, evaluationsPerMetric[metric.name].length / models.length, modelColors, modelOrder, theme, ), )}
) : null} {!selectedMetric && algorithmMetrics.length ? (

Algorithmic Evaluations

3 ? classes.graphsGrid : classes.graphsFlex, )} > {algorithmMetrics.map((metric) => renderResult( statisticalInformationPerMetric, metric, modelA, modelB, evaluationsPerMetric[metric.name].length / models.length, modelColors, modelOrder, theme, ), )}
) : null} {selectedMetric && statisticalInformationPerMetric.hasOwnProperty( selectedMetric.name, ) ? (
{extractMetricDisplayName(selectedMetric)}
p-value {statisticalInformationPerMetric[selectedMetric.name][ 'p' ].toFixed(4)} {statisticalInformationPerMetric[selectedMetric.name][ 'p' ] <= 0.05 ? 'Significant' : 'Not significant'}
) : ( <>
{`Press calculate to measure statistical significance ${selectedMetric ? 'for' : 'across'} "${selectedMetric ? extractMetricDisplayName(selectedMetric) : 'all'}" metric${selectedMetric ? '' : 's'}`} {`for "${modelA.name}" and "${modelB.name}" models.`} {complexity === 'high' ? (
It might take few minutes to build this view.
) : null}
)}
) : ( <>
{`Press calculate to measure statistical significance ${selectedMetric ? 'for' : 'across'} "${selectedMetric ? extractMetricDisplayName(selectedMetric) : 'all'}" metric${selectedMetric ? '' : 's'}`} {`for "${modelA.name}" and "${modelB.name}" models.`} {complexity === 'high' ? (
It might take few minutes to build this view.
) : null}
)} {selectedMetric && statisticalInformationPerMetric && statisticalInformationPerMetric.hasOwnProperty(selectedMetric.name) && (

Tasks{selectedMetric && filteredEvaluations && *}

{filteredEvaluations ? ( <> * Only tasks with different model aggregate scores are shown in the above table. ) : null}
)}
); }