Spaces:
Running
Running
scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,0,0.35957325998039574,0.15076277502664528 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,1,0.19999999999999998,0.4843127204585538 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,2,0.13483997249264842,0.590013887163346 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,3,-0.022222222222222223,1.0 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,4,0.19999999999999998,0.4843127204585538 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,5,0.06666666666666667,0.8618005952380953 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,6,0.15555555555555553,0.6006536596119929 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,7,0.37777777777777777,0.1557418430335097 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,10,9,0.15555555555555553,0.6006536596119929 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.06666666666666667,0.8618005952380953 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.24444444444444444,0.38071979717813054 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.19999999999999998,0.4843127204585538 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.06666666666666667,0.8618005952380953 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.15555555555555553,0.6006536596119929 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.19999999999999998,0.4843127204585538 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.15555555555555553,0.6006536596119929 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 | |
Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.19999999999999998,0.4843127204585538 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.022222222222222223,1.0 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.37777777777777777,0.1557418430335097 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.15555555555555553,0.6006536596119929 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3333333333333333,0.21637345679012346 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 | |
Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 | |
Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 | |
Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8989331499509894,0.0003280163150135276 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9888264649460883,7.772240795323086e-05 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8989331499509894,0.0003280163150135276 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4222222222222222,0.10831349206349207 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8989331499509894,0.0003280163150135276 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9438798074485389,0.0001621317520439264 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8539864924534399,0.0006436975254696865 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8989331499509894,0.0003280163150135276 | |
HFv2 BBH Raw,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8090398349558905,0.0012254240706707103 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4494665749754947,0.07248608508684644 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7191465199607915,0.004057136032371292 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8539864924534399,0.0006436975254696865 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.522862326927363,0.03809415806109578 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.3333333333333333,0.21637345679012346 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 | |
HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5843065474681431,0.019550269092885535 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4140393356054125,0.10223484473548783 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.0,1.0 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.49441323247304414,0.048193488293190756 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,0.3680349649825889,0.14634982666257293 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,0,0.28888888888888886,0.2912483465608466 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,1,0.28888888888888886,0.2912483465608466 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6,0.016666115520282188 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,7,-0.06666666666666667,0.8618005952380953 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 | |
Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 | |
Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 | |
Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7640931774583409,0.002263469812035174 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6292532049656926,0.011921001496914019 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6741998624632421,0.0070583320485280866 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7640931774583409,0.002263469812035174 | |
Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5843065474681431,0.019550269092885535 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3333333333333333,0.21637345679012346 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5111111111111111,0.04662257495590829 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4222222222222222,0.10831349206349207 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.06666666666666667,0.8618005952380953 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.24444444444444444,0.38071979717813054 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.1111111111111111,0.7274895282186948 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.28888888888888886,0.2912483465608466 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.28888888888888886,0.2912483465608466 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.24444444444444444,0.38071979717813054 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 | |
Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.1111111111111111,0.7274895282186948 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.22473328748774735,0.36917141633269157 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.35957325998039574,0.15076277502664528 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3146266024828463,0.2086677876982641 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4494665749754947,0.07248608508684644 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.35957325998039574,0.15076277502664528 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.47739603762933147,0.05829058013948874 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.29553088043720516,0.24112859961644273 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.26967994498529685,0.2811980995641792 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.40451991747794525,0.1059975484249457 | |
Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.1111111111111111,0.7274895282186948 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 | |
Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7640931774583409,0.002263469812035174 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8539864924534399,0.0006436975254696865 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7640931774583409,0.002263469812035174 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4494665749754947,0.07248608508684644 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5555555555555555,0.02860945767195767 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.40451991747794525,0.1059975484249457 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.4666666666666666,0.07255015432098766 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.47739603762933147,0.05829058013948874 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6292532049656926,0.011921001496914019 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5393598899705937,0.03114121059579671 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8989331499509894,0.0003280163150135276 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6292532049656926,0.011921001496914019 | |
MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5393598899705937,0.03114121059579671 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,, | |
toolbench_tuned_on_toolbench,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,, | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4944132324730442,0.048193488293190756 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.044946657497549475,0.8574624419592412 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
toolbench_avg.,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.44946657497549475,0.07248608508684644 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.3333333333333333,0.21637345679012346 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.3333333333333333,0.21637345679012346 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4944132324730442,0.048193488293190756 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.06666666666666667,0.8618005952380953 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.44946657497549475,0.07248608508684644 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4944132324730442,0.048193488293190756 | |
toolbench_open_weather,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.3595732599803958,0.15076277502664528 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.22473328748774737,0.36917141633269157 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.06666666666666667,0.8618005952380953 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.37777777777777777,0.1557418430335097 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.22473328748774737,0.36917141633269157 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.15555555555555553,0.6006536596119929 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.19999999999999998,0.4843127204585538 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
toolbench_the_cat_api,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.044946657497549475,0.8574624419592412 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.24444444444444444,0.38071979717813054 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4944132324730442,0.048193488293190756 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4222222222222222,0.10831349206349207 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.40451991747794525,0.1059975484249457 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6292532049656925,0.011921001496914019 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.022222222222222223,1.0 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.44946657497549475,0.07248608508684644 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.44946657497549475,0.07248608508684644 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.40451991747794525,0.1059975484249457 | |
toolbench_home_search,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.31462660248284624,0.2086677876982641 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.40451991747794525,0.1059975484249457 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4944132324730442,0.048193488293190756 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.522862326927363,0.03809415806109578 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6592611948214577,0.008926875535053643 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.15555555555555553,0.6006536596119929 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6292532049656925,0.011921001496914019 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5393598899705937,0.03114121059579671 | |
toolbench_trip_booking,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.5111111111111111,0.04662257495590829 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.15555555555555553,0.6006536596119929 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.28888888888888886,0.2912483465608466 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.24444444444444444,0.38071979717813054 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.022222222222222223,1.0 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.37777777777777777,0.1557418430335097 | |
toolbench_google_sheets,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4222222222222222,0.10831349206349207 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.28888888888888886,0.2912483465608466 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6292532049656925,0.011921001496914019 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
toolbench_virtualhome,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.060858061945018464,0.8237838788539289 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,-0.049690399499995326,0.8618044330490108 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.1807753815155468,0.517439239336394 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.14907119849998599,0.6015081344405899 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,-0.049690399499995326,0.8618044330490108 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,-0.10846522890932808,0.6977358290647566 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,-0.14907119849998599,0.6015081344405899 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,, | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,-0.049690399499995326,0.8618044330490108 | |
toolbench_webshop_long,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.24343224778007386,0.373077501374122 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5967623950328607,0.020456721550759976 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6531972647421808,0.014645494331075933 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.629940788348712,0.016309171877754967 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.598056818096707,0.018255860587459292 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.529150262212918,0.04363140075597019 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5060480768510598,0.0457979388108198 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5443310539518174,0.0419323909238529 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7399853698407473,0.004051149991792091 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3265986323710904,0.22226996949464695 | |
toolbench_webshop_short,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.598056818096707,0.018255860587459292 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4944132324730442,0.048193488293190756 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6,0.016666115520282188 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.764093177458341,0.002263469812035174 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.13483997249264842,0.590013887163346 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.4666666666666666,0.07255015432098766 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.37777777777777777,0.1557418430335097 | |
toolbench_tabletop,toolbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.40451991747794525,0.1059975484249457 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,1,0.4666666666666666,0.07255015432098766 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,3,0.35957325998039574,0.15076277502664528 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7640931774583409,0.002263469812035174 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,0,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,1,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,2,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,3,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,4,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,5,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,6,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,7,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,8,0.04494665749754947,0.8574624419592412 | |
OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,10,9,0.04494665749754947,0.8574624419592412 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656926,0.011921001496914019 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7191465199607915,0.004057136032371292 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7047274841194893,0.005187148855929351 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5843065474681431,0.019550269092885535 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.8539864924534399,0.0006436975254696865 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7640931774583409,0.002263469812035174 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5111111111111111,0.04662257495590829 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.5555555555555555,0.02860945767195767 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7191465199607915,0.004057136032371292 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8539864924534399,0.0006436975254696865 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6,0.016666115520282188 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.37777777777777777,0.1557418430335097 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.5111111111111111,0.04662257495590829 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.5555555555555555,0.02860945767195767 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6,0.016666115520282188 | |
HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.28888888888888886,0.2912483465608466 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.8989331499509894,0.0003280163150135276 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.5555555555555555,0.02860945767195767 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.4222222222222222,0.10831349206349207 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.5843065474681431,0.019550269092885535 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6,0.016666115520282188 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.37777777777777777,0.1557418430335097 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 | |
HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.4666666666666666,0.07255015432098766 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.19999999999999998,0.4843127204585538 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.4666666666666666,0.07255015432098766 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.28888888888888886,0.2912483465608466 | |
HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
BigBench,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4222222222222222,0.10831349206349207 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7191465199607915,0.004057136032371292 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.8989331499509894,0.0003280163150135276 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5393598899705937,0.03114121059579671 | |
BigBench Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3680349649825889,0.14634982666257293 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4494665749754947,0.07248608508684644 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6445033866354896,0.012304364739182175 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7501937734175208,0.0029250200956793346 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.3333333333333333,0.21637345679012346 | |
BigBench Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.24444444444444444,0.38071979717813054 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.24444444444444444,0.38071979717813054 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7047274841194893,0.005187148855929351 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.40451991747794525,0.1059975484249457 | |
BigBench Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.37777777777777777,0.1557418430335097 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.28888888888888886,0.2912483465608466 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
BigBench Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.49441323247304414,0.048193488293190756 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.5393598899705937,0.03114121059579671 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.1111111111111111,0.7274895282186948 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
BigBench Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6,0.016666115520282188 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.20459830184114206,0.4170770595205646 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.4666666666666666,0.07255015432098766 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.28888888888888886,0.2912483465608466 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.06819943394704735,0.7867749320074033 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.5555555555555555,0.02860945767195767 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6900655593423543,0.006458954266892998 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
BigBench Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.1111111111111111,0.7274895282186948 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.37777777777777777,0.1557418430335097 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6292532049656926,0.011921001496914019 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.15555555555555553,0.6006536596119929 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 | |
BigBench Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.3333333333333333,0.21637345679012346 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.7191465199607915,0.004057136032371292 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.022222222222222223,1.0 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6292532049656926,0.011921001496914019 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.4222222222222222,0.10831349206349207 | |
BigBench Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.4666666666666666,0.07255015432098766 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,0,0.4494665749754947,0.07248608508684644 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,1,0.26967994498529685,0.2811980995641792 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,3,0.4666666666666666,0.07255015432098766 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,6,0.3333333333333333,0.21637345679012346 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,8,0.522862326927363,0.03809415806109578 | |
BigBench Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.7640931774583409,0.002263469812035174 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.828078671210825,0.001082228864258374 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7191465199607915,0.004057136032371292 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7501937734175208,0.0029250200956793346 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.7501937734175208,0.0029250200956793346 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.7640931774583409,0.002263469812035174 | |
LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7501937734175208,0.0029250200956793346 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6292532049656926,0.011921001496914019 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7640931774583409,0.002263469812035174 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7640931774583409,0.002263469812035174 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.7640931774583409,0.002263469812035174 | |
LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7191465199607915,0.004057136032371292 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.6,0.016666115520282188 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6,0.016666115520282188 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.6,0.016666115520282188 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,2,0.6,0.016666115520282188 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,10,9,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,0,0.35957325998039574,0.15076277502664528 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,1,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,2,0.13483997249264842,0.590013887163346 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,3,-0.022222222222222223,1.0 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,4,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,5,0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,6,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,7,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,10,9,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,1,0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,2,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,3,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,4,0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,5,-0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,6,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,7,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,10,9,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,1,0.022222222222222223,1.0 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,3,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,5,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,7,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,7,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,3,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,3,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,6,0.9888264649460883,7.772240795323086e-05 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,7,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,1,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.9438798074485389,0.0001621317520439264 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,HFv2 BBH Raw,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.4494665749754947,0.07248608508684644 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.522862326927363,0.03809415806109578 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.4140393356054125,0.10223484473548783 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.0,1.0 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.49441323247304414,0.048193488293190756 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,7,0.3680349649825889,0.14634982666257293 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,0,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,1,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,3,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,7,-0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,8,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,5,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,8,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,0,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,2,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,4,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,7,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,8,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,10,9,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,1,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,4,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,5,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,1,0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,2,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,4,0.1111111111111111,0.7274895282186948 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,5,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,6,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,7,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,10,9,0.1111111111111111,0.7274895282186948 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,0,0.22473328748774735,0.36917141633269157 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,1,0.35957325998039574,0.15076277502664528 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,2,0.3146266024828463,0.2086677876982641 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,3,0.4494665749754947,0.07248608508684644 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,4,0.35957325998039574,0.15076277502664528 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,5,0.47739603762933147,0.05829058013948874 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,6,0.29553088043720516,0.24112859961644273 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,7,0.26967994498529685,0.2811980995641792 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,8,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,10,9,0.1111111111111111,0.7274895282186948 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,0,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,4,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,7,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,8,0.6,0.016666115520282188 | |
aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,0,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,2,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,3,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,10,9,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,4,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,4,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,6,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,10,9,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,0,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,2,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,0,0.4494665749754947,0.07248608508684644 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,5,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,0,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,5,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,0,0.47739603762933147,0.05829058013948874 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,1,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,2,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,5,0.5393598899705937,0.03114121059579671 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,6,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,8,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,1,0.5393598899705937,0.03114121059579671 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,0,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,1,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,2,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,3,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,4,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,5,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,6,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,7,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,8,, | |
aggregate,aggregate,toolbench_tuned_on_toolbench,toolbench_240829.csv,kendall,random,10,9,, | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,3,0.4944132324730442,0.048193488293190756 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,5,0.044946657497549475,0.8574624419592412 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,6,0.6,0.016666115520282188 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_avg.,toolbench_240829.csv,kendall,random,10,9,0.44946657497549475,0.07248608508684644 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,0,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,2,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,3,0.4944132324730442,0.048193488293190756 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,5,0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,6,0.44946657497549475,0.07248608508684644 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,8,0.4944132324730442,0.048193488293190756 | |
aggregate,aggregate,toolbench_open_weather,toolbench_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,1,0.3595732599803958,0.15076277502664528 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,2,0.22473328748774737,0.36917141633269157 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,3,0.06666666666666667,0.8618005952380953 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,4,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,5,-0.22473328748774737,0.36917141633269157 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,6,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,7,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_the_cat_api,toolbench_240829.csv,kendall,random,10,9,0.044946657497549475,0.8574624419592412 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,0,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,1,0.4944132324730442,0.048193488293190756 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,2,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,3,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,4,0.6292532049656925,0.011921001496914019 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,5,-0.022222222222222223,1.0 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,6,0.44946657497549475,0.07248608508684644 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,7,0.44946657497549475,0.07248608508684644 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,8,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,toolbench_home_search,toolbench_240829.csv,kendall,random,10,9,0.31462660248284624,0.2086677876982641 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,0,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,1,0.4944132324730442,0.048193488293190756 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,3,0.522862326927363,0.03809415806109578 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,4,0.6592611948214577,0.008926875535053643 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,5,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,7,0.6292532049656925,0.011921001496914019 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,8,0.5393598899705937,0.03114121059579671 | |
aggregate,aggregate,toolbench_trip_booking,toolbench_240829.csv,kendall,random,10,9,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,0,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,1,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,3,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,5,-0.022222222222222223,1.0 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,8,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,toolbench_google_sheets,toolbench_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,1,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,3,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,5,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,6,0.6292532049656925,0.011921001496914019 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,toolbench_virtualhome,toolbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,0,0.060858061945018464,0.8237838788539289 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,1,-0.049690399499995326,0.8618044330490108 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,2,0.1807753815155468,0.517439239336394 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,3,0.14907119849998599,0.6015081344405899 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,4,-0.049690399499995326,0.8618044330490108 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,5,-0.10846522890932808,0.6977358290647566 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,6,-0.14907119849998599,0.6015081344405899 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,7,, | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,8,-0.049690399499995326,0.8618044330490108 | |
aggregate,aggregate,toolbench_webshop_long,toolbench_240829.csv,kendall,random,10,9,0.24343224778007386,0.373077501374122 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,0,0.5967623950328607,0.020456721550759976 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,1,0.6531972647421808,0.014645494331075933 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,2,0.629940788348712,0.016309171877754967 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,3,0.598056818096707,0.018255860587459292 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,4,0.529150262212918,0.04363140075597019 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,5,0.5060480768510598,0.0457979388108198 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,6,0.5443310539518174,0.0419323909238529 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,7,0.7399853698407473,0.004051149991792091 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,8,0.3265986323710904,0.22226996949464695 | |
aggregate,aggregate,toolbench_webshop_short,toolbench_240829.csv,kendall,random,10,9,0.598056818096707,0.018255860587459292 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,0,0.4944132324730442,0.048193488293190756 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,1,0.6,0.016666115520282188 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,3,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,4,0.764093177458341,0.002263469812035174 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,5,0.13483997249264842,0.590013887163346 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,6,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,8,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,toolbench_tabletop,toolbench_240829.csv,kendall,random,10,9,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,1,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,3,0.35957325998039574,0.15076277502664528 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,5,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,7,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,0,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,1,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,2,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,3,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,4,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,5,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,6,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,7,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,8,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,10,9,0.04494665749754947,0.8574624419592412 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,0,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,1,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,2,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,4,0.7047274841194893,0.005187148855929351 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,0,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,4,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,6,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,10,9,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,3,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,4,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,7,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,1,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,2,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,4,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,5,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,6,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,7,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,5,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,1,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,5,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,7,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,2,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,2,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,0,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,10,9,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,5,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,8,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6,0.016666115520282188 | |
aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.9999999999999999,5.511463844797178e-07 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.19999999999999998,0.4843127204585538 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,4,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,6,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,7,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,8,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,BigBench,biggen_240829.csv,kendall,random,10,9,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,5,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,6,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,7,0.8989331499509894,0.0003280163150135276 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,8,0.5393598899705937,0.03114121059579671 | |
aggregate,aggregate,BigBench Grounding,biggen_240829.csv,kendall,random,10,9,0.3680349649825889,0.14634982666257293 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,0,0.4494665749754947,0.07248608508684644 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,2,0.6445033866354896,0.012304364739182175 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,6,0.7501937734175208,0.0029250200956793346 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,8,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,BigBench Instruction Following,biggen_240829.csv,kendall,random,10,9,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,2,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,3,0.8539864924534399,0.0006436975254696865 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,4,0.24444444444444444,0.38071979717813054 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,5,0.7047274841194893,0.005187148855929351 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,7,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,8,0.40451991747794525,0.1059975484249457 | |
aggregate,aggregate,BigBench Planning,biggen_240829.csv,kendall,random,10,9,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,0,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,4,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,8,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,BigBench Reasoning,biggen_240829.csv,kendall,random,10,9,0.49441323247304414,0.048193488293190756 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,0,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,3,0.5393598899705937,0.03114121059579671 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,4,0.1111111111111111,0.7274895282186948 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,7,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,8,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,BigBench Refinement,biggen_240829.csv,kendall,random,10,9,0.6,0.016666115520282188 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,0,0.20459830184114206,0.4170770595205646 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,1,0.5843065474681431,0.019550269092885535 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,2,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,3,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,4,0.28888888888888886,0.2912483465608466 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,5,0.06819943394704735,0.7867749320074033 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,6,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,7,0.6900655593423543,0.006458954266892998 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,8,0.5555555555555555,0.02860945767195767 | |
aggregate,aggregate,BigBench Safety,biggen_240829.csv,kendall,random,10,9,0.1111111111111111,0.7274895282186948 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,0,0.37777777777777777,0.1557418430335097 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,1,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,2,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,4,0.15555555555555553,0.6006536596119929 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,BigBench Theory of Mind,biggen_240829.csv,kendall,random,10,9,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,0,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,1,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,2,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,3,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,4,0.022222222222222223,1.0 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,6,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,7,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,8,0.4222222222222222,0.10831349206349207 | |
aggregate,aggregate,BigBench Tool Usage,biggen_240829.csv,kendall,random,10,9,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,0,0.4494665749754947,0.07248608508684644 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,1,0.26967994498529685,0.2811980995641792 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,2,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,3,0.4666666666666666,0.07255015432098766 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,4,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,5,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,6,0.3333333333333333,0.21637345679012346 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,8,0.522862326927363,0.03809415806109578 | |
aggregate,aggregate,BigBench Multilingual,biggen_240829.csv,kendall,random,10,9,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,2,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,3,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,4,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,5,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,8,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,10,9,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,1,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,2,0.828078671210825,0.001082228864258374 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,3,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,4,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,5,0.7501937734175208,0.0029250200956793346 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,6,0.7501937734175208,0.0029250200956793346 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,8,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,10,9,0.7501937734175208,0.0029250200956793346 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,0,0.6292532049656926,0.011921001496914019 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,1,0.8090398349558905,0.0012254240706707103 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,4,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,5,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,6,0.6741998624632421,0.0070583320485280866 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,8,0.7640931774583409,0.002263469812035174 | |
aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,10,9,0.7191465199607915,0.004057136032371292 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,0,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,1,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,3,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,5,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,6,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,7,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,8,0.911111111111111,2.9761904761904762e-05 | |
aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,10,9,0.9555555555555554,5.5114638447971785e-06 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,0,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,1,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,3,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,4,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,5,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,6,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,7,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,8,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,10,9,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,0,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,1,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,2,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,3,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,4,0.8222222222222221,0.0003576940035273369 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,5,0.7777777777777777,0.0009463183421516755 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,6,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,7,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,10,9,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,0,0.6444444444444444,0.009148478835978836 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,1,0.8666666666666666,0.00011518959435626102 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,2,0.6,0.016666115520282188 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,3,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,4,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,5,0.7333333333333333,0.002212852733686067 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,6,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,7,0.5111111111111111,0.04662257495590829 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,8,0.6888888888888888,0.00468694885361552 | |
aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,10,9,0.5555555555555555,0.02860945767195767 | |