Upload organize_model_results.json with huggingface_hub
Browse files- organize_model_results.json +39 -0
organize_model_results.json
CHANGED
|
@@ -22,6 +22,7 @@
|
|
| 22 |
"whisper_large_v3": 0.8294532718704128,
|
| 23 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995,
|
| 24 |
"phi_4_multimodal_instruct": 1.3868687388941825,
|
|
|
|
| 25 |
"WavLLM_fairseq": 1.2058793232211378,
|
| 26 |
"SALMONN_7B": 0.7757204295537071,
|
| 27 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886
|
|
@@ -34,6 +35,7 @@
|
|
| 34 |
"Qwen2-Audio-7B-Instruct": 0.21342294856199182,
|
| 35 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
|
| 36 |
"phi_4_multimodal_instruct": 0.24508284335582894,
|
|
|
|
| 37 |
"WavLLM_fairseq": 0.06399522524688675,
|
| 38 |
"SALMONN_7B": 0.17175112770658157,
|
| 39 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
|
|
@@ -60,6 +62,7 @@
|
|
| 60 |
"whisper_large_v3": 14.673689493155793,
|
| 61 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
|
| 62 |
"phi_4_multimodal_instruct": 22.678131781242936,
|
|
|
|
| 63 |
"WavLLM_fairseq": 2.368659001743569,
|
| 64 |
"SALMONN_7B": 5.296039450108202,
|
| 65 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
|
|
@@ -72,6 +75,7 @@
|
|
| 72 |
"Qwen2-Audio-7B-Instruct": 53.6,
|
| 73 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
|
| 74 |
"phi_4_multimodal_instruct": 66.2,
|
|
|
|
| 75 |
"WavLLM_fairseq": 62.199999999999996,
|
| 76 |
"SALMONN_7B": 46.8,
|
| 77 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
|
|
@@ -111,6 +115,7 @@
|
|
| 111 |
"Qwen2-Audio-7B-Instruct": 71.60909856781802,
|
| 112 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
|
| 113 |
"phi_4_multimodal_instruct": 54.422914911541696,
|
|
|
|
| 114 |
"WavLLM_fairseq": 44.3133951137321,
|
| 115 |
"SALMONN_7B": 50.88458298230834,
|
| 116 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
|
|
@@ -136,6 +141,7 @@
|
|
| 136 |
"whisper_large_v3": 0.3171008846684522,
|
| 137 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
|
| 138 |
"phi_4_multimodal_instruct": 0.3470091713334957,
|
|
|
|
| 139 |
"WavLLM_fairseq": 0.4463923382842302,
|
| 140 |
"SALMONN_7B": 0.42346400454508565,
|
| 141 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
|
|
@@ -203,6 +209,7 @@
|
|
| 203 |
"whisper_large_v3": 0.1698509342851144,
|
| 204 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
|
| 205 |
"phi_4_multimodal_instruct": 0.14552883606001388,
|
|
|
|
| 206 |
"WavLLM_fairseq": 0.42541061709652933,
|
| 207 |
"SALMONN_7B": 0.24872817713464365,
|
| 208 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
|
|
@@ -215,6 +222,7 @@
|
|
| 215 |
"Qwen2-Audio-7B-Instruct": 44.800000000000004,
|
| 216 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
|
| 217 |
"phi_4_multimodal_instruct": 30.8,
|
|
|
|
| 218 |
"WavLLM_fairseq": 19.2,
|
| 219 |
"SALMONN_7B": 15.8,
|
| 220 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
|
|
@@ -242,6 +250,7 @@
|
|
| 242 |
"Qwen2-Audio-7B-Instruct": 58.31395348837209,
|
| 243 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
|
| 244 |
"phi_4_multimodal_instruct": 68.40116279069767,
|
|
|
|
| 245 |
"WavLLM_fairseq": 58.54651162790698,
|
| 246 |
"SALMONN_7B": 59.24418604651163,
|
| 247 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
|
|
@@ -300,6 +309,7 @@
|
|
| 300 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
|
| 301 |
"gemini-1.5-flash": 1.1100431601824359,
|
| 302 |
"phi_4_multimodal_instruct": 0.8529492791331231,
|
|
|
|
| 303 |
"WavLLM_fairseq": 1.2204842511249197,
|
| 304 |
"SALMONN_7B": 1.0189782362484312,
|
| 305 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
|
|
@@ -312,6 +322,7 @@
|
|
| 312 |
"Qwen2-Audio-7B-Instruct": 53.9463601532567,
|
| 313 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
|
| 314 |
"phi_4_multimodal_instruct": 51.609195402298845,
|
|
|
|
| 315 |
"WavLLM_fairseq": 51.072796934865906,
|
| 316 |
"SALMONN_7B": 41.7624521072797,
|
| 317 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
|
|
@@ -324,6 +335,7 @@
|
|
| 324 |
"Qwen2-Audio-7B-Instruct": 39.6,
|
| 325 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
|
| 326 |
"phi_4_multimodal_instruct": 43.8,
|
|
|
|
| 327 |
"WavLLM_fairseq": 46.6,
|
| 328 |
"SALMONN_7B": 36.6,
|
| 329 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
|
|
@@ -351,6 +363,7 @@
|
|
| 351 |
"Qwen2-Audio-7B-Instruct": 61.56666666666667,
|
| 352 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
|
| 353 |
"phi_4_multimodal_instruct": 36.833333333333336,
|
|
|
|
| 354 |
"WavLLM_fairseq": 46.766666666666666,
|
| 355 |
"SALMONN_7B": 42.733333333333334,
|
| 356 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
|
|
@@ -376,6 +389,7 @@
|
|
| 376 |
"whisper_large_v3": 0.2143555471246589,
|
| 377 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
|
| 378 |
"phi_4_multimodal_instruct": 0.22801359968481416,
|
|
|
|
| 379 |
"WavLLM_fairseq": 0.39796588405247263,
|
| 380 |
"SALMONN_7B": 0.34868891450584405,
|
| 381 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
|
|
@@ -400,6 +414,7 @@
|
|
| 400 |
"whisper_large_v3": 0.15887899737116104,
|
| 401 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
|
| 402 |
"phi_4_multimodal_instruct": 0.24134627375003423,
|
|
|
|
| 403 |
"WavLLM_fairseq": 0.6671766188447099,
|
| 404 |
"SALMONN_7B": 0.3597423676988383,
|
| 405 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
|
|
@@ -437,6 +452,7 @@
|
|
| 437 |
"whisper_large_v3": 46.01512198258627,
|
| 438 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
|
| 439 |
"phi_4_multimodal_instruct": 0.36465303013961253,
|
|
|
|
| 440 |
"WavLLM_fairseq": 5.933522277713613,
|
| 441 |
"SALMONN_7B": 26.89649039333571,
|
| 442 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
|
|
@@ -474,6 +490,7 @@
|
|
| 474 |
"whisper_large_v3": 0.09459022434812692,
|
| 475 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
|
| 476 |
"phi_4_multimodal_instruct": 0.09672866386388193,
|
|
|
|
| 477 |
"WavLLM_fairseq": 0.15491778414546403,
|
| 478 |
"SALMONN_7B": 0.10765150204693537,
|
| 479 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
|
|
@@ -498,6 +515,7 @@
|
|
| 498 |
"Qwen2-Audio-7B-Instruct": 33.8,
|
| 499 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
|
| 500 |
"phi_4_multimodal_instruct": 41.2,
|
|
|
|
| 501 |
"WavLLM_fairseq": 31.6,
|
| 502 |
"SALMONN_7B": 9.0,
|
| 503 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
|
|
@@ -513,6 +531,7 @@
|
|
| 513 |
"Qwen2-Audio-7B-Instruct": 0.9666666666666667,
|
| 514 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
|
| 515 |
"phi_4_multimodal_instruct": 0.5333333333333333,
|
|
|
|
| 516 |
"WavLLM_fairseq": 0.23333333333333336,
|
| 517 |
"SALMONN_7B": 0.06666666666666667,
|
| 518 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
|
|
@@ -525,6 +544,7 @@
|
|
| 525 |
"Qwen2-Audio-7B-Instruct": 92.80876494023903,
|
| 526 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
|
| 527 |
"phi_4_multimodal_instruct": 59.46215139442231,
|
|
|
|
| 528 |
"WavLLM_fairseq": 51.932270916334666,
|
| 529 |
"SALMONN_7B": 81.31474103585658,
|
| 530 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
|
|
@@ -575,6 +595,7 @@
|
|
| 575 |
"Qwen2-Audio-7B-Instruct": 66.49242028227914,
|
| 576 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
|
| 577 |
"phi_4_multimodal_instruct": 72.60846837428123,
|
|
|
|
| 578 |
"WavLLM_fairseq": 66.5446941975954,
|
| 579 |
"SALMONN_7B": 56.455828541557764,
|
| 580 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
|
|
@@ -587,6 +608,7 @@
|
|
| 587 |
"Qwen2-Audio-7B-Instruct": 40.4,
|
| 588 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
|
| 589 |
"phi_4_multimodal_instruct": 52.199999999999996,
|
|
|
|
| 590 |
"WavLLM_fairseq": 45.199999999999996,
|
| 591 |
"SALMONN_7B": 17.2,
|
| 592 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
@@ -616,6 +638,7 @@
|
|
| 616 |
"Qwen2-Audio-7B-Instruct": 42.0,
|
| 617 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
|
| 618 |
"phi_4_multimodal_instruct": 43.8,
|
|
|
|
| 619 |
"WavLLM_fairseq": 45.199999999999996,
|
| 620 |
"SALMONN_7B": 40.599999999999994,
|
| 621 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
@@ -631,6 +654,7 @@
|
|
| 631 |
"Qwen2-Audio-7B-Instruct": 24.8,
|
| 632 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
|
| 633 |
"phi_4_multimodal_instruct": 37.0,
|
|
|
|
| 634 |
"WavLLM_fairseq": 31.6,
|
| 635 |
"SALMONN_7B": 7.0,
|
| 636 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
|
|
@@ -668,6 +692,7 @@
|
|
| 668 |
"Qwen2-Audio-7B-Instruct": 0.19891712076314283,
|
| 669 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
|
| 670 |
"phi_4_multimodal_instruct": 0.1757379026471828,
|
|
|
|
| 671 |
"WavLLM_fairseq": 0.041732965094428545,
|
| 672 |
"SALMONN_7B": 0.20994052484339956,
|
| 673 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
|
|
@@ -683,6 +708,7 @@
|
|
| 683 |
"Qwen2-Audio-7B-Instruct": 2.55,
|
| 684 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
|
| 685 |
"phi_4_multimodal_instruct": 3.5166666666666666,
|
|
|
|
| 686 |
"WavLLM_fairseq": 2.6833333333333336,
|
| 687 |
"SALMONN_7B": 2.5166666666666666,
|
| 688 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
|
|
@@ -708,6 +734,7 @@
|
|
| 708 |
"whisper_large_v3": 2.451098639578599,
|
| 709 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
|
| 710 |
"phi_4_multimodal_instruct": 0.053138495633157125,
|
|
|
|
| 711 |
"WavLLM_fairseq": 0.1695522548322915,
|
| 712 |
"SALMONN_7B": 0.3649023706010388,
|
| 713 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
|
|
@@ -733,6 +760,7 @@
|
|
| 733 |
"Qwen2-Audio-7B-Instruct": 50.919591292758774,
|
| 734 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
|
| 735 |
"phi_4_multimodal_instruct": 47.86582401555663,
|
|
|
|
| 736 |
"WavLLM_fairseq": 43.01199466903598,
|
| 737 |
"SALMONN_7B": 57.75401069518716,
|
| 738 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
|
|
@@ -763,6 +791,7 @@
|
|
| 763 |
"whisper_large_v3": 0.27026366524560785,
|
| 764 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
|
| 765 |
"phi_4_multimodal_instruct": 0.44227061666711925,
|
|
|
|
| 766 |
"WavLLM_fairseq": 0.7540934640345399,
|
| 767 |
"SALMONN_7B": 0.6569229098215983,
|
| 768 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
|
|
@@ -813,6 +842,7 @@
|
|
| 813 |
"Qwen2-Audio-7B-Instruct": 45.75079872204473,
|
| 814 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
|
| 815 |
"phi_4_multimodal_instruct": 38.466453674121404,
|
|
|
|
| 816 |
"WavLLM_fairseq": 29.840255591054312,
|
| 817 |
"SALMONN_7B": 50.287539936102235,
|
| 818 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
|
|
@@ -829,6 +859,7 @@
|
|
| 829 |
"whisper_large_v3": 0.06844171360300393,
|
| 830 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
|
| 831 |
"phi_4_multimodal_instruct": 0.05739643527661961,
|
|
|
|
| 832 |
"WavLLM_fairseq": 0.10077292565771828,
|
| 833 |
"SALMONN_7B": 0.0925804013361617,
|
| 834 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
|
|
@@ -841,6 +872,7 @@
|
|
| 841 |
"Qwen2-Audio-7B-Instruct": 0.2165498391593041,
|
| 842 |
"whisper_large_v3": 0.14602420615337386,
|
| 843 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682,
|
|
|
|
| 844 |
"WavLLM_fairseq": 0.3792176325635977,
|
| 845 |
"SALMONN_7B": 0.23699946689025367,
|
| 846 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275
|
|
@@ -866,6 +898,7 @@
|
|
| 866 |
"Qwen2-Audio-7B-Instruct": 44.473684210526315,
|
| 867 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
|
| 868 |
"phi_4_multimodal_instruct": 35.13157894736842,
|
|
|
|
| 869 |
"WavLLM_fairseq": 26.25,
|
| 870 |
"SALMONN_7B": 47.30263157894737,
|
| 871 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
|
|
@@ -1014,6 +1047,7 @@
|
|
| 1014 |
"Qwen2-Audio-7B-Instruct": 68.38333333333333,
|
| 1015 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
|
| 1016 |
"phi_4_multimodal_instruct": 51.68333333333334,
|
|
|
|
| 1017 |
"WavLLM_fairseq": 49.06666666666666,
|
| 1018 |
"SALMONN_7B": 59.766666666666666,
|
| 1019 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
|
|
@@ -1026,6 +1060,7 @@
|
|
| 1026 |
"Qwen2-Audio-7B-Instruct": 80.04901960784315,
|
| 1027 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
|
| 1028 |
"phi_4_multimodal_instruct": 88.33333333333334,
|
|
|
|
| 1029 |
"WavLLM_fairseq": 83.92156862745098,
|
| 1030 |
"SALMONN_7B": 83.48039215686273,
|
| 1031 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
|
|
@@ -1055,6 +1090,7 @@
|
|
| 1055 |
"Qwen2-Audio-7B-Instruct": 41.60919540229885,
|
| 1056 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
|
| 1057 |
"phi_4_multimodal_instruct": 43.524904214559385,
|
|
|
|
| 1058 |
"WavLLM_fairseq": 41.57088122605364,
|
| 1059 |
"SALMONN_7B": 30.536398467432953,
|
| 1060 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
|
|
@@ -1083,6 +1119,7 @@
|
|
| 1083 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
|
| 1084 |
"gemini-1.5-flash": 0.9690871089536138,
|
| 1085 |
"phi_4_multimodal_instruct": 0.7126483279395901,
|
|
|
|
| 1086 |
"WavLLM_fairseq": 1.2913969795037756,
|
| 1087 |
"SALMONN_7B": 1.2721817691477886,
|
| 1088 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
|
|
@@ -1107,6 +1144,7 @@
|
|
| 1107 |
"Qwen2-Audio-7B-Instruct": 51.6,
|
| 1108 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
|
| 1109 |
"phi_4_multimodal_instruct": 49.0,
|
|
|
|
| 1110 |
"WavLLM_fairseq": 50.8,
|
| 1111 |
"SALMONN_7B": 44.6,
|
| 1112 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
|
|
@@ -1122,6 +1160,7 @@
|
|
| 1122 |
"Qwen2-Audio-7B-Instruct": 46.2,
|
| 1123 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4,
|
| 1124 |
"phi_4_multimodal_instruct": 52.599999999999994,
|
|
|
|
| 1125 |
"WavLLM_fairseq": 49.400000000000006,
|
| 1126 |
"SALMONN_7B": 24.2,
|
| 1127 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996
|
|
|
|
| 22 |
"whisper_large_v3": 0.8294532718704128,
|
| 23 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.4757667842702995,
|
| 24 |
"phi_4_multimodal_instruct": 1.3868687388941825,
|
| 25 |
+
"seallms_audio_7b": 1.8960881769720068,
|
| 26 |
"WavLLM_fairseq": 1.2058793232211378,
|
| 27 |
"SALMONN_7B": 0.7757204295537071,
|
| 28 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.4715562308464886
|
|
|
|
| 35 |
"Qwen2-Audio-7B-Instruct": 0.21342294856199182,
|
| 36 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.120421856260385,
|
| 37 |
"phi_4_multimodal_instruct": 0.24508284335582894,
|
| 38 |
+
"seallms_audio_7b": 0.1444387454989207,
|
| 39 |
"WavLLM_fairseq": 0.06399522524688675,
|
| 40 |
"SALMONN_7B": 0.17175112770658157,
|
| 41 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.1388630786594543
|
|
|
|
| 62 |
"whisper_large_v3": 14.673689493155793,
|
| 63 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.209998552437538,
|
| 64 |
"phi_4_multimodal_instruct": 22.678131781242936,
|
| 65 |
+
"seallms_audio_7b": 18.79451062979056,
|
| 66 |
"WavLLM_fairseq": 2.368659001743569,
|
| 67 |
"SALMONN_7B": 5.296039450108202,
|
| 68 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 14.154700735606419
|
|
|
|
| 75 |
"Qwen2-Audio-7B-Instruct": 53.6,
|
| 76 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 71.6,
|
| 77 |
"phi_4_multimodal_instruct": 66.2,
|
| 78 |
+
"seallms_audio_7b": 58.2,
|
| 79 |
"WavLLM_fairseq": 62.199999999999996,
|
| 80 |
"SALMONN_7B": 46.8,
|
| 81 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.0
|
|
|
|
| 115 |
"Qwen2-Audio-7B-Instruct": 71.60909856781802,
|
| 116 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 51.727042965459134,
|
| 117 |
"phi_4_multimodal_instruct": 54.422914911541696,
|
| 118 |
+
"seallms_audio_7b": 63.184498736310026,
|
| 119 |
"WavLLM_fairseq": 44.3133951137321,
|
| 120 |
"SALMONN_7B": 50.88458298230834,
|
| 121 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 56.44481887110362
|
|
|
|
| 141 |
"whisper_large_v3": 0.3171008846684522,
|
| 142 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.32988393799204613,
|
| 143 |
"phi_4_multimodal_instruct": 0.3470091713334957,
|
| 144 |
+
"seallms_audio_7b": 0.290236182128074,
|
| 145 |
"WavLLM_fairseq": 0.4463923382842302,
|
| 146 |
"SALMONN_7B": 0.42346400454508565,
|
| 147 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.31912994075156237
|
|
|
|
| 209 |
"whisper_large_v3": 0.1698509342851144,
|
| 210 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1789273082575623,
|
| 211 |
"phi_4_multimodal_instruct": 0.14552883606001388,
|
| 212 |
+
"seallms_audio_7b": 0.6259629515980555,
|
| 213 |
"WavLLM_fairseq": 0.42541061709652933,
|
| 214 |
"SALMONN_7B": 0.24872817713464365,
|
| 215 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.17467982364056267
|
|
|
|
| 222 |
"Qwen2-Audio-7B-Instruct": 44.800000000000004,
|
| 223 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 72.2,
|
| 224 |
"phi_4_multimodal_instruct": 30.8,
|
| 225 |
+
"seallms_audio_7b": 63.8,
|
| 226 |
"WavLLM_fairseq": 19.2,
|
| 227 |
"SALMONN_7B": 15.8,
|
| 228 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 63.0
|
|
|
|
| 250 |
"Qwen2-Audio-7B-Instruct": 58.31395348837209,
|
| 251 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 73.11046511627907,
|
| 252 |
"phi_4_multimodal_instruct": 68.40116279069767,
|
| 253 |
+
"seallms_audio_7b": 57.587209302325576,
|
| 254 |
"WavLLM_fairseq": 58.54651162790698,
|
| 255 |
"SALMONN_7B": 59.24418604651163,
|
| 256 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 64.94186046511628
|
|
|
|
| 309 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.5840399155162387,
|
| 310 |
"gemini-1.5-flash": 1.1100431601824359,
|
| 311 |
"phi_4_multimodal_instruct": 0.8529492791331231,
|
| 312 |
+
"seallms_audio_7b": 1.7106737273868193,
|
| 313 |
"WavLLM_fairseq": 1.2204842511249197,
|
| 314 |
"SALMONN_7B": 1.0189782362484312,
|
| 315 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.507882090054792
|
|
|
|
| 322 |
"Qwen2-Audio-7B-Instruct": 53.9463601532567,
|
| 323 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.59003831417625,
|
| 324 |
"phi_4_multimodal_instruct": 51.609195402298845,
|
| 325 |
+
"seallms_audio_7b": 52.1455938697318,
|
| 326 |
"WavLLM_fairseq": 51.072796934865906,
|
| 327 |
"SALMONN_7B": 41.7624521072797,
|
| 328 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 45.593869731800766
|
|
|
|
| 335 |
"Qwen2-Audio-7B-Instruct": 39.6,
|
| 336 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 66.0,
|
| 337 |
"phi_4_multimodal_instruct": 43.8,
|
| 338 |
+
"seallms_audio_7b": 45.0,
|
| 339 |
"WavLLM_fairseq": 46.6,
|
| 340 |
"SALMONN_7B": 36.6,
|
| 341 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 53.8
|
|
|
|
| 363 |
"Qwen2-Audio-7B-Instruct": 61.56666666666667,
|
| 364 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 19.6,
|
| 365 |
"phi_4_multimodal_instruct": 36.833333333333336,
|
| 366 |
+
"seallms_audio_7b": 30.5,
|
| 367 |
"WavLLM_fairseq": 46.766666666666666,
|
| 368 |
"SALMONN_7B": 42.733333333333334,
|
| 369 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 25.433333333333337
|
|
|
|
| 389 |
"whisper_large_v3": 0.2143555471246589,
|
| 390 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.22881615619208825,
|
| 391 |
"phi_4_multimodal_instruct": 0.22801359968481416,
|
| 392 |
+
"seallms_audio_7b": 0.5812260145043848,
|
| 393 |
"WavLLM_fairseq": 0.39796588405247263,
|
| 394 |
"SALMONN_7B": 0.34868891450584405,
|
| 395 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.22004640235805695
|
|
|
|
| 414 |
"whisper_large_v3": 0.15887899737116104,
|
| 415 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.1448629161356777,
|
| 416 |
"phi_4_multimodal_instruct": 0.24134627375003423,
|
| 417 |
+
"seallms_audio_7b": 0.5738685499413504,
|
| 418 |
"WavLLM_fairseq": 0.6671766188447099,
|
| 419 |
"SALMONN_7B": 0.3597423676988383,
|
| 420 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.15611126487402763
|
|
|
|
| 452 |
"whisper_large_v3": 46.01512198258627,
|
| 453 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 46.80524126004861,
|
| 454 |
"phi_4_multimodal_instruct": 0.36465303013961253,
|
| 455 |
+
"seallms_audio_7b": 43.98074943006231,
|
| 456 |
"WavLLM_fairseq": 5.933522277713613,
|
| 457 |
"SALMONN_7B": 26.89649039333571,
|
| 458 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 46.79924664837527
|
|
|
|
| 490 |
"whisper_large_v3": 0.09459022434812692,
|
| 491 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.09948381629977261,
|
| 492 |
"phi_4_multimodal_instruct": 0.09672866386388193,
|
| 493 |
+
"seallms_audio_7b": 0.13672725996455154,
|
| 494 |
"WavLLM_fairseq": 0.15491778414546403,
|
| 495 |
"SALMONN_7B": 0.10765150204693537,
|
| 496 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.09515429104337297
|
|
|
|
| 515 |
"Qwen2-Audio-7B-Instruct": 33.8,
|
| 516 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 45.4,
|
| 517 |
"phi_4_multimodal_instruct": 41.2,
|
| 518 |
+
"seallms_audio_7b": 43.0,
|
| 519 |
"WavLLM_fairseq": 31.6,
|
| 520 |
"SALMONN_7B": 9.0,
|
| 521 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 37.400000000000006
|
|
|
|
| 531 |
"Qwen2-Audio-7B-Instruct": 0.9666666666666667,
|
| 532 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 7.633333333333334,
|
| 533 |
"phi_4_multimodal_instruct": 0.5333333333333333,
|
| 534 |
+
"seallms_audio_7b": 15.633333333333333,
|
| 535 |
"WavLLM_fairseq": 0.23333333333333336,
|
| 536 |
"SALMONN_7B": 0.06666666666666667,
|
| 537 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 9.666666666666666
|
|
|
|
| 544 |
"Qwen2-Audio-7B-Instruct": 92.80876494023903,
|
| 545 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 15.737051792828685,
|
| 546 |
"phi_4_multimodal_instruct": 59.46215139442231,
|
| 547 |
+
"seallms_audio_7b": 66.43426294820716,
|
| 548 |
"WavLLM_fairseq": 51.932270916334666,
|
| 549 |
"SALMONN_7B": 81.31474103585658,
|
| 550 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 44.22310756972111
|
|
|
|
| 595 |
"Qwen2-Audio-7B-Instruct": 66.49242028227914,
|
| 596 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 89.33612127548353,
|
| 597 |
"phi_4_multimodal_instruct": 72.60846837428123,
|
| 598 |
+
"seallms_audio_7b": 75.6926293779404,
|
| 599 |
"WavLLM_fairseq": 66.5446941975954,
|
| 600 |
"SALMONN_7B": 56.455828541557764,
|
| 601 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 86.4610559330894
|
|
|
|
| 608 |
"Qwen2-Audio-7B-Instruct": 40.4,
|
| 609 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 58.0,
|
| 610 |
"phi_4_multimodal_instruct": 52.199999999999996,
|
| 611 |
+
"seallms_audio_7b": 49.400000000000006,
|
| 612 |
"WavLLM_fairseq": 45.199999999999996,
|
| 613 |
"SALMONN_7B": 17.2,
|
| 614 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
|
|
| 638 |
"Qwen2-Audio-7B-Instruct": 42.0,
|
| 639 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 56.0,
|
| 640 |
"phi_4_multimodal_instruct": 43.8,
|
| 641 |
+
"seallms_audio_7b": 45.599999999999994,
|
| 642 |
"WavLLM_fairseq": 45.199999999999996,
|
| 643 |
"SALMONN_7B": 40.599999999999994,
|
| 644 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 49.0
|
|
|
|
| 654 |
"Qwen2-Audio-7B-Instruct": 24.8,
|
| 655 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 44.0,
|
| 656 |
"phi_4_multimodal_instruct": 37.0,
|
| 657 |
+
"seallms_audio_7b": 35.4,
|
| 658 |
"WavLLM_fairseq": 31.6,
|
| 659 |
"SALMONN_7B": 7.0,
|
| 660 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.0
|
|
|
|
| 692 |
"Qwen2-Audio-7B-Instruct": 0.19891712076314283,
|
| 693 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.05796819723943051,
|
| 694 |
"phi_4_multimodal_instruct": 0.1757379026471828,
|
| 695 |
+
"seallms_audio_7b": 0.30423899385222564,
|
| 696 |
"WavLLM_fairseq": 0.041732965094428545,
|
| 697 |
"SALMONN_7B": 0.20994052484339956,
|
| 698 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.07953048457785493
|
|
|
|
| 708 |
"Qwen2-Audio-7B-Instruct": 2.55,
|
| 709 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.016666666666666,
|
| 710 |
"phi_4_multimodal_instruct": 3.5166666666666666,
|
| 711 |
+
"seallms_audio_7b": 3.5833333333333335,
|
| 712 |
"WavLLM_fairseq": 2.6833333333333336,
|
| 713 |
"SALMONN_7B": 2.5166666666666666,
|
| 714 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 12.416666666666666
|
|
|
|
| 734 |
"whisper_large_v3": 2.451098639578599,
|
| 735 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 2.8327095799289337,
|
| 736 |
"phi_4_multimodal_instruct": 0.053138495633157125,
|
| 737 |
+
"seallms_audio_7b": 0.06475917031217593,
|
| 738 |
"WavLLM_fairseq": 0.1695522548322915,
|
| 739 |
"SALMONN_7B": 0.3649023706010388,
|
| 740 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 2.4245628096245917
|
|
|
|
| 760 |
"Qwen2-Audio-7B-Instruct": 50.919591292758774,
|
| 761 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 24.647544968400585,
|
| 762 |
"phi_4_multimodal_instruct": 47.86582401555663,
|
| 763 |
+
"seallms_audio_7b": 53.03840544482256,
|
| 764 |
"WavLLM_fairseq": 43.01199466903598,
|
| 765 |
"SALMONN_7B": 57.75401069518716,
|
| 766 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 29.47134606841404
|
|
|
|
| 791 |
"whisper_large_v3": 0.27026366524560785,
|
| 792 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.3035544573275043,
|
| 793 |
"phi_4_multimodal_instruct": 0.44227061666711925,
|
| 794 |
+
"seallms_audio_7b": 1.0837293290249002,
|
| 795 |
"WavLLM_fairseq": 0.7540934640345399,
|
| 796 |
"SALMONN_7B": 0.6569229098215983,
|
| 797 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.29992939962527493
|
|
|
|
| 842 |
"Qwen2-Audio-7B-Instruct": 45.75079872204473,
|
| 843 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.466453674121407,
|
| 844 |
"phi_4_multimodal_instruct": 38.466453674121404,
|
| 845 |
+
"seallms_audio_7b": 53.73801916932908,
|
| 846 |
"WavLLM_fairseq": 29.840255591054312,
|
| 847 |
"SALMONN_7B": 50.287539936102235,
|
| 848 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 17.380191693290733
|
|
|
|
| 859 |
"whisper_large_v3": 0.06844171360300393,
|
| 860 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.07041669714480775,
|
| 861 |
"phi_4_multimodal_instruct": 0.05739643527661961,
|
| 862 |
+
"seallms_audio_7b": 0.17813863896813206,
|
| 863 |
"WavLLM_fairseq": 0.10077292565771828,
|
| 864 |
"SALMONN_7B": 0.0925804013361617,
|
| 865 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.06922195401458074
|
|
|
|
| 872 |
"Qwen2-Audio-7B-Instruct": 0.2165498391593041,
|
| 873 |
"whisper_large_v3": 0.14602420615337386,
|
| 874 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.20140159998943682,
|
| 875 |
+
"seallms_audio_7b": 0.369768551146351,
|
| 876 |
"WavLLM_fairseq": 0.3792176325635977,
|
| 877 |
"SALMONN_7B": 0.23699946689025367,
|
| 878 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.14540692118393275
|
|
|
|
| 898 |
"Qwen2-Audio-7B-Instruct": 44.473684210526315,
|
| 899 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 18.88157894736842,
|
| 900 |
"phi_4_multimodal_instruct": 35.13157894736842,
|
| 901 |
+
"seallms_audio_7b": 42.10526315789473,
|
| 902 |
"WavLLM_fairseq": 26.25,
|
| 903 |
"SALMONN_7B": 47.30263157894737,
|
| 904 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 16.710526315789473
|
|
|
|
| 1047 |
"Qwen2-Audio-7B-Instruct": 68.38333333333333,
|
| 1048 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 26.35,
|
| 1049 |
"phi_4_multimodal_instruct": 51.68333333333334,
|
| 1050 |
+
"seallms_audio_7b": 50.083333333333336,
|
| 1051 |
"WavLLM_fairseq": 49.06666666666666,
|
| 1052 |
"SALMONN_7B": 59.766666666666666,
|
| 1053 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.016666666666666
|
|
|
|
| 1060 |
"Qwen2-Audio-7B-Instruct": 80.04901960784315,
|
| 1061 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 88.57843137254902,
|
| 1062 |
"phi_4_multimodal_instruct": 88.33333333333334,
|
| 1063 |
+
"seallms_audio_7b": 83.52941176470588,
|
| 1064 |
"WavLLM_fairseq": 83.92156862745098,
|
| 1065 |
"SALMONN_7B": 83.48039215686273,
|
| 1066 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 82.99019607843137
|
|
|
|
| 1090 |
"Qwen2-Audio-7B-Instruct": 41.60919540229885,
|
| 1091 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 47.356321839080465,
|
| 1092 |
"phi_4_multimodal_instruct": 43.524904214559385,
|
| 1093 |
+
"seallms_audio_7b": 51.11111111111111,
|
| 1094 |
"WavLLM_fairseq": 41.57088122605364,
|
| 1095 |
"SALMONN_7B": 30.536398467432953,
|
| 1096 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 36.81992337164751
|
|
|
|
| 1119 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 0.7824973031283711,
|
| 1120 |
"gemini-1.5-flash": 0.9690871089536138,
|
| 1121 |
"phi_4_multimodal_instruct": 0.7126483279395901,
|
| 1122 |
+
"seallms_audio_7b": 1.0639495685005393,
|
| 1123 |
"WavLLM_fairseq": 1.2913969795037756,
|
| 1124 |
"SALMONN_7B": 1.2721817691477886,
|
| 1125 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 0.6848705501618123
|
|
|
|
| 1144 |
"Qwen2-Audio-7B-Instruct": 51.6,
|
| 1145 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 74.0,
|
| 1146 |
"phi_4_multimodal_instruct": 49.0,
|
| 1147 |
+
"seallms_audio_7b": 54.2,
|
| 1148 |
"WavLLM_fairseq": 50.8,
|
| 1149 |
"SALMONN_7B": 44.6,
|
| 1150 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.800000000000004
|
|
|
|
| 1160 |
"Qwen2-Audio-7B-Instruct": 46.2,
|
| 1161 |
"cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct": 65.4,
|
| 1162 |
"phi_4_multimodal_instruct": 52.599999999999994,
|
| 1163 |
+
"seallms_audio_7b": 53.0,
|
| 1164 |
"WavLLM_fairseq": 49.400000000000006,
|
| 1165 |
"SALMONN_7B": 24.2,
|
| 1166 |
"cascade_whisper_large_v3_llama_3_8b_instruct": 57.199999999999996
|