diff --git a/static/eval_results/Default/Aquila_VL_2B/summary_results.json b/static/eval_results/Default/Aquila_VL_2B/summary_results.json index 8aaeeec492de6dabf76847d0cb433cab957a2f9d..dcb8c904dec241a846a39433a1e06c0f06a18552 100644 --- a/static/eval_results/Default/Aquila_VL_2B/summary_results.json +++ b/static/eval_results/Default/Aquila_VL_2B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.159970161379836, - "micro_mean_score": 0.15844711671722148 + "macro_mean_score": 0.159970161379836 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.24567572098570653, - "micro_mean_score": 0.2704213241616509 + "macro_mean_score": 0.24567572098570653 }, "overall_score": 0.17100157004197775 }, diff --git a/static/eval_results/Default/Aria/summary_results.json b/static/eval_results/Default/Aria/summary_results.json index 348d2d8b5f081312e2c2629ee53791750ebf9e42..262302a95067159ec3ce3b6253a1dd306b3d4658 100644 --- a/static/eval_results/Default/Aria/summary_results.json +++ b/static/eval_results/Default/Aria/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.289073788209904, - "micro_mean_score": 0.2859007507765791 + "macro_mean_score": 0.289073788209904 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.5103725263180767, - "micro_mean_score": 0.5349957007738607 + "macro_mean_score": 0.5103725263180767 }, "overall_score": 0.31755778420402525 }, diff --git a/static/eval_results/Default/Claude_3.5/summary_results.json b/static/eval_results/Default/Claude_3.5/summary_results.json index d90792e8314fea5c53f068d815fd6ebdff3bd724..90e5a47aba73675b646720cb0997dd8d8ef63eb2 100644 --- a/static/eval_results/Default/Claude_3.5/summary_results.json +++ b/static/eval_results/Default/Claude_3.5/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.5040975742801586, - "micro_mean_score": 0.5002259116666758 + "macro_mean_score": 0.5040975742801586 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.6373907158949892, - "micro_mean_score": 0.6569647463456579 + "macro_mean_score": 0.6373907158949892 }, "overall_score": 0.5212541172602853 }, diff --git a/static/eval_results/Default/Claude_3.5_new/summary_results.json b/static/eval_results/Default/Claude_3.5_new/summary_results.json index b27da6920bcbd055a5c65f822bb65e8153eeedae..b2b1262a4f91848bebb4798f1412b74159406080 100644 --- a/static/eval_results/Default/Claude_3.5_new/summary_results.json +++ b/static/eval_results/Default/Claude_3.5_new/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.5259191914020757, - "micro_mean_score": 0.5230785894131227 + "macro_mean_score": 0.5259191914020757 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.6563419761104125, - "micro_mean_score": 0.6724419604471196 + "macro_mean_score": 0.6563419761104125 }, "overall_score": 0.5427062825031487 }, diff --git a/static/eval_results/Default/GPT_4o/summary_results.json b/static/eval_results/Default/GPT_4o/summary_results.json index 6af57dc0f78b6677c89cf6d73a5396b2d10b16f8..3831c998400ed6772cfa18c65b85a8909e2815ad 100644 --- a/static/eval_results/Default/GPT_4o/summary_results.json +++ b/static/eval_results/Default/GPT_4o/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.5265030595065238, - "micro_mean_score": 0.5236338521693411 + "macro_mean_score": 0.5265030595065238 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.6478225794744895, - "micro_mean_score": 0.665391229578676 + "macro_mean_score": 0.6478225794744895 }, "overall_score": 0.5421184432647768 }, diff --git a/static/eval_results/Default/GPT_4o_mini/summary_results.json b/static/eval_results/Default/GPT_4o_mini/summary_results.json index e4ea03c1e4c0224b18df4676d6f3f1b2bbef39af..b6c0e1a1ef4160ac20baa8972de137fa688c1931 100644 --- a/static/eval_results/Default/GPT_4o_mini/summary_results.json +++ b/static/eval_results/Default/GPT_4o_mini/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.40767494558789397, - "micro_mean_score": 0.40431644154143376 + "macro_mean_score": 0.40767494558789397 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.586537827213665, - "micro_mean_score": 0.6133276010318144 + "macro_mean_score": 0.586537827213665 }, "overall_score": 0.43069690064863675 }, diff --git a/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json index 89748aec2730a7b4dd7c3fbdec0e71c34ad210d5..955aaae1b37ceb0dde39241617ad564edfe0858a 100644 --- a/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json +++ b/static/eval_results/Default/Gemini_1.5_flash_002/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.4189319021967416, - "micro_mean_score": 0.41567515414375245 + "macro_mean_score": 0.4189319021967416 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.5691365176285039, - "micro_mean_score": 0.5987532244196045 + "macro_mean_score": 0.5691365176285039 }, "overall_score": 0.4382651695295427 }, diff --git a/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json index 0710bf3e0c920cb7b8109b90e9bcbdfba2792418..c927a5f4d37a33ecf98b427dc40bf59b22d02e99 100644 --- a/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json +++ b/static/eval_results/Default/Gemini_1.5_pro_002/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.4822473962867704, - "micro_mean_score": 0.4764805563057179 + "macro_mean_score": 0.4822473962867704 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.5858190649927173, - "micro_mean_score": 0.6104901117798793 + "macro_mean_score": 0.5858190649927173 }, "overall_score": 0.4955784031499121 }, diff --git a/static/eval_results/Default/Idefics3/summary_results.json b/static/eval_results/Default/Idefics3/summary_results.json index ce15d5ce2e9339df0bcb8985694132ef9048c00a..137e4ffe4ef3455dc310488af3a98c8507d8b451 100644 --- a/static/eval_results/Default/Idefics3/summary_results.json +++ b/static/eval_results/Default/Idefics3/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.08956972487602757, - "micro_mean_score": 0.08982225274252693 + "macro_mean_score": 0.08956972487602757 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.3210866162255635, - "micro_mean_score": 0.35649183147033553 + "macro_mean_score": 0.3210866162255635 }, "overall_score": 0.11936892871309657 }, diff --git a/static/eval_results/Default/InternVL2_2B/summary_results.json b/static/eval_results/Default/InternVL2_2B/summary_results.json index 0243043e9c2981fd3df05c1a2f24eb91964c05ea..ae2b06dd6b3b09cb487cd01b1750cea89616f408 100644 --- a/static/eval_results/Default/InternVL2_2B/summary_results.json +++ b/static/eval_results/Default/InternVL2_2B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.13141974398938763, - "micro_mean_score": 0.13063500716262516 + "macro_mean_score": 0.13141974398938763 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.23864417043743646, - "micro_mean_score": 0.24901117798796224 + "macro_mean_score": 0.23864417043743646 }, "overall_score": 0.14522090778963154 }, diff --git a/static/eval_results/Default/InternVL2_5_2B/summary_results.json b/static/eval_results/Default/InternVL2_5_2B/summary_results.json index f8718f5d302518ea84ef84b781f4f5270625aa50..60dc4bee1510b276c7c1538399bc559d0981700b 100644 --- a/static/eval_results/Default/InternVL2_5_2B/summary_results.json +++ b/static/eval_results/Default/InternVL2_5_2B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.17806821966478364, - "micro_mean_score": 0.17708809739236367 + "macro_mean_score": 0.17806821966478364 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.2738430375585404, - "micro_mean_score": 0.2905417024935512 + "macro_mean_score": 0.2738430375585404 }, "overall_score": 0.19039567147289096 }, diff --git a/static/eval_results/Default/InternVL2_5_78B/summary_results.json b/static/eval_results/Default/InternVL2_5_78B/summary_results.json index 055e16e870658b5ad33e53b229171e8ec80d837e..f3a90403f21da666d10bf25990a85b1422c0f465 100644 --- a/static/eval_results/Default/InternVL2_5_78B/summary_results.json +++ b/static/eval_results/Default/InternVL2_5_78B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.44132952988532753, - "micro_mean_score": 0.4397079059379812 + "macro_mean_score": 0.44132952988532753 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.5538024772749066, - "micro_mean_score": 0.5776870163370592 + "macro_mean_score": 0.5538024772749066 }, "overall_score": 0.4558062458859664 }, diff --git a/static/eval_results/Default/InternVL2_76B/summary_results.json b/static/eval_results/Default/InternVL2_76B/summary_results.json index 2c244cef39cdaadb040968fc9007b1a1307168c5..555bfdb892e77d456f38018cf8e28b00f1673bf7 100644 --- a/static/eval_results/Default/InternVL2_76B/summary_results.json +++ b/static/eval_results/Default/InternVL2_76B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.3562710424410931, - "micro_mean_score": 0.35129859801162616 + "macro_mean_score": 0.3562710424410931 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.5192997443033639, - "micro_mean_score": 0.5421324161650903 + "macro_mean_score": 0.5192997443033639 }, "overall_score": 0.3772549347599992 }, diff --git a/static/eval_results/Default/InternVL2_8B/summary_results.json b/static/eval_results/Default/InternVL2_8B/summary_results.json index 6d91119c9457eff407742939ac2882586060f469..6361d0cdb75a067c5198a442510b744b78712c7e 100644 --- a/static/eval_results/Default/InternVL2_8B/summary_results.json +++ b/static/eval_results/Default/InternVL2_8B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.25956581776451815, - "micro_mean_score": 0.2546984460483302 + "macro_mean_score": 0.25956581776451815 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1165, - "macro_mean_score": 0.3978571701460552, - "micro_mean_score": 0.4108583690987125 + "macro_mean_score": 0.3978571701460552 }, "overall_score": 0.2773656948037259 }, diff --git a/static/eval_results/Default/Llama_3_2_11B/summary_results.json b/static/eval_results/Default/Llama_3_2_11B/summary_results.json index b9e128e5c619e8d90b92df12a38760d4d8f440b2..eae72367a5a3e1a5e4ad473951a2e971fc32fe22 100644 --- a/static/eval_results/Default/Llama_3_2_11B/summary_results.json +++ b/static/eval_results/Default/Llama_3_2_11B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.15999641916771298, - "micro_mean_score": 0.15809331016967038 + "macro_mean_score": 0.15999641916771298 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.3173342406187366, - "micro_mean_score": 0.3487962166809973 + "macro_mean_score": 0.3173342406187366 }, "overall_score": 0.1802478219287358 }, diff --git a/static/eval_results/Default/Mammoth_VL/summary_results.json b/static/eval_results/Default/Mammoth_VL/summary_results.json index 6b2cc0baf5cb8d8e9cfd4184289f35fba2e6c779..d37b186198c3323ffc97934b0986866de319ebda 100644 --- a/static/eval_results/Default/Mammoth_VL/summary_results.json +++ b/static/eval_results/Default/Mammoth_VL/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.264052880412689, - "micro_mean_score": 0.2626894374387823 + "macro_mean_score": 0.264052880412689 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.37992668750165337, - "micro_mean_score": 0.40120378331900275 + "macro_mean_score": 0.37992668750165337 }, "overall_score": 0.27896733083008046 }, diff --git a/static/eval_results/Default/MiniCPM_v2.6/summary_results.json b/static/eval_results/Default/MiniCPM_v2.6/summary_results.json index 636b1496744d89284ea5089d88cce3d34abddac2..a666c2522449456d0b15f40788df615267ac589b 100644 --- a/static/eval_results/Default/MiniCPM_v2.6/summary_results.json +++ b/static/eval_results/Default/MiniCPM_v2.6/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.22955895202146906, - "micro_mean_score": 0.22560399396899078 + "macro_mean_score": 0.22955895202146906 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.41728623355613875, - "micro_mean_score": 0.43452278589853827 + "macro_mean_score": 0.41728623355613875 }, "overall_score": 0.2537218694467236 }, diff --git a/static/eval_results/Default/NVLM/summary_results.json b/static/eval_results/Default/NVLM/summary_results.json index 1f5960546368b4fa15bc67524b10eebb5393c2ca..e3a50b6a8b72a5a1684ba8b9e6c350b854576f71 100644 --- a/static/eval_results/Default/NVLM/summary_results.json +++ b/static/eval_results/Default/NVLM/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.21589726765847422, - "micro_mean_score": 0.21406043849932396 + "macro_mean_score": 0.21589726765847422 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.3478114310231307, - "micro_mean_score": 0.3947549441100602 + "macro_mean_score": 0.3478114310231307 }, "overall_score": 0.23287631838857856 }, diff --git a/static/eval_results/Default/Phi-3.5-vision/summary_results.json b/static/eval_results/Default/Phi-3.5-vision/summary_results.json index 1e9d5d25bb8ca28106310878ffdeebc6788d2f0c..93acd3c7a360dea90731e94b1d6af9f8bc24a04c 100644 --- a/static/eval_results/Default/Phi-3.5-vision/summary_results.json +++ b/static/eval_results/Default/Phi-3.5-vision/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.22995297916629392, - "micro_mean_score": 0.22708502951025372 + "macro_mean_score": 0.22995297916629392 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.3947914647737769, - "micro_mean_score": 0.42459157351676696 + "macro_mean_score": 0.3947914647737769 }, "overall_score": 0.2511698139474551 }, diff --git a/static/eval_results/Default/Pixtral_12B/summary_results.json b/static/eval_results/Default/Pixtral_12B/summary_results.json index d7b2c538d50bf2b1e42d3ba272fa87d54e676a20..dbcfded35be661ff50a035baf732bd78e4230c40 100644 --- a/static/eval_results/Default/Pixtral_12B/summary_results.json +++ b/static/eval_results/Default/Pixtral_12B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.31362045151669854, - "micro_mean_score": 0.3100986209078182 + "macro_mean_score": 0.31362045151669854 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.4566234428542061, - "micro_mean_score": 0.4870593293207223 + "macro_mean_score": 0.4566234428542061 }, "overall_score": 0.33202677713439754 }, diff --git a/static/eval_results/Default/Qwen2_VL_2B/summary_results.json b/static/eval_results/Default/Qwen2_VL_2B/summary_results.json index 76a71eabec4ee5d88551bf968f232ee13dffdc5a..d9692119bda7b8a4151fd43bbf57680746783f1e 100644 --- a/static/eval_results/Default/Qwen2_VL_2B/summary_results.json +++ b/static/eval_results/Default/Qwen2_VL_2B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.20877163406364055, - "micro_mean_score": 0.20561526268932287 + "macro_mean_score": 0.20877163406364055 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.3154302566225611, - "micro_mean_score": 0.33856405846947557 + "macro_mean_score": 0.3154302566225611 }, "overall_score": 0.22249997162072932 }, diff --git a/static/eval_results/Default/Qwen2_VL_72B/summary_results.json b/static/eval_results/Default/Qwen2_VL_72B/summary_results.json index 21ca30c7af4a3ac146431c7c2c9ef8774c9fdd1e..85b1f578d5d184097f343aa4b5bf41ae1f60fa2c 100644 --- a/static/eval_results/Default/Qwen2_VL_72B/summary_results.json +++ b/static/eval_results/Default/Qwen2_VL_72B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.4542376574527161, - "micro_mean_score": 0.4501201906164793 + "macro_mean_score": 0.4542376574527161 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.5639771804231668, - "micro_mean_score": 0.5835339638865004 + "macro_mean_score": 0.5639771804231668 }, "overall_score": 0.4683625465479226 }, diff --git a/static/eval_results/Default/Qwen2_VL_7B/summary_results.json b/static/eval_results/Default/Qwen2_VL_7B/summary_results.json index a67230b05e5f3234888e722eab28419c004ee575..5d895b1ac4072a594c639edc3e2e170db83c13c5 100644 --- a/static/eval_results/Default/Qwen2_VL_7B/summary_results.json +++ b/static/eval_results/Default/Qwen2_VL_7B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.3293449599230247, - "micro_mean_score": 0.325331493515679 + "macro_mean_score": 0.3293449599230247 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1170, - "macro_mean_score": 0.43955105763038577, - "micro_mean_score": 0.45508547008546996 + "macro_mean_score": 0.43955105763038577 }, "overall_score": 0.34352990319228904 }, diff --git a/static/eval_results/Default/llava_onevision_72B/summary_results.json b/static/eval_results/Default/llava_onevision_72B/summary_results.json index 2eb71da75405e6141add0c6e95de67741daab5e6..f829b79dc6c0a49552888b0ef503d4b2c8c00d2e 100644 --- a/static/eval_results/Default/llava_onevision_72B/summary_results.json +++ b/static/eval_results/Default/llava_onevision_72B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.2974368415462532, - "micro_mean_score": 0.2956217833156672 + "macro_mean_score": 0.2974368415462532 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.4599484231632498, - "micro_mean_score": 0.4850386930352536 + "macro_mean_score": 0.4599484231632498 }, "overall_score": 0.31835417383358944 }, diff --git a/static/eval_results/Default/llava_onevision_7B/summary_results.json b/static/eval_results/Default/llava_onevision_7B/summary_results.json index 28bba3867965ba53739f13ae6e96aa8e53be256d..ab35106e197991e3c8120329b54a0139361aa5d5 100644 --- a/static/eval_results/Default/llava_onevision_7B/summary_results.json +++ b/static/eval_results/Default/llava_onevision_7B/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 440, "num_eval_samples": 6539, - "macro_mean_score": 0.21362697219149712, - "micro_mean_score": 0.21073910058505504 + "macro_mean_score": 0.21362697219149712 }, "open": { "num_eval_tasks": 65, "num_eval_samples": 1163, - "macro_mean_score": 0.33979975321921935, - "micro_mean_score": 0.36474634565778147 + "macro_mean_score": 0.33979975321921935 }, "overall_score": 0.2298670331158574 }, diff --git a/static/eval_results/SI/Aquila_VL_2B/summary_results.json b/static/eval_results/SI/Aquila_VL_2B/summary_results.json index ff246bf1cd8585833334967628ff5f37f092ebf6..25ad416c8ba0bfab7a949e7f25aebbe52d858fa5 100644 --- a/static/eval_results/SI/Aquila_VL_2B/summary_results.json +++ b/static/eval_results/SI/Aquila_VL_2B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.20770364903712493, - "micro_mean_score": 0.20333142638522636, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.31474202723571276, - "micro_mean_score": 0.3326568265682657, "missing_tasks": [] }, "overall_score": 0.22197543279693666 diff --git a/static/eval_results/SI/Aria/summary_results.json b/static/eval_results/SI/Aria/summary_results.json index 5648c2026d713e85a8b3c03c640ec9f3a4d53c86..e6ae8614fdc6bc902403a3536d1edb375f37eccb 100644 --- a/static/eval_results/SI/Aria/summary_results.json +++ b/static/eval_results/SI/Aria/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.3178882776147889, - "micro_mean_score": 0.3101511832828904, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.5137437248005172, - "micro_mean_score": 0.5472939729397295, "missing_tasks": [] }, "overall_score": 0.34400233723955265 diff --git a/static/eval_results/SI/Claude_3.5/summary_results.json b/static/eval_results/SI/Claude_3.5/summary_results.json index f8d28fc504e0065f3389cbbbf63b00505e1bcc62..b64b3a6585b24577c6141e797f2ae79a8fd0c98b 100644 --- a/static/eval_results/SI/Claude_3.5/summary_results.json +++ b/static/eval_results/SI/Claude_3.5/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 273, "num_eval_samples": 4116, - "macro_mean_score": 0.520276385877485, - "micro_mean_score": 0.5148202137998056 + "macro_mean_score": 0.520276385877485 }, "open": { "num_eval_tasks": 42, "num_eval_samples": 813, - "macro_mean_score": 0.6479684260295507, - "micro_mean_score": 0.6801968019680197 + "macro_mean_score": 0.6479684260295507 }, "overall_score": 0.5373019912310938 }, diff --git a/static/eval_results/SI/Claude_3.5_new/summary_results.json b/static/eval_results/SI/Claude_3.5_new/summary_results.json index 3c88676ad4f2fab27160f4dcc3b8dfe2136a9e2d..c686dc09a9e826eca8e0b0b7d29a470db2533add 100644 --- a/static/eval_results/SI/Claude_3.5_new/summary_results.json +++ b/static/eval_results/SI/Claude_3.5_new/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 273, "num_eval_samples": 4116, - "macro_mean_score": 0.5462752278980763, - "micro_mean_score": 0.5417881438289601 + "macro_mean_score": 0.5462752278980763 }, "open": { "num_eval_tasks": 42, "num_eval_samples": 813, - "macro_mean_score": 0.6764020657053476, - "micro_mean_score": 0.6924969249692496 + "macro_mean_score": 0.6764020657053476 }, "overall_score": 0.5636254729390457 }, diff --git a/static/eval_results/SI/GPT_4o/summary_results.json b/static/eval_results/SI/GPT_4o/summary_results.json index 3fd663e31742e6d071f91953e3b43b831fc05c54..3e8bfe360bf2d907c25cda15e98b5eedfdae5e75 100644 --- a/static/eval_results/SI/GPT_4o/summary_results.json +++ b/static/eval_results/SI/GPT_4o/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 273, "num_eval_samples": 4116, - "macro_mean_score": 0.5529953662872719, - "micro_mean_score": 0.5483479105928085 + "macro_mean_score": 0.5529953662872719 }, "open": { "num_eval_tasks": 42, "num_eval_samples": 813, - "macro_mean_score": 0.6600228904804206, - "micro_mean_score": 0.6801968019680197 + "macro_mean_score": 0.6600228904804206 }, "overall_score": 0.5672657028463584 }, diff --git a/static/eval_results/SI/GPT_4o_mini/summary_results.json b/static/eval_results/SI/GPT_4o_mini/summary_results.json index cc69433e9989576e5a8c7974b79e624af8c0838b..9800919d8f39047c35924ce712648302284ffff5 100644 --- a/static/eval_results/SI/GPT_4o_mini/summary_results.json +++ b/static/eval_results/SI/GPT_4o_mini/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 273, "num_eval_samples": 4116, - "macro_mean_score": 0.4431039098921726, - "micro_mean_score": 0.43780369290573373 + "macro_mean_score": 0.4431039098921726 }, "open": { "num_eval_tasks": 42, "num_eval_samples": 813, - "macro_mean_score": 0.595574663769726, - "micro_mean_score": 0.6334563345633456 + "macro_mean_score": 0.595574663769726 }, "overall_score": 0.46343334374251305 }, diff --git a/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json index 95d4d30ad782486786b62a3c713ef22d885e1b95..658f52f88bcacf519ac2252e4c873e39db874fb7 100644 --- a/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json +++ b/static/eval_results/SI/Gemini_1.5_flash_002/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 273, "num_eval_samples": 4116, - "macro_mean_score": 0.43481964330318734, - "micro_mean_score": 0.4297862001943635 + "macro_mean_score": 0.43481964330318734 }, "open": { "num_eval_tasks": 42, "num_eval_samples": 813, - "macro_mean_score": 0.5787083135236054, - "micro_mean_score": 0.6186961869618696 + "macro_mean_score": 0.5787083135236054 }, "overall_score": 0.4540047993325765 }, diff --git a/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json b/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json index b782cc9ac005a5507ee5cb19b6ce2a0a0098b4ca..66dfdb1e3ebb38c910054a925f47c88cfe17e622 100644 --- a/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json +++ b/static/eval_results/SI/Gemini_1.5_pro_002/summary_results.json @@ -3,14 +3,12 @@ "core": { "num_eval_tasks": 273, "num_eval_samples": 4116, - "macro_mean_score": 0.4914311038229404, - "micro_mean_score": 0.48323615160349853 + "macro_mean_score": 0.4914311038229404 }, "open": { "num_eval_tasks": 42, "num_eval_samples": 813, - "macro_mean_score": 0.5814975405131552, - "micro_mean_score": 0.6174661746617466 + "macro_mean_score": 0.5814975405131552 }, "overall_score": 0.5034399620483024 }, diff --git a/static/eval_results/SI/Idefics3/summary_results.json b/static/eval_results/SI/Idefics3/summary_results.json index 486dce3311f80e350c2765b963dfc7581e29f78f..a81352578651a1bcd38b81a495f96e068c0ae2aa 100644 --- a/static/eval_results/SI/Idefics3/summary_results.json +++ b/static/eval_results/SI/Idefics3/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.08941182847569326, - "micro_mean_score": 0.08779475233900695, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.3231434267517844, - "micro_mean_score": 0.3618081180811809, "missing_tasks": [] }, "overall_score": 0.12057604157917208 diff --git a/static/eval_results/SI/InternVL2_2B/summary_results.json b/static/eval_results/SI/InternVL2_2B/summary_results.json index b021be0db3f55396a6b2deb794e0c5aeea0bf1a8..f2997d9201693c35de1f253dde60092a8f6b1fe7 100644 --- a/static/eval_results/SI/InternVL2_2B/summary_results.json +++ b/static/eval_results/SI/InternVL2_2B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.12069001041308772, - "micro_mean_score": 0.11842605219090299, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.28522459992910454, - "micro_mean_score": 0.28886838868388687, "missing_tasks": [] }, "overall_score": 0.14262795568189 diff --git a/static/eval_results/SI/InternVL2_76B/summary_results.json b/static/eval_results/SI/InternVL2_76B/summary_results.json index 6aa5a3d95b342f82d82ffccd2bcf0a2a4db5aeab..8ced14b9dfa570e4377e33bb9cdf9cb457c9bacd 100644 --- a/static/eval_results/SI/InternVL2_76B/summary_results.json +++ b/static/eval_results/SI/InternVL2_76B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.3998616568018755, - "micro_mean_score": 0.39149064302628933, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.554748737158244, - "micro_mean_score": 0.5800738007380073, "missing_tasks": [] }, "overall_score": 0.42051326751605805 diff --git a/static/eval_results/SI/InternVL2_8B/summary_results.json b/static/eval_results/SI/InternVL2_8B/summary_results.json index 6626dca0ba52f31da57208eedcad8531070e052c..ede424035d3cbc58cd0965001b249eaf03334b39 100644 --- a/static/eval_results/SI/InternVL2_8B/summary_results.json +++ b/static/eval_results/SI/InternVL2_8B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.27650612401825575, - "micro_mean_score": 0.27119471729837735, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.39388373890935635, - "micro_mean_score": 0.4045510455104551, "missing_tasks": [] }, "overall_score": 0.29215647267040246 diff --git a/static/eval_results/SI/Llama_3_2_11B/summary_results.json b/static/eval_results/SI/Llama_3_2_11B/summary_results.json index 1f38c1b0b9723c7ad1eb5965264683efa3b48f73..93eaa0f1273ca0bd2215a352663a9167d97bc46e 100644 --- a/static/eval_results/SI/Llama_3_2_11B/summary_results.json +++ b/static/eval_results/SI/Llama_3_2_11B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.20789144960796493, - "micro_mean_score": 0.20163641703273802, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.3861125858565788, - "micro_mean_score": 0.4130381303813038, "missing_tasks": [] }, "overall_score": 0.2316542677744468 diff --git a/static/eval_results/SI/MiniCPM_v2.6/summary_results.json b/static/eval_results/SI/MiniCPM_v2.6/summary_results.json index 1e91edbcb5931b29cb88fcb6d0990c607e10cb5f..8897e3589252728510a8f52efb169e0ac456ea08 100644 --- a/static/eval_results/SI/MiniCPM_v2.6/summary_results.json +++ b/static/eval_results/SI/MiniCPM_v2.6/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.23230765810722817, - "micro_mean_score": 0.22684118052665975, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4360655066213874, - "micro_mean_score": 0.4588560885608856, "missing_tasks": [] }, "overall_score": 0.2594753712424494 diff --git a/static/eval_results/SI/Molmo_72B/summary_results.json b/static/eval_results/SI/Molmo_72B/summary_results.json index 67df8a5ceb069e1926824e120e649ddeba93073b..ceecd07087332338c36b4975d37ee56f4c31cc00 100644 --- a/static/eval_results/SI/Molmo_72B/summary_results.json +++ b/static/eval_results/SI/Molmo_72B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4073, "num_not_eval_samples": 0, "macro_mean_score": 0.36480000609384927, - "micro_mean_score": 0.36205779758110807, "missing_tasks": [ "planning_screenshot_termes", "table_understanding", @@ -17,7 +16,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4465682063915481, - "micro_mean_score": 0.4850553505535054, "missing_tasks": [] }, "overall_score": 0.3758072638262318 diff --git a/static/eval_results/SI/Molmo_7B_D/summary_results.json b/static/eval_results/SI/Molmo_7B_D/summary_results.json index a75e051b5c4fd6a1eb5c021faa7ba3bfd17d761d..ec9c71746e686b73a33db7b970b416af5c002aa1 100644 --- a/static/eval_results/SI/Molmo_7B_D/summary_results.json +++ b/static/eval_results/SI/Molmo_7B_D/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4102, "num_not_eval_samples": 0, "macro_mean_score": 0.2098088446992518, - "micro_mean_score": 0.20550929661464645, "missing_tasks": [ "MMSoc_Misinformation_PolitiFact" ] @@ -15,7 +14,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.35697926179118733, - "micro_mean_score": 0.38936039360393604, "missing_tasks": [] }, "overall_score": 0.22949405972428777 diff --git a/static/eval_results/SI/NVLM/summary_results.json b/static/eval_results/SI/NVLM/summary_results.json index 45c4dacafa84da381f3a9b804029c98426e57384..34d127ca7759178b27916786ff7ace3c7be8b17b 100644 --- a/static/eval_results/SI/NVLM/summary_results.json +++ b/static/eval_results/SI/NVLM/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.32989872890926025, - "micro_mean_score": 0.32315683713111915, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4469349818134809, - "micro_mean_score": 0.4881303813038132, "missing_tasks": [] }, "overall_score": 0.34550356262982296 diff --git a/static/eval_results/SI/POINTS_15_7B/summary_results.json b/static/eval_results/SI/POINTS_15_7B/summary_results.json index 644eeeb1861af7a618903bbbf3e87ee094ef44c8..e5820b2f5f746740c61e0977d309a00cdb422f6f 100644 --- a/static/eval_results/SI/POINTS_15_7B/summary_results.json +++ b/static/eval_results/SI/POINTS_15_7B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.31355970638319003, - "micro_mean_score": 0.30728203432446294, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.41331219301389166, - "micro_mean_score": 0.42749077490774917, "missing_tasks": [] }, "overall_score": 0.32686003793395024 diff --git a/static/eval_results/SI/POINTS_7B/summary_results.json b/static/eval_results/SI/POINTS_7B/summary_results.json index 8db3edd07587d8cd344d3781063797b53af6eae2..f216b228915366ca37a68836b27ccb5c2910fed6 100644 --- a/static/eval_results/SI/POINTS_7B/summary_results.json +++ b/static/eval_results/SI/POINTS_7B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.25511317681632334, - "micro_mean_score": 0.24927711632415062, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.30315625179016, - "micro_mean_score": 0.3313653136531366, "missing_tasks": [] }, "overall_score": 0.26151892014616823 diff --git a/static/eval_results/SI/Phi-3.5-vision/summary_results.json b/static/eval_results/SI/Phi-3.5-vision/summary_results.json index b7443fff17692c8d16a8171ae077403abf95772d..847a379e7361bfb1cd61e50f5cf1368ba6ee1118 100644 --- a/static/eval_results/SI/Phi-3.5-vision/summary_results.json +++ b/static/eval_results/SI/Phi-3.5-vision/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.2561274958722834, - "micro_mean_score": 0.2504214576875906, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4272267419054576, - "micro_mean_score": 0.445879458794588, "missing_tasks": [] }, "overall_score": 0.2789407286767066 diff --git a/static/eval_results/SI/Pixtral_12B/summary_results.json b/static/eval_results/SI/Pixtral_12B/summary_results.json index cf0689e6dbc300e655b0ab20bbcac39388d1c437..0c03664f175bf64c62204d412dd7ad05566f89b2 100644 --- a/static/eval_results/SI/Pixtral_12B/summary_results.json +++ b/static/eval_results/SI/Pixtral_12B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.3436942439614412, - "micro_mean_score": 0.3373564384613738, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4417271955536318, - "micro_mean_score": 0.4845633456334564, "missing_tasks": [] }, "overall_score": 0.3567653041737333 diff --git a/static/eval_results/SI/Qwen2_VL_2B/summary_results.json b/static/eval_results/SI/Qwen2_VL_2B/summary_results.json index 9b971e81cae22201b809b20a03940d5a8fa91adb..1709d5a3f31d0203ada3f95070f0a56883170019 100644 --- a/static/eval_results/SI/Qwen2_VL_2B/summary_results.json +++ b/static/eval_results/SI/Qwen2_VL_2B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.22787906973244856, - "micro_mean_score": 0.2234748515064842, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.3509364634962041, - "micro_mean_score": 0.3768757687576875, "missing_tasks": [] }, "overall_score": 0.24428672223428263 diff --git a/static/eval_results/SI/Qwen2_VL_72B/summary_results.json b/static/eval_results/SI/Qwen2_VL_72B/summary_results.json index ede9d54993b54c73ddf7fd14fa46ff74244d04e5..543d0e7df8fb59051a55e264c6e7fc24a28e0531 100644 --- a/static/eval_results/SI/Qwen2_VL_72B/summary_results.json +++ b/static/eval_results/SI/Qwen2_VL_72B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.4730536307784527, - "micro_mean_score": 0.4659830915476831, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.5510079982505317, - "micro_mean_score": 0.5826568265682657, "missing_tasks": [] }, "overall_score": 0.48344754644139654 diff --git a/static/eval_results/SI/Qwen2_VL_7B/summary_results.json b/static/eval_results/SI/Qwen2_VL_7B/summary_results.json index c602e711d4391693f5f1065275958be22caa3a30..f09914f408bc7f742c3db855e501365fa09f4558 100644 --- a/static/eval_results/SI/Qwen2_VL_7B/summary_results.json +++ b/static/eval_results/SI/Qwen2_VL_7B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.3538656561495699, - "micro_mean_score": 0.34581250459157137, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4517429592549692, - "micro_mean_score": 0.4730012300123002, "missing_tasks": [] }, "overall_score": 0.3669159632302898 diff --git a/static/eval_results/SI/SmolVLM/summary_results.json b/static/eval_results/SI/SmolVLM/summary_results.json index 97be21070ed94838e45c9cd7983b884ba1236b63..ed6ee02267cb206119a1ba694ab8999afa6b3949 100644 --- a/static/eval_results/SI/SmolVLM/summary_results.json +++ b/static/eval_results/SI/SmolVLM/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.07348385181460795, - "micro_mean_score": 0.0732694668402814, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.2427337975725658, - "micro_mean_score": 0.2504920049200492, "missing_tasks": [] }, "overall_score": 0.09605051124900234 diff --git a/static/eval_results/SI/llava_onevision_72B/summary_results.json b/static/eval_results/SI/llava_onevision_72B/summary_results.json index c67b592ab42608c0b29d97657f792f219073bd8b..b20ad6721edd9fae45db594487c5b1f587967ad4 100644 --- a/static/eval_results/SI/llava_onevision_72B/summary_results.json +++ b/static/eval_results/SI/llava_onevision_72B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.312618242621264, - "micro_mean_score": 0.3098623876487132, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.4425822460912829, - "micro_mean_score": 0.47539975399754, "missing_tasks": [] }, "overall_score": 0.32994677641726655 diff --git a/static/eval_results/SI/llava_onevision_7B/summary_results.json b/static/eval_results/SI/llava_onevision_7B/summary_results.json index 0a8707a14e183e8e3fb3dbb2232b4d1fca07b301..27b6e63d4a3e9c602546195539b2c343af4cf99a 100644 --- a/static/eval_results/SI/llava_onevision_7B/summary_results.json +++ b/static/eval_results/SI/llava_onevision_7B/summary_results.json @@ -5,7 +5,6 @@ "num_eval_samples": 4116, "num_not_eval_samples": 0, "macro_mean_score": 0.23683339637631812, - "micro_mean_score": 0.23283041278687175, "missing_tasks": [] }, "open": { @@ -13,7 +12,6 @@ "num_eval_samples": 813, "num_not_eval_samples": 0, "macro_mean_score": 0.3871602360316429, - "micro_mean_score": 0.4113161131611316, "missing_tasks": [] }, "overall_score": 0.25687697499702805 diff --git a/utils.py b/utils.py index a33c572a63f4bd4b13e5ceeb7b3636fde093d31d..0fe74b03137cd943088ba8058ebf156703d9de77 100644 --- a/utils.py +++ b/utils.py @@ -33,6 +33,8 @@ MODEL_NAME_MAP = { "Mammoth_VL": "Mammoth-VL-8B", "SmolVLM": "SmolVLM-1.7B", "POINTS_15_7B": "POINTS-1.5-8B", + "InternVL2_5_78B": "InternVL2.5-78B", + "InternVL2_5_2B": "InternVL2.5-2B", } DIMENSION_NAME_MAP = { @@ -125,12 +127,12 @@ class BaseDataLoader: # Define the base MODEL_GROUPS structure BASE_MODEL_GROUPS = { "All": list(MODEL_NAME_MAP.keys()), - "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B'], - "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B"], + "Flagship Models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002', 'Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', 'Molmo_72B', 'InternVL2_5_78B'], + "Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini', 'Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B"], "Proprietary Flagship models": ['Claude_3.5_new', 'GPT_4o', 'Claude_3.5', 'Gemini_1.5_pro_002'], "Proprietary Efficiency Models": ['Gemini_1.5_flash_002', 'GPT_4o_mini'], - "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B"], - "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B",] + "Open-source Flagship Models": ['Qwen2_VL_72B', 'InternVL2_76B', 'llava_onevision_72B', 'NVLM', "Molmo_72B", "InternVL2_5_78B"], + "Open-source Efficiency Models": ['Qwen2_VL_7B', 'Pixtral_12B', 'Aria', 'InternVL2_8B', 'Phi-3.5-vision', 'MiniCPM_v2.6', 'llava_onevision_7B', 'Llama_3_2_11B', 'Idefics3', 'Molmo_7B_D', "Aquila_VL_2B", "POINTS_7B", "Qwen2_VL_2B", "InternVL2_2B", "InternVL2_5_2B"] } def __init__(self):