liaojiajia commited on
Commit
9ec00c3
·
1 Parent(s): 5740e03

add mm results

Browse files
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import abc
2
  import gradio as gr
3
  import os
 
4
 
5
  from gen_table import *
6
  from meta_data import *
@@ -242,6 +243,112 @@ with gr.Blocks(title="Open Agent Leaderboard") as demo:
242
  outputs=data_component
243
  )
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
 
246
  with gr.Row():
247
  with gr.Accordion("📙 Citation", open=False):
 
1
  import abc
2
  import gradio as gr
3
  import os
4
+ import pandas as pd
5
 
6
  from gen_table import *
7
  from meta_data import *
 
243
  outputs=data_component
244
  )
245
 
246
+ with gr.Tab(label='🏅 Open Agent Multi-Modal Leaderboard'):
247
+ gr.Markdown(LEADERBOARD_MD['MULTI_MODAL_MAIN'])
248
+ struct_multi_modal = load_results(MULTIMODAL_SCORE_FILE)
249
+ timestamp = struct_multi_modal['time']
250
+ EVAL_TIME_MM = format_timestamp(timestamp)
251
+
252
+ # Use BUILD_L3_DF to process multi-modal results (pass the list directly)
253
+ table_mm, check_box_mm = BUILD_L3_DF(
254
+ struct_multi_modal['multi_modal_results'], DEFAULT_MULTI_MODAL_BENCH
255
+ )
256
+
257
+ # Save the complete table as a CSV file
258
+ csv_path_multi_modal = os.path.join(os.getcwd(), 'src/multi_modal_results.csv')
259
+ table_mm.to_csv(csv_path_multi_modal, index=False)
260
+ print(f"Multi-modal results saved to {csv_path_multi_modal}")
261
+
262
+ type_map_mm = check_box_mm['type_map']
263
+
264
+ checkbox_group_mm = gr.CheckboxGroup(
265
+ choices=check_box_mm['all'],
266
+ value=check_box_mm['required'],
267
+ label='Evaluation Dimension',
268
+ interactive=True,
269
+ )
270
+
271
+ agent_name_mm = gr.CheckboxGroup(
272
+ choices=table_mm['Agent'].unique().tolist(),
273
+ value=table_mm['Agent'].unique().tolist(),
274
+ label='Agent',
275
+ interactive=True
276
+ )
277
+
278
+ vlm_name_mm = gr.CheckboxGroup(
279
+ choices=table_mm['VLMs'].unique().tolist(),
280
+ value=table_mm['VLMs'].unique().tolist(),
281
+ label='VLMs',
282
+ interactive=True
283
+ )
284
+
285
+ initial_headers_mm = ['Rank'] + check_box_mm['essential'] + checkbox_group_mm.value
286
+ available_headers_mm = [h for h in initial_headers_mm if h in table_mm.columns]
287
+
288
+ data_component_mm = gr.components.DataFrame(
289
+ value=table_mm[available_headers_mm],
290
+ type='pandas',
291
+ datatype=[type_map_mm[x] for x in available_headers_mm],
292
+ interactive=False,
293
+ wrap=True,
294
+ visible=True
295
+ )
296
+
297
+ def filter_df_mm(fields, agents, vlms, *args):
298
+ headers = ['Rank'] + check_box_mm['essential'] + fields
299
+ df = table_mm.copy()
300
+
301
+ # Validate inputs to avoid errors
302
+ if not agents:
303
+ agents = df['Agent'].unique().tolist()
304
+ if not vlms:
305
+ vlms = df['VLMs'].unique().tolist()
306
+
307
+ # Add filtering logic
308
+ df['flag'] = df.apply(lambda row: (
309
+ row['Agent'] in agents and
310
+ row['VLMs'] in vlms
311
+ ), axis=1)
312
+
313
+ df = df[df['flag']].copy()
314
+ df.pop('flag')
315
+
316
+ # Ensure all requested columns exist
317
+ available_headers = [h for h in headers if h in df.columns]
318
+
319
+ # If no columns are available, return an empty DataFrame with basic columns
320
+ if not available_headers:
321
+ available_headers = ['Rank'] + check_box_mm['essential']
322
+
323
+ comp = gr.components.DataFrame(
324
+ value=df[available_headers],
325
+ type='pandas',
326
+ datatype=[type_map_mm.get(col, 'str') for col in available_headers],
327
+ interactive=False,
328
+ wrap=True,
329
+ visible=True
330
+ )
331
+
332
+ return comp
333
+
334
+ # Add change events for multi-modal leaderboard
335
+ checkbox_group_mm.change(
336
+ fn=filter_df_mm,
337
+ inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm],
338
+ outputs=data_component_mm
339
+ )
340
+
341
+ agent_name_mm.change(
342
+ fn=filter_df_mm,
343
+ inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm],
344
+ outputs=data_component_mm
345
+ )
346
+
347
+ vlm_name_mm.change(
348
+ fn=filter_df_mm,
349
+ inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm],
350
+ outputs=data_component_mm
351
+ )
352
 
353
  with gr.Row():
354
  with gr.Accordion("📙 Citation", open=False):
gen_table.py CHANGED
@@ -97,14 +97,14 @@ def BUILD_L2_DF(results, fields):
97
  # Create DataFrame
98
  df = pd.DataFrame(res)
99
 
100
- # 获取所有唯一的 Algorithm LLM
101
  unique_algorithms = df['Algorithm'].unique().tolist()
102
  unique_llms = df['LLM'].unique().tolist()
103
 
104
  # Set checkbox configuration
105
  check_box = {}
106
- check_box['Algorithm_options'] = unique_algorithms # 添加 Algorithm 可选项
107
- check_box['LLM_options'] = unique_llms # 添加 LLM 可选项
108
 
109
  # Sort by Dataset and Score in descending order
110
  df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
@@ -183,7 +183,7 @@ def generate_table(results, fields):
183
  df = pd.concat([valid, missing])
184
  df = df.sort_values('Rank')
185
 
186
- # 重新排列列顺序
187
  columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
188
  for d in fields:
189
  columns.extend([f"{d}-Score", f"{d}-Cost($)"])
@@ -238,4 +238,72 @@ def generate_table_detail(results, fields):
238
  remaining_columns = [col for col in df.columns if col not in columns]
239
  df = df[columns + remaining_columns]
240
 
241
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # Create DataFrame
98
  df = pd.DataFrame(res)
99
 
100
+ # Get all unique Algorithms and LLM
101
  unique_algorithms = df['Algorithm'].unique().tolist()
102
  unique_llms = df['LLM'].unique().tolist()
103
 
104
  # Set checkbox configuration
105
  check_box = {}
106
+ check_box['Algorithm_options'] = unique_algorithms # Add Algorithm Options
107
+ check_box['LLM_options'] = unique_llms # Add LLM option
108
 
109
  # Sort by Dataset and Score in descending order
110
  df = df.sort_values(['Dataset', 'Score'], ascending=[True, False])
 
183
  df = pd.concat([valid, missing])
184
  df = df.sort_values('Rank')
185
 
186
+ # Rearrange column order
187
  columns = ['Rank', 'Algorithm', 'LLM', 'Eval Date', 'Avg Score']
188
  for d in fields:
189
  columns.extend([f"{d}-Score", f"{d}-Cost($)"])
 
238
  remaining_columns = [col for col in df.columns if col not in columns]
239
  df = df[columns + remaining_columns]
240
 
241
+ return df
242
+
243
+ def generate_multi_modal_table(results, fields):
244
+ res = defaultdict(list)
245
+ for entry in results.values():
246
+ # Add Agent and VLMs
247
+ res['Agent'].append(entry.get('Agent', 'Unknown'))
248
+ res['VLMs'].append(entry.get('VLMs', 'Unknown'))
249
+
250
+ # Add numeric fields
251
+ for field in fields:
252
+ res[field].append(entry.get(field, None))
253
+
254
+ # Create DataFrame
255
+ df = pd.DataFrame(res)
256
+
257
+ # Sort by Score in descending order
258
+ df = df.sort_values('Score', ascending=False)
259
+
260
+ # Add Rank column
261
+ df['Rank'] = range(1, len(df) + 1)
262
+
263
+ # Rearrange column order
264
+ columns = ['Rank', 'Agent', 'VLMs'] + fields
265
+ df = df[columns]
266
+
267
+ return df
268
+
269
+ def BUILD_L3_DF(results, fields):
270
+ res = defaultdict(list)
271
+
272
+ # Iterate over each entry in the multi-modal results (results is a list)
273
+ for entry in results:
274
+ # Add Agent and VLMs
275
+ res['Agent'].append(entry.get('Agent', 'Unknown'))
276
+ res['VLMs'].append(entry.get('VLMs', 'Unknown'))
277
+
278
+ # Add numeric fields
279
+ for field in fields:
280
+ res[field].append(entry.get(field, None))
281
+
282
+ # Create DataFrame
283
+ df = pd.DataFrame(res)
284
+
285
+ # Sort by Score in descending order
286
+ df = df.sort_values('Score', ascending=False)
287
+
288
+ # Add Rank column
289
+ df['Rank'] = range(1, len(df) + 1)
290
+
291
+ # Rearrange column order
292
+ columns = ['Rank', 'Agent', 'VLMs'] + fields
293
+ df = df[columns]
294
+
295
+ # Set checkbox configuration
296
+ check_box = {}
297
+ check_box['essential'] = ['Agent', 'VLMs']
298
+ check_box['required'] = check_box['essential'] + fields
299
+ check_box['all'] = ['Rank'] + fields
300
+
301
+ type_map = defaultdict(lambda: 'number')
302
+ type_map['Agent'] = 'str'
303
+ type_map['VLMs'] = 'str'
304
+ type_map['Rank'] = 'number'
305
+ for field in fields:
306
+ type_map[field] = 'number'
307
+ check_box['type_map'] = type_map
308
+
309
+ return df, check_box
meta_data.py CHANGED
@@ -1,12 +1,13 @@
1
  # CONSTANTS-URL
2
  OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
3
  DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
 
4
 
5
  # CONSTANTS-TEXT
6
  LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
7
  ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, ToT, etc. The agents are implemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
8
 
9
- We are excited to announce that the paper "Unifying Language Agent Algorithms with Graph-based Orchestration Engine for Reproducible Agent Research" has been accepted to ACL 2025 Systems Demonstration Track! 🎉
10
 
11
  This leaderboard was last updated: {}.
12
 
@@ -18,6 +19,9 @@ DEFAULT_MATH_BENCH = [
18
  'gsm8k', 'AQuA', 'MATH-500',
19
  ]
20
 
 
 
 
21
  # The README file for each benchmark
22
  LEADERBOARD_MD = {}
23
 
@@ -69,6 +73,19 @@ LEADERBOARD_MD['MATH_DETAIL'] = f"""
69
  - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
70
  """
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  META_FIELDS = [
73
  'Algorithm', 'LLM', 'Eval Date'
74
  ]
 
1
  # CONSTANTS-URL
2
  OVERALL_MATH_SCORE_FILE = "src/overall_math_score.json"
3
  DETAIL_MATH_SCORE_FILE = "src/detail_math_score.json"
4
+ MULTIMODAL_SCORE_FILE = "src/multi_modal_results.json"
5
 
6
  # CONSTANTS-TEXT
7
  LEADERBORAD_INTRODUCTION = """# Open Agent Leaderboard
8
  ### Welcome to the Open Agent Leaderboard! We share the evaluation results of open agents: CoT, SC-CoT, PoT, ReAct, ToT, etc. The agents are implemented by the OpenSource Framework: [*OmAgent*](https://github.com/om-ai-lab/OmAgent)
9
 
10
+ We are excited to announce that the paper "Unifying Language Agent Algorithms with Graph-based Orchestration Engine for Reproducible Agent Research" has been accepted to ACL 2025 Systems Demonstration Track! [*Paper*](https://arxiv.org/abs/2505.24354) 🎉
11
 
12
  This leaderboard was last updated: {}.
13
 
 
19
  'gsm8k', 'AQuA', 'MATH-500',
20
  ]
21
 
22
+ DEFAULT_MULTI_MODAL_BENCH = ['Score', 'Pass Rate', 'Total Input Tokens', 'Total Output Tokens', 'All Tokens']
23
+
24
+
25
  # The README file for each benchmark
26
  LEADERBOARD_MD = {}
27
 
 
73
  - ReAct-Pro*: We modified ReAct to ReAct-Pro, following the Reflexion repository. Implementation details can be found in the [*OmAgent*](https://github.com/om-ai-lab/OmAgent) repository.
74
  """
75
 
76
+ LEADERBOARD_MD['MULTI_MODAL_MAIN'] = f"""
77
+ ## Math task main Evaluation Results
78
+
79
+ - Metrics:
80
+ - Score: The evaluation score on each Benchmarks (the higher the better).
81
+ - Pass rate: The percentage of response that are valid, where a response is valid if it is neither empty nor null.
82
+
83
+ - By default, we present the overall evaluation results based on MME-RealWorld, sorted by the descending order of Score.
84
+
85
+ - IO (Input-Output): The baseline method that directly prompts the model with the question and expects an answer without any intermediate reasoning steps.
86
+
87
+ """
88
+
89
  META_FIELDS = [
90
  'Algorithm', 'LLM', 'Eval Date'
91
  ]
preprocess.py CHANGED
@@ -174,7 +174,50 @@ def process_csv_to_overall_json():
174
  with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
175
  json.dump(result, f, indent=4, ensure_ascii=False)
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  if __name__ == "__main__":
178
- # Generate JSON files in two formats
179
  process_csv_to_json()
180
- process_csv_to_overall_json()
 
 
174
  with open('src/overall_math_score.json', 'w', encoding='utf-8') as f:
175
  json.dump(result, f, indent=4, ensure_ascii=False)
176
 
177
+ def process_multi_modal_csv():
178
+ # Read the CSV file
179
+ df = pd.read_csv('src/multi-modal.csv', skipinitialspace=True)
180
+
181
+ # Clean and rename columns
182
+ df.columns = df.columns.str.strip().str.replace('="', '').str.replace('"', '')
183
+ df = df.rename(columns={
184
+ 'Agent': 'Agent',
185
+ 'VLMs': 'VLMs',
186
+ 'Score': 'Score',
187
+ 'Pass Rate': 'Pass Rate',
188
+ 'Total Input Tokens': 'Total Input Tokens',
189
+ 'Total Output Tokens': 'Total Output Tokens',
190
+ 'All Tokens': 'All Tokens'
191
+ })
192
+
193
+ # Strip unwanted characters from all string values
194
+ df = df.applymap(lambda x: str(x).replace('="', '').replace('"', '').strip() if isinstance(x, str) else x)
195
+
196
+ # Helper function to parse numbers with commas
197
+ def parse_number(value):
198
+ if pd.isna(value) or value == '-':
199
+ return 0
200
+ return int(float(str(value).replace(',', '')))
201
+
202
+ # Process numeric fields
203
+ df['Score'] = df['Score'].apply(lambda x: round(float(x), 2) if pd.notnull(x) and x != '-' else 0.0)
204
+ df['Pass Rate'] = df['Pass Rate'].apply(lambda x: round(float(x) / 100, 4) if pd.notnull(x) and x != '-' else 0.0)
205
+ df['Total Input Tokens'] = df['Total Input Tokens'].apply(parse_number)
206
+ df['Total Output Tokens'] = df['Total Output Tokens'].apply(parse_number)
207
+ df['All Tokens'] = df['All Tokens'].apply(parse_number)
208
+
209
+ # Convert to Hugging Face-compatible format
210
+ result = {
211
+ "time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
212
+ "multi_modal_results": df.to_dict(orient='records')
213
+ }
214
+
215
+ # Save as JSON file
216
+ with open('src/multi_modal_results.json', 'w', encoding='utf-8') as f:
217
+ json.dump(result, f, indent=4, ensure_ascii=False)
218
+
219
  if __name__ == "__main__":
220
+ # Generate JSON files in three formats
221
  process_csv_to_json()
222
+ process_csv_to_overall_json()
223
+ process_multi_modal_csv()
src/detail_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-03-05 13:15:02",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
 
1
  {
2
+ "time": "2025-06-25 18:17:55",
3
  "results": {
4
  "IO": {
5
  "gpt-3.5-turbo": {
src/multi-modal.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ "=""Agent""","=""VLMs""","=""Score""","=""Pass Rate""","=""Total Input Tokens""","=""Total Output Tokens""","=""All Tokens""
2
+ "=""ZoomEye""","=""Qwen2.5-VL-72B-Instruct""","=""51.56""","=""99.81""","=""76,808,965""","=""1,276,460""","=""78,085,425""
3
+ "=""ZoomEye""","=""Qwen2.5-VL-7B-Instruct""","=""48.06""","=""96.50""","=""94,418,593""","=""1,472,836""","=""95,891,429""
4
+ "=""IO""","=""Qwen2.5-VL-72B-Instruct""","=""44.47""","=""100.00""","=""6,174,490""","=""2,114""","=""6,176,604""
5
+ "=""ZoomEye""","=""InternVL2.5-8B""","=""43.42""","=""99.34""","=""153,857,588""","=""2,017,170""","=""155,874,758""
6
+ "=""IO""","=""InternVL2.5-8B""","=""42.95""","=""100.00""","=""2,779,778""","=""2,335""","=""2,782,113""
7
+ "=""IO""","=""Qwen2.5-VL-7B-Instruct""","=""42.86""","=""100.00""","=""6,174,490""","=""2,114""","=""6,176,604""
8
+ "=""ZoomEye""","=""Llava-v1.5-7B""","=""31.60""","=""98.86""","=""113,073,261""","=""1,368,724""","=""114,441,985""
9
+ "=""IO""","=""Llava-v1.5-7B""","=""24.79""","=""100.00""","=""734,868""","=""17,036""","=""751,904""
10
+ "=""V*""","=""seal_vqa & seal_vsm""","=""15.14""","=""72.37""","=""-""","=""-""","=""-"""
src/multi_modal_results.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Rank,Agent,VLMs,Score,Pass Rate,Total Input Tokens,Total Output Tokens,All Tokens
2
+ 1,ZoomEye,Qwen2.5-VL-72B-Instruct,51.56,0.9981,76808965,1276460,78085425
3
+ 2,ZoomEye,Qwen2.5-VL-7B-Instruct,48.06,0.965,94418593,1472836,95891429
4
+ 3,IO,Qwen2.5-VL-72B-Instruct,44.47,1.0,6174490,2114,6176604
5
+ 4,ZoomEye,InternVL2.5-8B,43.42,0.9934,153857588,2017170,155874758
6
+ 5,IO,InternVL2.5-8B,42.95,1.0,2779778,2335,2782113
7
+ 6,IO,Qwen2.5-VL-7B-Instruct,42.86,1.0,6174490,2114,6176604
8
+ 7,ZoomEye,Llava-v1.5-7B,31.6,0.9886,113073261,1368724,114441985
9
+ 8,IO,Llava-v1.5-7B,24.79,1.0,734868,17036,751904
10
+ 9,V*,seal_vqa & seal_vsm,15.14,0.7237,0,0,0
src/multi_modal_results.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "time": "2025-06-25 18:17:55",
3
+ "multi_modal_results": [
4
+ {
5
+ "Agent": "ZoomEye",
6
+ "VLMs": "Qwen2.5-VL-72B-Instruct",
7
+ "Score": 51.56,
8
+ "Pass Rate": 0.9981,
9
+ "Total Input Tokens": 76808965,
10
+ "Total Output Tokens": 1276460,
11
+ "All Tokens": 78085425
12
+ },
13
+ {
14
+ "Agent": "ZoomEye",
15
+ "VLMs": "Qwen2.5-VL-7B-Instruct",
16
+ "Score": 48.06,
17
+ "Pass Rate": 0.965,
18
+ "Total Input Tokens": 94418593,
19
+ "Total Output Tokens": 1472836,
20
+ "All Tokens": 95891429
21
+ },
22
+ {
23
+ "Agent": "IO",
24
+ "VLMs": "Qwen2.5-VL-72B-Instruct",
25
+ "Score": 44.47,
26
+ "Pass Rate": 1.0,
27
+ "Total Input Tokens": 6174490,
28
+ "Total Output Tokens": 2114,
29
+ "All Tokens": 6176604
30
+ },
31
+ {
32
+ "Agent": "ZoomEye",
33
+ "VLMs": "InternVL2.5-8B",
34
+ "Score": 43.42,
35
+ "Pass Rate": 0.9934,
36
+ "Total Input Tokens": 153857588,
37
+ "Total Output Tokens": 2017170,
38
+ "All Tokens": 155874758
39
+ },
40
+ {
41
+ "Agent": "IO",
42
+ "VLMs": "InternVL2.5-8B",
43
+ "Score": 42.95,
44
+ "Pass Rate": 1.0,
45
+ "Total Input Tokens": 2779778,
46
+ "Total Output Tokens": 2335,
47
+ "All Tokens": 2782113
48
+ },
49
+ {
50
+ "Agent": "IO",
51
+ "VLMs": "Qwen2.5-VL-7B-Instruct",
52
+ "Score": 42.86,
53
+ "Pass Rate": 1.0,
54
+ "Total Input Tokens": 6174490,
55
+ "Total Output Tokens": 2114,
56
+ "All Tokens": 6176604
57
+ },
58
+ {
59
+ "Agent": "ZoomEye",
60
+ "VLMs": "Llava-v1.5-7B",
61
+ "Score": 31.6,
62
+ "Pass Rate": 0.9886,
63
+ "Total Input Tokens": 113073261,
64
+ "Total Output Tokens": 1368724,
65
+ "All Tokens": 114441985
66
+ },
67
+ {
68
+ "Agent": "IO",
69
+ "VLMs": "Llava-v1.5-7B",
70
+ "Score": 24.79,
71
+ "Pass Rate": 1.0,
72
+ "Total Input Tokens": 734868,
73
+ "Total Output Tokens": 17036,
74
+ "All Tokens": 751904
75
+ },
76
+ {
77
+ "Agent": "V*",
78
+ "VLMs": "seal_vqa & seal_vsm",
79
+ "Score": 15.14,
80
+ "Pass Rate": 0.7237,
81
+ "Total Input Tokens": 0,
82
+ "Total Output Tokens": 0,
83
+ "All Tokens": 0
84
+ }
85
+ ]
86
+ }
src/overall_math_score.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "time": "2025-03-05 13:15:02",
3
  "results": {
4
  "IO": {
5
  "META": {
 
1
  {
2
+ "time": "2025-06-25 18:17:55",
3
  "results": {
4
  "IO": {
5
  "META": {