Spaces:
Running
Running
Commit
·
a325fdc
1
Parent(s):
c52f9e4
Updating Metric
Browse files
src/data/metrics/absolute_improvement_to_baseline.json
CHANGED
@@ -60,12 +60,12 @@
|
|
60 |
"MLAB (llama3-1-405b-instruct)": 3.8
|
61 |
},
|
62 |
"backdoor-trigger-recovery": {
|
63 |
-
"MLAB (gpt-4o)": 74.0,
|
64 |
-
"Top Human in Competition": 621.3,
|
65 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
|
|
|
66 |
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
67 |
"MLAB (gemini-exp-1206)": 80.4,
|
68 |
"MLAB (o3-mini)": 38.8,
|
|
|
69 |
"MLAB (llama3-1-405b-instruct)": 71.7,
|
70 |
"Human Idea + MLAB (gpt-4o)": 54.5
|
71 |
}
|
|
|
60 |
"MLAB (llama3-1-405b-instruct)": 3.8
|
61 |
},
|
62 |
"backdoor-trigger-recovery": {
|
|
|
|
|
63 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
|
64 |
+
"Top Human in Competition": 621.3,
|
65 |
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
66 |
"MLAB (gemini-exp-1206)": 80.4,
|
67 |
"MLAB (o3-mini)": 38.8,
|
68 |
+
"MLAB (gpt-4o)": 64.5,
|
69 |
"MLAB (llama3-1-405b-instruct)": 71.7,
|
70 |
"Human Idea + MLAB (gpt-4o)": 54.5
|
71 |
}
|
src/data/metrics/relative_improvement_to_human.json
CHANGED
@@ -60,12 +60,12 @@
|
|
60 |
"MLAB (llama3-1-405b-instruct)": 6.2
|
61 |
},
|
62 |
"backdoor-trigger-recovery": {
|
63 |
-
"MLAB (gpt-4o)": 11.9,
|
64 |
-
"Top Human in Competition": 100.0,
|
65 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 4.0,
|
|
|
66 |
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
67 |
"MLAB (gemini-exp-1206)": 12.9,
|
68 |
"MLAB (o3-mini)": 6.2,
|
|
|
69 |
"MLAB (llama3-1-405b-instruct)": 11.5,
|
70 |
"Human Idea + MLAB (gpt-4o)": 8.8
|
71 |
}
|
|
|
60 |
"MLAB (llama3-1-405b-instruct)": 6.2
|
61 |
},
|
62 |
"backdoor-trigger-recovery": {
|
|
|
|
|
63 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 4.0,
|
64 |
+
"Top Human in Competition": 100.0,
|
65 |
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
66 |
"MLAB (gemini-exp-1206)": 12.9,
|
67 |
"MLAB (o3-mini)": 6.2,
|
68 |
+
"MLAB (gpt-4o)": 10.4,
|
69 |
"MLAB (llama3-1-405b-instruct)": 11.5,
|
70 |
"Human Idea + MLAB (gpt-4o)": 8.8
|
71 |
}
|