Upload 7 files
Browse files- README.md +40 -0
- about.md +25 -0
- app.py +210 -0
- gen_table.py +142 -0
- meta_data.py +54 -0
- requirements.txt +3 -0
- results.json +136 -0
README.md
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Cybersecurity Leaderboard
|
3 |
+
emoji: 🌎
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
+
pinned: true
|
9 |
+
license: apache-2.0
|
10 |
+
tags:
|
11 |
+
- leaderboard
|
12 |
+
short_description: 'Cybersecurity Capability Evaluation Results Collection '
|
13 |
+
sdk_version: 4.44.1
|
14 |
+
---
|
15 |
+
|
16 |
+
## Submit your benchmark
|
17 |
+
|
18 |
+
This leaderboard is a collection of cybersecurity-relevant benchmarks. To submit your benchmark, please use this: https://docs.google.com/forms/d/e/1FAIpQLSd0arYQ0xy9FpGbXwu68rAFpCm0HNb-8ZK8Mma3Ru2oa2Astg/viewform. We will regularly update this leaderboard.
|
19 |
+
|
20 |
+
## Paper & Blog
|
21 |
+
|
22 |
+
Paper: https://arxiv.org/abs/2504.05408
|
23 |
+
Blog: https://rdi.berkeley.edu/frontier-ai-impact-on-cybersecurity/
|
24 |
+
|
25 |
+
## Survey
|
26 |
+
|
27 |
+
We're also launching an expert survey on this topic. We invite all AI and security researchers and practitioners to take the survey here: https://berkeley.qualtrics.com/jfe/form/SV_6zmYIqEyv7bfOrs
|
28 |
+
|
29 |
+
## Citation
|
30 |
+
|
31 |
+
Please consider to cite the report if the resource is useful to your research:
|
32 |
+
|
33 |
+
```BibTex
|
34 |
+
@article{guo2025sok,
|
35 |
+
title={{Frontier AI's Impact on the Cybersecurity Landscape}},
|
36 |
+
author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
|
37 |
+
journal={arXiv preprint arXiv:2504.05408},
|
38 |
+
year={2025}
|
39 |
+
}
|
40 |
+
```
|
about.md
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Submit your benchmark
|
2 |
+
|
3 |
+
This leaderboard is a collection of cybersecurity-relevant benchmarks. To submit your benchmark, please use this: https://docs.google.com/forms/d/e/1FAIpQLSd0arYQ0xy9FpGbXwu68rAFpCm0HNb-8ZK8Mma3Ru2oa2Astg/viewform. We will regularly update this leaderboard.
|
4 |
+
|
5 |
+
## Paper & Blog
|
6 |
+
|
7 |
+
Paper: https://arxiv.org/abs/2504.05408
|
8 |
+
Blog: https://rdi.berkeley.edu/frontier-ai-impact-on-cybersecurity/
|
9 |
+
|
10 |
+
## Survey
|
11 |
+
|
12 |
+
We're also launching an expert survey on this topic. We invite all AI and security researchers and practitioners to take the survey here: https://berkeley.qualtrics.com/jfe/form/SV_6zmYIqEyv7bfOrs
|
13 |
+
|
14 |
+
## Citation
|
15 |
+
|
16 |
+
Please consider to cite the report if the resource is useful to your research:
|
17 |
+
|
18 |
+
```BibTex
|
19 |
+
@article{guo2025sok,
|
20 |
+
title={{Frontier AI's Impact on the Cybersecurity Landscape}},
|
21 |
+
author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
|
22 |
+
journal={arXiv preprint arXiv:2504.05408},
|
23 |
+
year={2025}
|
24 |
+
}
|
25 |
+
```
|
app.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc, sys
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from gen_table import *
|
5 |
+
from meta_data import *
|
6 |
+
|
7 |
+
# import pandas as pd
|
8 |
+
# pd.set_option('display.max_colwidth', 0)
|
9 |
+
|
10 |
+
head_style = """
|
11 |
+
<style>
|
12 |
+
@media (min-width: 1536px)
|
13 |
+
{
|
14 |
+
.gradio-container {
|
15 |
+
min-width: var(--size-full) !important;
|
16 |
+
}
|
17 |
+
}
|
18 |
+
</style>
|
19 |
+
"""
|
20 |
+
|
21 |
+
TAB_CSS = """
|
22 |
+
/* 1. Target the real tab‐list container (old & new class names + role attr) */
|
23 |
+
#leaderboard_tabs [role="tablist"],
|
24 |
+
#leaderboard_tabs .gradio-tabs-tablist,
|
25 |
+
#leaderboard_tabs .tab-container[role="tablist"] {
|
26 |
+
display: flex !important;
|
27 |
+
flex-wrap: wrap !important; /* allow multi‑row */
|
28 |
+
white-space: normal !important; /* cancel nowrap */
|
29 |
+
overflow-x: visible!important; /* don’t clip off */
|
30 |
+
height: auto !important; /* grow as tall as needed */
|
31 |
+
max-width: none !important; /* cancel any max‑width */
|
32 |
+
}
|
33 |
+
|
34 |
+
/* 2. Stop each button from flexing */
|
35 |
+
#leaderboard_tabs [role="tab"],
|
36 |
+
#leaderboard_tabs .tab-container[role="tablist"] .tab-button,
|
37 |
+
#leaderboard_tabs .gradio-tabs-tab {
|
38 |
+
flex: none !important;
|
39 |
+
}
|
40 |
+
|
41 |
+
/* 3. Hide every possible “more/overflow” toggle */
|
42 |
+
#leaderboard_tabs .overflow-menu,
|
43 |
+
#leaderboard_tabs [class*="overflow-button"],
|
44 |
+
#leaderboard_tabs button[aria-label*="More"],
|
45 |
+
#leaderboard_tabs .gradio-tabs-overflow,
|
46 |
+
#leaderboard_tabs .gradio-tabs-overflow-button {
|
47 |
+
display: none !important;
|
48 |
+
}
|
49 |
+
"""
|
50 |
+
|
51 |
+
with gr.Blocks(title="Cybersecurity Leaderboard", head=
|
52 |
+
head_style) as demo:
|
53 |
+
struct = load_results()
|
54 |
+
timestamp = struct['time']
|
55 |
+
EVAL_TIME = format_timestamp(timestamp)
|
56 |
+
results = struct['results']
|
57 |
+
model_list=[]
|
58 |
+
task_list=[]
|
59 |
+
benchmark_list=[]
|
60 |
+
for task in results:
|
61 |
+
task_list+=[task]
|
62 |
+
for benchmark in results[task]:
|
63 |
+
if benchmark!='category':
|
64 |
+
benchmark_list+=[benchmark]
|
65 |
+
model_list+=list(results[task][benchmark].keys())
|
66 |
+
|
67 |
+
model_list=list(set(model_list))
|
68 |
+
N_MODEL=len(model_list)
|
69 |
+
N_TASK=len(task_list)
|
70 |
+
N_DATA = len(list(set(benchmark_list)))
|
71 |
+
DATASETS = benchmark_list
|
72 |
+
|
73 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_DATA,N_TASK,EVAL_TIME))
|
74 |
+
structs = [abc.abstractproperty() for _ in range(N_TASK)] #N_DATA
|
75 |
+
|
76 |
+
with gr.Tabs(elem_id="leaderboard_tabs", elem_classes='tab-buttons') as tabs:
|
77 |
+
with gr.TabItem('🏅 Cybersecurity Main Leaderboard', elem_id='main', id=0):
|
78 |
+
gr.Markdown(LEADERBOARD_MD['MAIN'].format(N_DATA,N_DATA))
|
79 |
+
_, check_box = BUILD_L1_DF(results, DEFAULT_TASK)
|
80 |
+
table = generate_table(results, DEFAULT_TASK)
|
81 |
+
|
82 |
+
type_map = check_box['type_map']
|
83 |
+
|
84 |
+
checkbox_group = gr.CheckboxGroup(
|
85 |
+
choices=check_box['all'],
|
86 |
+
value=check_box['required'],
|
87 |
+
label='Aspects of Cybersecurity Work',
|
88 |
+
interactive=True,
|
89 |
+
)
|
90 |
+
|
91 |
+
headers = check_box['essential'] + checkbox_group.value
|
92 |
+
with gr.Row():
|
93 |
+
model_name = gr.Textbox(
|
94 |
+
value='Input the Model Name (fuzzy, case insensitive)',
|
95 |
+
label='Model Name',
|
96 |
+
interactive=True,
|
97 |
+
visible=True)
|
98 |
+
data_component = gr.components.DataFrame(
|
99 |
+
value=table[headers],
|
100 |
+
type='pandas',
|
101 |
+
datatype=[type_map[x] for x in headers],
|
102 |
+
interactive=False,
|
103 |
+
wrap=True,
|
104 |
+
visible=True)
|
105 |
+
|
106 |
+
def filter_df(fields, model_name):
|
107 |
+
headers = check_box['essential'] + fields
|
108 |
+
df = generate_table(results, fields)
|
109 |
+
|
110 |
+
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
111 |
+
if model_name != default_val:
|
112 |
+
print(model_name)
|
113 |
+
model_name = model_name.lower()
|
114 |
+
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
|
115 |
+
flag = [model_name in name for name in method_names]
|
116 |
+
df['TEMP_FLAG'] = flag
|
117 |
+
df = df[df['TEMP_FLAG'] == True]
|
118 |
+
df.pop('TEMP_FLAG')
|
119 |
+
|
120 |
+
comp = gr.components.DataFrame(
|
121 |
+
value=df[headers],
|
122 |
+
type='pandas',
|
123 |
+
datatype=[type_map[x] for x in headers],
|
124 |
+
interactive=False,
|
125 |
+
wrap=True,
|
126 |
+
visible=True)
|
127 |
+
return comp
|
128 |
+
|
129 |
+
for cbox in [checkbox_group]:
|
130 |
+
cbox.change(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
|
131 |
+
model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
|
132 |
+
|
133 |
+
with gr.TabItem('🔍 About', elem_id='about', id=1):
|
134 |
+
with open("about.md", 'r', encoding="utf-8") as file:
|
135 |
+
gr.Markdown(file.read())
|
136 |
+
|
137 |
+
for i, task in enumerate(task_list):
|
138 |
+
with gr.TabItem(f'📊 {task} Leaderboard', elem_id=task, id=i + 2):
|
139 |
+
if task in LEADERBOARD_MD:
|
140 |
+
gr.Markdown(LEADERBOARD_MD[task])
|
141 |
+
|
142 |
+
s = structs[i]
|
143 |
+
s.table, s.check_box = BUILD_L2_DF(results, task)
|
144 |
+
s.type_map = s.check_box['type_map']
|
145 |
+
|
146 |
+
s.checkbox_group = gr.CheckboxGroup(
|
147 |
+
choices=s.check_box['all'],
|
148 |
+
value=s.check_box['required'],
|
149 |
+
label=f'{task} CheckBoxes',
|
150 |
+
interactive=True,
|
151 |
+
)
|
152 |
+
s.headers = s.check_box['essential'] + s.checkbox_group.value
|
153 |
+
|
154 |
+
with gr.Row():
|
155 |
+
s.model_name = gr.Textbox(
|
156 |
+
value='Input the Model Name (fuzzy, case insensitive)',
|
157 |
+
label='Model Name',
|
158 |
+
interactive=True,
|
159 |
+
visible=True)
|
160 |
+
s.data_component = gr.components.DataFrame(
|
161 |
+
value=s.table[s.headers],
|
162 |
+
type='pandas',
|
163 |
+
datatype=[s.type_map[x] for x in s.headers],
|
164 |
+
interactive=False,
|
165 |
+
wrap=True,
|
166 |
+
visible=True)
|
167 |
+
s.dataset = gr.Textbox(value=task, label=task, visible=False)
|
168 |
+
|
169 |
+
def filter_df_l2(dataset_name, fields, model_name):
|
170 |
+
s = structs[task_list.index(dataset_name)]
|
171 |
+
headers = s.check_box['essential'] + fields
|
172 |
+
df = cp.deepcopy(s.table)
|
173 |
+
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
174 |
+
if model_name != default_val:
|
175 |
+
print(model_name)
|
176 |
+
model_name = model_name.lower()
|
177 |
+
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
|
178 |
+
flag = [model_name in name for name in method_names]
|
179 |
+
df['TEMP_FLAG'] = flag
|
180 |
+
df = df[df['TEMP_FLAG'] == True]
|
181 |
+
df.pop('TEMP_FLAG')
|
182 |
+
|
183 |
+
comp = gr.components.DataFrame(
|
184 |
+
value=df[headers],
|
185 |
+
type='pandas',
|
186 |
+
datatype=[s.type_map[x] for x in headers],
|
187 |
+
interactive=False,
|
188 |
+
wrap=True,
|
189 |
+
visible=True)
|
190 |
+
return comp
|
191 |
+
|
192 |
+
for cbox in [s.checkbox_group]:
|
193 |
+
cbox.change(
|
194 |
+
fn=filter_df_l2,
|
195 |
+
inputs=[s.dataset, s.checkbox_group, s.model_name],
|
196 |
+
outputs=s.data_component)
|
197 |
+
s.model_name.submit(
|
198 |
+
fn=filter_df_l2,
|
199 |
+
inputs=[s.dataset, s.checkbox_group, s.model_name],
|
200 |
+
outputs=s.data_component)
|
201 |
+
|
202 |
+
with gr.Row():
|
203 |
+
with gr.Accordion('Citation', open=False):
|
204 |
+
citation_button = gr.Textbox(
|
205 |
+
value=CITATION_BUTTON_TEXT,
|
206 |
+
label=CITATION_BUTTON_LABEL,
|
207 |
+
elem_id='citation-button')
|
208 |
+
|
209 |
+
if __name__ == '__main__':
|
210 |
+
demo.launch(server_name='0.0.0.0', share=True)
|
gen_table.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy as cp
|
2 |
+
import json, sys
|
3 |
+
from collections import defaultdict
|
4 |
+
from urllib.request import urlopen
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
from meta_data import DEFAULT_TASK
|
11 |
+
|
12 |
+
|
13 |
+
def listinstr(lst, s):
|
14 |
+
assert isinstance(lst, list)
|
15 |
+
for item in lst:
|
16 |
+
if item in s:
|
17 |
+
return True
|
18 |
+
return False
|
19 |
+
|
20 |
+
|
21 |
+
def load_results():
|
22 |
+
#data = json.loads(urlopen(URL).read())
|
23 |
+
with open('results.json', 'r') as file:
|
24 |
+
data = json.load(file)
|
25 |
+
return data
|
26 |
+
|
27 |
+
|
28 |
+
def nth_large(val, vals):
|
29 |
+
return sum([1 for v in vals if v > val]) + 1
|
30 |
+
|
31 |
+
def format_timestamp(timestamp):
|
32 |
+
date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
|
33 |
+
return date
|
34 |
+
|
35 |
+
def BUILD_L1_DF(results, fields):
|
36 |
+
check_box = {}
|
37 |
+
check_box['essential'] = ['Model']
|
38 |
+
# revise there to set default dataset
|
39 |
+
check_box['required'] = DEFAULT_TASK
|
40 |
+
check_box['all'] = DEFAULT_TASK
|
41 |
+
type_map = defaultdict(lambda: 'number')
|
42 |
+
check_box['type_map'] = type_map
|
43 |
+
|
44 |
+
df = generate_table(results, fields)
|
45 |
+
return df, check_box
|
46 |
+
|
47 |
+
|
48 |
+
def BUILD_L2_DF(results, task):
|
49 |
+
results=results[task]
|
50 |
+
model_list=[]
|
51 |
+
benchmark_list=[]
|
52 |
+
all_fields=[]
|
53 |
+
for benchmark in results:
|
54 |
+
if benchmark!='category':
|
55 |
+
benchmark_list+=[benchmark]
|
56 |
+
if benchmark not in ["CRUXEval","AutoPenBench"]:
|
57 |
+
all_fields+=[benchmark]
|
58 |
+
else:
|
59 |
+
all_fields+=[benchmark+' (autonomous)', benchmark+' (assisted)']
|
60 |
+
model_list+=list(results[benchmark].keys())
|
61 |
+
model_list=list(set(model_list))
|
62 |
+
|
63 |
+
res = defaultdict(list)
|
64 |
+
res['Model']=model_list
|
65 |
+
|
66 |
+
for benchmark in benchmark_list:
|
67 |
+
if benchmark not in ["CRUXEval","AutoPenBench"]:
|
68 |
+
for model in model_list:
|
69 |
+
if model in results[benchmark]:
|
70 |
+
res[benchmark].append(results[benchmark][model])
|
71 |
+
else:
|
72 |
+
res[benchmark].append(None)
|
73 |
+
else:
|
74 |
+
for model in model_list:
|
75 |
+
res[benchmark+' (autonomous)'].append(results[benchmark][model]['autonomous'])
|
76 |
+
res[benchmark+' (assisted)'].append(results[benchmark][model]['assisted'])
|
77 |
+
|
78 |
+
df = pd.DataFrame(res)
|
79 |
+
required_fields = all_fields
|
80 |
+
|
81 |
+
check_box = {}
|
82 |
+
check_box['essential'] = ['Model']
|
83 |
+
check_box['required'] = required_fields
|
84 |
+
check_box['all'] = all_fields
|
85 |
+
type_map = defaultdict(lambda: 'number')
|
86 |
+
check_box['type_map'] = type_map
|
87 |
+
return df, check_box
|
88 |
+
|
89 |
+
|
90 |
+
def generate_table(results, fields):
|
91 |
+
model_list=[]
|
92 |
+
task_list=fields
|
93 |
+
benchmark_list=[]
|
94 |
+
for task in results:
|
95 |
+
for benchmark in results[task]:
|
96 |
+
if benchmark!='category':
|
97 |
+
benchmark_list+=[benchmark]
|
98 |
+
model_list+=list(results[task][benchmark].keys())
|
99 |
+
model_list=list(set(model_list))
|
100 |
+
|
101 |
+
res = defaultdict(list)
|
102 |
+
res['Model']=model_list
|
103 |
+
|
104 |
+
average_score={}
|
105 |
+
cnt={}
|
106 |
+
for task in task_list:
|
107 |
+
task_score=[]
|
108 |
+
for model in model_list:
|
109 |
+
score=[]
|
110 |
+
for benchmark in results[task]:
|
111 |
+
if benchmark != 'category':
|
112 |
+
if model not in results[task][benchmark]:
|
113 |
+
score.append(None)
|
114 |
+
elif not isinstance(results[task][benchmark][model], (int, float)):
|
115 |
+
score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2)
|
116 |
+
else:
|
117 |
+
score.append(results[task][benchmark][model])
|
118 |
+
if not any (item is not None for item in score):
|
119 |
+
score=None
|
120 |
+
else:
|
121 |
+
score=np.mean([s for s in score if s is not None])
|
122 |
+
if model not in average_score:
|
123 |
+
average_score[model]=score
|
124 |
+
cnt[model]=1
|
125 |
+
else:
|
126 |
+
average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1)
|
127 |
+
cnt[model]+=1
|
128 |
+
task_score.append(score)
|
129 |
+
res[task]=task_score
|
130 |
+
|
131 |
+
#res['Avg Score']=[average_score[model] for model in model_list]
|
132 |
+
#res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']]
|
133 |
+
|
134 |
+
df = pd.DataFrame(res)
|
135 |
+
# valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
|
136 |
+
# valid = valid.sort_values('Avg Score')
|
137 |
+
# valid = valid.iloc[::-1]
|
138 |
+
# if len(fields):
|
139 |
+
# missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
|
140 |
+
# missing = missing.iloc[::-1]
|
141 |
+
# df = pd.concat([valid, missing])
|
142 |
+
return df
|
meta_data.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
|
2 |
+
# CONSTANTS-CITATION
|
3 |
+
CITATION_BUTTON_TEXT = r"""@article{guo2025sok,
|
4 |
+
title={{Frontier AI's Impact on the Cybersecurity Landscape}},
|
5 |
+
author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
|
6 |
+
journal={arXiv preprint arXiv:2504.05408},
|
7 |
+
year={2025}
|
8 |
+
}
|
9 |
+
"""
|
10 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
11 |
+
# CONSTANTS-TEXT
|
12 |
+
LEADERBORAD_INTRODUCTION = """# Cybersecurity Leaderboard
|
13 |
+
### Welcome to the Cybersecurity Leaderboard! This leaderboard is a collection of benchmarks relevant to cybersecurity capabilities.
|
14 |
+
This leaderboard covers {} benchmarks across {} aspects of cybersecurity work.
|
15 |
+
|
16 |
+
This leaderboard was last updated: {} """
|
17 |
+
# CONSTANTS-FIELDS
|
18 |
+
# META_FIELDS = [
|
19 |
+
# 'Model'
|
20 |
+
# ]
|
21 |
+
|
22 |
+
DEFAULT_TASK = [
|
23 |
+
'Vulnerable code generation', 'Attack generation', 'CTF', 'Cyber knowledge', 'Pen test', 'Vulnerability detection', 'PoC generation', 'Patching'
|
24 |
+
]
|
25 |
+
MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
|
26 |
+
|
27 |
+
# The README file for each benchmark
|
28 |
+
LEADERBOARD_MD = {}
|
29 |
+
|
30 |
+
LEADERBOARD_MD['MAIN'] = """
|
31 |
+
## Main Evaluation Results
|
32 |
+
|
33 |
+
- Metrics:
|
34 |
+
- Avg Score: The average score on {} Cybersecurity Benchmarks (normalized to 0 - 100, the higher the better).
|
35 |
+
- Avg Rank: The average rank on {} Cybersecurity Benchmarks (the lower the better).
|
36 |
+
- Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
|
37 |
+
"""
|
38 |
+
|
39 |
+
LEADERBOARD_MD['Vulnerable code generation'] = """Need to add a description
|
40 |
+
"""
|
41 |
+
LEADERBOARD_MD['Attack generation'] = """Need to add a description
|
42 |
+
"""
|
43 |
+
LEADERBOARD_MD['CTF'] = """Need to add a description
|
44 |
+
"""
|
45 |
+
LEADERBOARD_MD['Cyber knowledge'] = """Need to add a description
|
46 |
+
"""
|
47 |
+
LEADERBOARD_MD['Pen test'] = """Need to add a description
|
48 |
+
"""
|
49 |
+
LEADERBOARD_MD['Vulnerability detection'] = """Need to add a description
|
50 |
+
"""
|
51 |
+
LEADERBOARD_MD['PoC generation'] = """Need to add a description
|
52 |
+
"""
|
53 |
+
LEADERBOARD_MD['Patching'] = """Need to add a description
|
54 |
+
"""
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.15.0
|
2 |
+
numpy>=1.23.4
|
3 |
+
pandas>=1.5.3
|
results.json
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"time": "20250418",
|
3 |
+
"results": {
|
4 |
+
"Vulnerable code generation": {
|
5 |
+
"category": "attack",
|
6 |
+
"CyberSecEval-3":{
|
7 |
+
"GPT-4": 35,
|
8 |
+
"Llama-3.1-405B": 39,
|
9 |
+
"Llama-3.1-70B": 35
|
10 |
+
},
|
11 |
+
"SecCodePLT":{
|
12 |
+
"GPT-4o": 44,
|
13 |
+
"Llama-3.1-70B": 47
|
14 |
+
}
|
15 |
+
},
|
16 |
+
"Attack generation": {
|
17 |
+
"category": "attack",
|
18 |
+
"CyberSecEval-3":{
|
19 |
+
"GPT-4": 40,
|
20 |
+
"Llama-3.1-405B": 49,
|
21 |
+
"Llama-3.1-70B": 41
|
22 |
+
},
|
23 |
+
"SecCodePLT":{
|
24 |
+
"GPT-4o": 0.2,
|
25 |
+
"Claude-3.5-Sonnet": 0.2,
|
26 |
+
"Llama-3.1-70B": 0
|
27 |
+
},
|
28 |
+
"RedCode-Gen": {
|
29 |
+
"GPT-3.5": 32.5,
|
30 |
+
"GPT-4": 66.9,
|
31 |
+
"GPT-4o": 72.5,
|
32 |
+
"Llama-2-7B": 20.7
|
33 |
+
},
|
34 |
+
"RedCode-Exec": {
|
35 |
+
"GPT-4": 64.5,
|
36 |
+
"GPT-4o": 77.23,
|
37 |
+
"Claude-3.5-Sonnet": 67.63,
|
38 |
+
"Llama-3.1-70B": 76.70,
|
39 |
+
"Llama-3.1-8B": 62.87
|
40 |
+
}
|
41 |
+
},
|
42 |
+
"CTF": {
|
43 |
+
"category": "attack",
|
44 |
+
"CyBench": {
|
45 |
+
"GPT-4o": 12.5,
|
46 |
+
"GPT-4.5-preview": 17.5,
|
47 |
+
"o1-preview": 10.0,
|
48 |
+
"o3-mini": 22.5,
|
49 |
+
"Claude-3.5-Sonnet": 17.5,
|
50 |
+
"Claude-3.7-Sonnet": 20,
|
51 |
+
"Gemini-1.5-pro": 7.5,
|
52 |
+
"Llama-3.1-405B": 7.5,
|
53 |
+
"Llama-3.1-70B": 5.0
|
54 |
+
},
|
55 |
+
"NYU": {
|
56 |
+
"GPT-4": 7.00,
|
57 |
+
"GPT-4o": 9.50,
|
58 |
+
"Claude-3.5-Sonnet": 13.50
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"Cyber knowledge": {
|
62 |
+
"category": "attack",
|
63 |
+
"CyberBench": {
|
64 |
+
"GPT-3.5": 62.6,
|
65 |
+
"GPT-4": 69.9,
|
66 |
+
"Llama-2-7B": 50.6
|
67 |
+
},
|
68 |
+
"CyberMetric": {
|
69 |
+
"GPT-3.5": 88.10,
|
70 |
+
"GPT-4": 91.00,
|
71 |
+
"GPT-4o": 91.25,
|
72 |
+
"Gemini-1.0-pro": 84.00,
|
73 |
+
"Llama-3-8B": 73.05,
|
74 |
+
"Llama-2-70B": 72.60
|
75 |
+
},
|
76 |
+
"TACTL": {
|
77 |
+
"GPT-4o": 85.2,
|
78 |
+
"DeepSeek-R1": 91.8,
|
79 |
+
"DeepSeek-V3": 86.3,
|
80 |
+
"Llama-3.1-405B": 88.5,
|
81 |
+
"Llama-3.3-70B": 78.7
|
82 |
+
}
|
83 |
+
},
|
84 |
+
"Pen test": {
|
85 |
+
"category": "defense",
|
86 |
+
"AutoPenBench": {
|
87 |
+
"GPT-4o": {
|
88 |
+
"autonomous": 21.00,
|
89 |
+
"assisted": 64.00
|
90 |
+
}
|
91 |
+
}
|
92 |
+
},
|
93 |
+
"Vulnerability detection": {
|
94 |
+
"category": "defense",
|
95 |
+
"PrimeVul": {
|
96 |
+
"GPT-3.5": 6.21,
|
97 |
+
"GPT-4": 12.94
|
98 |
+
}
|
99 |
+
},
|
100 |
+
"PoC generation": {
|
101 |
+
"category": "defense",
|
102 |
+
"CRUXEval": {
|
103 |
+
"GPT-3.5": {
|
104 |
+
"autonomous": 49.1,
|
105 |
+
"assisted": 63.3
|
106 |
+
},
|
107 |
+
"GPT-4": {
|
108 |
+
"autonomous": 74.8,
|
109 |
+
"assisted": 81.9
|
110 |
+
},
|
111 |
+
"Code-Llama-13B": {
|
112 |
+
"autonomous": 39.1,
|
113 |
+
"assisted": 39.3
|
114 |
+
},
|
115 |
+
"Code-Llama-34B": {
|
116 |
+
"autonomous": 50.4,
|
117 |
+
"assisted": 46.0
|
118 |
+
}
|
119 |
+
}
|
120 |
+
},
|
121 |
+
"Patching": {
|
122 |
+
"category": "defense",
|
123 |
+
"SWE-bench-verified": {
|
124 |
+
"GPT-3.5": 0.4,
|
125 |
+
"GPT-4": 22.4,
|
126 |
+
"GPT-4o": 38.8,
|
127 |
+
"o1": 48.9,
|
128 |
+
"o3-mini": 49.3,
|
129 |
+
"Claude-3.5-Sonnet": 49.0,
|
130 |
+
"Claude-3.7-Sonnet": 70.3,
|
131 |
+
"DeepSeek-V3": 42.0,
|
132 |
+
"DeepSeek-R1": 49.2
|
133 |
+
}
|
134 |
+
}
|
135 |
+
}
|
136 |
+
}
|