yujinyujin9393 commited on
Commit
bb8ff6c
·
verified ·
1 Parent(s): bc30b2a

Upload 7 files

Browse files
Files changed (7) hide show
  1. README.md +40 -0
  2. about.md +25 -0
  3. app.py +210 -0
  4. gen_table.py +142 -0
  5. meta_data.py +54 -0
  6. requirements.txt +3 -0
  7. results.json +136 -0
README.md ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cybersecurity Leaderboard
3
+ emoji: 🌎
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: gradio
7
+ app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ tags:
11
+ - leaderboard
12
+ short_description: 'Cybersecurity Capability Evaluation Results Collection '
13
+ sdk_version: 4.44.1
14
+ ---
15
+
16
+ ## Submit your benchmark
17
+
18
+ This leaderboard is a collection of cybersecurity-relevant benchmarks. To submit your benchmark, please use this: https://docs.google.com/forms/d/e/1FAIpQLSd0arYQ0xy9FpGbXwu68rAFpCm0HNb-8ZK8Mma3Ru2oa2Astg/viewform. We will regularly update this leaderboard.
19
+
20
+ ## Paper & Blog
21
+
22
+ Paper: https://arxiv.org/abs/2504.05408
23
+ Blog: https://rdi.berkeley.edu/frontier-ai-impact-on-cybersecurity/
24
+
25
+ ## Survey
26
+
27
+ We're also launching an expert survey on this topic. We invite all AI and security researchers and practitioners to take the survey here: https://berkeley.qualtrics.com/jfe/form/SV_6zmYIqEyv7bfOrs
28
+
29
+ ## Citation
30
+
31
+ Please consider to cite the report if the resource is useful to your research:
32
+
33
+ ```BibTex
34
+ @article{guo2025sok,
35
+ title={{Frontier AI's Impact on the Cybersecurity Landscape}},
36
+ author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
37
+ journal={arXiv preprint arXiv:2504.05408},
38
+ year={2025}
39
+ }
40
+ ```
about.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Submit your benchmark
2
+
3
+ This leaderboard is a collection of cybersecurity-relevant benchmarks. To submit your benchmark, please use this: https://docs.google.com/forms/d/e/1FAIpQLSd0arYQ0xy9FpGbXwu68rAFpCm0HNb-8ZK8Mma3Ru2oa2Astg/viewform. We will regularly update this leaderboard.
4
+
5
+ ## Paper & Blog
6
+
7
+ Paper: https://arxiv.org/abs/2504.05408
8
+ Blog: https://rdi.berkeley.edu/frontier-ai-impact-on-cybersecurity/
9
+
10
+ ## Survey
11
+
12
+ We're also launching an expert survey on this topic. We invite all AI and security researchers and practitioners to take the survey here: https://berkeley.qualtrics.com/jfe/form/SV_6zmYIqEyv7bfOrs
13
+
14
+ ## Citation
15
+
16
+ Please consider to cite the report if the resource is useful to your research:
17
+
18
+ ```BibTex
19
+ @article{guo2025sok,
20
+ title={{Frontier AI's Impact on the Cybersecurity Landscape}},
21
+ author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
22
+ journal={arXiv preprint arXiv:2504.05408},
23
+ year={2025}
24
+ }
25
+ ```
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc, sys
2
+ import gradio as gr
3
+
4
+ from gen_table import *
5
+ from meta_data import *
6
+
7
+ # import pandas as pd
8
+ # pd.set_option('display.max_colwidth', 0)
9
+
10
+ head_style = """
11
+ <style>
12
+ @media (min-width: 1536px)
13
+ {
14
+ .gradio-container {
15
+ min-width: var(--size-full) !important;
16
+ }
17
+ }
18
+ </style>
19
+ """
20
+
21
+ TAB_CSS = """
22
+ /* 1. Target the real tab‐list container (old & new class names + role attr) */
23
+ #leaderboard_tabs [role="tablist"],
24
+ #leaderboard_tabs .gradio-tabs-tablist,
25
+ #leaderboard_tabs .tab-container[role="tablist"] {
26
+ display: flex !important;
27
+ flex-wrap: wrap !important; /* allow multi‑row */
28
+ white-space: normal !important; /* cancel nowrap */
29
+ overflow-x: visible!important; /* don’t clip off */
30
+ height: auto !important; /* grow as tall as needed */
31
+ max-width: none !important; /* cancel any max‑width */
32
+ }
33
+
34
+ /* 2. Stop each button from flexing */
35
+ #leaderboard_tabs [role="tab"],
36
+ #leaderboard_tabs .tab-container[role="tablist"] .tab-button,
37
+ #leaderboard_tabs .gradio-tabs-tab {
38
+ flex: none !important;
39
+ }
40
+
41
+ /* 3. Hide every possible “more/overflow” toggle */
42
+ #leaderboard_tabs .overflow-menu,
43
+ #leaderboard_tabs [class*="overflow-button"],
44
+ #leaderboard_tabs button[aria-label*="More"],
45
+ #leaderboard_tabs .gradio-tabs-overflow,
46
+ #leaderboard_tabs .gradio-tabs-overflow-button {
47
+ display: none !important;
48
+ }
49
+ """
50
+
51
+ with gr.Blocks(title="Cybersecurity Leaderboard", head=
52
+ head_style) as demo:
53
+ struct = load_results()
54
+ timestamp = struct['time']
55
+ EVAL_TIME = format_timestamp(timestamp)
56
+ results = struct['results']
57
+ model_list=[]
58
+ task_list=[]
59
+ benchmark_list=[]
60
+ for task in results:
61
+ task_list+=[task]
62
+ for benchmark in results[task]:
63
+ if benchmark!='category':
64
+ benchmark_list+=[benchmark]
65
+ model_list+=list(results[task][benchmark].keys())
66
+
67
+ model_list=list(set(model_list))
68
+ N_MODEL=len(model_list)
69
+ N_TASK=len(task_list)
70
+ N_DATA = len(list(set(benchmark_list)))
71
+ DATASETS = benchmark_list
72
+
73
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_DATA,N_TASK,EVAL_TIME))
74
+ structs = [abc.abstractproperty() for _ in range(N_TASK)] #N_DATA
75
+
76
+ with gr.Tabs(elem_id="leaderboard_tabs", elem_classes='tab-buttons') as tabs:
77
+ with gr.TabItem('🏅 Cybersecurity Main Leaderboard', elem_id='main', id=0):
78
+ gr.Markdown(LEADERBOARD_MD['MAIN'].format(N_DATA,N_DATA))
79
+ _, check_box = BUILD_L1_DF(results, DEFAULT_TASK)
80
+ table = generate_table(results, DEFAULT_TASK)
81
+
82
+ type_map = check_box['type_map']
83
+
84
+ checkbox_group = gr.CheckboxGroup(
85
+ choices=check_box['all'],
86
+ value=check_box['required'],
87
+ label='Aspects of Cybersecurity Work',
88
+ interactive=True,
89
+ )
90
+
91
+ headers = check_box['essential'] + checkbox_group.value
92
+ with gr.Row():
93
+ model_name = gr.Textbox(
94
+ value='Input the Model Name (fuzzy, case insensitive)',
95
+ label='Model Name',
96
+ interactive=True,
97
+ visible=True)
98
+ data_component = gr.components.DataFrame(
99
+ value=table[headers],
100
+ type='pandas',
101
+ datatype=[type_map[x] for x in headers],
102
+ interactive=False,
103
+ wrap=True,
104
+ visible=True)
105
+
106
+ def filter_df(fields, model_name):
107
+ headers = check_box['essential'] + fields
108
+ df = generate_table(results, fields)
109
+
110
+ default_val = 'Input the Model Name (fuzzy, case insensitive)'
111
+ if model_name != default_val:
112
+ print(model_name)
113
+ model_name = model_name.lower()
114
+ method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
115
+ flag = [model_name in name for name in method_names]
116
+ df['TEMP_FLAG'] = flag
117
+ df = df[df['TEMP_FLAG'] == True]
118
+ df.pop('TEMP_FLAG')
119
+
120
+ comp = gr.components.DataFrame(
121
+ value=df[headers],
122
+ type='pandas',
123
+ datatype=[type_map[x] for x in headers],
124
+ interactive=False,
125
+ wrap=True,
126
+ visible=True)
127
+ return comp
128
+
129
+ for cbox in [checkbox_group]:
130
+ cbox.change(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
131
+ model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
132
+
133
+ with gr.TabItem('🔍 About', elem_id='about', id=1):
134
+ with open("about.md", 'r', encoding="utf-8") as file:
135
+ gr.Markdown(file.read())
136
+
137
+ for i, task in enumerate(task_list):
138
+ with gr.TabItem(f'📊 {task} Leaderboard', elem_id=task, id=i + 2):
139
+ if task in LEADERBOARD_MD:
140
+ gr.Markdown(LEADERBOARD_MD[task])
141
+
142
+ s = structs[i]
143
+ s.table, s.check_box = BUILD_L2_DF(results, task)
144
+ s.type_map = s.check_box['type_map']
145
+
146
+ s.checkbox_group = gr.CheckboxGroup(
147
+ choices=s.check_box['all'],
148
+ value=s.check_box['required'],
149
+ label=f'{task} CheckBoxes',
150
+ interactive=True,
151
+ )
152
+ s.headers = s.check_box['essential'] + s.checkbox_group.value
153
+
154
+ with gr.Row():
155
+ s.model_name = gr.Textbox(
156
+ value='Input the Model Name (fuzzy, case insensitive)',
157
+ label='Model Name',
158
+ interactive=True,
159
+ visible=True)
160
+ s.data_component = gr.components.DataFrame(
161
+ value=s.table[s.headers],
162
+ type='pandas',
163
+ datatype=[s.type_map[x] for x in s.headers],
164
+ interactive=False,
165
+ wrap=True,
166
+ visible=True)
167
+ s.dataset = gr.Textbox(value=task, label=task, visible=False)
168
+
169
+ def filter_df_l2(dataset_name, fields, model_name):
170
+ s = structs[task_list.index(dataset_name)]
171
+ headers = s.check_box['essential'] + fields
172
+ df = cp.deepcopy(s.table)
173
+ default_val = 'Input the Model Name (fuzzy, case insensitive)'
174
+ if model_name != default_val:
175
+ print(model_name)
176
+ model_name = model_name.lower()
177
+ method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
178
+ flag = [model_name in name for name in method_names]
179
+ df['TEMP_FLAG'] = flag
180
+ df = df[df['TEMP_FLAG'] == True]
181
+ df.pop('TEMP_FLAG')
182
+
183
+ comp = gr.components.DataFrame(
184
+ value=df[headers],
185
+ type='pandas',
186
+ datatype=[s.type_map[x] for x in headers],
187
+ interactive=False,
188
+ wrap=True,
189
+ visible=True)
190
+ return comp
191
+
192
+ for cbox in [s.checkbox_group]:
193
+ cbox.change(
194
+ fn=filter_df_l2,
195
+ inputs=[s.dataset, s.checkbox_group, s.model_name],
196
+ outputs=s.data_component)
197
+ s.model_name.submit(
198
+ fn=filter_df_l2,
199
+ inputs=[s.dataset, s.checkbox_group, s.model_name],
200
+ outputs=s.data_component)
201
+
202
+ with gr.Row():
203
+ with gr.Accordion('Citation', open=False):
204
+ citation_button = gr.Textbox(
205
+ value=CITATION_BUTTON_TEXT,
206
+ label=CITATION_BUTTON_LABEL,
207
+ elem_id='citation-button')
208
+
209
+ if __name__ == '__main__':
210
+ demo.launch(server_name='0.0.0.0', share=True)
gen_table.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy as cp
2
+ import json, sys
3
+ from collections import defaultdict
4
+ from urllib.request import urlopen
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from meta_data import DEFAULT_TASK
11
+
12
+
13
+ def listinstr(lst, s):
14
+ assert isinstance(lst, list)
15
+ for item in lst:
16
+ if item in s:
17
+ return True
18
+ return False
19
+
20
+
21
+ def load_results():
22
+ #data = json.loads(urlopen(URL).read())
23
+ with open('results.json', 'r') as file:
24
+ data = json.load(file)
25
+ return data
26
+
27
+
28
+ def nth_large(val, vals):
29
+ return sum([1 for v in vals if v > val]) + 1
30
+
31
+ def format_timestamp(timestamp):
32
+ date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
33
+ return date
34
+
35
+ def BUILD_L1_DF(results, fields):
36
+ check_box = {}
37
+ check_box['essential'] = ['Model']
38
+ # revise there to set default dataset
39
+ check_box['required'] = DEFAULT_TASK
40
+ check_box['all'] = DEFAULT_TASK
41
+ type_map = defaultdict(lambda: 'number')
42
+ check_box['type_map'] = type_map
43
+
44
+ df = generate_table(results, fields)
45
+ return df, check_box
46
+
47
+
48
+ def BUILD_L2_DF(results, task):
49
+ results=results[task]
50
+ model_list=[]
51
+ benchmark_list=[]
52
+ all_fields=[]
53
+ for benchmark in results:
54
+ if benchmark!='category':
55
+ benchmark_list+=[benchmark]
56
+ if benchmark not in ["CRUXEval","AutoPenBench"]:
57
+ all_fields+=[benchmark]
58
+ else:
59
+ all_fields+=[benchmark+' (autonomous)', benchmark+' (assisted)']
60
+ model_list+=list(results[benchmark].keys())
61
+ model_list=list(set(model_list))
62
+
63
+ res = defaultdict(list)
64
+ res['Model']=model_list
65
+
66
+ for benchmark in benchmark_list:
67
+ if benchmark not in ["CRUXEval","AutoPenBench"]:
68
+ for model in model_list:
69
+ if model in results[benchmark]:
70
+ res[benchmark].append(results[benchmark][model])
71
+ else:
72
+ res[benchmark].append(None)
73
+ else:
74
+ for model in model_list:
75
+ res[benchmark+' (autonomous)'].append(results[benchmark][model]['autonomous'])
76
+ res[benchmark+' (assisted)'].append(results[benchmark][model]['assisted'])
77
+
78
+ df = pd.DataFrame(res)
79
+ required_fields = all_fields
80
+
81
+ check_box = {}
82
+ check_box['essential'] = ['Model']
83
+ check_box['required'] = required_fields
84
+ check_box['all'] = all_fields
85
+ type_map = defaultdict(lambda: 'number')
86
+ check_box['type_map'] = type_map
87
+ return df, check_box
88
+
89
+
90
+ def generate_table(results, fields):
91
+ model_list=[]
92
+ task_list=fields
93
+ benchmark_list=[]
94
+ for task in results:
95
+ for benchmark in results[task]:
96
+ if benchmark!='category':
97
+ benchmark_list+=[benchmark]
98
+ model_list+=list(results[task][benchmark].keys())
99
+ model_list=list(set(model_list))
100
+
101
+ res = defaultdict(list)
102
+ res['Model']=model_list
103
+
104
+ average_score={}
105
+ cnt={}
106
+ for task in task_list:
107
+ task_score=[]
108
+ for model in model_list:
109
+ score=[]
110
+ for benchmark in results[task]:
111
+ if benchmark != 'category':
112
+ if model not in results[task][benchmark]:
113
+ score.append(None)
114
+ elif not isinstance(results[task][benchmark][model], (int, float)):
115
+ score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2)
116
+ else:
117
+ score.append(results[task][benchmark][model])
118
+ if not any (item is not None for item in score):
119
+ score=None
120
+ else:
121
+ score=np.mean([s for s in score if s is not None])
122
+ if model not in average_score:
123
+ average_score[model]=score
124
+ cnt[model]=1
125
+ else:
126
+ average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1)
127
+ cnt[model]+=1
128
+ task_score.append(score)
129
+ res[task]=task_score
130
+
131
+ #res['Avg Score']=[average_score[model] for model in model_list]
132
+ #res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']]
133
+
134
+ df = pd.DataFrame(res)
135
+ # valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
136
+ # valid = valid.sort_values('Avg Score')
137
+ # valid = valid.iloc[::-1]
138
+ # if len(fields):
139
+ # missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
140
+ # missing = missing.iloc[::-1]
141
+ # df = pd.concat([valid, missing])
142
+ return df
meta_data.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
2
+ # CONSTANTS-CITATION
3
+ CITATION_BUTTON_TEXT = r"""@article{guo2025sok,
4
+ title={{Frontier AI's Impact on the Cybersecurity Landscape}},
5
+ author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
6
+ journal={arXiv preprint arXiv:2504.05408},
7
+ year={2025}
8
+ }
9
+ """
10
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
11
+ # CONSTANTS-TEXT
12
+ LEADERBORAD_INTRODUCTION = """# Cybersecurity Leaderboard
13
+ ### Welcome to the Cybersecurity Leaderboard! This leaderboard is a collection of benchmarks relevant to cybersecurity capabilities.
14
+ This leaderboard covers {} benchmarks across {} aspects of cybersecurity work.
15
+
16
+ This leaderboard was last updated: {} """
17
+ # CONSTANTS-FIELDS
18
+ # META_FIELDS = [
19
+ # 'Model'
20
+ # ]
21
+
22
+ DEFAULT_TASK = [
23
+ 'Vulnerable code generation', 'Attack generation', 'CTF', 'Cyber knowledge', 'Pen test', 'Vulnerability detection', 'PoC generation', 'Patching'
24
+ ]
25
+ MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
26
+
27
+ # The README file for each benchmark
28
+ LEADERBOARD_MD = {}
29
+
30
+ LEADERBOARD_MD['MAIN'] = """
31
+ ## Main Evaluation Results
32
+
33
+ - Metrics:
34
+ - Avg Score: The average score on {} Cybersecurity Benchmarks (normalized to 0 - 100, the higher the better).
35
+ - Avg Rank: The average rank on {} Cybersecurity Benchmarks (the lower the better).
36
+ - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
37
+ """
38
+
39
+ LEADERBOARD_MD['Vulnerable code generation'] = """Need to add a description
40
+ """
41
+ LEADERBOARD_MD['Attack generation'] = """Need to add a description
42
+ """
43
+ LEADERBOARD_MD['CTF'] = """Need to add a description
44
+ """
45
+ LEADERBOARD_MD['Cyber knowledge'] = """Need to add a description
46
+ """
47
+ LEADERBOARD_MD['Pen test'] = """Need to add a description
48
+ """
49
+ LEADERBOARD_MD['Vulnerability detection'] = """Need to add a description
50
+ """
51
+ LEADERBOARD_MD['PoC generation'] = """Need to add a description
52
+ """
53
+ LEADERBOARD_MD['Patching'] = """Need to add a description
54
+ """
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.15.0
2
+ numpy>=1.23.4
3
+ pandas>=1.5.3
results.json ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "time": "20250418",
3
+ "results": {
4
+ "Vulnerable code generation": {
5
+ "category": "attack",
6
+ "CyberSecEval-3":{
7
+ "GPT-4": 35,
8
+ "Llama-3.1-405B": 39,
9
+ "Llama-3.1-70B": 35
10
+ },
11
+ "SecCodePLT":{
12
+ "GPT-4o": 44,
13
+ "Llama-3.1-70B": 47
14
+ }
15
+ },
16
+ "Attack generation": {
17
+ "category": "attack",
18
+ "CyberSecEval-3":{
19
+ "GPT-4": 40,
20
+ "Llama-3.1-405B": 49,
21
+ "Llama-3.1-70B": 41
22
+ },
23
+ "SecCodePLT":{
24
+ "GPT-4o": 0.2,
25
+ "Claude-3.5-Sonnet": 0.2,
26
+ "Llama-3.1-70B": 0
27
+ },
28
+ "RedCode-Gen": {
29
+ "GPT-3.5": 32.5,
30
+ "GPT-4": 66.9,
31
+ "GPT-4o": 72.5,
32
+ "Llama-2-7B": 20.7
33
+ },
34
+ "RedCode-Exec": {
35
+ "GPT-4": 64.5,
36
+ "GPT-4o": 77.23,
37
+ "Claude-3.5-Sonnet": 67.63,
38
+ "Llama-3.1-70B": 76.70,
39
+ "Llama-3.1-8B": 62.87
40
+ }
41
+ },
42
+ "CTF": {
43
+ "category": "attack",
44
+ "CyBench": {
45
+ "GPT-4o": 12.5,
46
+ "GPT-4.5-preview": 17.5,
47
+ "o1-preview": 10.0,
48
+ "o3-mini": 22.5,
49
+ "Claude-3.5-Sonnet": 17.5,
50
+ "Claude-3.7-Sonnet": 20,
51
+ "Gemini-1.5-pro": 7.5,
52
+ "Llama-3.1-405B": 7.5,
53
+ "Llama-3.1-70B": 5.0
54
+ },
55
+ "NYU": {
56
+ "GPT-4": 7.00,
57
+ "GPT-4o": 9.50,
58
+ "Claude-3.5-Sonnet": 13.50
59
+ }
60
+ },
61
+ "Cyber knowledge": {
62
+ "category": "attack",
63
+ "CyberBench": {
64
+ "GPT-3.5": 62.6,
65
+ "GPT-4": 69.9,
66
+ "Llama-2-7B": 50.6
67
+ },
68
+ "CyberMetric": {
69
+ "GPT-3.5": 88.10,
70
+ "GPT-4": 91.00,
71
+ "GPT-4o": 91.25,
72
+ "Gemini-1.0-pro": 84.00,
73
+ "Llama-3-8B": 73.05,
74
+ "Llama-2-70B": 72.60
75
+ },
76
+ "TACTL": {
77
+ "GPT-4o": 85.2,
78
+ "DeepSeek-R1": 91.8,
79
+ "DeepSeek-V3": 86.3,
80
+ "Llama-3.1-405B": 88.5,
81
+ "Llama-3.3-70B": 78.7
82
+ }
83
+ },
84
+ "Pen test": {
85
+ "category": "defense",
86
+ "AutoPenBench": {
87
+ "GPT-4o": {
88
+ "autonomous": 21.00,
89
+ "assisted": 64.00
90
+ }
91
+ }
92
+ },
93
+ "Vulnerability detection": {
94
+ "category": "defense",
95
+ "PrimeVul": {
96
+ "GPT-3.5": 6.21,
97
+ "GPT-4": 12.94
98
+ }
99
+ },
100
+ "PoC generation": {
101
+ "category": "defense",
102
+ "CRUXEval": {
103
+ "GPT-3.5": {
104
+ "autonomous": 49.1,
105
+ "assisted": 63.3
106
+ },
107
+ "GPT-4": {
108
+ "autonomous": 74.8,
109
+ "assisted": 81.9
110
+ },
111
+ "Code-Llama-13B": {
112
+ "autonomous": 39.1,
113
+ "assisted": 39.3
114
+ },
115
+ "Code-Llama-34B": {
116
+ "autonomous": 50.4,
117
+ "assisted": 46.0
118
+ }
119
+ }
120
+ },
121
+ "Patching": {
122
+ "category": "defense",
123
+ "SWE-bench-verified": {
124
+ "GPT-3.5": 0.4,
125
+ "GPT-4": 22.4,
126
+ "GPT-4o": 38.8,
127
+ "o1": 48.9,
128
+ "o3-mini": 49.3,
129
+ "Claude-3.5-Sonnet": 49.0,
130
+ "Claude-3.7-Sonnet": 70.3,
131
+ "DeepSeek-V3": 42.0,
132
+ "DeepSeek-R1": 49.2
133
+ }
134
+ }
135
+ }
136
+ }