Spaces:

FrontierAICybersecurity
/

Cybersecurity_leaderboard

Running

App Files Files Community

yujinyujin9393 commited on Apr 19

Commit

bb8ff6c

verified ·

1 Parent(s): bc30b2a

Upload 7 files

Browse files

Files changed (7) hide show

README.md +40 -0
about.md +25 -0
app.py +210 -0
gen_table.py +142 -0
meta_data.py +54 -0
requirements.txt +3 -0
results.json +136 -0

README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+title: Cybersecurity Leaderboard
+emoji: 🌎
+colorFrom: blue
+colorTo: green
+sdk: gradio
+app_file: app.py
+pinned: true
+license: apache-2.0
+tags:
+- leaderboard
+short_description: 'Cybersecurity Capability Evaluation Results Collection '
+sdk_version: 4.44.1
+---
+## Submit your benchmark
+This leaderboard is a collection of cybersecurity-relevant benchmarks. To submit your benchmark, please use this: https://docs.google.com/forms/d/e/1FAIpQLSd0arYQ0xy9FpGbXwu68rAFpCm0HNb-8ZK8Mma3Ru2oa2Astg/viewform. We will regularly update this leaderboard.
+## Paper & Blog
+Paper: https://arxiv.org/abs/2504.05408
+Blog: https://rdi.berkeley.edu/frontier-ai-impact-on-cybersecurity/
+## Survey
+We're also launching an expert survey on this topic. We invite all AI and security researchers and practitioners to take the survey here: https://berkeley.qualtrics.com/jfe/form/SV_6zmYIqEyv7bfOrs
+## Citation
+Please consider to cite the report if the resource is useful to your research:
+```BibTex
+@article{guo2025sok,
+  title={{Frontier AI's Impact on the Cybersecurity Landscape}},
+  author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
+  journal={arXiv preprint arXiv:2504.05408},
+  year={2025}
+}
+```

about.md ADDED Viewed

	@@ -0,0 +1,25 @@

+## Submit your benchmark
+This leaderboard is a collection of cybersecurity-relevant benchmarks. To submit your benchmark, please use this: https://docs.google.com/forms/d/e/1FAIpQLSd0arYQ0xy9FpGbXwu68rAFpCm0HNb-8ZK8Mma3Ru2oa2Astg/viewform. We will regularly update this leaderboard.
+## Paper & Blog
+Paper: https://arxiv.org/abs/2504.05408
+Blog: https://rdi.berkeley.edu/frontier-ai-impact-on-cybersecurity/
+## Survey
+We're also launching an expert survey on this topic. We invite all AI and security researchers and practitioners to take the survey here: https://berkeley.qualtrics.com/jfe/form/SV_6zmYIqEyv7bfOrs
+## Citation
+Please consider to cite the report if the resource is useful to your research:
+```BibTex
+@article{guo2025sok,
+  title={{Frontier AI's Impact on the Cybersecurity Landscape}},
+  author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
+  journal={arXiv preprint arXiv:2504.05408},
+  year={2025}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import abc, sys
+import gradio as gr
+from gen_table import *
+from meta_data import *
+# import pandas as pd
+# pd.set_option('display.max_colwidth', 0)
+head_style = """
+<style>
+@media (min-width: 1536px)
+{
+    .gradio-container {
+        min-width: var(--size-full) !important;
+    }
+}
+</style>
+"""
+TAB_CSS = """
+/* 1. Target the real tab‐list container (old & new class names + role attr) */
+#leaderboard_tabs [role="tablist"],
+#leaderboard_tabs .gradio-tabs-tablist,
+#leaderboard_tabs .tab-container[role="tablist"] {
+  display: flex       !important;
+  flex-wrap: wrap     !important;  /* allow multi‑row */
+  white-space: normal !important;  /* cancel nowrap */
+  overflow-x: visible!important;  /* don’t clip off */
+  height: auto        !important;  /* grow as tall as needed */
+  max-width: none     !important;  /* cancel any max‑width */
+}
+/* 2. Stop each button from flexing */
+#leaderboard_tabs [role="tab"],
+#leaderboard_tabs .tab-container[role="tablist"] .tab-button,
+#leaderboard_tabs .gradio-tabs-tab {
+  flex: none !important;
+}
+/* 3. Hide every possible “more/overflow” toggle */
+#leaderboard_tabs .overflow-menu,
+#leaderboard_tabs [class*="overflow-button"],
+#leaderboard_tabs button[aria-label*="More"],
+#leaderboard_tabs .gradio-tabs-overflow,
+#leaderboard_tabs .gradio-tabs-overflow-button {
+  display: none !important;
+}
+"""
+with gr.Blocks(title="Cybersecurity Leaderboard", head=
+head_style) as demo:
+    struct = load_results()
+    timestamp = struct['time']
+    EVAL_TIME = format_timestamp(timestamp)
+    results = struct['results']
+    model_list=[]
+    task_list=[]
+    benchmark_list=[]
+    for task in results:
+        task_list+=[task]
+        for benchmark in results[task]:
+            if benchmark!='category':
+                benchmark_list+=[benchmark]
+                model_list+=list(results[task][benchmark].keys())
+    model_list=list(set(model_list))
+    N_MODEL=len(model_list)
+    N_TASK=len(task_list)
+    N_DATA = len(list(set(benchmark_list)))
+    DATASETS = benchmark_list
+    gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_DATA,N_TASK,EVAL_TIME))
+    structs = [abc.abstractproperty() for _ in range(N_TASK)] #N_DATA
+    with gr.Tabs(elem_id="leaderboard_tabs", elem_classes='tab-buttons') as tabs:
+        with gr.TabItem('🏅 Cybersecurity Main Leaderboard', elem_id='main', id=0):
+            gr.Markdown(LEADERBOARD_MD['MAIN'].format(N_DATA,N_DATA))
+            _, check_box = BUILD_L1_DF(results, DEFAULT_TASK)
+            table = generate_table(results, DEFAULT_TASK)
+            type_map = check_box['type_map']
+            checkbox_group = gr.CheckboxGroup(
+                choices=check_box['all'],
+                value=check_box['required'],
+                label='Aspects of Cybersecurity Work',
+                interactive=True,
+            )
+            headers = check_box['essential'] + checkbox_group.value
+            with gr.Row():
+                model_name = gr.Textbox(
+                    value='Input the Model Name (fuzzy, case insensitive)',
+                    label='Model Name',
+                    interactive=True,
+                    visible=True)
+            data_component = gr.components.DataFrame(
+                value=table[headers],
+                type='pandas',
+                datatype=[type_map[x] for x in headers],
+                interactive=False,
+                wrap=True,
+                visible=True)
+            def filter_df(fields, model_name):
+                headers = check_box['essential'] + fields
+                df = generate_table(results, fields)
+                default_val = 'Input the Model Name (fuzzy, case insensitive)'
+                if model_name != default_val:
+                    print(model_name)
+                    model_name = model_name.lower()
+                    method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Model']]
+                    flag = [model_name in name for name in method_names]
+                    df['TEMP_FLAG'] = flag
+                    df = df[df['TEMP_FLAG'] == True]
+                    df.pop('TEMP_FLAG')
+                comp = gr.components.DataFrame(
+                    value=df[headers],
+                    type='pandas',
+                    datatype=[type_map[x] for x in headers],
+                    interactive=False,
+                    wrap=True,
+                    visible=True)
+                return comp
+            for cbox in [checkbox_group]:
+                cbox.change(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
+            model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name], outputs=data_component)
+        with gr.TabItem('🔍 About', elem_id='about', id=1):
+            with open("about.md", 'r', encoding="utf-8") as file:
+                gr.Markdown(file.read())
+        for i, task in enumerate(task_list):
+            with gr.TabItem(f'📊 {task} Leaderboard', elem_id=task, id=i + 2):
+                if task in LEADERBOARD_MD:
+                    gr.Markdown(LEADERBOARD_MD[task])
+                s = structs[i]
+                s.table, s.check_box = BUILD_L2_DF(results, task)
+                s.type_map = s.check_box['type_map']
+                s.checkbox_group = gr.CheckboxGroup(
+                    choices=s.check_box['all'],
+                    value=s.check_box['required'],
+                    label=f'{task} CheckBoxes',
+                    interactive=True,
+                )
+                s.headers = s.check_box['essential'] + s.checkbox_group.value
+                with gr.Row():
+                    s.model_name = gr.Textbox(
+                        value='Input the Model Name (fuzzy, case insensitive)',
+                        label='Model Name',
+                        interactive=True,
+                        visible=True)
+                s.data_component = gr.components.DataFrame(
+                    value=s.table[s.headers],
+                    type='pandas',
+                    datatype=[s.type_map[x] for x in s.headers],
+                    interactive=False,
+                    wrap=True,
+                    visible=True)
+                s.dataset = gr.Textbox(value=task, label=task, visible=False)
+                def filter_df_l2(dataset_name, fields, model_name):
+                    s = structs[task_list.index(dataset_name)]
+                    headers = s.check_box['essential'] + fields
+                    df = cp.deepcopy(s.table)
+                    default_val = 'Input the Model Name (fuzzy, case insensitive)'
+                    if model_name != default_val:
+                        print(model_name)
+                        model_name = model_name.lower()
+                        method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
+                        flag = [model_name in name for name in method_names]
+                        df['TEMP_FLAG'] = flag
+                        df = df[df['TEMP_FLAG'] == True]
+                        df.pop('TEMP_FLAG')
+                    comp = gr.components.DataFrame(
+                        value=df[headers],
+                        type='pandas',
+                        datatype=[s.type_map[x] for x in headers],
+                        interactive=False,
+                        wrap=True,
+                        visible=True)
+                    return comp
+                for cbox in [s.checkbox_group]:
+                    cbox.change(
+                        fn=filter_df_l2,
+                        inputs=[s.dataset, s.checkbox_group, s.model_name],
+                        outputs=s.data_component)
+                s.model_name.submit(
+                    fn=filter_df_l2,
+                    inputs=[s.dataset, s.checkbox_group, s.model_name],
+                    outputs=s.data_component)
+    with gr.Row():
+        with gr.Accordion('Citation', open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id='citation-button')
+if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0', share=True)

gen_table.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import copy as cp
+import json, sys
+from collections import defaultdict
+from urllib.request import urlopen
+import gradio as gr
+import numpy as np
+import pandas as pd
+from meta_data import DEFAULT_TASK
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def load_results():
+    #data = json.loads(urlopen(URL).read())
+    with open('results.json', 'r') as file:
+        data = json.load(file)
+    return data
+def nth_large(val, vals):
+    return sum([1 for v in vals if v > val]) + 1
+def format_timestamp(timestamp):
+    date = timestamp[:-4] + '.' + timestamp[-4:-2] + '.' + timestamp[-2:]
+    return date
+def BUILD_L1_DF(results, fields):
+    check_box = {}
+    check_box['essential'] = ['Model']
+    # revise there to set default dataset
+    check_box['required'] = DEFAULT_TASK
+    check_box['all'] = DEFAULT_TASK
+    type_map = defaultdict(lambda: 'number')
+    check_box['type_map'] = type_map
+    df = generate_table(results, fields)
+    return df, check_box
+def BUILD_L2_DF(results, task):
+    results=results[task]
+    model_list=[]
+    benchmark_list=[]
+    all_fields=[]
+    for benchmark in results:
+        if benchmark!='category':
+            benchmark_list+=[benchmark]
+            if benchmark not in ["CRUXEval","AutoPenBench"]:
+                all_fields+=[benchmark]
+            else:
+                all_fields+=[benchmark+' (autonomous)', benchmark+' (assisted)']
+            model_list+=list(results[benchmark].keys())
+    model_list=list(set(model_list))
+    res = defaultdict(list)
+    res['Model']=model_list
+    for benchmark in benchmark_list:
+        if benchmark not in ["CRUXEval","AutoPenBench"]:
+            for model in model_list:
+                if model in results[benchmark]:
+                    res[benchmark].append(results[benchmark][model])
+                else:
+                    res[benchmark].append(None)
+        else:
+            for model in model_list:
+                res[benchmark+' (autonomous)'].append(results[benchmark][model]['autonomous'])
+                res[benchmark+' (assisted)'].append(results[benchmark][model]['assisted'])
+    df = pd.DataFrame(res)
+    required_fields = all_fields
+    check_box = {}
+    check_box['essential'] = ['Model']
+    check_box['required'] = required_fields
+    check_box['all'] = all_fields
+    type_map = defaultdict(lambda: 'number')
+    check_box['type_map'] = type_map
+    return df, check_box
+def generate_table(results, fields):
+    model_list=[]
+    task_list=fields
+    benchmark_list=[]
+    for task in results:
+        for benchmark in results[task]:
+            if benchmark!='category':
+                benchmark_list+=[benchmark]
+                model_list+=list(results[task][benchmark].keys())
+    model_list=list(set(model_list))
+    res = defaultdict(list)
+    res['Model']=model_list
+    average_score={}
+    cnt={}
+    for task in task_list:
+        task_score=[]
+        for model in model_list:
+            score=[]
+            for benchmark in results[task]:
+                if benchmark != 'category':
+                    if model not in results[task][benchmark]:
+                        score.append(None)
+                    elif not isinstance(results[task][benchmark][model], (int, float)):
+                        score.append((results[task][benchmark][model]["autonomous"]+results[task][benchmark][model]["assisted"])/2)
+                    else:
+                        score.append(results[task][benchmark][model])
+            if not any (item is not None for item in score):
+                score=None
+            else:
+                score=np.mean([s for s in score if s is not None])
+                if model not in average_score:
+                    average_score[model]=score
+                    cnt[model]=1
+                else:
+                    average_score[model]=((average_score[model]*cnt[model])+score)/(cnt[model]+1)
+                    cnt[model]+=1
+            task_score.append(score)
+        res[task]=task_score
+    #res['Avg Score']=[average_score[model] for model in model_list]
+    #res['Avg Rank'] = [sorted(res['Avg Score'], reverse=True).index(score) + 1 for score in res['Avg Score']]
+    df = pd.DataFrame(res)
+    # valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
+    # valid = valid.sort_values('Avg Score')
+    # valid = valid.iloc[::-1]
+    # if len(fields):
+    #     missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
+    #     missing = missing.iloc[::-1]
+    # df = pd.concat([valid, missing])
+    return df

meta_data.py ADDED Viewed

	@@ -0,0 +1,54 @@

+VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
+# CONSTANTS-CITATION
+CITATION_BUTTON_TEXT = r"""@article{guo2025sok,
+  title={{Frontier AI's Impact on the Cybersecurity Landscape}},
+  author={Guo, Wenbo and Potter, Yujin and Shi, Tianneng and Wang, Zhun and Zhang, Andy and Song, Dawn},
+  journal={arXiv preprint arXiv:2504.05408},
+  year={2025}
+}
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+# CONSTANTS-TEXT
+LEADERBORAD_INTRODUCTION = """# Cybersecurity Leaderboard
+### Welcome to the Cybersecurity Leaderboard! This leaderboard is a collection of benchmarks relevant to cybersecurity capabilities.
+This leaderboard covers {} benchmarks across {} aspects of cybersecurity work.
+This leaderboard was last updated: {} """
+# CONSTANTS-FIELDS
+# META_FIELDS = [
+#     'Model'
+# ]
+DEFAULT_TASK = [
+    'Vulnerable code generation', 'Attack generation', 'CTF', 'Cyber knowledge', 'Pen test', 'Vulnerability detection', 'PoC generation', 'Patching'
+]
+MMBENCH_FIELDS = ['MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'CCBench']
+# The README file for each benchmark
+LEADERBOARD_MD = {}
+LEADERBOARD_MD['MAIN'] = """
+## Main Evaluation Results
+- Metrics:
+  - Avg Score: The average score on {} Cybersecurity Benchmarks (normalized to 0 - 100, the higher the better).
+  - Avg Rank: The average rank on {} Cybersecurity Benchmarks (the lower the better).
+  - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
+"""
+LEADERBOARD_MD['Vulnerable code generation'] = """Need to add a description
+"""
+LEADERBOARD_MD['Attack generation'] = """Need to add a description
+"""
+LEADERBOARD_MD['CTF'] = """Need to add a description
+"""
+LEADERBOARD_MD['Cyber knowledge'] = """Need to add a description
+"""
+LEADERBOARD_MD['Pen test'] = """Need to add a description
+"""
+LEADERBOARD_MD['Vulnerability detection'] = """Need to add a description
+"""
+LEADERBOARD_MD['PoC generation'] = """Need to add a description
+"""
+LEADERBOARD_MD['Patching'] = """Need to add a description
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==4.15.0
+numpy>=1.23.4
+pandas>=1.5.3

results.json ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+    "time": "20250418",
+    "results": {
+        "Vulnerable code generation": {
+            "category": "attack",
+            "CyberSecEval-3":{
+                "GPT-4": 35,
+                "Llama-3.1-405B": 39,
+                "Llama-3.1-70B": 35
+            },
+            "SecCodePLT":{
+                "GPT-4o": 44,
+                "Llama-3.1-70B": 47
+            }
+        },
+        "Attack generation": {
+            "category": "attack",
+            "CyberSecEval-3":{
+                "GPT-4": 40,
+                "Llama-3.1-405B": 49,
+                "Llama-3.1-70B": 41
+            },
+            "SecCodePLT":{
+                "GPT-4o": 0.2,
+                "Claude-3.5-Sonnet": 0.2,
+                "Llama-3.1-70B": 0
+            },
+            "RedCode-Gen": {
+                "GPT-3.5": 32.5,
+                "GPT-4": 66.9,
+                "GPT-4o": 72.5,
+                "Llama-2-7B": 20.7
+            },
+            "RedCode-Exec": {
+                "GPT-4": 64.5,
+                "GPT-4o": 77.23,
+                "Claude-3.5-Sonnet": 67.63,
+                "Llama-3.1-70B": 76.70,
+                "Llama-3.1-8B": 62.87
+            }
+        },
+        "CTF": {
+            "category": "attack",
+            "CyBench": {
+                "GPT-4o": 12.5,
+                "GPT-4.5-preview": 17.5,
+                "o1-preview": 10.0,
+                "o3-mini": 22.5,
+                "Claude-3.5-Sonnet": 17.5,
+                "Claude-3.7-Sonnet": 20,
+                "Gemini-1.5-pro": 7.5,
+                "Llama-3.1-405B": 7.5,
+                "Llama-3.1-70B": 5.0
+            },
+            "NYU": {
+                "GPT-4": 7.00,
+                "GPT-4o": 9.50,
+                "Claude-3.5-Sonnet": 13.50
+            }
+        },
+        "Cyber knowledge": {
+            "category": "attack",
+            "CyberBench": {
+                "GPT-3.5": 62.6,
+                "GPT-4": 69.9,
+                "Llama-2-7B": 50.6
+            },
+            "CyberMetric": {
+                "GPT-3.5": 88.10,
+                "GPT-4": 91.00,
+                "GPT-4o": 91.25,
+                "Gemini-1.0-pro": 84.00,
+                "Llama-3-8B": 73.05,
+                "Llama-2-70B": 72.60
+            },
+            "TACTL": {
+                "GPT-4o": 85.2,
+                "DeepSeek-R1": 91.8,
+                "DeepSeek-V3": 86.3,
+                "Llama-3.1-405B": 88.5,
+                "Llama-3.3-70B": 78.7
+            }
+        },
+        "Pen test": {
+            "category": "defense",
+            "AutoPenBench": {
+                "GPT-4o": {
+                    "autonomous": 21.00,
+                    "assisted": 64.00
+                }
+            }
+        },
+        "Vulnerability detection": {
+            "category": "defense",
+            "PrimeVul": {
+                "GPT-3.5": 6.21,
+                "GPT-4": 12.94
+            }
+        },
+        "PoC generation": {
+            "category": "defense",
+            "CRUXEval": {
+                "GPT-3.5": {
+                    "autonomous": 49.1,
+                    "assisted": 63.3
+                },
+                "GPT-4": {
+                    "autonomous": 74.8,
+                    "assisted": 81.9
+                },
+                "Code-Llama-13B": {
+                    "autonomous": 39.1,
+                    "assisted": 39.3
+                },
+                "Code-Llama-34B": {
+                    "autonomous": 50.4,
+                    "assisted": 46.0
+                }
+            }
+        },
+        "Patching": {
+            "category": "defense",
+            "SWE-bench-verified": {
+                "GPT-3.5": 0.4,
+                "GPT-4": 22.4,
+                "GPT-4o": 38.8,
+                "o1": 48.9,
+                "o3-mini": 49.3,
+                "Claude-3.5-Sonnet": 49.0,
+                "Claude-3.7-Sonnet": 70.3,
+                "DeepSeek-V3": 42.0,
+                "DeepSeek-R1": 49.2
+            }
+        }
+    }
+}