File size: 4,543 Bytes
3b232e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d523c31
 
 
3b232e3
 
d523c31
 
 
 
 
 
 
 
3b232e3
d523c31
 
 
3b232e3
 
d523c31
3b232e3
 
d523c31
3b232e3
d523c31
 
3b232e3
 
 
 
d523c31
3b232e3
 
 
d523c31
 
 
3b232e3
 
 
 
 
 
 
d523c31
 
 
 
3b232e3
 
d523c31
 
 
 
 
 
 
 
 
3b232e3
 
 
 
 
 
d523c31
3b232e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d523c31
 
 
3b232e3
d523c31
 
 
 
 
 
 
 
 
3b232e3
 
 
 
 
d523c31
3b232e3
 
 
 
 
 
d523c31
3b232e3
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
from scipy.stats import anderson_ksamp
from statsmodels.stats.weightstats import ztest
import json
import pandas as pd
from datetime import date
from huggingface_hub import HfApi
from os.path import isfile
from func_timeout import func_timeout, FunctionTimedOut

from scripts.bold import generate_and_evaluate_causal_lm_toxicity_bold
from scripts.honest import calculate_honest, detect_honest_bias
from scripts.winobias import calculate_wino_bias
from avidtools.datamodels.report import Report
from avidtools.datamodels.components import *
from avidtools.datamodels.enums import *

# def generate_report():
def generate_report(classof,type,risk_domain,sep,lifecycle):
# def generate_report(scraped_input, selections):
    report = Report()

    # report.affects = Affects(
    #     developer = [],
    #     deployer = ['Hugging Face'],
    #     artifacts = [Artifact(
    #         type = ArtifactTypeEnum.model,
    #         name = model_id
    #     )]
    # )    
    report.problemtype = Problemtype(
        # classof = clas,
        classof = classof,
        type = type,
        description = LangValue(
            lang = 'eng',
            value = scraped_input['title']
        )
    )
    report.references = [
        Reference(
            label = scraped_input['description'],
            url = scraped_input['url']
        )
    ]
    report.description = LangValue(
        lang = 'eng',
        value = scraped_input['description']
    )
    report.impact = Impact(
        avid = AvidTaxonomy(
            risk_domain = risk_domain,
            sep_view = sep,
            lifecycle_view = lifecycle,
            taxonomy_version = '0.2'
        )
    )
    report.reported_date = date.today()
    
    return report.dict()

scraped_input = {
    "title": "### title",
    "description": "description",
    "url": "https://link.to.arxiv.paper"
}

# selections = {
#     "classof": ClassEnum.llm,
#     "type": TypeEnum.detection,
#     "avid": {
#         "risk_domain": ["Security"],
#         "sep": [SepEnum.E0101],
#         "lifecycle": [LifecycleEnum.L05]
#     }
# }

demo = gr.Blocks(theme=gr.themes.Soft())
# demo = gr.Blocks(theme='gradio/darkdefault')

with demo:

    gr.Markdown("# Report AI Vulnerability Research")
    gr.Markdown("""
    As language models become more prevalent in day-to-day technology, it's important to develop methods to \
    investigate their biases and limitations. To this end, researchers are developing metrics like \
    BOLD, HONEST, and WinoBias that calculate scores which represent their tendency to generate "unfair" text across \
    different collections of prompts. With the widgets below, you can choose a model and a metric to run your own \
    evaluations.
    
    Generating these scores is only half the battle, though! What do you do with these numbers once you've evaluated \
    a model? [AVID](https://avidml.org)'s data model makes it easy to collect and communicate your findings with \
    structured reports.
    """)
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("""
            ## Step 1: \n\
            Select a model and a method of detection.
            """)
            with gr.Box():
                title = gr.Markdown(scraped_input['title'])
                description = gr.Markdown(scraped_input['description'])

        with gr.Column(scale=3):
            gr.Markdown("""## Step 2: \
            Categorize your report.""")

            classof = gr.Radio(label="Class", choices=[ce.value for ce in ClassEnum])
            type = gr.Radio(label="Type", choices=[te.value for te in TypeEnum])
            risk_domain = gr.CheckboxGroup(label="Risk Domain", choices=['Security','Ethics','Performance'])
            sep = gr.CheckboxGroup(label="Effect Categories", choices=[se.value for se in SepEnum])
            lifecycle = gr.CheckboxGroup(label="Lifecycle Categories", choices=[le.value for le in LifecycleEnum])

        with gr.Column(scale=5):
            gr.Markdown("""
            ## Step 3: \n\
            Generate a report that you can submit to AVID.

            The title and abstract get auto-populated from Step 1. The taxonomy categories populate from your selections in Step 2.
            """)
            report_button = gr.Button("Generate Report")
            report_json = gr.Json(label="AVID Report")

    report_button.click(
        fn=generate_report,
        inputs=[classof,type,risk_domain,sep,lifecycle],
        outputs=[report_json]
    )

demo.launch()