saattrupdan commited on
Commit
5d40291
·
1 Parent(s): 9a46da5

feat: Add About tab

Browse files
Files changed (1) hide show
  1. app.py +157 -72
app.py CHANGED
@@ -21,6 +21,92 @@ logging.basicConfig(level=logging.INFO, format=fmt)
21
  logger = logging.getLogger("radial_plot_generator")
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  UPDATE_FREQUENCY_MINUTES = 30
25
 
26
 
@@ -155,78 +241,77 @@ def main() -> None:
155
  })
156
 
157
  with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
158
- gr.Markdown("# Radial Plot Generator")
159
- gr.Markdown(
160
- "This demo allows you to generate a radial plot comparing the performance "
161
- "of different language models on different tasks. It is based on the "
162
- "generative results from the [ScandEval benchmark](https://scandeval.com)."
163
- )
164
- with gr.Column():
165
- with gr.Row():
166
- language_names_dropdown = gr.Dropdown(
167
- choices=all_languages,
168
- multiselect=True,
169
- label="Languages",
170
- value=["Danish"],
171
- interactive=True,
172
- scale=2,
173
- )
174
- model_ids_dropdown = gr.Dropdown(
175
- choices=danish_models,
176
- multiselect=True,
177
- label="Models",
178
- value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
179
- interactive=True,
180
- scale=2,
181
- )
182
- with gr.Row():
183
- use_win_ratio_checkbox = gr.Checkbox(
184
- label="Compare models with win ratios (as opposed to raw scores)",
185
- value=True,
186
- interactive=True,
187
- scale=1,
188
- )
189
- show_scale_checkbox = gr.Checkbox(
190
- label="Show the scale on the plot (always 0-100)",
191
- value=False,
192
- interactive=True,
193
- scale=1,
194
- )
195
- plot_width_slider = gr.Slider(
196
- label="Plot width",
197
- minimum=600,
198
- maximum=1000,
199
- step=10,
200
- value=800,
201
- interactive=True,
202
- scale=1,
203
- )
204
- plot_height_slider = gr.Slider(
205
- label="Plot height",
206
- minimum=300,
207
- maximum=700,
208
- step=10,
209
- value=500,
210
- interactive=True,
211
- scale=1,
212
- )
213
- with gr.Row():
214
- plot = gr.Plot(
215
- value=produce_radial_plot(
216
- model_ids_dropdown.value,
217
- language_names=language_names_dropdown.value,
218
- use_win_ratio=use_win_ratio_checkbox.value,
219
- show_scale=show_scale_checkbox.value,
220
- plot_width=plot_width_slider.value,
221
- plot_height=plot_height_slider.value,
222
- results_dfs=results_dfs,
223
- ),
224
- )
225
- with gr.Row():
226
- gr.Markdown(
227
- "<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
228
- "Alexandra Institute</a>.</center>"
229
- )
230
 
231
  language_names_dropdown.change(
232
  fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),
 
21
  logger = logging.getLogger("radial_plot_generator")
22
 
23
 
24
+ INTRO_MARKDOWN = """
25
+ # Radial Plot Generator
26
+
27
+ This demo allows you to generate a radial plot comparing the performance of different
28
+ language models on different tasks. It is based on the generative results from the
29
+ [ScandEval benchmark](https://scandeval.com).
30
+ """
31
+
32
+
33
+ ABOUT_MARKDOWN = """
34
+ ## About the ScandEval Benchmark
35
+
36
+ The [ScandEval benchmark](https://scandeval.com) is used compare pretrained language
37
+ models on tasks in Danish, Swedish, Norwegian Bokmål, Norwegian Nynorsk, Icelandic,
38
+ Faroese, German, Dutch and English. The benchmark supports both encoder models (such as
39
+ BERT) and generative models (such as GPT), and leaderboards for both kinds [are
40
+ available](https://scandeval.com).
41
+
42
+ The generative models are evaluated using in-context learning with few-shot prompts.
43
+ The few-shot examples are sampled randomly from the training split, and we benchmark
44
+ the models 10 times with bootstrapped test sets and different few-shot examples in each
45
+ iteration. This allows us to better measure the uncertainty of the results.
46
+
47
+ We use the uncertainty in the radial plot when we compute the win ratios (i.e., the
48
+ percentage of other models that a model beats on a task). Namely, we compute the win
49
+ ratio as the percentage of other models that a model _significantly_ beats on a task,
50
+ where we use a paired t-test with a significance level of 0.05 to determine whether a
51
+ model significantly beats another model.
52
+
53
+ ## The Benchmark Datasets
54
+
55
+ The ScandEval generative benchmark currently covers the languages Danish, Swedish,
56
+ Norwegian, Icelandic, German, Dutch and English. For each language, the benchmark
57
+ consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are
58
+ the following:
59
+
60
+ ### Text Classification
61
+ Given a piece of text, classify it into a number of classes. For this task we extract
62
+ the first token of the possible labels, and choose the label whose first token has the
63
+ highest probability. All datasets in this category are currently trinary sentiment
64
+ classification datasets. We use the Matthews Correlation Coefficient (MCC) as the
65
+ evaluation metric.
66
+
67
+ ### Information Extraction
68
+ Given a piece of text, extract a number of entities from the text. As the model needs
69
+ to extract multiple entities, we use [structured
70
+ generation](https://github.com/noamgat/lm-format-enforcer) to make the model generate a
71
+ JSON dictionary with keys being the entity categories and values being lists of the
72
+ identified entities. All datasets in this task are named entity recognition datasets.
73
+ We use the micro-averaged F1 score as the evaluation metric, where we ignore the
74
+ Miscellaneous category.
75
+
76
+ ### Grammar
77
+ Given a piece of text, determine whether it is grammatically correct or not. All
78
+ datasets in this task are built from the dependency treebanks of the languages, where
79
+ words are removed or swapped, in a way that makes the sentence ungrammatical. We use
80
+ the Matthews Correlation Coefficient (MCC) as the evaluation metric.
81
+
82
+ ### Question Answering
83
+ Given a question and a piece of text, extract the answer to the question from the text.
84
+ All datasets in this task are extractive question answering datasets. We use the exact
85
+ match (EM) score as the evaluation metric.
86
+
87
+ ### Summarisation
88
+ Given a piece of text, generate a summary of the text. All the datasets come from
89
+ either news articles or WikiHow articles. We use the BERTScore metric as the evaluation
90
+ metric, where the encoder model used is
91
+ [microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base).
92
+
93
+ ### Knowledge
94
+ Given a trivia-style question with multiple choice answers, choose the correct answer.
95
+ As with text classification, we use the probabilities of the answer letter (a, b, c or
96
+ d) to choose the answer. The datasets in this task are machine translated versions of
97
+ the [MMLU](https://doi.org/10.48550/arXiv.2009.03300) and
98
+ [ARC](https://allenai.org/data/arc) datasets. We use the Matthews Correlation
99
+ Coefficient (MCC) as the evaluation metric.
100
+
101
+ ### Reasoning
102
+ Given a scenario and multiple possible endings, choose the correct ending. As with text
103
+ classification, we use the probabilities of the answer letter (a, b, c or d) to choose
104
+ the answer. The datasets in this task are machine translated versions of the
105
+ [HellaSwag](https://rowanzellers.com/hellaswag/) dataset. We use the Matthews
106
+ Correlation Coefficient (MCC) as the evaluation metric.
107
+ """
108
+
109
+
110
  UPDATE_FREQUENCY_MINUTES = 30
111
 
112
 
 
241
  })
242
 
243
  with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
244
+ gr.Markdown(INTRO_MARKDOWN)
245
+
246
+ with gr.Tab(label="Build a Radial Plot"):
247
+ with gr.Column():
248
+ with gr.Row():
249
+ language_names_dropdown = gr.Dropdown(
250
+ choices=all_languages,
251
+ multiselect=True,
252
+ label="Languages",
253
+ value=["Danish"],
254
+ interactive=True,
255
+ scale=2,
256
+ )
257
+ model_ids_dropdown = gr.Dropdown(
258
+ choices=danish_models,
259
+ multiselect=True,
260
+ label="Models",
261
+ value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
262
+ interactive=True,
263
+ scale=2,
264
+ )
265
+ with gr.Row():
266
+ use_win_ratio_checkbox = gr.Checkbox(
267
+ label="Compare models with win ratios (as opposed to raw scores)",
268
+ value=True,
269
+ interactive=True,
270
+ scale=1,
271
+ )
272
+ show_scale_checkbox = gr.Checkbox(
273
+ label="Show the scale on the plot (always 0-100)",
274
+ value=False,
275
+ interactive=True,
276
+ scale=1,
277
+ )
278
+ plot_width_slider = gr.Slider(
279
+ label="Plot width",
280
+ minimum=600,
281
+ maximum=1000,
282
+ step=10,
283
+ value=800,
284
+ interactive=True,
285
+ scale=1,
286
+ )
287
+ plot_height_slider = gr.Slider(
288
+ label="Plot height",
289
+ minimum=300,
290
+ maximum=700,
291
+ step=10,
292
+ value=500,
293
+ interactive=True,
294
+ scale=1,
295
+ )
296
+ with gr.Row():
297
+ plot = gr.Plot(
298
+ value=produce_radial_plot(
299
+ model_ids_dropdown.value,
300
+ language_names=language_names_dropdown.value,
301
+ use_win_ratio=use_win_ratio_checkbox.value,
302
+ show_scale=show_scale_checkbox.value,
303
+ plot_width=plot_width_slider.value,
304
+ plot_height=plot_height_slider.value,
305
+ results_dfs=results_dfs,
306
+ ),
307
+ )
308
+ with gr.Row():
309
+ gr.Markdown(
310
+ "<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
311
+ "Alexandra Institute</a>.</center>"
312
+ )
313
+ with gr.Tab(label="About"):
314
+ gr.Markdown(ABOUT_MARKDOWN)
 
315
 
316
  language_names_dropdown.change(
317
  fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),