Spaces:

alexandrainst
/

radial-plot-generator

Running

App Files Files Community

saattrupdan commited on Jan 27, 2024

Commit

5d40291

1 Parent(s): 9a46da5

feat: Add About tab

Browse files

Files changed (1) hide show

app.py +157 -72

app.py CHANGED Viewed

@@ -21,6 +21,92 @@ logging.basicConfig(level=logging.INFO, format=fmt)
 logger = logging.getLogger("radial_plot_generator")
 UPDATE_FREQUENCY_MINUTES = 30
@@ -155,78 +241,77 @@ def main() -> None:
     })
     with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
-        gr.Markdown("# Radial Plot Generator")
-        gr.Markdown(
-            "This demo allows you to generate a radial plot comparing the performance "
-            "of different language models on different tasks. It is based on the "
-            "generative results from the [ScandEval benchmark](https://scandeval.com)."
-        )
-        with gr.Column():
-            with gr.Row():
-                language_names_dropdown = gr.Dropdown(
-                    choices=all_languages,
-                    multiselect=True,
-                    label="Languages",
-                    value=["Danish"],
-                    interactive=True,
-                    scale=2,
-                )
-                model_ids_dropdown = gr.Dropdown(
-                    choices=danish_models,
-                    multiselect=True,
-                    label="Models",
-                    value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
-                    interactive=True,
-                    scale=2,
-                )
-            with gr.Row():
-                use_win_ratio_checkbox = gr.Checkbox(
-                    label="Compare models with win ratios (as opposed to raw scores)",
-                    value=True,
-                    interactive=True,
-                    scale=1,
-                )
-                show_scale_checkbox = gr.Checkbox(
-                    label="Show the scale on the plot (always 0-100)",
-                    value=False,
-                    interactive=True,
-                    scale=1,
-                )
-                plot_width_slider = gr.Slider(
-                    label="Plot width",
-                    minimum=600,
-                    maximum=1000,
-                    step=10,
-                    value=800,
-                    interactive=True,
-                    scale=1,
-                )
-                plot_height_slider = gr.Slider(
-                    label="Plot height",
-                    minimum=300,
-                    maximum=700,
-                    step=10,
-                    value=500,
-                    interactive=True,
-                    scale=1,
-                )
-            with gr.Row():
-                plot = gr.Plot(
-                    value=produce_radial_plot(
-                        model_ids_dropdown.value,
-                        language_names=language_names_dropdown.value,
-                        use_win_ratio=use_win_ratio_checkbox.value,
-                        show_scale=show_scale_checkbox.value,
-                        plot_width=plot_width_slider.value,
-                        plot_height=plot_height_slider.value,
-                        results_dfs=results_dfs,
-                    ),
-                )
-            with gr.Row():
-                gr.Markdown(
-                    "<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
-                    "Alexandra Institute</a>.</center>"
-                )
         language_names_dropdown.change(
             fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),

 logger = logging.getLogger("radial_plot_generator")
+INTRO_MARKDOWN = """
+# Radial Plot Generator
+This demo allows you to generate a radial plot comparing the performance of different
+language models on different tasks. It is based on the generative results from the
+[ScandEval benchmark](https://scandeval.com).
+"""
+ABOUT_MARKDOWN = """
+## About the ScandEval Benchmark
+The [ScandEval benchmark](https://scandeval.com) is used compare pretrained language
+models on tasks in Danish, Swedish, Norwegian Bokmål, Norwegian Nynorsk, Icelandic,
+Faroese, German, Dutch and English. The benchmark supports both encoder models (such as
+BERT) and generative models (such as GPT), and leaderboards for both kinds [are
+available](https://scandeval.com).
+The generative models are evaluated using in-context learning with few-shot prompts.
+The few-shot examples are sampled randomly from the training split, and we benchmark
+the models 10 times with bootstrapped test sets and different few-shot examples in each
+iteration. This allows us to better measure the uncertainty of the results.
+We use the uncertainty in the radial plot when we compute the win ratios (i.e., the
+percentage of other models that a model beats on a task). Namely, we compute the win
+ratio as the percentage of other models that a model _significantly_ beats on a task,
+where we use a paired t-test with a significance level of 0.05 to determine whether a
+model significantly beats another model.
+## The Benchmark Datasets
+The ScandEval generative benchmark currently covers the languages Danish, Swedish,
+Norwegian, Icelandic, German, Dutch and English. For each language, the benchmark
+consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are
+the following:
+### Text Classification
+Given a piece of text, classify it into a number of classes. For this task we extract
+the first token of the possible labels, and choose the label whose first token has the
+highest probability. All datasets in this category are currently trinary sentiment
+classification datasets. We use the Matthews Correlation Coefficient (MCC) as the
+evaluation metric.
+### Information Extraction
+Given a piece of text, extract a number of entities from the text. As the model needs
+to extract multiple entities, we use [structured
+generation](https://github.com/noamgat/lm-format-enforcer) to make the model generate a
+JSON dictionary with keys being the entity categories and values being lists of the
+identified entities. All datasets in this task are named entity recognition datasets.
+We use the micro-averaged F1 score as the evaluation metric, where we ignore the
+Miscellaneous category.
+### Grammar
+Given a piece of text, determine whether it is grammatically correct or not. All
+datasets in this task are built from the dependency treebanks of the languages, where
+words are removed or swapped, in a way that makes the sentence ungrammatical. We use
+the Matthews Correlation Coefficient (MCC) as the evaluation metric.
+### Question Answering
+Given a question and a piece of text, extract the answer to the question from the text.
+All datasets in this task are extractive question answering datasets. We use the exact
+match (EM) score as the evaluation metric.
+### Summarisation
+Given a piece of text, generate a summary of the text. All the datasets come from
+either news articles or WikiHow articles. We use the BERTScore metric as the evaluation
+metric, where the encoder model used is
+[microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base).
+### Knowledge
+Given a trivia-style question with multiple choice answers, choose the correct answer.
+As with text classification, we use the probabilities of the answer letter (a, b, c or
+d) to choose the answer. The datasets in this task are machine translated versions of
+the [MMLU](https://doi.org/10.48550/arXiv.2009.03300) and
+[ARC](https://allenai.org/data/arc) datasets. We use the Matthews Correlation
+Coefficient (MCC) as the evaluation metric.
+### Reasoning
+Given a scenario and multiple possible endings, choose the correct ending. As with text
+classification, we use the probabilities of the answer letter (a, b, c or d) to choose
+the answer. The datasets in this task are machine translated versions of the
+[HellaSwag](https://rowanzellers.com/hellaswag/) dataset. We use the Matthews
+Correlation Coefficient (MCC) as the evaluation metric.
+"""
 UPDATE_FREQUENCY_MINUTES = 30
     })
     with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+        gr.Markdown(INTRO_MARKDOWN)
+        with gr.Tab(label="Build a Radial Plot"):
+            with gr.Column():
+                with gr.Row():
+                    language_names_dropdown = gr.Dropdown(
+                        choices=all_languages,
+                        multiselect=True,
+                        label="Languages",
+                        value=["Danish"],
+                        interactive=True,
+                        scale=2,
+                    )
+                    model_ids_dropdown = gr.Dropdown(
+                        choices=danish_models,
+                        multiselect=True,
+                        label="Models",
+                        value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"],
+                        interactive=True,
+                        scale=2,
+                    )
+                with gr.Row():
+                    use_win_ratio_checkbox = gr.Checkbox(
+                        label="Compare models with win ratios (as opposed to raw scores)",
+                        value=True,
+                        interactive=True,
+                        scale=1,
+                    )
+                    show_scale_checkbox = gr.Checkbox(
+                        label="Show the scale on the plot (always 0-100)",
+                        value=False,
+                        interactive=True,
+                        scale=1,
+                    )
+                    plot_width_slider = gr.Slider(
+                        label="Plot width",
+                        minimum=600,
+                        maximum=1000,
+                        step=10,
+                        value=800,
+                        interactive=True,
+                        scale=1,
+                    )
+                    plot_height_slider = gr.Slider(
+                        label="Plot height",
+                        minimum=300,
+                        maximum=700,
+                        step=10,
+                        value=500,
+                        interactive=True,
+                        scale=1,
+                    )
+                with gr.Row():
+                    plot = gr.Plot(
+                        value=produce_radial_plot(
+                            model_ids_dropdown.value,
+                            language_names=language_names_dropdown.value,
+                            use_win_ratio=use_win_ratio_checkbox.value,
+                            show_scale=show_scale_checkbox.value,
+                            plot_width=plot_width_slider.value,
+                            plot_height=plot_height_slider.value,
+                            results_dfs=results_dfs,
+                        ),
+                    )
+                with gr.Row():
+                    gr.Markdown(
+                        "<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">"
+                        "Alexandra Institute</a>.</center>"
+                    )
+        with gr.Tab(label="About"):
+            gr.Markdown(ABOUT_MARKDOWN)
         language_names_dropdown.change(
             fn=partial(update_model_ids_dropdown, results_dfs=results_dfs),