Spaces:
				
			
			
	
			
			
		Build error
		
	
	
	
			
			
	
	
	
	
		
		
		Build error
		
	
		Nathan Habib
		
	commited on
		
		
					Commit 
							
							·
						
						e3aaf53
	
1
								Parent(s):
							
							26286b2
								
add new evals to the leaderboard
Browse files- app.py +22 -19
- src/assets/hardcoded_evals.py +3 -0
- src/assets/text_content.py +53 -1
- src/get_model_info/utils.py +3 -0
- src/plots/read_results.py +6 -3
    	
        app.py
    CHANGED
    
    | @@ -88,6 +88,9 @@ BENCHMARK_COLS = [ | |
| 88 | 
             
                    AutoEvalColumn.hellaswag,
         | 
| 89 | 
             
                    AutoEvalColumn.mmlu,
         | 
| 90 | 
             
                    AutoEvalColumn.truthfulqa,
         | 
|  | |
|  | |
|  | |
| 91 | 
             
                ]
         | 
| 92 | 
             
            ]
         | 
| 93 |  | 
| @@ -107,7 +110,7 @@ update_collections(original_df.copy()) | |
| 107 | 
             
            leaderboard_df = original_df.copy()
         | 
| 108 |  | 
| 109 | 
             
            models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
         | 
| 110 | 
            -
            plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
         | 
| 111 | 
             
            to_be_dumped = f"models = {repr(models)}\n"
         | 
| 112 |  | 
| 113 | 
             
            (
         | 
| @@ -516,24 +519,24 @@ with demo: | |
| 516 | 
             
                            queue=True,
         | 
| 517 | 
             
                        )
         | 
| 518 |  | 
| 519 | 
            -
                    with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
         | 
| 520 | 
            -
             | 
| 521 | 
            -
             | 
| 522 | 
            -
             | 
| 523 | 
            -
             | 
| 524 | 
            -
             | 
| 525 | 
            -
             | 
| 526 | 
            -
             | 
| 527 | 
            -
             | 
| 528 | 
            -
             | 
| 529 | 
            -
             | 
| 530 | 
            -
             | 
| 531 | 
            -
             | 
| 532 | 
            -
             | 
| 533 | 
            -
             | 
| 534 | 
            -
             | 
| 535 | 
            -
             | 
| 536 | 
            -
             | 
| 537 | 
             
                    with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
         | 
| 538 | 
             
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
| 539 |  | 
|  | |
| 88 | 
             
                    AutoEvalColumn.hellaswag,
         | 
| 89 | 
             
                    AutoEvalColumn.mmlu,
         | 
| 90 | 
             
                    AutoEvalColumn.truthfulqa,
         | 
| 91 | 
            +
                    AutoEvalColumn.winogrande,
         | 
| 92 | 
            +
                    AutoEvalColumn.gsm8k,
         | 
| 93 | 
            +
                    AutoEvalColumn.drop
         | 
| 94 | 
             
                ]
         | 
| 95 | 
             
            ]
         | 
| 96 |  | 
|  | |
| 110 | 
             
            leaderboard_df = original_df.copy()
         | 
| 111 |  | 
| 112 | 
             
            models = original_df["model_name_for_query"].tolist()  # needed for model backlinks in their to the leaderboard
         | 
| 113 | 
            +
            #plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
         | 
| 114 | 
             
            to_be_dumped = f"models = {repr(models)}\n"
         | 
| 115 |  | 
| 116 | 
             
            (
         | 
|  | |
| 519 | 
             
                            queue=True,
         | 
| 520 | 
             
                        )
         | 
| 521 |  | 
| 522 | 
            +
                    # with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
         | 
| 523 | 
            +
                    #     with gr.Row():
         | 
| 524 | 
            +
                    #         with gr.Column():
         | 
| 525 | 
            +
                    #             chart = create_metric_plot_obj(
         | 
| 526 | 
            +
                    #                 plot_df,
         | 
| 527 | 
            +
                    #                 ["Average ⬆️"],
         | 
| 528 | 
            +
                    #                 HUMAN_BASELINES,
         | 
| 529 | 
            +
                    #                 title="Average of Top Scores and Human Baseline Over Time",
         | 
| 530 | 
            +
                    #             )
         | 
| 531 | 
            +
                    #             gr.Plot(value=chart, interactive=False, width=500, height=500)
         | 
| 532 | 
            +
                    #         with gr.Column():
         | 
| 533 | 
            +
                    #             chart = create_metric_plot_obj(
         | 
| 534 | 
            +
                    #                 plot_df,
         | 
| 535 | 
            +
                    #                 ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
         | 
| 536 | 
            +
                    #                 HUMAN_BASELINES,
         | 
| 537 | 
            +
                    #                 title="Top Scores and Human Baseline Over Time",
         | 
| 538 | 
            +
                    #             )
         | 
| 539 | 
            +
                    #             gr.Plot(value=chart, interactive=False, width=500, height=500)
         | 
| 540 | 
             
                    with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
         | 
| 541 | 
             
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
| 542 |  | 
    	
        src/assets/hardcoded_evals.py
    CHANGED
    
    | @@ -35,6 +35,9 @@ baseline = { | |
| 35 | 
             
                AutoEvalColumn.hellaswag.name: 25.0,
         | 
| 36 | 
             
                AutoEvalColumn.mmlu.name: 25.0,
         | 
| 37 | 
             
                AutoEvalColumn.truthfulqa.name: 25.0,
         | 
|  | |
|  | |
|  | |
| 38 | 
             
                AutoEvalColumn.dummy.name: "baseline",
         | 
| 39 | 
             
                AutoEvalColumn.model_type.name: "",
         | 
| 40 | 
             
            }
         | 
|  | |
| 35 | 
             
                AutoEvalColumn.hellaswag.name: 25.0,
         | 
| 36 | 
             
                AutoEvalColumn.mmlu.name: 25.0,
         | 
| 37 | 
             
                AutoEvalColumn.truthfulqa.name: 25.0,
         | 
| 38 | 
            +
                AutoEvalColumn.winogrande.name: 50.0,
         | 
| 39 | 
            +
                AutoEvalColumn.gsm8k.name: 0.21,
         | 
| 40 | 
            +
                AutoEvalColumn.drop.name: 0.47,
         | 
| 41 | 
             
                AutoEvalColumn.dummy.name: "baseline",
         | 
| 42 | 
             
                AutoEvalColumn.model_type.name: "",
         | 
| 43 | 
             
            }
         | 
    	
        src/assets/text_content.py
    CHANGED
    
    | @@ -31,7 +31,10 @@ If there is no icon, we have not uploaded the information on the model yet, feel | |
| 31 | 
             
            - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
         | 
| 32 | 
             
            - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
         | 
| 33 | 
             
            - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
         | 
| 34 | 
            -
            - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a test to measure a model | 
|  | |
|  | |
|  | |
| 35 |  | 
| 36 | 
             
            For all these evaluations, a higher score is a better score.
         | 
| 37 | 
             
            We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
         | 
| @@ -55,6 +58,14 @@ The tasks and few shots parameters are: | |
| 55 | 
             
            - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
         | 
| 56 | 
             
            - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
         | 
| 57 | 
             
            - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 58 |  | 
| 59 | 
             
            ## Quantization
         | 
| 60 | 
             
            To get more information about quantization, see:
         | 
| @@ -166,4 +177,45 @@ CITATION_BUTTON_TEXT = r""" | |
| 166 | 
             
                  eprint={2109.07958},
         | 
| 167 | 
             
                  archivePrefix={arXiv},
         | 
| 168 | 
             
                  primaryClass={cs.CL}
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 169 | 
             
            }"""
         | 
|  | |
| 31 | 
             
            - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
         | 
| 32 | 
             
            - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
         | 
| 33 | 
             
            - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
         | 
| 34 | 
            +
            - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
         | 
| 35 | 
            +
            - <a href="https://arxiv.org/abs/1907.10641" target="_blank">  Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
         | 
| 36 | 
            +
            - <a href="https://arxiv.org/abs/2110.14168" target="_blank">  GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
         | 
| 37 | 
            +
            - <a href="https://arxiv.org/abs/1903.00161" target="_blank">  DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
         | 
| 38 |  | 
| 39 | 
             
            For all these evaluations, a higher score is a better score.
         | 
| 40 | 
             
            We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
         | 
|  | |
| 58 | 
             
            - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
         | 
| 59 | 
             
            - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
         | 
| 60 | 
             
            - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
         | 
| 61 | 
            +
            - Winogrande: 5-shot, *winogrande* (`acc`)
         | 
| 62 | 
            +
            - GSM8k: 5-shot, *gsm8k* (`acc`)
         | 
| 63 | 
            +
            - DROP: 3-shot, *drop* (`f1`)
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            Side note on the baseline scores: 
         | 
| 66 | 
            +
            - for log-likelihood evaluation, we select the random baseline
         | 
| 67 | 
            +
            - for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
         | 
| 68 | 
            +
            - for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
         | 
| 69 |  | 
| 70 | 
             
            ## Quantization
         | 
| 71 | 
             
            To get more information about quantization, see:
         | 
|  | |
| 177 | 
             
                  eprint={2109.07958},
         | 
| 178 | 
             
                  archivePrefix={arXiv},
         | 
| 179 | 
             
                  primaryClass={cs.CL}
         | 
| 180 | 
            +
            }
         | 
| 181 | 
            +
            @misc{DBLP:journals/corr/abs-1907-10641,
         | 
| 182 | 
            +
                  title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
         | 
| 183 | 
            +
                  author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
         | 
| 184 | 
            +
                  year={2019},
         | 
| 185 | 
            +
                  eprint={1907.10641},
         | 
| 186 | 
            +
                  archivePrefix={arXiv},
         | 
| 187 | 
            +
                  primaryClass={cs.CL}
         | 
| 188 | 
            +
            }
         | 
| 189 | 
            +
            @misc{DBLP:journals/corr/abs-2110-14168,
         | 
| 190 | 
            +
                  title={Training Verifiers to Solve Math Word Problems},
         | 
| 191 | 
            +
                  author={Karl Cobbe and
         | 
| 192 | 
            +
                              Vineet Kosaraju and
         | 
| 193 | 
            +
                              Mohammad Bavarian and
         | 
| 194 | 
            +
                              Mark Chen and
         | 
| 195 | 
            +
                              Heewoo Jun and
         | 
| 196 | 
            +
                              Lukasz Kaiser and
         | 
| 197 | 
            +
                              Matthias Plappert and
         | 
| 198 | 
            +
                              Jerry Tworek and
         | 
| 199 | 
            +
                              Jacob Hilton and
         | 
| 200 | 
            +
                              Reiichiro Nakano and
         | 
| 201 | 
            +
                              Christopher Hesse and
         | 
| 202 | 
            +
                              John Schulman},
         | 
| 203 | 
            +
                  year={2021},
         | 
| 204 | 
            +
                  eprint={2110.14168},
         | 
| 205 | 
            +
                  archivePrefix={arXiv},
         | 
| 206 | 
            +
                  primaryClass={cs.CL}
         | 
| 207 | 
            +
            }
         | 
| 208 | 
            +
            @misc{DBLP:journals/corr/abs-1903-00161,
         | 
| 209 | 
            +
                  title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
         | 
| 210 | 
            +
                              Over Paragraphs},
         | 
| 211 | 
            +
                  author={Dheeru Dua and
         | 
| 212 | 
            +
                              Yizhong Wang and
         | 
| 213 | 
            +
                              Pradeep Dasigi and
         | 
| 214 | 
            +
                              Gabriel Stanovsky and
         | 
| 215 | 
            +
                              Sameer Singh and
         | 
| 216 | 
            +
                              Matt Gardner},
         | 
| 217 | 
            +
                  year={2019},
         | 
| 218 | 
            +
                  eprinttype={arXiv},
         | 
| 219 | 
            +
                  eprint={1903.00161},
         | 
| 220 | 
            +
                  primaryClass={cs.CL}
         | 
| 221 | 
             
            }"""
         | 
    	
        src/get_model_info/utils.py
    CHANGED
    
    | @@ -29,6 +29,9 @@ class AutoEvalColumn:  # Auto evals column | |
| 29 | 
             
                hellaswag = ColumnContent("HellaSwag", "number", True)
         | 
| 30 | 
             
                mmlu = ColumnContent("MMLU", "number", True)
         | 
| 31 | 
             
                truthfulqa = ColumnContent("TruthfulQA", "number", True)
         | 
|  | |
|  | |
|  | |
| 32 | 
             
                model_type = ColumnContent("Type", "str", False)
         | 
| 33 | 
             
                precision = ColumnContent("Precision", "str", False)  # , True)
         | 
| 34 | 
             
                license = ColumnContent("Hub License", "str", False)
         | 
|  | |
| 29 | 
             
                hellaswag = ColumnContent("HellaSwag", "number", True)
         | 
| 30 | 
             
                mmlu = ColumnContent("MMLU", "number", True)
         | 
| 31 | 
             
                truthfulqa = ColumnContent("TruthfulQA", "number", True)
         | 
| 32 | 
            +
                winogrande = ColumnContent("Winogrande", "number", True)
         | 
| 33 | 
            +
                gsm8k = ColumnContent("GSM8K", "number", True)
         | 
| 34 | 
            +
                drop = ColumnContent("DROP", "number", True)
         | 
| 35 | 
             
                model_type = ColumnContent("Type", "str", False)
         | 
| 36 | 
             
                precision = ColumnContent("Precision", "str", False)  # , True)
         | 
| 37 | 
             
                license = ColumnContent("Hub License", "str", False)
         | 
    	
        src/plots/read_results.py
    CHANGED
    
    | @@ -8,13 +8,16 @@ import numpy as np | |
| 8 |  | 
| 9 | 
             
            from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
         | 
| 10 |  | 
| 11 | 
            -
            METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
         | 
| 12 | 
            -
            BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
         | 
| 13 | 
             
            BENCH_TO_NAME = {
         | 
| 14 | 
             
                "arc:challenge": AutoEvalColumn.arc.name,
         | 
| 15 | 
             
                "hellaswag": AutoEvalColumn.hellaswag.name,
         | 
| 16 | 
             
                "hendrycksTest": AutoEvalColumn.mmlu.name,
         | 
| 17 | 
             
                "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
         | 
|  | |
|  | |
|  | |
| 18 | 
             
            }
         | 
| 19 |  | 
| 20 |  | 
| @@ -46,7 +49,7 @@ class EvalResult: | |
| 46 | 
             
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         | 
| 47 | 
             
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         | 
| 48 | 
             
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         | 
| 49 | 
            -
                    data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) /  | 
| 50 | 
             
                    data_dict[AutoEvalColumn.still_on_hub.name] = (
         | 
| 51 | 
             
                        is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
         | 
| 52 | 
             
                    )
         | 
|  | |
| 8 |  | 
| 9 | 
             
            from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
         | 
| 10 |  | 
| 11 | 
            +
            METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc", "acc", "f1"]
         | 
| 12 | 
            +
            BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "winogrande", "gsm8k", "drop"]
         | 
| 13 | 
             
            BENCH_TO_NAME = {
         | 
| 14 | 
             
                "arc:challenge": AutoEvalColumn.arc.name,
         | 
| 15 | 
             
                "hellaswag": AutoEvalColumn.hellaswag.name,
         | 
| 16 | 
             
                "hendrycksTest": AutoEvalColumn.mmlu.name,
         | 
| 17 | 
             
                "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
         | 
| 18 | 
            +
                "winogrande": AutoEvalColumn.winogrande.name,
         | 
| 19 | 
            +
                "gsm8k": AutoEvalColumn.gsm8k.name,
         | 
| 20 | 
            +
                "drop": AutoEvalColumn.drop.name,
         | 
| 21 | 
             
            }
         | 
| 22 |  | 
| 23 |  | 
|  | |
| 49 | 
             
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         | 
| 50 | 
             
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         | 
| 51 | 
             
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         | 
| 52 | 
            +
                    data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 7.0
         | 
| 53 | 
             
                    data_dict[AutoEvalColumn.still_on_hub.name] = (
         | 
| 54 | 
             
                        is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
         | 
| 55 | 
             
                    )
         |