Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Fix TruthQA typo
Browse files
    	
        app.py
    CHANGED
    
    | @@ -43,11 +43,11 @@ def load_results(model, benchmark, metric): | |
| 43 | 
             
                with open(file_path) as fp:
         | 
| 44 | 
             
                    data = json.load(fp)
         | 
| 45 | 
             
                accs = np.array([v[metric] for k, v in data["results"].items()])
         | 
| 46 | 
            -
                mean_acc = np.mean(accs) | 
| 47 | 
             
                return mean_acc, data["config"]["model_args"]
         | 
| 48 |  | 
| 49 |  | 
| 50 | 
            -
            COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", " | 
| 51 | 
             
            TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
         | 
| 52 |  | 
| 53 | 
             
            if not IS_PUBLIC:
         | 
| @@ -57,36 +57,36 @@ if not IS_PUBLIC: | |
| 57 | 
             
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         | 
| 58 | 
             
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         | 
| 59 | 
             
            def get_leaderboard():
         | 
| 60 | 
            -
                if repo: | 
| 61 | 
             
                    print("pulling changes")
         | 
| 62 | 
             
                    repo.git_pull()
         | 
| 63 | 
            -
             | 
| 64 | 
             
                all_data = get_eval_results_dicts(IS_PUBLIC)
         | 
| 65 | 
            -
             | 
| 66 | 
             
                if not IS_PUBLIC:
         | 
| 67 | 
             
                    gpt4_values = {
         | 
| 68 | 
            -
                        "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>', | 
| 69 | 
            -
                        "Revision":"tech report", | 
| 70 | 
             
                        "8bit":None,
         | 
| 71 | 
             
                        "Average ⬆️":84.3,
         | 
| 72 | 
             
                        "ARC (25-shot) ⬆️":96.3,
         | 
| 73 | 
             
                        "HellaSwag (10-shot) ⬆️":95.3,
         | 
| 74 | 
             
                        "MMLU (5-shot) ⬆️":86.4,
         | 
| 75 | 
            -
                        " | 
| 76 | 
             
                    }
         | 
| 77 | 
             
                    all_data.append(gpt4_values)
         | 
| 78 | 
             
                    gpt35_values = {
         | 
| 79 | 
            -
                        "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>', | 
| 80 | 
            -
                        "Revision":"tech report", | 
| 81 | 
             
                        "8bit":None,
         | 
| 82 | 
             
                        "Average ⬆️":71.9,
         | 
| 83 | 
             
                        "ARC (25-shot) ⬆️":85.2,
         | 
| 84 | 
             
                        "HellaSwag (10-shot) ⬆️":85.5,
         | 
| 85 | 
             
                        "MMLU (5-shot) ⬆️":70.0,
         | 
| 86 | 
            -
                        " | 
| 87 | 
             
                    }
         | 
| 88 | 
             
                    all_data.append(gpt35_values)
         | 
| 89 | 
            -
             | 
| 90 | 
             
                dataframe = pd.DataFrame.from_records(all_data)
         | 
| 91 | 
             
                dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
         | 
| 92 | 
             
                print(dataframe)
         | 
| @@ -94,38 +94,38 @@ def get_leaderboard(): | |
| 94 | 
             
                return dataframe
         | 
| 95 |  | 
| 96 | 
             
            def get_eval_table():
         | 
| 97 | 
            -
                if repo: | 
| 98 | 
             
                    print("pulling changes for eval")
         | 
| 99 | 
             
                    repo.git_pull()
         | 
| 100 | 
            -
                entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')] | 
| 101 | 
             
                all_evals = []
         | 
| 102 | 
            -
             | 
| 103 | 
             
                for entry in entries:
         | 
| 104 | 
             
                    print(entry)
         | 
| 105 | 
             
                    if ".json"in entry:
         | 
| 106 | 
             
                        file_path = os.path.join("evals/eval_requests", entry)
         | 
| 107 | 
             
                        with open(file_path) as fp:
         | 
| 108 | 
             
                            data = json.load(fp)
         | 
| 109 | 
            -
             | 
| 110 | 
             
                        data["# params"] = "unknown"
         | 
| 111 | 
             
                        data["model"] = make_clickable_model(data["model"])
         | 
| 112 | 
             
                        data["revision"] = data.get("revision", "main")
         | 
| 113 | 
            -
             | 
| 114 |  | 
| 115 | 
             
                        all_evals.append(data)
         | 
| 116 | 
             
                    else:
         | 
| 117 | 
             
                        # this is a folder
         | 
| 118 | 
            -
                        sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')] | 
| 119 | 
             
                        for sub_entry in sub_entries:
         | 
| 120 | 
             
                            file_path = os.path.join("evals/eval_requests", entry, sub_entry)
         | 
| 121 | 
             
                            with open(file_path) as fp:
         | 
| 122 | 
             
                                data = json.load(fp)
         | 
| 123 | 
            -
             | 
| 124 | 
             
                            #data["# params"] = get_n_params(data["model"])
         | 
| 125 | 
             
                            data["model"] = make_clickable_model(data["model"])
         | 
| 126 | 
             
                            all_evals.append(data)
         | 
| 127 |  | 
| 128 | 
            -
             | 
| 129 | 
             
                dataframe = pd.DataFrame.from_records(all_evals)
         | 
| 130 | 
             
                return dataframe[EVAL_COLS]
         | 
| 131 |  | 
| @@ -137,12 +137,12 @@ def is_model_on_hub(model_name, revision) -> bool: | |
| 137 | 
             
                try:
         | 
| 138 | 
             
                    config = AutoConfig.from_pretrained(model_name, revision=revision)
         | 
| 139 | 
             
                    return True
         | 
| 140 | 
            -
             | 
| 141 | 
             
                except Exception as e:
         | 
| 142 | 
             
                    print("Could not get the model config from the hub")
         | 
| 143 | 
             
                    print(e)
         | 
| 144 | 
             
                    return False
         | 
| 145 | 
            -
             | 
| 146 |  | 
| 147 |  | 
| 148 | 
             
            def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
         | 
| @@ -152,12 +152,12 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, | |
| 152 | 
             
                if is_delta_weight and not is_model_on_hub(base_model, revision):
         | 
| 153 | 
             
                    print(base_model, "base model not found on hub")
         | 
| 154 | 
             
                    return
         | 
| 155 | 
            -
             | 
| 156 | 
             
                if not is_model_on_hub(model, revision):
         | 
| 157 | 
             
                    print(model, "not found on hub")
         | 
| 158 | 
             
                    return
         | 
| 159 | 
             
                print("adding new eval")
         | 
| 160 | 
            -
             | 
| 161 | 
             
                eval_entry = {
         | 
| 162 | 
             
                    "model" : model,
         | 
| 163 | 
             
                    "base_model" : base_model,
         | 
| @@ -166,22 +166,22 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, | |
| 166 | 
             
                    "8bit_eval" : is_8_bit_eval,
         | 
| 167 | 
             
                    "is_delta_weight" : is_delta_weight,
         | 
| 168 | 
             
                    "status" : "PENDING"
         | 
| 169 | 
            -
                } | 
| 170 | 
            -
             | 
| 171 | 
             
                user_name = ""
         | 
| 172 | 
             
                model_path = model
         | 
| 173 | 
             
                if "/" in model:
         | 
| 174 | 
             
                    user_name = model.split("/")[0]
         | 
| 175 | 
             
                    model_path = model.split("/")[1]
         | 
| 176 | 
            -
             | 
| 177 | 
             
                OUT_DIR=f"eval_requests/{user_name}"
         | 
| 178 | 
             
                os.makedirs(OUT_DIR, exist_ok=True)
         | 
| 179 | 
             
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
         | 
| 180 | 
            -
             | 
| 181 | 
             
                with open(out_path, "w") as f:
         | 
| 182 | 
             
                    f.write(json.dumps(eval_entry))
         | 
| 183 | 
             
                LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
         | 
| 184 | 
            -
             | 
| 185 | 
             
                api = HfApi()
         | 
| 186 | 
             
                api.upload_file(
         | 
| 187 | 
             
                    path_or_fileobj=out_path,
         | 
| @@ -191,14 +191,14 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, | |
| 191 | 
             
                    repo_type="dataset",
         | 
| 192 | 
             
                )
         | 
| 193 |  | 
| 194 | 
            -
             | 
| 195 | 
             
            def refresh():
         | 
| 196 | 
             
                return get_leaderboard(), get_eval_table()
         | 
| 197 | 
            -
             | 
| 198 |  | 
| 199 |  | 
| 200 | 
             
            block = gr.Blocks()
         | 
| 201 | 
            -
            with block: | 
| 202 | 
             
                with gr.Row():
         | 
| 203 | 
             
                    gr.Markdown(f"""
         | 
| 204 | 
             
            # 🤗 Open LLM Leaderboard
         | 
| @@ -208,49 +208,47 @@ Evaluation is performed against 4 popular benchmarks: | |
| 208 | 
             
            - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
         | 
| 209 | 
             
            - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
         | 
| 210 | 
             
            - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
         | 
| 211 | 
            -
            - <a href="https://arxiv.org/abs/2109.07958" target="_blank">   | 
| 212 |  | 
| 213 | 
             
            We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
         | 
| 214 | 
             
                    """)
         | 
| 215 | 
            -
             | 
| 216 | 
             
                with gr.Row():
         | 
| 217 | 
             
                    leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
         | 
| 218 | 
             
                                                                datatype=TYPES, max_rows=5)
         | 
| 219 |  | 
| 220 | 
            -
             | 
| 221 | 
            -
             | 
| 222 | 
             
                with gr.Row():
         | 
| 223 | 
             
                    gr.Markdown(f"""
         | 
| 224 | 
             
                # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
         | 
| 225 | 
            -
             | 
| 226 | 
             
                """)
         | 
| 227 | 
             
                with gr.Accordion("Evaluation Queue", open=False):
         | 
| 228 | 
             
                    with gr.Row():
         | 
| 229 | 
             
                        eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
         | 
| 230 | 
            -
                                                                    datatype=EVAL_TYPES, max_rows=5) | 
| 231 | 
            -
             | 
| 232 | 
             
                with gr.Row():
         | 
| 233 | 
             
                    refresh_button = gr.Button("Refresh")
         | 
| 234 | 
            -
                    refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table]) | 
| 235 | 
            -
             | 
| 236 | 
             
                with gr.Accordion("Submit a new model for evaluation"):
         | 
| 237 | 
            -
                    # with gr.Row():
         | 
| 238 | 
            -
                    #     gr.Markdown(f"""# Submit a new model for evaluation""")
         | 
| 239 | 
             
                    with gr.Row():
         | 
| 240 | 
             
                        with gr.Column():
         | 
| 241 | 
             
                            model_name_textbox = gr.Textbox(label="Model name")
         | 
| 242 | 
             
                            revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
         | 
| 243 | 
            -
             | 
| 244 | 
             
                        with gr.Column():
         | 
| 245 | 
             
                            is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
         | 
| 246 | 
             
                            private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
         | 
| 247 | 
             
                            is_delta_weight = gr.Checkbox(False, label="Delta weights")
         | 
| 248 | 
             
                            base_model_name_textbox = gr.Textbox(label="base model (for delta)")
         | 
| 249 | 
            -
             | 
| 250 | 
             
                    with gr.Row():
         | 
| 251 | 
             
                        submit_button = gr.Button("Submit Eval")
         | 
| 252 | 
             
                        submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
         | 
| 253 | 
            -
             | 
| 254 |  | 
| 255 | 
             
                block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
         | 
| 256 | 
             
            block.launch()
         | 
|  | |
| 43 | 
             
                with open(file_path) as fp:
         | 
| 44 | 
             
                    data = json.load(fp)
         | 
| 45 | 
             
                accs = np.array([v[metric] for k, v in data["results"].items()])
         | 
| 46 | 
            +
                mean_acc = np.mean(accs)
         | 
| 47 | 
             
                return mean_acc, data["config"]["model_args"]
         | 
| 48 |  | 
| 49 |  | 
| 50 | 
            +
            COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthfulQA (0-shot) ⬆️"]
         | 
| 51 | 
             
            TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
         | 
| 52 |  | 
| 53 | 
             
            if not IS_PUBLIC:
         | 
|  | |
| 57 | 
             
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         | 
| 58 | 
             
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         | 
| 59 | 
             
            def get_leaderboard():
         | 
| 60 | 
            +
                if repo:
         | 
| 61 | 
             
                    print("pulling changes")
         | 
| 62 | 
             
                    repo.git_pull()
         | 
| 63 | 
            +
             | 
| 64 | 
             
                all_data = get_eval_results_dicts(IS_PUBLIC)
         | 
| 65 | 
            +
             | 
| 66 | 
             
                if not IS_PUBLIC:
         | 
| 67 | 
             
                    gpt4_values = {
         | 
| 68 | 
            +
                        "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
         | 
| 69 | 
            +
                        "Revision":"tech report",
         | 
| 70 | 
             
                        "8bit":None,
         | 
| 71 | 
             
                        "Average ⬆️":84.3,
         | 
| 72 | 
             
                        "ARC (25-shot) ⬆️":96.3,
         | 
| 73 | 
             
                        "HellaSwag (10-shot) ⬆️":95.3,
         | 
| 74 | 
             
                        "MMLU (5-shot) ⬆️":86.4,
         | 
| 75 | 
            +
                        "TruthfulQA (0-shot) ⬆️":59.0,
         | 
| 76 | 
             
                    }
         | 
| 77 | 
             
                    all_data.append(gpt4_values)
         | 
| 78 | 
             
                    gpt35_values = {
         | 
| 79 | 
            +
                        "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
         | 
| 80 | 
            +
                        "Revision":"tech report",
         | 
| 81 | 
             
                        "8bit":None,
         | 
| 82 | 
             
                        "Average ⬆️":71.9,
         | 
| 83 | 
             
                        "ARC (25-shot) ⬆️":85.2,
         | 
| 84 | 
             
                        "HellaSwag (10-shot) ⬆️":85.5,
         | 
| 85 | 
             
                        "MMLU (5-shot) ⬆️":70.0,
         | 
| 86 | 
            +
                        "TruthfulQA (0-shot) ⬆️":47.0,
         | 
| 87 | 
             
                    }
         | 
| 88 | 
             
                    all_data.append(gpt35_values)
         | 
| 89 | 
            +
             | 
| 90 | 
             
                dataframe = pd.DataFrame.from_records(all_data)
         | 
| 91 | 
             
                dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
         | 
| 92 | 
             
                print(dataframe)
         | 
|  | |
| 94 | 
             
                return dataframe
         | 
| 95 |  | 
| 96 | 
             
            def get_eval_table():
         | 
| 97 | 
            +
                if repo:
         | 
| 98 | 
             
                    print("pulling changes for eval")
         | 
| 99 | 
             
                    repo.git_pull()
         | 
| 100 | 
            +
                entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
         | 
| 101 | 
             
                all_evals = []
         | 
| 102 | 
            +
             | 
| 103 | 
             
                for entry in entries:
         | 
| 104 | 
             
                    print(entry)
         | 
| 105 | 
             
                    if ".json"in entry:
         | 
| 106 | 
             
                        file_path = os.path.join("evals/eval_requests", entry)
         | 
| 107 | 
             
                        with open(file_path) as fp:
         | 
| 108 | 
             
                            data = json.load(fp)
         | 
| 109 | 
            +
             | 
| 110 | 
             
                        data["# params"] = "unknown"
         | 
| 111 | 
             
                        data["model"] = make_clickable_model(data["model"])
         | 
| 112 | 
             
                        data["revision"] = data.get("revision", "main")
         | 
| 113 | 
            +
             | 
| 114 |  | 
| 115 | 
             
                        all_evals.append(data)
         | 
| 116 | 
             
                    else:
         | 
| 117 | 
             
                        # this is a folder
         | 
| 118 | 
            +
                        sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
         | 
| 119 | 
             
                        for sub_entry in sub_entries:
         | 
| 120 | 
             
                            file_path = os.path.join("evals/eval_requests", entry, sub_entry)
         | 
| 121 | 
             
                            with open(file_path) as fp:
         | 
| 122 | 
             
                                data = json.load(fp)
         | 
| 123 | 
            +
             | 
| 124 | 
             
                            #data["# params"] = get_n_params(data["model"])
         | 
| 125 | 
             
                            data["model"] = make_clickable_model(data["model"])
         | 
| 126 | 
             
                            all_evals.append(data)
         | 
| 127 |  | 
| 128 | 
            +
             | 
| 129 | 
             
                dataframe = pd.DataFrame.from_records(all_evals)
         | 
| 130 | 
             
                return dataframe[EVAL_COLS]
         | 
| 131 |  | 
|  | |
| 137 | 
             
                try:
         | 
| 138 | 
             
                    config = AutoConfig.from_pretrained(model_name, revision=revision)
         | 
| 139 | 
             
                    return True
         | 
| 140 | 
            +
             | 
| 141 | 
             
                except Exception as e:
         | 
| 142 | 
             
                    print("Could not get the model config from the hub")
         | 
| 143 | 
             
                    print(e)
         | 
| 144 | 
             
                    return False
         | 
| 145 | 
            +
             | 
| 146 |  | 
| 147 |  | 
| 148 | 
             
            def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
         | 
|  | |
| 152 | 
             
                if is_delta_weight and not is_model_on_hub(base_model, revision):
         | 
| 153 | 
             
                    print(base_model, "base model not found on hub")
         | 
| 154 | 
             
                    return
         | 
| 155 | 
            +
             | 
| 156 | 
             
                if not is_model_on_hub(model, revision):
         | 
| 157 | 
             
                    print(model, "not found on hub")
         | 
| 158 | 
             
                    return
         | 
| 159 | 
             
                print("adding new eval")
         | 
| 160 | 
            +
             | 
| 161 | 
             
                eval_entry = {
         | 
| 162 | 
             
                    "model" : model,
         | 
| 163 | 
             
                    "base_model" : base_model,
         | 
|  | |
| 166 | 
             
                    "8bit_eval" : is_8_bit_eval,
         | 
| 167 | 
             
                    "is_delta_weight" : is_delta_weight,
         | 
| 168 | 
             
                    "status" : "PENDING"
         | 
| 169 | 
            +
                }
         | 
| 170 | 
            +
             | 
| 171 | 
             
                user_name = ""
         | 
| 172 | 
             
                model_path = model
         | 
| 173 | 
             
                if "/" in model:
         | 
| 174 | 
             
                    user_name = model.split("/")[0]
         | 
| 175 | 
             
                    model_path = model.split("/")[1]
         | 
| 176 | 
            +
             | 
| 177 | 
             
                OUT_DIR=f"eval_requests/{user_name}"
         | 
| 178 | 
             
                os.makedirs(OUT_DIR, exist_ok=True)
         | 
| 179 | 
             
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
         | 
| 180 | 
            +
             | 
| 181 | 
             
                with open(out_path, "w") as f:
         | 
| 182 | 
             
                    f.write(json.dumps(eval_entry))
         | 
| 183 | 
             
                LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
         | 
| 184 | 
            +
             | 
| 185 | 
             
                api = HfApi()
         | 
| 186 | 
             
                api.upload_file(
         | 
| 187 | 
             
                    path_or_fileobj=out_path,
         | 
|  | |
| 191 | 
             
                    repo_type="dataset",
         | 
| 192 | 
             
                )
         | 
| 193 |  | 
| 194 | 
            +
             | 
| 195 | 
             
            def refresh():
         | 
| 196 | 
             
                return get_leaderboard(), get_eval_table()
         | 
| 197 | 
            +
             | 
| 198 |  | 
| 199 |  | 
| 200 | 
             
            block = gr.Blocks()
         | 
| 201 | 
            +
            with block:
         | 
| 202 | 
             
                with gr.Row():
         | 
| 203 | 
             
                    gr.Markdown(f"""
         | 
| 204 | 
             
            # 🤗 Open LLM Leaderboard
         | 
|  | |
| 208 | 
             
            - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
         | 
| 209 | 
             
            - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
         | 
| 210 | 
             
            - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
         | 
| 211 | 
            +
            - <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
         | 
| 212 |  | 
| 213 | 
             
            We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
         | 
| 214 | 
             
                    """)
         | 
| 215 | 
            +
             | 
| 216 | 
             
                with gr.Row():
         | 
| 217 | 
             
                    leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
         | 
| 218 | 
             
                                                                datatype=TYPES, max_rows=5)
         | 
| 219 |  | 
| 220 | 
            +
             | 
| 221 | 
            +
             | 
| 222 | 
             
                with gr.Row():
         | 
| 223 | 
             
                    gr.Markdown(f"""
         | 
| 224 | 
             
                # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
         | 
| 225 | 
            +
             | 
| 226 | 
             
                """)
         | 
| 227 | 
             
                with gr.Accordion("Evaluation Queue", open=False):
         | 
| 228 | 
             
                    with gr.Row():
         | 
| 229 | 
             
                        eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
         | 
| 230 | 
            +
                                                                    datatype=EVAL_TYPES, max_rows=5)
         | 
| 231 | 
            +
             | 
| 232 | 
             
                with gr.Row():
         | 
| 233 | 
             
                    refresh_button = gr.Button("Refresh")
         | 
| 234 | 
            +
                    refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
         | 
| 235 | 
            +
             | 
| 236 | 
             
                with gr.Accordion("Submit a new model for evaluation"):
         | 
|  | |
|  | |
| 237 | 
             
                    with gr.Row():
         | 
| 238 | 
             
                        with gr.Column():
         | 
| 239 | 
             
                            model_name_textbox = gr.Textbox(label="Model name")
         | 
| 240 | 
             
                            revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
         | 
| 241 | 
            +
             | 
| 242 | 
             
                        with gr.Column():
         | 
| 243 | 
             
                            is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
         | 
| 244 | 
             
                            private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
         | 
| 245 | 
             
                            is_delta_weight = gr.Checkbox(False, label="Delta weights")
         | 
| 246 | 
             
                            base_model_name_textbox = gr.Textbox(label="base model (for delta)")
         | 
| 247 | 
            +
             | 
| 248 | 
             
                    with gr.Row():
         | 
| 249 | 
             
                        submit_button = gr.Button("Submit Eval")
         | 
| 250 | 
             
                        submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
         | 
| 251 | 
            +
             | 
| 252 |  | 
| 253 | 
             
                block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
         | 
| 254 | 
             
            block.launch()
         | 
    	
        utils.py
    CHANGED
    
    | @@ -21,7 +21,7 @@ BENCH_TO_NAME = { | |
| 21 | 
             
                "arc_challenge":"ARC (25-shot) ⬆️",
         | 
| 22 | 
             
                 "hellaswag":"HellaSwag (10-shot) ⬆️",
         | 
| 23 | 
             
                 "hendrycks":"MMLU (5-shot) ⬆️",
         | 
| 24 | 
            -
                 "truthfulqa_mc":" | 
| 25 | 
             
            }
         | 
| 26 | 
             
            def make_clickable_model(model_name):        
         | 
| 27 | 
             
                LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
         | 
|  | |
| 21 | 
             
                "arc_challenge":"ARC (25-shot) ⬆️",
         | 
| 22 | 
             
                 "hellaswag":"HellaSwag (10-shot) ⬆️",
         | 
| 23 | 
             
                 "hendrycks":"MMLU (5-shot) ⬆️",
         | 
| 24 | 
            +
                 "truthfulqa_mc":"TruthfulQA (0-shot) ⬆️",
         | 
| 25 | 
             
            }
         | 
| 26 | 
             
            def make_clickable_model(model_name):        
         | 
| 27 | 
             
                LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
         | 
 
			

