Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	
		edbeeching
		
	commited on
		
		
					Commit 
							
							·
						
						fcb01e3
	
1
								Parent(s):
							
							b2c063a
								
updates table to include revision
Browse files
    	
        app.py
    CHANGED
    
    | @@ -46,8 +46,8 @@ def load_results(model, benchmark, metric): | |
| 46 | 
             
                return mean_acc, data["config"]["model_args"]
         | 
| 47 |  | 
| 48 |  | 
| 49 | 
            -
            COLS = [" | 
| 50 | 
            -
            TYPES = ["str", | 
| 51 |  | 
| 52 | 
             
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         | 
| 53 | 
             
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         | 
| @@ -59,7 +59,7 @@ def get_leaderboard(): | |
| 59 | 
             
                all_data = get_eval_results_dicts()
         | 
| 60 | 
             
                dataframe = pd.DataFrame.from_records(all_data)
         | 
| 61 | 
             
                dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
         | 
| 62 | 
            -
                
         | 
| 63 | 
             
                dataframe = dataframe[COLS]
         | 
| 64 | 
             
                return dataframe
         | 
| 65 |  | 
|  | |
| 46 | 
             
                return mean_acc, data["config"]["model_args"]
         | 
| 47 |  | 
| 48 |  | 
| 49 | 
            +
            COLS = ["base_model", "revision", "8bit", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️"]
         | 
| 50 | 
            +
            TYPES = ["markdown","str", "bool", "number", "number", "number", "number", "number", ]
         | 
| 51 |  | 
| 52 | 
             
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         | 
| 53 | 
             
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         | 
|  | |
| 59 | 
             
                all_data = get_eval_results_dicts()
         | 
| 60 | 
             
                dataframe = pd.DataFrame.from_records(all_data)
         | 
| 61 | 
             
                dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
         | 
| 62 | 
            +
                print(dataframe)
         | 
| 63 | 
             
                dataframe = dataframe[COLS]
         | 
| 64 | 
             
                return dataframe
         | 
| 65 |  | 
    	
        utils.py
    CHANGED
    
    | @@ -50,6 +50,7 @@ class EvalResult: | |
| 50 | 
             
                eval_name : str
         | 
| 51 | 
             
                org : str
         | 
| 52 | 
             
                model : str
         | 
|  | |
| 53 | 
             
                is_8bit : bool
         | 
| 54 | 
             
                results : dict
         | 
| 55 |  | 
| @@ -60,8 +61,11 @@ class EvalResult: | |
| 60 | 
             
                    else:
         | 
| 61 | 
             
                        base_model =f"{self.model}"
         | 
| 62 | 
             
                    data_dict = {}
         | 
|  | |
| 63 | 
             
                    data_dict["eval_name"] = self.eval_name
         | 
|  | |
| 64 | 
             
                    data_dict["base_model"] = make_clickable_model(base_model)
         | 
|  | |
| 65 | 
             
                    data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
         | 
| 66 | 
             
                    data_dict["# params"] = get_n_params(base_model)
         | 
| 67 |  | 
| @@ -83,21 +87,22 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]: | |
| 83 |  | 
| 84 | 
             
                path_split = json_filepath.split("/")
         | 
| 85 | 
             
                org = None
         | 
| 86 | 
            -
                model = path_split[- | 
| 87 | 
             
                is_8bit = path_split[-2] == "8bit"
         | 
| 88 | 
            -
                 | 
|  | |
| 89 | 
             
                    # handles gpt2 type models that don't have an org
         | 
| 90 | 
            -
                    result_key = f"{path_split[-3]}_{path_split[-2]}"
         | 
| 91 | 
            -
                else:
         | 
| 92 | 
             
                    result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         | 
| 93 | 
            -
             | 
|  | |
|  | |
| 94 |  | 
| 95 | 
             
                eval_result = None
         | 
| 96 | 
             
                for benchmark, metric  in zip(BENCHMARKS, METRICS):
         | 
| 97 | 
             
                    if benchmark in json_filepath:
         | 
| 98 | 
             
                        accs = np.array([v[metric] for k, v in data["results"].items()])
         | 
| 99 | 
             
                        mean_acc = round(np.mean(accs),3)
         | 
| 100 | 
            -
                        eval_result = EvalResult(result_key, org, model, is_8bit, {benchmark:mean_acc})
         | 
| 101 |  | 
| 102 | 
             
                return result_key, eval_result
         | 
| 103 |  | 
|  | |
| 50 | 
             
                eval_name : str
         | 
| 51 | 
             
                org : str
         | 
| 52 | 
             
                model : str
         | 
| 53 | 
            +
                revision : str
         | 
| 54 | 
             
                is_8bit : bool
         | 
| 55 | 
             
                results : dict
         | 
| 56 |  | 
|  | |
| 61 | 
             
                    else:
         | 
| 62 | 
             
                        base_model =f"{self.model}"
         | 
| 63 | 
             
                    data_dict = {}
         | 
| 64 | 
            +
                    
         | 
| 65 | 
             
                    data_dict["eval_name"] = self.eval_name
         | 
| 66 | 
            +
                    data_dict["8bit"] = self.is_8bit
         | 
| 67 | 
             
                    data_dict["base_model"] = make_clickable_model(base_model)
         | 
| 68 | 
            +
                    data_dict["revision"] = self.revision
         | 
| 69 | 
             
                    data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
         | 
| 70 | 
             
                    data_dict["# params"] = get_n_params(base_model)
         | 
| 71 |  | 
|  | |
| 87 |  | 
| 88 | 
             
                path_split = json_filepath.split("/")
         | 
| 89 | 
             
                org = None
         | 
| 90 | 
            +
                model = path_split[-4]
         | 
| 91 | 
             
                is_8bit = path_split[-2] == "8bit"
         | 
| 92 | 
            +
                revision = path_split[-3]
         | 
| 93 | 
            +
                if len(path_split)== 6:
         | 
| 94 | 
             
                    # handles gpt2 type models that don't have an org
         | 
|  | |
|  | |
| 95 | 
             
                    result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         | 
| 96 | 
            +
                else:
         | 
| 97 | 
            +
                    result_key = f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         | 
| 98 | 
            +
                    org = path_split[-5]
         | 
| 99 |  | 
| 100 | 
             
                eval_result = None
         | 
| 101 | 
             
                for benchmark, metric  in zip(BENCHMARKS, METRICS):
         | 
| 102 | 
             
                    if benchmark in json_filepath:
         | 
| 103 | 
             
                        accs = np.array([v[metric] for k, v in data["results"].items()])
         | 
| 104 | 
             
                        mean_acc = round(np.mean(accs),3)
         | 
| 105 | 
            +
                        eval_result = EvalResult(result_key, org, model, revision, is_8bit, {benchmark:mean_acc})
         | 
| 106 |  | 
| 107 | 
             
                return result_key, eval_result
         | 
| 108 |  | 
 
			
