Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	
		Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						beb2b32
	
1
								Parent(s):
							
							aa85eec
								
update v2
Browse files- README.md +4 -2
- app.py +404 -25
- app_bkp.py +0 -316
- gif.gif +0 -3
- pyproject.toml +4 -2
- requirements.txt +7 -2
- src/display/about.py +85 -2
- src/display/css_html_js.py +39 -15
- src/display/formatting.py +1 -1
- src/display/utils.py +67 -50
- src/envs.py +6 -4
- src/leaderboard/filter_models.py +8 -118
- src/populate.py +4 -3
- src/submission/submit.py +2 -0
- src/tools/plots.py +1 -1
- src/voting/vote_system.py +151 -0
    	
        README.md
    CHANGED
    
    | @@ -1,15 +1,17 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
            -
            title: Open LLM Leaderboard
         | 
| 3 | 
             
            emoji: 🏆
         | 
| 4 | 
             
            colorFrom: green
         | 
| 5 | 
             
            colorTo: indigo
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
            -
            sdk_version: 4. | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: true
         | 
| 10 | 
             
            license: apache-2.0
         | 
|  | |
| 11 | 
             
            fullWidth: true
         | 
| 12 | 
             
            startup_duration_timeout: 1h
         | 
|  | |
| 13 | 
             
            space_ci:
         | 
| 14 | 
             
              private: true
         | 
| 15 | 
             
              secrets:
         | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            title: Open LLM Leaderboard 2
         | 
| 3 | 
             
            emoji: 🏆
         | 
| 4 | 
             
            colorFrom: green
         | 
| 5 | 
             
            colorTo: indigo
         | 
| 6 | 
             
            sdk: gradio
         | 
| 7 | 
            +
            sdk_version: 4.36.1
         | 
| 8 | 
             
            app_file: app.py
         | 
| 9 | 
             
            pinned: true
         | 
| 10 | 
             
            license: apache-2.0
         | 
| 11 | 
            +
            duplicated_from: open-llm-leaderboard/open_llm_leaderboard
         | 
| 12 | 
             
            fullWidth: true
         | 
| 13 | 
             
            startup_duration_timeout: 1h
         | 
| 14 | 
            +
            hf_oauth: true
         | 
| 15 | 
             
            space_ci:
         | 
| 16 | 
             
              private: true
         | 
| 17 | 
             
              secrets:
         | 
    	
        app.py
    CHANGED
    
    | @@ -1,12 +1,17 @@ | |
| 1 | 
             
            import os
         | 
| 2 | 
             
            import logging
         | 
| 3 | 
             
            import time
         | 
|  | |
| 4 | 
             
            import datetime
         | 
| 5 | 
             
            import gradio as gr
         | 
|  | |
| 6 | 
             
            import datasets
         | 
| 7 | 
             
            from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
         | 
| 8 | 
             
            from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
         | 
| 9 |  | 
|  | |
|  | |
|  | |
| 10 | 
             
            from src.display.about import (
         | 
| 11 | 
             
                CITATION_BUTTON_LABEL,
         | 
| 12 | 
             
                CITATION_BUTTON_TEXT,
         | 
| @@ -27,6 +32,7 @@ from src.display.utils import ( | |
| 27 | 
             
                Precision,
         | 
| 28 | 
             
                WeightType,
         | 
| 29 | 
             
                fields,
         | 
|  | |
| 30 | 
             
            )
         | 
| 31 | 
             
            from src.envs import (
         | 
| 32 | 
             
                API,
         | 
| @@ -35,35 +41,343 @@ from src.envs import ( | |
| 35 | 
             
                HF_TOKEN,
         | 
| 36 | 
             
                QUEUE_REPO,
         | 
| 37 | 
             
                REPO_ID,
         | 
|  | |
|  | |
| 38 | 
             
                HF_HOME,
         | 
| 39 | 
             
            )
         | 
|  | |
|  | |
|  | |
|  | |
| 40 |  | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 45 |  | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
             | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 62 | 
            -
             | 
| 63 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 64 | 
             
                )
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 67 |  | 
| 68 | 
             
                with gr.Row():
         | 
| 69 | 
             
                    with gr.Accordion("📙 Citation", open=False):
         | 
| @@ -75,4 +389,69 @@ with demo: | |
| 75 | 
             
                            show_copy_button=True,
         | 
| 76 | 
             
                        )
         | 
| 77 |  | 
| 78 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
             
            import os
         | 
| 2 | 
             
            import logging
         | 
| 3 | 
             
            import time
         | 
| 4 | 
            +
            import schedule
         | 
| 5 | 
             
            import datetime
         | 
| 6 | 
             
            import gradio as gr
         | 
| 7 | 
            +
            from threading import Thread
         | 
| 8 | 
             
            import datasets
         | 
| 9 | 
             
            from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
         | 
| 10 | 
             
            from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
         | 
| 11 |  | 
| 12 | 
            +
            # Start ephemeral Spaces on PRs (see config in README.md)
         | 
| 13 | 
            +
            from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
         | 
| 14 | 
            +
             | 
| 15 | 
             
            from src.display.about import (
         | 
| 16 | 
             
                CITATION_BUTTON_LABEL,
         | 
| 17 | 
             
                CITATION_BUTTON_TEXT,
         | 
|  | |
| 32 | 
             
                Precision,
         | 
| 33 | 
             
                WeightType,
         | 
| 34 | 
             
                fields,
         | 
| 35 | 
            +
                EvalQueueColumn
         | 
| 36 | 
             
            )
         | 
| 37 | 
             
            from src.envs import (
         | 
| 38 | 
             
                API,
         | 
|  | |
| 41 | 
             
                HF_TOKEN,
         | 
| 42 | 
             
                QUEUE_REPO,
         | 
| 43 | 
             
                REPO_ID,
         | 
| 44 | 
            +
                VOTES_REPO,
         | 
| 45 | 
            +
                VOTES_PATH,
         | 
| 46 | 
             
                HF_HOME,
         | 
| 47 | 
             
            )
         | 
| 48 | 
            +
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         | 
| 49 | 
            +
            from src.submission.submit import add_new_eval
         | 
| 50 | 
            +
            from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
         | 
| 51 | 
            +
            from src.voting.vote_system import VoteManager, run_scheduler
         | 
| 52 |  | 
| 53 | 
            +
            # Configure logging
         | 
| 54 | 
            +
            logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            # Start ephemeral Spaces on PRs (see config in README.md)
         | 
| 57 | 
            +
            from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
         | 
| 60 | 
            +
            # This controls whether a full initialization should be performed.
         | 
| 61 | 
            +
            DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
         | 
| 62 | 
            +
            LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
         | 
| 63 | 
            +
            LEADERBOARD_DF = None
         | 
| 64 | 
            +
             | 
| 65 | 
            +
            def restart_space():
         | 
| 66 | 
            +
                API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
         | 
| 67 | 
            +
             | 
| 68 | 
            +
             | 
| 69 | 
            +
            def time_diff_wrapper(func):
         | 
| 70 | 
            +
                def wrapper(*args, **kwargs):
         | 
| 71 | 
            +
                    start_time = time.time()
         | 
| 72 | 
            +
                    result = func(*args, **kwargs)
         | 
| 73 | 
            +
                    end_time = time.time()
         | 
| 74 | 
            +
                    diff = end_time - start_time
         | 
| 75 | 
            +
                    logging.info(f"Time taken for {func.__name__}: {diff} seconds")
         | 
| 76 | 
            +
                    return result
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                return wrapper
         | 
| 79 |  | 
| 80 | 
            +
             | 
| 81 | 
            +
            @time_diff_wrapper
         | 
| 82 | 
            +
            def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
         | 
| 83 | 
            +
                """Download dataset with exponential backoff retries."""
         | 
| 84 | 
            +
                attempt = 0
         | 
| 85 | 
            +
                while attempt < max_attempts:
         | 
| 86 | 
            +
                    try:
         | 
| 87 | 
            +
                        logging.info(f"Downloading {repo_id} to {local_dir}")
         | 
| 88 | 
            +
                        snapshot_download(
         | 
| 89 | 
            +
                            repo_id=repo_id,
         | 
| 90 | 
            +
                            local_dir=local_dir,
         | 
| 91 | 
            +
                            repo_type=repo_type,
         | 
| 92 | 
            +
                            tqdm_class=None,
         | 
| 93 | 
            +
                            etag_timeout=30,
         | 
| 94 | 
            +
                            max_workers=8,
         | 
| 95 | 
            +
                        )
         | 
| 96 | 
            +
                        logging.info("Download successful")
         | 
| 97 | 
            +
                        return
         | 
| 98 | 
            +
                    except Exception as e:
         | 
| 99 | 
            +
                        wait_time = backoff_factor**attempt
         | 
| 100 | 
            +
                        logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
         | 
| 101 | 
            +
                        time.sleep(wait_time)
         | 
| 102 | 
            +
                        attempt += 1
         | 
| 103 | 
            +
                raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            def get_latest_data_leaderboard(leaderboard_initial_df = None):
         | 
| 106 | 
            +
                current_time = datetime.datetime.now()
         | 
| 107 | 
            +
                global LAST_UPDATE_LEADERBOARD
         | 
| 108 | 
            +
                if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
         | 
| 109 | 
            +
                    return leaderboard_initial_df
         | 
| 110 | 
            +
                LAST_UPDATE_LEADERBOARD = current_time
         | 
| 111 | 
            +
                leaderboard_dataset = datasets.load_dataset(
         | 
| 112 | 
            +
                    AGGREGATED_REPO, 
         | 
| 113 | 
            +
                    "default", 
         | 
| 114 | 
            +
                    split="train", 
         | 
| 115 | 
            +
                    cache_dir=HF_HOME, 
         | 
| 116 | 
            +
                    download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset 
         | 
| 117 | 
            +
                    verification_mode="no_checks"
         | 
| 118 | 
            +
                )
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                global LEADERBOARD_DF
         | 
| 121 | 
            +
                LEADERBOARD_DF = get_leaderboard_df(
         | 
| 122 | 
            +
                    leaderboard_dataset=leaderboard_dataset, 
         | 
| 123 | 
            +
                    cols=COLS,
         | 
| 124 | 
            +
                    benchmark_cols=BENCHMARK_COLS,
         | 
| 125 | 
             
                )
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                return LEADERBOARD_DF
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            def get_latest_data_queue():
         | 
| 130 | 
            +
                eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
         | 
| 131 | 
            +
                return eval_queue_dfs
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            def init_space():
         | 
| 134 | 
            +
                """Initializes the application space, loading only necessary data."""
         | 
| 135 | 
            +
                if DO_FULL_INIT:
         | 
| 136 | 
            +
                    # These downloads only occur on full initialization
         | 
| 137 | 
            +
                    try:
         | 
| 138 | 
            +
                        download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         | 
| 139 | 
            +
                        download_dataset(VOTES_REPO, VOTES_PATH)
         | 
| 140 | 
            +
                    except Exception:
         | 
| 141 | 
            +
                        restart_space()
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                # Always redownload the leaderboard DataFrame
         | 
| 144 | 
            +
                global LEADERBOARD_DF
         | 
| 145 | 
            +
                LEADERBOARD_DF = get_latest_data_leaderboard()
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                # Evaluation queue DataFrame retrieval is independent of initialization detail level
         | 
| 148 | 
            +
                eval_queue_dfs = get_latest_data_queue()
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                return LEADERBOARD_DF, eval_queue_dfs
         | 
| 151 | 
            +
             | 
| 152 | 
            +
            # Initialize VoteManager
         | 
| 153 | 
            +
            vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
         | 
| 154 | 
            +
             | 
| 155 | 
            +
             | 
| 156 | 
            +
            # Schedule the upload_votes method to run every 15 minutes
         | 
| 157 | 
            +
            schedule.every(15).minutes.do(vote_manager.upload_votes)
         | 
| 158 | 
            +
             | 
| 159 | 
            +
            # Start the scheduler in a separate thread
         | 
| 160 | 
            +
            scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
         | 
| 161 | 
            +
            scheduler_thread.start()
         | 
| 162 | 
            +
             | 
| 163 | 
            +
            # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
         | 
| 164 | 
            +
            # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
         | 
| 165 | 
            +
            LEADERBOARD_DF, eval_queue_dfs = init_space()
         | 
| 166 | 
            +
            finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
         | 
| 167 | 
            +
             | 
| 168 | 
            +
             | 
| 169 | 
            +
            # Data processing for plots now only on demand in the respective Gradio tab
         | 
| 170 | 
            +
            def load_and_create_plots():
         | 
| 171 | 
            +
                plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
         | 
| 172 | 
            +
                return plot_df
         | 
| 173 | 
            +
             | 
| 174 | 
            +
            # Function to check if a user is logged in
         | 
| 175 | 
            +
            def check_login(profile: gr.OAuthProfile | None) -> bool:
         | 
| 176 | 
            +
                if profile is None:
         | 
| 177 | 
            +
                    return False
         | 
| 178 | 
            +
                return True
         | 
| 179 | 
            +
             | 
| 180 | 
            +
            def init_leaderboard(dataframe):
         | 
| 181 | 
            +
                if dataframe is None or dataframe.empty:
         | 
| 182 | 
            +
                    raise ValueError("Leaderboard DataFrame is empty or None.")
         | 
| 183 | 
            +
                return Leaderboard(
         | 
| 184 | 
            +
                    value=dataframe,
         | 
| 185 | 
            +
                    datatype=[c.type for c in fields(AutoEvalColumn)],
         | 
| 186 | 
            +
                    select_columns=SelectColumns(
         | 
| 187 | 
            +
                        default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
         | 
| 188 | 
            +
                        cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
         | 
| 189 | 
            +
                        label="Select Columns to Display:",
         | 
| 190 | 
            +
                    ),
         | 
| 191 | 
            +
                    search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
         | 
| 192 | 
            +
                    hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         | 
| 193 | 
            +
                    filter_columns=[
         | 
| 194 | 
            +
                        ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
         | 
| 195 | 
            +
                        ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
         | 
| 196 | 
            +
                        ColumnFilter(
         | 
| 197 | 
            +
                            AutoEvalColumn.params.name,
         | 
| 198 | 
            +
                            type="slider",
         | 
| 199 | 
            +
                            min=0.01,
         | 
| 200 | 
            +
                            max=150,
         | 
| 201 | 
            +
                            label="Select the number of parameters (B)",
         | 
| 202 | 
            +
                        ),
         | 
| 203 | 
            +
                        ColumnFilter(
         | 
| 204 | 
            +
                            AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
         | 
| 205 | 
            +
                        ),
         | 
| 206 | 
            +
                        ColumnFilter(
         | 
| 207 | 
            +
                            AutoEvalColumn.merged.name, type="boolean", label="Merge/MoErge", default=True
         | 
| 208 | 
            +
                        ),
         | 
| 209 | 
            +
                        ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
         | 
| 210 | 
            +
                        ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
         | 
| 211 | 
            +
                        ColumnFilter(AutoEvalColumn.maintainers_highlight.name, type="boolean", label="Show only maintainer's highlight", default=False),
         | 
| 212 | 
            +
                    ],
         | 
| 213 | 
            +
                    bool_checkboxgroup_label="Hide models",
         | 
| 214 | 
            +
                    interactive=False,
         | 
| 215 | 
            +
                )
         | 
| 216 | 
            +
             | 
| 217 | 
            +
            main_block = gr.Blocks(css=custom_css)
         | 
| 218 | 
            +
            with main_block:
         | 
| 219 | 
            +
                with gr.Row(elem_id="header-row"):
         | 
| 220 | 
            +
                    gr.HTML(TITLE)
         | 
| 221 | 
            +
                
         | 
| 222 | 
            +
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         | 
| 223 | 
            +
             | 
| 224 | 
            +
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 225 | 
            +
                    with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
         | 
| 226 | 
            +
                        leaderboard = init_leaderboard(LEADERBOARD_DF)
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                    with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
         | 
| 229 | 
            +
                        with gr.Column():
         | 
| 230 | 
            +
                            with gr.Row():
         | 
| 231 | 
            +
                                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
| 232 | 
            +
             | 
| 233 | 
            +
                        with gr.Row():
         | 
| 234 | 
            +
                            gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                        with gr.Row():
         | 
| 237 | 
            +
                            with gr.Column():
         | 
| 238 | 
            +
                                model_name_textbox = gr.Textbox(label="Model name")
         | 
| 239 | 
            +
                                revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="latest")
         | 
| 240 | 
            +
                                with gr.Row():
         | 
| 241 | 
            +
                                    model_type = gr.Dropdown(
         | 
| 242 | 
            +
                                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
         | 
| 243 | 
            +
                                        label="Model type",
         | 
| 244 | 
            +
                                        multiselect=False,
         | 
| 245 | 
            +
                                        value=ModelType.FT.to_str(" : "),
         | 
| 246 | 
            +
                                        interactive=True,
         | 
| 247 | 
            +
                                    )
         | 
| 248 | 
            +
                                    chat_template_toggle = gr.Checkbox(
         | 
| 249 | 
            +
                                        label="Use chat template", 
         | 
| 250 | 
            +
                                        value=False,
         | 
| 251 | 
            +
                                        info="Is your model a chat model?",
         | 
| 252 | 
            +
                                    )
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                            with gr.Column():
         | 
| 255 | 
            +
                                precision = gr.Dropdown(
         | 
| 256 | 
            +
                                    choices=[i.value.name for i in Precision if i != Precision.Unknown],
         | 
| 257 | 
            +
                                    label="Precision",
         | 
| 258 | 
            +
                                    multiselect=False,
         | 
| 259 | 
            +
                                    value="float16",
         | 
| 260 | 
            +
                                    interactive=True,
         | 
| 261 | 
            +
                                )
         | 
| 262 | 
            +
                                weight_type = gr.Dropdown(
         | 
| 263 | 
            +
                                    choices=[i.value.name for i in WeightType],
         | 
| 264 | 
            +
                                    label="Weights type",
         | 
| 265 | 
            +
                                    multiselect=False,
         | 
| 266 | 
            +
                                    value="Original",
         | 
| 267 | 
            +
                                    interactive=True,
         | 
| 268 | 
            +
                                )
         | 
| 269 | 
            +
                                base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
         | 
| 270 | 
            +
                        
         | 
| 271 | 
            +
                        with gr.Column():
         | 
| 272 | 
            +
                            with gr.Accordion(
         | 
| 273 | 
            +
                                f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
         | 
| 274 | 
            +
                                open=False,
         | 
| 275 | 
            +
                                ):
         | 
| 276 | 
            +
                                    with gr.Row():
         | 
| 277 | 
            +
                                        finished_eval_table = gr.components.Dataframe(
         | 
| 278 | 
            +
                                            value=finished_eval_queue_df,
         | 
| 279 | 
            +
                                            headers=EVAL_COLS,
         | 
| 280 | 
            +
                                            datatype=EVAL_TYPES,
         | 
| 281 | 
            +
                                            row_count=5,
         | 
| 282 | 
            +
                                        )
         | 
| 283 | 
            +
                            with gr.Accordion(
         | 
| 284 | 
            +
                                f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
         | 
| 285 | 
            +
                                open=False,
         | 
| 286 | 
            +
                            ):
         | 
| 287 | 
            +
                                with gr.Row():
         | 
| 288 | 
            +
                                    running_eval_table = gr.components.Dataframe(
         | 
| 289 | 
            +
                                        value=running_eval_queue_df,
         | 
| 290 | 
            +
                                        headers=EVAL_COLS,
         | 
| 291 | 
            +
                                        datatype=EVAL_TYPES,
         | 
| 292 | 
            +
                                        row_count=5,
         | 
| 293 | 
            +
                                    )
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                            with gr.Accordion(
         | 
| 296 | 
            +
                                f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
         | 
| 297 | 
            +
                                open=False,
         | 
| 298 | 
            +
                            ):
         | 
| 299 | 
            +
                                with gr.Row():
         | 
| 300 | 
            +
                                    pending_eval_table = gr.components.Dataframe(
         | 
| 301 | 
            +
                                        value=pending_eval_queue_df,
         | 
| 302 | 
            +
                                        headers=EVAL_COLS,
         | 
| 303 | 
            +
                                        datatype=EVAL_TYPES,
         | 
| 304 | 
            +
                                        row_count=5,
         | 
| 305 | 
            +
                                    )
         | 
| 306 | 
            +
             | 
| 307 | 
            +
                        submit_button = gr.Button("Submit Eval")
         | 
| 308 | 
            +
                        submission_result = gr.Markdown()
         | 
| 309 | 
            +
             | 
| 310 | 
            +
                        # The chat template checkbox update function
         | 
| 311 | 
            +
                        def update_chat_checkbox(model_type_value):
         | 
| 312 | 
            +
                            return ModelType.from_str(model_type_value) == ModelType.chat
         | 
| 313 | 
            +
                        
         | 
| 314 | 
            +
                        model_type.change(
         | 
| 315 | 
            +
                            fn=update_chat_checkbox,
         | 
| 316 | 
            +
                            inputs=[model_type],  # Pass the current checkbox value
         | 
| 317 | 
            +
                            outputs=chat_template_toggle,
         | 
| 318 | 
            +
                        )
         | 
| 319 | 
            +
             | 
| 320 | 
            +
                        submit_button.click(
         | 
| 321 | 
            +
                            add_new_eval,
         | 
| 322 | 
            +
                            [
         | 
| 323 | 
            +
                                model_name_textbox,
         | 
| 324 | 
            +
                                base_model_name_textbox,
         | 
| 325 | 
            +
                                revision_name_textbox,
         | 
| 326 | 
            +
                                precision,
         | 
| 327 | 
            +
                                weight_type,
         | 
| 328 | 
            +
                                model_type,
         | 
| 329 | 
            +
                                chat_template_toggle,
         | 
| 330 | 
            +
                            ],
         | 
| 331 | 
            +
                            submission_result,
         | 
| 332 | 
            +
                        )
         | 
| 333 | 
            +
             | 
| 334 | 
            +
                    # Ensure  the values in 'pending_eval_queue_df' are correct and ready for the DataFrame component
         | 
| 335 | 
            +
                    with gr.TabItem("🆙 Model Vote"):
         | 
| 336 | 
            +
                        with gr.Row():
         | 
| 337 | 
            +
                            gr.Markdown(
         | 
| 338 | 
            +
                                "## Vote for the models which should be evaluated first! \nYou'll need to sign in with the button above first. All votes are recorded.", 
         | 
| 339 | 
            +
                                elem_classes="markdown-text"
         | 
| 340 | 
            +
                            )
         | 
| 341 | 
            +
                            login_button = gr.LoginButton(elem_id="oauth-button")
         | 
| 342 | 
            +
             | 
| 343 | 
            +
             | 
| 344 | 
            +
                        with gr.Row():
         | 
| 345 | 
            +
                            pending_models = pending_eval_queue_df[EvalQueueColumn.model_name.name].to_list()
         | 
| 346 | 
            +
             | 
| 347 | 
            +
                            with gr.Column():
         | 
| 348 | 
            +
                                selected_model = gr.Dropdown(
         | 
| 349 | 
            +
                                    choices=pending_models,
         | 
| 350 | 
            +
                                    label="Models",
         | 
| 351 | 
            +
                                    multiselect=False,
         | 
| 352 | 
            +
                                    value="str",
         | 
| 353 | 
            +
                                    interactive=True,
         | 
| 354 | 
            +
                                )
         | 
| 355 | 
            +
             | 
| 356 | 
            +
                                vote_button = gr.Button("Vote", variant="primary")
         | 
| 357 | 
            +
             | 
| 358 | 
            +
                        with gr.Row():
         | 
| 359 | 
            +
                            with gr.Accordion(
         | 
| 360 | 
            +
                                f"Available models pending ({len(pending_eval_queue_df)})",
         | 
| 361 | 
            +
                                open=True,
         | 
| 362 | 
            +
                            ):
         | 
| 363 | 
            +
                                with gr.Row():
         | 
| 364 | 
            +
                                    pending_eval_table_votes = gr.components.Dataframe(
         | 
| 365 | 
            +
                                        value=vote_manager.create_request_vote_df(
         | 
| 366 | 
            +
                                            pending_eval_queue_df
         | 
| 367 | 
            +
                                        ),
         | 
| 368 | 
            +
                                        headers=EVAL_COLS,
         | 
| 369 | 
            +
                                        datatype=EVAL_TYPES,
         | 
| 370 | 
            +
                                        row_count=5,
         | 
| 371 | 
            +
                                        interactive=False
         | 
| 372 | 
            +
                                    )
         | 
| 373 | 
            +
             | 
| 374 | 
            +
                        # Set the click event for the vote button
         | 
| 375 | 
            +
                        vote_button.click(
         | 
| 376 | 
            +
                            vote_manager.add_vote,
         | 
| 377 | 
            +
                            inputs=[selected_model, pending_eval_table],
         | 
| 378 | 
            +
                            outputs=[pending_eval_table_votes]
         | 
| 379 | 
            +
                        )
         | 
| 380 | 
            +
             | 
| 381 |  | 
| 382 | 
             
                with gr.Row():
         | 
| 383 | 
             
                    with gr.Accordion("📙 Citation", open=False):
         | 
|  | |
| 389 | 
             
                            show_copy_button=True,
         | 
| 390 | 
             
                        )
         | 
| 391 |  | 
| 392 | 
            +
                main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
         | 
| 393 | 
            +
                leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
         | 
| 394 | 
            +
                pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
         | 
| 395 | 
            +
             | 
| 396 | 
            +
            main_block.queue(default_concurrency_limit=40)
         | 
| 397 | 
            +
             | 
| 398 | 
            +
             | 
| 399 | 
            +
            def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
         | 
| 400 | 
            +
                # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
         | 
| 401 | 
            +
                # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
         | 
| 402 | 
            +
                # ht to Lucain!
         | 
| 403 | 
            +
                if SPACE_ID is None:
         | 
| 404 | 
            +
                    print("Not in a Space: Space CI disabled.")
         | 
| 405 | 
            +
                    return WebhooksServer(ui=main_block)
         | 
| 406 | 
            +
             | 
| 407 | 
            +
                if IS_EPHEMERAL_SPACE:
         | 
| 408 | 
            +
                    print("In an ephemeral Space: Space CI disabled.")
         | 
| 409 | 
            +
                    return WebhooksServer(ui=main_block)
         | 
| 410 | 
            +
             | 
| 411 | 
            +
                card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
         | 
| 412 | 
            +
                config = card.data.get("space_ci", {})
         | 
| 413 | 
            +
                print(f"Enabling Space CI with config from README: {config}")
         | 
| 414 | 
            +
             | 
| 415 | 
            +
                return configure_space_ci(
         | 
| 416 | 
            +
                    blocks=ui,
         | 
| 417 | 
            +
                    trusted_authors=config.get("trusted_authors"),
         | 
| 418 | 
            +
                    private=config.get("private", "auto"),
         | 
| 419 | 
            +
                    variables=config.get("variables", "auto"),
         | 
| 420 | 
            +
                    secrets=config.get("secrets"),
         | 
| 421 | 
            +
                    hardware=config.get("hardware"),
         | 
| 422 | 
            +
                    storage=config.get("storage"),
         | 
| 423 | 
            +
                )
         | 
| 424 | 
            +
             | 
| 425 | 
            +
            # Create webhooks server (with CI url if in Space and not ephemeral)
         | 
| 426 | 
            +
            webhooks_server = enable_space_ci_and_return_server(ui=main_block)
         | 
| 427 | 
            +
             | 
| 428 | 
            +
            # Add webhooks
         | 
| 429 | 
            +
            @webhooks_server.add_webhook
         | 
| 430 | 
            +
            def update_leaderboard(payload: WebhookPayload) -> None:
         | 
| 431 | 
            +
                """Redownloads the leaderboard dataset each time it updates"""
         | 
| 432 | 
            +
                if payload.repo.type == "dataset" and payload.event.action == "update":
         | 
| 433 | 
            +
                    datasets.load_dataset(
         | 
| 434 | 
            +
                        AGGREGATED_REPO, 
         | 
| 435 | 
            +
                        "default", 
         | 
| 436 | 
            +
                        split="train", 
         | 
| 437 | 
            +
                        cache_dir=HF_HOME, 
         | 
| 438 | 
            +
                        download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, 
         | 
| 439 | 
            +
                        verification_mode="no_checks"
         | 
| 440 | 
            +
                    )
         | 
| 441 | 
            +
             | 
| 442 | 
            +
            # The below code is not used at the moment, as we can manage the queue file locally
         | 
| 443 | 
            +
            LAST_UPDATE_QUEUE = datetime.datetime.now()
         | 
| 444 | 
            +
            @webhooks_server.add_webhook    
         | 
| 445 | 
            +
            def update_queue(payload: WebhookPayload) -> None:
         | 
| 446 | 
            +
                """Redownloads the queue dataset each time it updates"""
         | 
| 447 | 
            +
                if payload.repo.type == "dataset" and payload.event.action == "update":
         | 
| 448 | 
            +
                    current_time = datetime.datetime.now()
         | 
| 449 | 
            +
                    global LAST_UPDATE_QUEUE
         | 
| 450 | 
            +
                    if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
         | 
| 451 | 
            +
                        print("Would have updated the queue")
         | 
| 452 | 
            +
                        # We only redownload is last update was more than 10 minutes ago, as the queue is 
         | 
| 453 | 
            +
                        # updated regularly and heavy to download
         | 
| 454 | 
            +
                        #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         | 
| 455 | 
            +
                        LAST_UPDATE_QUEUE = datetime.datetime.now()
         | 
| 456 | 
            +
             | 
| 457 | 
            +
            webhooks_server.launch()
         | 
    	
        app_bkp.py
    DELETED
    
    | @@ -1,316 +0,0 @@ | |
| 1 | 
            -
            import os
         | 
| 2 | 
            -
            import logging
         | 
| 3 | 
            -
            import time
         | 
| 4 | 
            -
            import datetime
         | 
| 5 | 
            -
            import gradio as gr
         | 
| 6 | 
            -
            import datasets
         | 
| 7 | 
            -
            from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
         | 
| 8 | 
            -
            from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
         | 
| 9 | 
            -
             | 
| 10 | 
            -
            from src.display.about import (
         | 
| 11 | 
            -
                CITATION_BUTTON_LABEL,
         | 
| 12 | 
            -
                CITATION_BUTTON_TEXT,
         | 
| 13 | 
            -
                EVALUATION_QUEUE_TEXT,
         | 
| 14 | 
            -
                FAQ_TEXT,
         | 
| 15 | 
            -
                INTRODUCTION_TEXT,
         | 
| 16 | 
            -
                LLM_BENCHMARKS_TEXT,
         | 
| 17 | 
            -
                TITLE,
         | 
| 18 | 
            -
            )
         | 
| 19 | 
            -
            from src.display.css_html_js import custom_css
         | 
| 20 | 
            -
            from src.display.utils import (
         | 
| 21 | 
            -
                BENCHMARK_COLS,
         | 
| 22 | 
            -
                COLS,
         | 
| 23 | 
            -
                EVAL_COLS,
         | 
| 24 | 
            -
                EVAL_TYPES,
         | 
| 25 | 
            -
                AutoEvalColumn,
         | 
| 26 | 
            -
                ModelType,
         | 
| 27 | 
            -
                Precision,
         | 
| 28 | 
            -
                WeightType,
         | 
| 29 | 
            -
                fields,
         | 
| 30 | 
            -
            )
         | 
| 31 | 
            -
            from src.envs import (
         | 
| 32 | 
            -
                API,
         | 
| 33 | 
            -
                EVAL_REQUESTS_PATH,
         | 
| 34 | 
            -
                AGGREGATED_REPO,
         | 
| 35 | 
            -
                HF_TOKEN,
         | 
| 36 | 
            -
                QUEUE_REPO,
         | 
| 37 | 
            -
                REPO_ID,
         | 
| 38 | 
            -
                HF_HOME,
         | 
| 39 | 
            -
            )
         | 
| 40 | 
            -
            from src.populate import get_evaluation_queue_df, get_leaderboard_df
         | 
| 41 | 
            -
            from src.submission.submit import add_new_eval
         | 
| 42 | 
            -
            from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
         | 
| 43 | 
            -
             | 
| 44 | 
            -
            # Configure logging
         | 
| 45 | 
            -
            logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
         | 
| 46 | 
            -
             | 
| 47 | 
            -
             | 
| 48 | 
            -
            # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
         | 
| 49 | 
            -
            # This controls whether a full initialization should be performed.
         | 
| 50 | 
            -
            DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
         | 
| 51 | 
            -
            LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
         | 
| 52 | 
            -
             | 
| 53 | 
            -
            def restart_space():
         | 
| 54 | 
            -
                API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
         | 
| 55 | 
            -
             | 
| 56 | 
            -
             | 
| 57 | 
            -
            def time_diff_wrapper(func):
         | 
| 58 | 
            -
                def wrapper(*args, **kwargs):
         | 
| 59 | 
            -
                    start_time = time.time()
         | 
| 60 | 
            -
                    result = func(*args, **kwargs)
         | 
| 61 | 
            -
                    end_time = time.time()
         | 
| 62 | 
            -
                    diff = end_time - start_time
         | 
| 63 | 
            -
                    logging.info(f"Time taken for {func.__name__}: {diff} seconds")
         | 
| 64 | 
            -
                    return result
         | 
| 65 | 
            -
             | 
| 66 | 
            -
                return wrapper
         | 
| 67 | 
            -
             | 
| 68 | 
            -
             | 
| 69 | 
            -
            @time_diff_wrapper
         | 
| 70 | 
            -
            def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
         | 
| 71 | 
            -
                """Download dataset with exponential backoff retries."""
         | 
| 72 | 
            -
                attempt = 0
         | 
| 73 | 
            -
                while attempt < max_attempts:
         | 
| 74 | 
            -
                    try:
         | 
| 75 | 
            -
                        logging.info(f"Downloading {repo_id} to {local_dir}")
         | 
| 76 | 
            -
                        snapshot_download(
         | 
| 77 | 
            -
                            repo_id=repo_id,
         | 
| 78 | 
            -
                            local_dir=local_dir,
         | 
| 79 | 
            -
                            repo_type=repo_type,
         | 
| 80 | 
            -
                            tqdm_class=None,
         | 
| 81 | 
            -
                            etag_timeout=30,
         | 
| 82 | 
            -
                            max_workers=8,
         | 
| 83 | 
            -
                        )
         | 
| 84 | 
            -
                        logging.info("Download successful")
         | 
| 85 | 
            -
                        return
         | 
| 86 | 
            -
                    except Exception as e:
         | 
| 87 | 
            -
                        wait_time = backoff_factor**attempt
         | 
| 88 | 
            -
                        logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
         | 
| 89 | 
            -
                        time.sleep(wait_time)
         | 
| 90 | 
            -
                        attempt += 1
         | 
| 91 | 
            -
                raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
         | 
| 92 | 
            -
             | 
| 93 | 
            -
            def get_latest_data_leaderboard(leaderboard_initial_df = None):
         | 
| 94 | 
            -
                current_time = datetime.datetime.now()
         | 
| 95 | 
            -
                global LAST_UPDATE_LEADERBOARD
         | 
| 96 | 
            -
                if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
         | 
| 97 | 
            -
                    return leaderboard_initial_df
         | 
| 98 | 
            -
                LAST_UPDATE_LEADERBOARD = current_time
         | 
| 99 | 
            -
                leaderboard_dataset = datasets.load_dataset(
         | 
| 100 | 
            -
                    AGGREGATED_REPO, 
         | 
| 101 | 
            -
                    "default", 
         | 
| 102 | 
            -
                    split="train", 
         | 
| 103 | 
            -
                    cache_dir=HF_HOME, 
         | 
| 104 | 
            -
                    download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset 
         | 
| 105 | 
            -
                    verification_mode="no_checks"
         | 
| 106 | 
            -
                )
         | 
| 107 | 
            -
             | 
| 108 | 
            -
                leaderboard_df = get_leaderboard_df(
         | 
| 109 | 
            -
                    leaderboard_dataset=leaderboard_dataset, 
         | 
| 110 | 
            -
                    cols=COLS,
         | 
| 111 | 
            -
                    benchmark_cols=BENCHMARK_COLS,
         | 
| 112 | 
            -
                )
         | 
| 113 | 
            -
             | 
| 114 | 
            -
                return leaderboard_df
         | 
| 115 | 
            -
             | 
| 116 | 
            -
            def get_latest_data_queue():
         | 
| 117 | 
            -
                eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
         | 
| 118 | 
            -
                return eval_queue_dfs
         | 
| 119 | 
            -
             | 
| 120 | 
            -
            def init_space():
         | 
| 121 | 
            -
                """Initializes the application space, loading only necessary data."""
         | 
| 122 | 
            -
                if DO_FULL_INIT:
         | 
| 123 | 
            -
                    # These downloads only occur on full initialization
         | 
| 124 | 
            -
                    try:
         | 
| 125 | 
            -
                        download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         | 
| 126 | 
            -
                    except Exception:
         | 
| 127 | 
            -
                        restart_space()
         | 
| 128 | 
            -
             | 
| 129 | 
            -
                # Always redownload the leaderboard DataFrame
         | 
| 130 | 
            -
                leaderboard_df = get_latest_data_leaderboard()
         | 
| 131 | 
            -
             | 
| 132 | 
            -
                # Evaluation queue DataFrame retrieval is independent of initialization detail level
         | 
| 133 | 
            -
                eval_queue_dfs = get_latest_data_queue()
         | 
| 134 | 
            -
             | 
| 135 | 
            -
                return leaderboard_df, eval_queue_dfs
         | 
| 136 | 
            -
             | 
| 137 | 
            -
             | 
| 138 | 
            -
            # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
         | 
| 139 | 
            -
            # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
         | 
| 140 | 
            -
            leaderboard_df, eval_queue_dfs = init_space()
         | 
| 141 | 
            -
            finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
         | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
            # Data processing for plots now only on demand in the respective Gradio tab
         | 
| 145 | 
            -
            def load_and_create_plots():
         | 
| 146 | 
            -
                plot_df = create_plot_df(create_scores_df(leaderboard_df))
         | 
| 147 | 
            -
                return plot_df
         | 
| 148 | 
            -
             | 
| 149 | 
            -
            def init_leaderboard(dataframe):
         | 
| 150 | 
            -
                return Leaderboard(
         | 
| 151 | 
            -
                    value = dataframe,
         | 
| 152 | 
            -
                    datatype=[c.type for c in fields(AutoEvalColumn)],
         | 
| 153 | 
            -
                    select_columns=SelectColumns(
         | 
| 154 | 
            -
                        default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
         | 
| 155 | 
            -
                        cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
         | 
| 156 | 
            -
                        label="Select Columns to Display:",
         | 
| 157 | 
            -
                    ),
         | 
| 158 | 
            -
                    search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
         | 
| 159 | 
            -
                    hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
         | 
| 160 | 
            -
                    filter_columns=[
         | 
| 161 | 
            -
                        ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
         | 
| 162 | 
            -
                        ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
         | 
| 163 | 
            -
                        ColumnFilter(
         | 
| 164 | 
            -
                            AutoEvalColumn.params.name,
         | 
| 165 | 
            -
                            type="slider",
         | 
| 166 | 
            -
                            min=0.01,
         | 
| 167 | 
            -
                            max=150,
         | 
| 168 | 
            -
                            label="Select the number of parameters (B)",
         | 
| 169 | 
            -
                        ),
         | 
| 170 | 
            -
                        ColumnFilter(
         | 
| 171 | 
            -
                            AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
         | 
| 172 | 
            -
                        ),
         | 
| 173 | 
            -
                        ColumnFilter(
         | 
| 174 | 
            -
                            AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
         | 
| 175 | 
            -
                        ),
         | 
| 176 | 
            -
                        ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
         | 
| 177 | 
            -
                        ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
         | 
| 178 | 
            -
                    ],
         | 
| 179 | 
            -
                    bool_checkboxgroup_label="Hide models",
         | 
| 180 | 
            -
                    interactive=False,
         | 
| 181 | 
            -
                )
         | 
| 182 | 
            -
             | 
| 183 | 
            -
            demo = gr.Blocks(css=custom_css)
         | 
| 184 | 
            -
            with demo:
         | 
| 185 | 
            -
                gr.HTML(TITLE)
         | 
| 186 | 
            -
                gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
         | 
| 187 | 
            -
             | 
| 188 | 
            -
                with gr.Tabs(elem_classes="tab-buttons") as tabs:
         | 
| 189 | 
            -
                    with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
         | 
| 190 | 
            -
                        leaderboard = init_leaderboard(leaderboard_df)
         | 
| 191 | 
            -
             | 
| 192 | 
            -
                    with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
         | 
| 193 | 
            -
                        with gr.Row():
         | 
| 194 | 
            -
                            with gr.Column():
         | 
| 195 | 
            -
                                plot_df = load_and_create_plots()
         | 
| 196 | 
            -
                                chart = create_metric_plot_obj(
         | 
| 197 | 
            -
                                    plot_df,
         | 
| 198 | 
            -
                                    [AutoEvalColumn.average.name],
         | 
| 199 | 
            -
                                    title="Average of Top Scores and Human Baseline Over Time (from last update)",
         | 
| 200 | 
            -
                                )
         | 
| 201 | 
            -
                                gr.Plot(value=chart, min_width=500)
         | 
| 202 | 
            -
                            with gr.Column():
         | 
| 203 | 
            -
                                plot_df = load_and_create_plots()
         | 
| 204 | 
            -
                                chart = create_metric_plot_obj(
         | 
| 205 | 
            -
                                    plot_df,
         | 
| 206 | 
            -
                                    BENCHMARK_COLS,
         | 
| 207 | 
            -
                                    title="Top Scores and Human Baseline Over Time (from last update)",
         | 
| 208 | 
            -
                                )
         | 
| 209 | 
            -
                                gr.Plot(value=chart, min_width=500)
         | 
| 210 | 
            -
             | 
| 211 | 
            -
                    with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
         | 
| 212 | 
            -
                        gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         | 
| 213 | 
            -
             | 
| 214 | 
            -
                    with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
         | 
| 215 | 
            -
                        gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
         | 
| 216 | 
            -
             | 
| 217 | 
            -
                    with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
         | 
| 218 | 
            -
                        countdown = gr.HTML(
         | 
| 219 | 
            -
                            """<div align="center">
         | 
| 220 | 
            -
                            <div position: relative>
         | 
| 221 | 
            -
                            <img 
         | 
| 222 | 
            -
                                src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif" 
         | 
| 223 | 
            -
                                allowtransparency="true" 
         | 
| 224 | 
            -
                                style="display:block;width:100%;height:auto;" 
         | 
| 225 | 
            -
                            />
         | 
| 226 | 
            -
                            <iframe 
         | 
| 227 | 
            -
                                src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&timezone=Europe%2FParis&width=&style=circles&uid=815898&loc=https://logwork.com/countdown-fxmc&language=en&textcolor=&background=%23ffd21e&date=2024-06-26%2015%3A00%3A00&digitscolor=%23ff9d00&unitscolor=&" 
         | 
| 228 | 
            -
                                style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;" 
         | 
| 229 | 
            -
                                scrolling="no" 
         | 
| 230 | 
            -
                                allowtransparency="true" 
         | 
| 231 | 
            -
                                frameborder="0"
         | 
| 232 | 
            -
                                allowfullscreen
         | 
| 233 | 
            -
                            />
         | 
| 234 | 
            -
                            </div>
         | 
| 235 | 
            -
                            </div>"""
         | 
| 236 | 
            -
                        )
         | 
| 237 | 
            -
                        #gif = gr.Image(value="./gif.gif", interactive=False)
         | 
| 238 | 
            -
                        gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
         | 
| 239 | 
            -
             | 
| 240 | 
            -
                with gr.Row():
         | 
| 241 | 
            -
                    with gr.Accordion("📙 Citation", open=False):
         | 
| 242 | 
            -
                        citation_button = gr.Textbox(
         | 
| 243 | 
            -
                            value=CITATION_BUTTON_TEXT,
         | 
| 244 | 
            -
                            label=CITATION_BUTTON_LABEL,
         | 
| 245 | 
            -
                            lines=20,
         | 
| 246 | 
            -
                            elem_id="citation-button",
         | 
| 247 | 
            -
                            show_copy_button=True,
         | 
| 248 | 
            -
                        )
         | 
| 249 | 
            -
             | 
| 250 | 
            -
                demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
         | 
| 251 | 
            -
                
         | 
| 252 | 
            -
             | 
| 253 | 
            -
            demo.queue(default_concurrency_limit=40)
         | 
| 254 | 
            -
             | 
| 255 | 
            -
            # Start ephemeral Spaces on PRs (see config in README.md)
         | 
| 256 | 
            -
            from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
         | 
| 257 | 
            -
             | 
| 258 | 
            -
            def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
         | 
| 259 | 
            -
                # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
         | 
| 260 | 
            -
                # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
         | 
| 261 | 
            -
                # ht to Lucain!
         | 
| 262 | 
            -
                if SPACE_ID is None:
         | 
| 263 | 
            -
                    print("Not in a Space: Space CI disabled.")
         | 
| 264 | 
            -
                    return WebhooksServer(ui=demo)
         | 
| 265 | 
            -
             | 
| 266 | 
            -
                if IS_EPHEMERAL_SPACE:
         | 
| 267 | 
            -
                    print("In an ephemeral Space: Space CI disabled.")
         | 
| 268 | 
            -
                    return WebhooksServer(ui=demo)
         | 
| 269 | 
            -
             | 
| 270 | 
            -
                card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
         | 
| 271 | 
            -
                config = card.data.get("space_ci", {})
         | 
| 272 | 
            -
                print(f"Enabling Space CI with config from README: {config}")
         | 
| 273 | 
            -
             | 
| 274 | 
            -
                return configure_space_ci(
         | 
| 275 | 
            -
                    blocks=ui,
         | 
| 276 | 
            -
                    trusted_authors=config.get("trusted_authors"),
         | 
| 277 | 
            -
                    private=config.get("private", "auto"),
         | 
| 278 | 
            -
                    variables=config.get("variables", "auto"),
         | 
| 279 | 
            -
                    secrets=config.get("secrets"),
         | 
| 280 | 
            -
                    hardware=config.get("hardware"),
         | 
| 281 | 
            -
                    storage=config.get("storage"),
         | 
| 282 | 
            -
                )
         | 
| 283 | 
            -
             | 
| 284 | 
            -
            # Create webhooks server (with CI url if in Space and not ephemeral)
         | 
| 285 | 
            -
            webhooks_server = enable_space_ci_and_return_server(ui=demo)
         | 
| 286 | 
            -
             | 
| 287 | 
            -
            # Add webhooks
         | 
| 288 | 
            -
            @webhooks_server.add_webhook
         | 
| 289 | 
            -
            def update_leaderboard(payload: WebhookPayload) -> None:
         | 
| 290 | 
            -
                """Redownloads the leaderboard dataset each time it updates"""
         | 
| 291 | 
            -
                if payload.repo.type == "dataset" and payload.event.action == "update":
         | 
| 292 | 
            -
                    datasets.load_dataset(
         | 
| 293 | 
            -
                        AGGREGATED_REPO, 
         | 
| 294 | 
            -
                        "default", 
         | 
| 295 | 
            -
                        split="train", 
         | 
| 296 | 
            -
                        cache_dir=HF_HOME, 
         | 
| 297 | 
            -
                        download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, 
         | 
| 298 | 
            -
                        verification_mode="no_checks"
         | 
| 299 | 
            -
                    )
         | 
| 300 | 
            -
             | 
| 301 | 
            -
            # The below code is not used at the moment, as we can manage the queue file locally
         | 
| 302 | 
            -
            LAST_UPDATE_QUEUE = datetime.datetime.now()
         | 
| 303 | 
            -
            @webhooks_server.add_webhook    
         | 
| 304 | 
            -
            def update_queue(payload: WebhookPayload) -> None:
         | 
| 305 | 
            -
                """Redownloads the queue dataset each time it updates"""
         | 
| 306 | 
            -
                if payload.repo.type == "dataset" and payload.event.action == "update":
         | 
| 307 | 
            -
                    current_time = datetime.datetime.now()
         | 
| 308 | 
            -
                    global LAST_UPDATE_QUEUE
         | 
| 309 | 
            -
                    if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
         | 
| 310 | 
            -
                        print("Would have updated the queue")
         | 
| 311 | 
            -
                        # We only redownload is last update was more than 10 minutes ago, as the queue is 
         | 
| 312 | 
            -
                        # updated regularly and heavy to download
         | 
| 313 | 
            -
                        #download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
         | 
| 314 | 
            -
                        LAST_UPDATE_QUEUE = datetime.datetime.now()
         | 
| 315 | 
            -
             | 
| 316 | 
            -
            webhooks_server.launch()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
    	
        gif.gif
    DELETED
    
    | Git LFS Details
 | 
    	
        pyproject.toml
    CHANGED
    
    | @@ -38,16 +38,18 @@ numpy = "1.26.0" | |
| 38 | 
             
            pandas = "2.2.2"
         | 
| 39 | 
             
            plotly = "5.14.1"
         | 
| 40 | 
             
            python-dateutil = "2.8.2"
         | 
| 41 | 
            -
            requests = "2.28.2"
         | 
| 42 | 
             
            sentencepiece = "^0.2.0"
         | 
| 43 | 
             
            tqdm = "4.65.0"
         | 
| 44 | 
             
            transformers = "4.41.1"
         | 
| 45 | 
             
            tokenizers = ">=0.15.0"
         | 
| 46 | 
             
            gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
         | 
| 47 | 
            -
            gradio = " 4.20.0"
         | 
| 48 | 
             
            isort = "^5.13.2"
         | 
| 49 | 
             
            ruff = "^0.3.5"
         | 
| 50 | 
             
            gradio-leaderboard = "0.0.8"
         | 
|  | |
|  | |
|  | |
|  | |
| 51 |  | 
| 52 | 
             
            [build-system]
         | 
| 53 | 
             
            requires = ["poetry-core"]
         | 
|  | |
| 38 | 
             
            pandas = "2.2.2"
         | 
| 39 | 
             
            plotly = "5.14.1"
         | 
| 40 | 
             
            python-dateutil = "2.8.2"
         | 
|  | |
| 41 | 
             
            sentencepiece = "^0.2.0"
         | 
| 42 | 
             
            tqdm = "4.65.0"
         | 
| 43 | 
             
            transformers = "4.41.1"
         | 
| 44 | 
             
            tokenizers = ">=0.15.0"
         | 
| 45 | 
             
            gradio-space-ci = {git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci", rev = "0.2.3"}
         | 
|  | |
| 46 | 
             
            isort = "^5.13.2"
         | 
| 47 | 
             
            ruff = "^0.3.5"
         | 
| 48 | 
             
            gradio-leaderboard = "0.0.8"
         | 
| 49 | 
            +
            gradio = {extras = ["oauth"], version = "^4.36.1"}
         | 
| 50 | 
            +
            requests = "^2.31.0"
         | 
| 51 | 
            +
            requests-oauthlib = "^1.3.1"
         | 
| 52 | 
            +
            schedule = "^1.2.2"
         | 
| 53 |  | 
| 54 | 
             
            [build-system]
         | 
| 55 | 
             
            requires = ["poetry-core"]
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -8,11 +8,16 @@ numpy==1.26.0 | |
| 8 | 
             
            pandas==2.2.2
         | 
| 9 | 
             
            plotly==5.14.1
         | 
| 10 | 
             
            python-dateutil==2.8.2
         | 
| 11 | 
            -
            requests==2.28.2
         | 
| 12 | 
             
            sentencepiece
         | 
| 13 | 
             
            tqdm==4.65.0
         | 
| 14 | 
             
            transformers==4.41.1
         | 
| 15 | 
             
            tokenizers>=0.15.0
         | 
| 16 | 
             
            gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
         | 
| 17 | 
            -
             | 
|  | |
|  | |
|  | |
| 18 | 
             
            gradio_leaderboard==0.0.9
         | 
|  | |
|  | |
|  | 
|  | |
| 8 | 
             
            pandas==2.2.2
         | 
| 9 | 
             
            plotly==5.14.1
         | 
| 10 | 
             
            python-dateutil==2.8.2
         | 
|  | |
| 11 | 
             
            sentencepiece
         | 
| 12 | 
             
            tqdm==4.65.0
         | 
| 13 | 
             
            transformers==4.41.1
         | 
| 14 | 
             
            tokenizers>=0.15.0
         | 
| 15 | 
             
            gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
         | 
| 16 | 
            +
            isort
         | 
| 17 | 
            +
            ruff
         | 
| 18 | 
            +
            gradio==4.31.0
         | 
| 19 | 
            +
            gradio[oauth]
         | 
| 20 | 
             
            gradio_leaderboard==0.0.9
         | 
| 21 | 
            +
            requests==2.31.0
         | 
| 22 | 
            +
            requests-oauthlib== 1.3.1
         | 
| 23 | 
            +
            schedule == 1.2.2
         | 
    	
        src/display/about.py
    CHANGED
    
    | @@ -219,6 +219,89 @@ CITATION_BUTTON_TEXT = r""" | |
| 219 | 
             
              publisher = {Hugging Face},
         | 
| 220 | 
             
              howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
         | 
| 221 | 
             
            }
         | 
| 222 | 
            -
             | 
| 223 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 224 | 
             
            """
         | 
|  | |
| 219 | 
             
              publisher = {Hugging Face},
         | 
| 220 | 
             
              howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
         | 
| 221 | 
             
            }
         | 
| 222 | 
            +
            @software{eval-harness,
         | 
| 223 | 
            +
              author       = {Gao, Leo and
         | 
| 224 | 
            +
                              Tow, Jonathan and
         | 
| 225 | 
            +
                              Biderman, Stella and
         | 
| 226 | 
            +
                              Black, Sid and
         | 
| 227 | 
            +
                              DiPofi, Anthony and
         | 
| 228 | 
            +
                              Foster, Charles and
         | 
| 229 | 
            +
                              Golding, Laurence and
         | 
| 230 | 
            +
                              Hsu, Jeffrey and
         | 
| 231 | 
            +
                              McDonell, Kyle and
         | 
| 232 | 
            +
                              Muennighoff, Niklas and
         | 
| 233 | 
            +
                              Phang, Jason and
         | 
| 234 | 
            +
                              Reynolds, Laria and
         | 
| 235 | 
            +
                              Tang, Eric and
         | 
| 236 | 
            +
                              Thite, Anish and
         | 
| 237 | 
            +
                              Wang, Ben and
         | 
| 238 | 
            +
                              Wang, Kevin and
         | 
| 239 | 
            +
                              Zou, Andy},
         | 
| 240 | 
            +
              title        = {A framework for few-shot language model evaluation},
         | 
| 241 | 
            +
              month        = sep,
         | 
| 242 | 
            +
              year         = 2021,
         | 
| 243 | 
            +
              publisher    = {Zenodo},
         | 
| 244 | 
            +
              version      = {v0.0.1},
         | 
| 245 | 
            +
              doi          = {10.5281/zenodo.5371628},
         | 
| 246 | 
            +
              url          = {https://doi.org/10.5281/zenodo.5371628}
         | 
| 247 | 
            +
            }
         | 
| 248 | 
            +
            @misc{clark2018think,
         | 
| 249 | 
            +
                  title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
         | 
| 250 | 
            +
                  author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
         | 
| 251 | 
            +
                  year={2018},
         | 
| 252 | 
            +
                  eprint={1803.05457},
         | 
| 253 | 
            +
                  archivePrefix={arXiv},
         | 
| 254 | 
            +
                  primaryClass={cs.AI}
         | 
| 255 | 
            +
            }
         | 
| 256 | 
            +
            @misc{zellers2019hellaswag,
         | 
| 257 | 
            +
                  title={HellaSwag: Can a Machine Really Finish Your Sentence?},
         | 
| 258 | 
            +
                  author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
         | 
| 259 | 
            +
                  year={2019},
         | 
| 260 | 
            +
                  eprint={1905.07830},
         | 
| 261 | 
            +
                  archivePrefix={arXiv},
         | 
| 262 | 
            +
                  primaryClass={cs.CL}
         | 
| 263 | 
            +
            }
         | 
| 264 | 
            +
            @misc{hendrycks2021measuring,
         | 
| 265 | 
            +
                  title={Measuring Massive Multitask Language Understanding},
         | 
| 266 | 
            +
                  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
         | 
| 267 | 
            +
                  year={2021},
         | 
| 268 | 
            +
                  eprint={2009.03300},
         | 
| 269 | 
            +
                  archivePrefix={arXiv},
         | 
| 270 | 
            +
                  primaryClass={cs.CY}
         | 
| 271 | 
            +
            }
         | 
| 272 | 
            +
            @misc{lin2022truthfulqa,
         | 
| 273 | 
            +
                  title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
         | 
| 274 | 
            +
                  author={Stephanie Lin and Jacob Hilton and Owain Evans},
         | 
| 275 | 
            +
                  year={2022},
         | 
| 276 | 
            +
                  eprint={2109.07958},
         | 
| 277 | 
            +
                  archivePrefix={arXiv},
         | 
| 278 | 
            +
                  primaryClass={cs.CL}
         | 
| 279 | 
            +
            }
         | 
| 280 | 
            +
            @misc{DBLP:journals/corr/abs-1907-10641,
         | 
| 281 | 
            +
                  title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
         | 
| 282 | 
            +
                  author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
         | 
| 283 | 
            +
                  year={2019},
         | 
| 284 | 
            +
                  eprint={1907.10641},
         | 
| 285 | 
            +
                  archivePrefix={arXiv},
         | 
| 286 | 
            +
                  primaryClass={cs.CL}
         | 
| 287 | 
            +
            }
         | 
| 288 | 
            +
            @misc{DBLP:journals/corr/abs-2110-14168,
         | 
| 289 | 
            +
                  title={Training Verifiers to Solve Math Word Problems},
         | 
| 290 | 
            +
                  author={Karl Cobbe and
         | 
| 291 | 
            +
                              Vineet Kosaraju and
         | 
| 292 | 
            +
                              Mohammad Bavarian and
         | 
| 293 | 
            +
                              Mark Chen and
         | 
| 294 | 
            +
                              Heewoo Jun and
         | 
| 295 | 
            +
                              Lukasz Kaiser and
         | 
| 296 | 
            +
                              Matthias Plappert and
         | 
| 297 | 
            +
                              Jerry Tworek and
         | 
| 298 | 
            +
                              Jacob Hilton and
         | 
| 299 | 
            +
                              Reiichiro Nakano and
         | 
| 300 | 
            +
                              Christopher Hesse and
         | 
| 301 | 
            +
                              John Schulman},
         | 
| 302 | 
            +
                  year={2021},
         | 
| 303 | 
            +
                  eprint={2110.14168},
         | 
| 304 | 
            +
                  archivePrefix={arXiv},
         | 
| 305 | 
            +
                  primaryClass={cs.CL}
         | 
| 306 | 
            +
            }
         | 
| 307 | 
             
            """
         | 
    	
        src/display/css_html_js.py
    CHANGED
    
    | @@ -9,7 +9,7 @@ table th:first-child { | |
| 9 |  | 
| 10 | 
             
            /* Full width space */
         | 
| 11 | 
             
            .gradio-container {
         | 
| 12 | 
            -
             | 
| 13 | 
             
            }
         | 
| 14 |  | 
| 15 | 
             
            /* Text style and margins */
         | 
| @@ -48,7 +48,7 @@ table th:first-child { | |
| 48 | 
             
            }
         | 
| 49 |  | 
| 50 | 
             
            /* Filters style */
         | 
| 51 | 
            -
            #filter_type{
         | 
| 52 | 
             
                border: 0;
         | 
| 53 | 
             
                padding-left: 0;
         | 
| 54 | 
             
                padding-top: 0;
         | 
| @@ -56,29 +56,53 @@ table th:first-child { | |
| 56 | 
             
            #filter_type label {
         | 
| 57 | 
             
                display: flex;
         | 
| 58 | 
             
            }
         | 
| 59 | 
            -
            #filter_type label > span{
         | 
| 60 | 
             
                margin-top: var(--spacing-lg);
         | 
| 61 | 
             
                margin-right: 0.5em;
         | 
| 62 | 
             
            }
         | 
| 63 | 
            -
            #filter_type label > .wrap{
         | 
| 64 | 
             
                width: 103px;
         | 
| 65 | 
             
            }
         | 
| 66 | 
            -
            #filter_type label > .wrap .wrap-inner{ | 
| 67 | 
             
                padding: 2px;
         | 
| 68 | 
             
            }
         | 
| 69 | 
            -
            #filter_type label > .wrap .wrap-inner input{
         | 
| 70 | 
            -
                width: 1px
         | 
| 71 | 
             
            }
         | 
| 72 | 
            -
            #filter-columns-type{
         | 
| 73 | 
            -
                border:0;
         | 
| 74 | 
            -
                padding:0.5;
         | 
|  | |
|  | |
|  | |
|  | |
| 75 | 
             
            }
         | 
| 76 | 
            -
            #filter | 
| 77 | 
            -
                border:0;
         | 
| 78 | 
            -
                padding:0.5;
         | 
| 79 | 
             
            }
         | 
| 80 | 
            -
             | 
| 81 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 82 | 
             
            }
         | 
| 83 | 
             
            """
         | 
| 84 |  | 
|  | |
| 9 |  | 
| 10 | 
             
            /* Full width space */
         | 
| 11 | 
             
            .gradio-container {
         | 
| 12 | 
            +
                max-width: 95% !important;
         | 
| 13 | 
             
            }
         | 
| 14 |  | 
| 15 | 
             
            /* Text style and margins */
         | 
|  | |
| 48 | 
             
            }
         | 
| 49 |  | 
| 50 | 
             
            /* Filters style */
         | 
| 51 | 
            +
            #filter_type {
         | 
| 52 | 
             
                border: 0;
         | 
| 53 | 
             
                padding-left: 0;
         | 
| 54 | 
             
                padding-top: 0;
         | 
|  | |
| 56 | 
             
            #filter_type label {
         | 
| 57 | 
             
                display: flex;
         | 
| 58 | 
             
            }
         | 
| 59 | 
            +
            #filter_type label > span {
         | 
| 60 | 
             
                margin-top: var(--spacing-lg);
         | 
| 61 | 
             
                margin-right: 0.5em;
         | 
| 62 | 
             
            }
         | 
| 63 | 
            +
            #filter_type label > .wrap {
         | 
| 64 | 
             
                width: 103px;
         | 
| 65 | 
             
            }
         | 
| 66 | 
            +
            #filter_type label > .wrap .wrap-inner {
         | 
| 67 | 
             
                padding: 2px;
         | 
| 68 | 
             
            }
         | 
| 69 | 
            +
            #filter_type label > .wrap .wrap-inner input {
         | 
| 70 | 
            +
                width: 1px;
         | 
| 71 | 
             
            }
         | 
| 72 | 
            +
            #filter-columns-type {
         | 
| 73 | 
            +
                border: 0;
         | 
| 74 | 
            +
                padding: 0.5;
         | 
| 75 | 
            +
            }
         | 
| 76 | 
            +
            #filter-columns-size {
         | 
| 77 | 
            +
                border: 0;
         | 
| 78 | 
            +
                padding: 0.5;
         | 
| 79 | 
             
            }
         | 
| 80 | 
            +
            #box-filter > .form {
         | 
| 81 | 
            +
                border: 0;
         | 
|  | |
| 82 | 
             
            }
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            /* Header styles */
         | 
| 85 | 
            +
            #header-title {
         | 
| 86 | 
            +
                text-align: left;
         | 
| 87 | 
            +
                display: inline-block;
         | 
| 88 | 
            +
            }
         | 
| 89 | 
            +
             | 
| 90 | 
            +
            #header-row {
         | 
| 91 | 
            +
                display: flex;
         | 
| 92 | 
            +
                justify-content: space-between;
         | 
| 93 | 
            +
                align-items: center;
         | 
| 94 | 
            +
            }
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            #header-row .gradio-html {
         | 
| 97 | 
            +
                flex-grow: 1;
         | 
| 98 | 
            +
            }
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            #oauth-button {
         | 
| 101 | 
            +
                height: auto;
         | 
| 102 | 
            +
                min-width: max-content;
         | 
| 103 | 
            +
                white-space: nowrap;
         | 
| 104 | 
            +
                padding: 10px 20px;
         | 
| 105 | 
            +
                border-radius: 4px;
         | 
| 106 | 
             
            }
         | 
| 107 | 
             
            """
         | 
| 108 |  | 
    	
        src/display/formatting.py
    CHANGED
    
    | @@ -11,7 +11,7 @@ def make_clickable_model(model_name): | |
| 11 | 
             
                link = f"https://huggingface.co/{model_name}"
         | 
| 12 |  | 
| 13 | 
             
                details_model_name = model_name.replace("/", "__")
         | 
| 14 | 
            -
                details_link = f"https://huggingface.co/datasets/open-llm-leaderboard | 
| 15 |  | 
| 16 | 
             
                return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")
         | 
| 17 |  | 
|  | |
| 11 | 
             
                link = f"https://huggingface.co/{model_name}"
         | 
| 12 |  | 
| 13 | 
             
                details_model_name = model_name.replace("/", "__")
         | 
| 14 | 
            +
                details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/{details_model_name}-details"
         | 
| 15 |  | 
| 16 | 
             
                return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")
         | 
| 17 |  | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -49,12 +49,23 @@ class Task: | |
| 49 |  | 
| 50 |  | 
| 51 | 
             
            class Tasks(Enum):
         | 
| 52 | 
            -
                 | 
| 53 | 
            -
                 | 
| 54 | 
            -
             | 
| 55 | 
            -
                 | 
| 56 | 
            -
                 | 
| 57 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 58 |  | 
| 59 |  | 
| 60 | 
             
            # These classes are for user facing column names,
         | 
| @@ -77,7 +88,8 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma | |
| 77 | 
             
            # Scores
         | 
| 78 | 
             
            auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
         | 
| 79 | 
             
            for task in Tasks:
         | 
| 80 | 
            -
                 | 
|  | |
| 81 | 
             
            # Model information
         | 
| 82 | 
             
            auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
         | 
| 83 | 
             
            auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
         | 
| @@ -94,7 +106,10 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh | |
| 94 | 
             
            auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
         | 
| 95 | 
             
            auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
         | 
| 96 | 
             
            auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
         | 
| 97 | 
            -
             | 
|  | |
|  | |
|  | |
| 98 | 
             
            auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
         | 
| 99 |  | 
| 100 | 
             
            # We use make dataclass to dynamically fill the scores from Tasks
         | 
| @@ -103,30 +118,31 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen= | |
| 103 |  | 
| 104 | 
             
            @dataclass(frozen=True)
         | 
| 105 | 
             
            class EvalQueueColumn:  # Queue column
         | 
| 106 | 
            -
                 | 
|  | |
| 107 | 
             
                revision = ColumnContent("revision", "str", True)
         | 
| 108 | 
            -
                private = ColumnContent("private", "bool", True)
         | 
| 109 | 
             
                precision = ColumnContent("precision", "str", True)
         | 
| 110 | 
            -
                weight_type = ColumnContent("weight_type", "str", "Original")
         | 
| 111 | 
             
                status = ColumnContent("status", "str", True)
         | 
| 112 |  | 
| 113 |  | 
| 114 | 
            -
            baseline_row = {
         | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
| 117 | 
            -
             | 
| 118 | 
            -
             | 
| 119 | 
            -
             | 
| 120 | 
            -
             | 
| 121 | 
            -
             | 
| 122 | 
            -
             | 
| 123 | 
            -
             | 
| 124 | 
            -
             | 
| 125 | 
            -
             | 
| 126 | 
            -
             | 
| 127 | 
            -
             | 
| 128 | 
            -
             | 
| 129 | 
            -
            }
         | 
| 130 |  | 
| 131 | 
             
            # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
         | 
| 132 | 
             
            # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
         | 
| @@ -136,22 +152,22 @@ baseline_row = { | |
| 136 | 
             
            # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
         | 
| 137 | 
             
            # GSM8K: paper
         | 
| 138 | 
             
            # Define the human baselines
         | 
| 139 | 
            -
            human_baseline_row = {
         | 
| 140 | 
            -
             | 
| 141 | 
            -
             | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
             | 
| 145 | 
            -
             | 
| 146 | 
            -
             | 
| 147 | 
            -
             | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| 154 | 
            -
            }
         | 
| 155 |  | 
| 156 |  | 
| 157 | 
             
            @dataclass
         | 
| @@ -166,22 +182,22 @@ class ModelType(Enum): | |
| 166 | 
             
                FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
         | 
| 167 | 
             
                chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
         | 
| 168 | 
             
                merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
         | 
| 169 | 
            -
                Unknown = ModelDetails(name="", symbol=" | 
| 170 |  | 
| 171 | 
             
                def to_str(self, separator=" "):
         | 
| 172 | 
             
                    return f"{self.value.symbol}{separator}{self.value.name}"
         | 
| 173 |  | 
| 174 | 
             
                @staticmethod
         | 
| 175 | 
            -
                def from_str( | 
| 176 | 
            -
                    if  | 
| 177 | 
             
                        return ModelType.FT
         | 
| 178 | 
            -
                    if " | 
| 179 | 
             
                        return ModelType.CPT
         | 
| 180 | 
            -
                    if "pretrained" in  | 
| 181 | 
             
                        return ModelType.PT
         | 
| 182 | 
            -
                    if any([k in  | 
| 183 | 
             
                        return ModelType.chat
         | 
| 184 | 
            -
                    if "merge" in  | 
| 185 | 
             
                        return ModelType.merges
         | 
| 186 | 
             
                    return ModelType.Unknown
         | 
| 187 |  | 
| @@ -200,6 +216,7 @@ class Precision(Enum): | |
| 200 | 
             
                qt_GPTQ = ModelDetails("GPTQ")
         | 
| 201 | 
             
                Unknown = ModelDetails("?")
         | 
| 202 |  | 
|  | |
| 203 | 
             
                def from_str(precision):
         | 
| 204 | 
             
                    if precision in ["torch.float16", "float16"]:
         | 
| 205 | 
             
                        return Precision.float16
         | 
|  | |
| 49 |  | 
| 50 |  | 
| 51 | 
             
            class Tasks(Enum):
         | 
| 52 | 
            +
                ifeval = Task("leaderboard_ifeval", "strict_acc,none", "IFEval")
         | 
| 53 | 
            +
                ifeval_raw = Task("leaderboard_ifeval", "strict_acc,none", "IFEval Raw")
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                bbh = Task("leaderboard_bbh", "acc_norm,none", "BBH")
         | 
| 56 | 
            +
                bbh_raw = Task("leaderboard_bbh", "acc_norm,none", "BBH Raw")
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                math = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5")
         | 
| 59 | 
            +
                math_raw = Task("leaderboard_math_hard", "exact_match,none", "MATH Lvl 5 Raw")
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                gpqa = Task("leaderboard_gpqa", "acc_norm,none", "GPQA")
         | 
| 62 | 
            +
                gpqa_raw = Task("leaderboard_gpqa", "acc_norm,none", "GPQA Raw")
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                musr = Task("leaderboard_musr", "acc_norm,none", "MUSR")
         | 
| 65 | 
            +
                musr_raw = Task("leaderboard_musr", "acc_norm,none", "MUSR Raw")
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                mmlu_pro = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO")
         | 
| 68 | 
            +
                mmlu_pro_raw = Task("leaderboard_mmlu_pro", "acc,none", "MMLU-PRO Raw")
         | 
| 69 |  | 
| 70 |  | 
| 71 | 
             
            # These classes are for user facing column names,
         | 
|  | |
| 88 | 
             
            # Scores
         | 
| 89 | 
             
            auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
         | 
| 90 | 
             
            for task in Tasks:
         | 
| 91 | 
            +
                displayed_by_default = not task.name.endswith("_raw")
         | 
| 92 | 
            +
                auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", displayed_by_default=displayed_by_default)])
         | 
| 93 | 
             
            # Model information
         | 
| 94 | 
             
            auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
         | 
| 95 | 
             
            auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
         | 
|  | |
| 106 | 
             
            auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
         | 
| 107 | 
             
            auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
         | 
| 108 | 
             
            auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
         | 
| 109 | 
            +
            auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Chat Template", "bool", False)])
         | 
| 110 | 
            +
            auto_eval_column_dict.append(["maintainers_highlight", ColumnContent, ColumnContent("Maintainer's Highlight", "bool", False, hidden=True)])
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            # fullname structure: <user>/<model_name>
         | 
| 113 | 
             
            auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
         | 
| 114 |  | 
| 115 | 
             
            # We use make dataclass to dynamically fill the scores from Tasks
         | 
|  | |
| 118 |  | 
| 119 | 
             
            @dataclass(frozen=True)
         | 
| 120 | 
             
            class EvalQueueColumn:  # Queue column
         | 
| 121 | 
            +
                model_link = ColumnContent("model_link", "markdown", True)
         | 
| 122 | 
            +
                model_name = ColumnContent("model_name", "str", True)
         | 
| 123 | 
             
                revision = ColumnContent("revision", "str", True)
         | 
| 124 | 
            +
                #private = ColumnContent("private", "bool", True)  # Should not be displayed
         | 
| 125 | 
             
                precision = ColumnContent("precision", "str", True)
         | 
| 126 | 
            +
                #weight_type = ColumnContent("weight_type", "str", "Original") # Might be confusing, to think about
         | 
| 127 | 
             
                status = ColumnContent("status", "str", True)
         | 
| 128 |  | 
| 129 |  | 
| 130 | 
            +
            # baseline_row = {
         | 
| 131 | 
            +
            #     AutoEvalColumn.model.name: "<p>Baseline</p>",
         | 
| 132 | 
            +
            #     AutoEvalColumn.revision.name: "N/A",
         | 
| 133 | 
            +
            #     AutoEvalColumn.precision.name: None,
         | 
| 134 | 
            +
            #     AutoEvalColumn.merged.name: False,
         | 
| 135 | 
            +
            #     AutoEvalColumn.average.name: 31.0,
         | 
| 136 | 
            +
            #     AutoEvalColumn.arc.name: 25.0,
         | 
| 137 | 
            +
            #     AutoEvalColumn.hellaswag.name: 25.0,
         | 
| 138 | 
            +
            #     AutoEvalColumn.mmlu.name: 25.0,
         | 
| 139 | 
            +
            #     AutoEvalColumn.truthfulqa.name: 25.0,
         | 
| 140 | 
            +
            #     AutoEvalColumn.winogrande.name: 50.0,
         | 
| 141 | 
            +
            #     AutoEvalColumn.gsm8k.name: 0.21,
         | 
| 142 | 
            +
            #     AutoEvalColumn.fullname.name: "baseline",
         | 
| 143 | 
            +
            #     AutoEvalColumn.model_type.name: "",
         | 
| 144 | 
            +
            #     AutoEvalColumn.not_flagged.name: False,
         | 
| 145 | 
            +
            # }
         | 
| 146 |  | 
| 147 | 
             
            # Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
         | 
| 148 | 
             
            # ARC human baseline is 0.80 (source: https://lab42.global/arc/)
         | 
|  | |
| 152 | 
             
            # Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
         | 
| 153 | 
             
            # GSM8K: paper
         | 
| 154 | 
             
            # Define the human baselines
         | 
| 155 | 
            +
            # human_baseline_row = {
         | 
| 156 | 
            +
            #     AutoEvalColumn.model.name: "<p>Human performance</p>",
         | 
| 157 | 
            +
            #     AutoEvalColumn.revision.name: "N/A",
         | 
| 158 | 
            +
            #     AutoEvalColumn.precision.name: None,
         | 
| 159 | 
            +
            #     AutoEvalColumn.average.name: 92.75,
         | 
| 160 | 
            +
            #     AutoEvalColumn.merged.name: False,
         | 
| 161 | 
            +
            #     AutoEvalColumn.arc.name: 80.0,
         | 
| 162 | 
            +
            #     AutoEvalColumn.hellaswag.name: 95.0,
         | 
| 163 | 
            +
            #     AutoEvalColumn.mmlu.name: 89.8,
         | 
| 164 | 
            +
            #     AutoEvalColumn.truthfulqa.name: 94.0,
         | 
| 165 | 
            +
            #     AutoEvalColumn.winogrande.name: 94.0,
         | 
| 166 | 
            +
            #     AutoEvalColumn.gsm8k.name: 100,
         | 
| 167 | 
            +
            #     AutoEvalColumn.fullname.name: "human_baseline",
         | 
| 168 | 
            +
            #     AutoEvalColumn.model_type.name: "",
         | 
| 169 | 
            +
            #     AutoEvalColumn.not_flagged.name: False,
         | 
| 170 | 
            +
            # }
         | 
| 171 |  | 
| 172 |  | 
| 173 | 
             
            @dataclass
         | 
|  | |
| 182 | 
             
                FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
         | 
| 183 | 
             
                chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
         | 
| 184 | 
             
                merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
         | 
| 185 | 
            +
                Unknown = ModelDetails(name="❓ other", symbol="❓")
         | 
| 186 |  | 
| 187 | 
             
                def to_str(self, separator=" "):
         | 
| 188 | 
             
                    return f"{self.value.symbol}{separator}{self.value.name}"
         | 
| 189 |  | 
| 190 | 
             
                @staticmethod
         | 
| 191 | 
            +
                def from_str(m_type):
         | 
| 192 | 
            +
                    if any([k for k in m_type if k in ["fine-tuned","🔶", "finetuned"]]):
         | 
| 193 | 
             
                        return ModelType.FT
         | 
| 194 | 
            +
                    if "continuously pretrained" in m_type or "🟩" in m_type:
         | 
| 195 | 
             
                        return ModelType.CPT
         | 
| 196 | 
            +
                    if "pretrained" in m_type or "🟢" in m_type:
         | 
| 197 | 
             
                        return ModelType.PT
         | 
| 198 | 
            +
                    if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
         | 
| 199 | 
             
                        return ModelType.chat
         | 
| 200 | 
            +
                    if "merge" in m_type or "🤝" in m_type:
         | 
| 201 | 
             
                        return ModelType.merges
         | 
| 202 | 
             
                    return ModelType.Unknown
         | 
| 203 |  | 
|  | |
| 216 | 
             
                qt_GPTQ = ModelDetails("GPTQ")
         | 
| 217 | 
             
                Unknown = ModelDetails("?")
         | 
| 218 |  | 
| 219 | 
            +
                @staticmethod
         | 
| 220 | 
             
                def from_str(precision):
         | 
| 221 | 
             
                    if precision in ["torch.float16", "float16"]:
         | 
| 222 | 
             
                        return Precision.float16
         | 
    	
        src/envs.py
    CHANGED
    
    | @@ -4,9 +4,10 @@ from huggingface_hub import HfApi | |
| 4 | 
             
            # clone / pull the lmeh eval data
         | 
| 5 | 
             
            HF_TOKEN = os.environ.get("HF_TOKEN", None)
         | 
| 6 |  | 
| 7 | 
            -
            REPO_ID = "open-llm-leaderboard | 
| 8 | 
            -
            QUEUE_REPO = "open-llm-leaderboard | 
| 9 | 
            -
            AGGREGATED_REPO = "open-llm-leaderboard | 
|  | |
| 10 |  | 
| 11 | 
             
            HF_HOME = os.getenv("HF_HOME", ".")
         | 
| 12 |  | 
| @@ -20,11 +21,12 @@ if not os.access(HF_HOME, os.W_OK): | |
| 20 | 
             
            else:
         | 
| 21 | 
             
                print("Write access confirmed for HF_HOME")
         | 
| 22 |  | 
|  | |
| 23 | 
             
            EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
         | 
| 24 |  | 
| 25 | 
             
            # Rate limit variables
         | 
| 26 | 
             
            RATE_LIMIT_PERIOD = 7
         | 
| 27 | 
             
            RATE_LIMIT_QUOTA = 5
         | 
| 28 | 
            -
            HAS_HIGHER_RATE_LIMIT = [ | 
| 29 |  | 
| 30 | 
             
            API = HfApi(token=HF_TOKEN)
         | 
|  | |
| 4 | 
             
            # clone / pull the lmeh eval data
         | 
| 5 | 
             
            HF_TOKEN = os.environ.get("HF_TOKEN", None)
         | 
| 6 |  | 
| 7 | 
            +
            REPO_ID = "open-llm-leaderboard/open_llm_leaderboard_v2"
         | 
| 8 | 
            +
            QUEUE_REPO = "open-llm-leaderboard/requests"
         | 
| 9 | 
            +
            AGGREGATED_REPO = "open-llm-leaderboard/contents"
         | 
| 10 | 
            +
            VOTES_REPO = "open-llm-leaderboard/votes"
         | 
| 11 |  | 
| 12 | 
             
            HF_HOME = os.getenv("HF_HOME", ".")
         | 
| 13 |  | 
|  | |
| 21 | 
             
            else:
         | 
| 22 | 
             
                print("Write access confirmed for HF_HOME")
         | 
| 23 |  | 
| 24 | 
            +
            VOTES_PATH = os.path.join(HF_HOME, "model-votes")
         | 
| 25 | 
             
            EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
         | 
| 26 |  | 
| 27 | 
             
            # Rate limit variables
         | 
| 28 | 
             
            RATE_LIMIT_PERIOD = 7
         | 
| 29 | 
             
            RATE_LIMIT_QUOTA = 5
         | 
| 30 | 
            +
            HAS_HIGHER_RATE_LIMIT = []
         | 
| 31 |  | 
| 32 | 
             
            API = HfApi(token=HF_TOKEN)
         | 
    	
        src/leaderboard/filter_models.py
    CHANGED
    
    | @@ -4,122 +4,8 @@ from src.display.utils import AutoEvalColumn | |
| 4 |  | 
| 5 | 
             
            # Models which have been flagged by users as being problematic for a reason or another
         | 
| 6 | 
             
            # (Model name to forum discussion link)
         | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
                "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
         | 
| 10 | 
            -
                "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
         | 
| 11 | 
            -
                "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
         | 
| 12 | 
            -
                "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
         | 
| 13 | 
            -
                "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
         | 
| 14 | 
            -
                "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
         | 
| 15 | 
            -
                "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
         | 
| 16 | 
            -
                "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
         | 
| 17 | 
            -
                "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
         | 
| 18 | 
            -
                "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
         | 
| 19 | 
            -
                "jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 20 | 
            -
                "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 21 | 
            -
                "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 22 | 
            -
                "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 23 | 
            -
                "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 24 | 
            -
                "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 25 | 
            -
                "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 26 | 
            -
                "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 27 | 
            -
                "janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 28 | 
            -
                "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 29 | 
            -
                "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 30 | 
            -
                "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 31 | 
            -
                "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 32 | 
            -
                "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 33 | 
            -
                "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 34 | 
            -
                "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 35 | 
            -
                "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 36 | 
            -
                "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 37 | 
            -
                "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 38 | 
            -
                "cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 39 | 
            -
                "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 40 | 
            -
                "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 41 | 
            -
                "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 42 | 
            -
                "rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 43 | 
            -
                "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
         | 
| 44 | 
            -
                "dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
         | 
| 45 | 
            -
                "udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
         | 
| 46 | 
            -
                "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
         | 
| 47 | 
            -
                "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         | 
| 48 | 
            -
                "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         | 
| 49 | 
            -
                "alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         | 
| 50 | 
            -
                "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         | 
| 51 | 
            -
                "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         | 
| 52 | 
            -
                "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
         | 
| 53 | 
            -
                # Merges not indicated
         | 
| 54 | 
            -
                "gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 55 | 
            -
                "gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 56 | 
            -
                "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 57 | 
            -
                "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 58 | 
            -
                "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 59 | 
            -
                "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 60 | 
            -
                "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 61 | 
            -
                "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 62 | 
            -
                "rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 63 | 
            -
                "rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 64 | 
            -
                "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 65 | 
            -
                "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 66 | 
            -
                "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 67 | 
            -
                "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 68 | 
            -
                "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 69 | 
            -
                "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 70 | 
            -
                "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 71 | 
            -
                "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 72 | 
            -
                "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 73 | 
            -
                "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 74 | 
            -
                "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 75 | 
            -
                "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 76 | 
            -
                "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 77 | 
            -
                "elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 78 | 
            -
                "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 79 | 
            -
                "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 80 | 
            -
                "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 81 | 
            -
                "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 82 | 
            -
                "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 83 | 
            -
                "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 84 | 
            -
                "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 85 | 
            -
                "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 86 | 
            -
                "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
         | 
| 87 | 
            -
                "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         | 
| 88 | 
            -
                "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         | 
| 89 | 
            -
                "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         | 
| 90 | 
            -
                "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         | 
| 91 | 
            -
                "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
         | 
| 92 | 
            -
                "udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 93 | 
            -
                "kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 94 | 
            -
                "kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 95 | 
            -
                "Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 96 | 
            -
                "mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 97 | 
            -
                "Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 98 | 
            -
                "ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 99 | 
            -
                "Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 100 | 
            -
                "SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 101 | 
            -
                "bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 102 | 
            -
                "cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 103 | 
            -
                "bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 104 | 
            -
                "jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 105 | 
            -
                # MoErges
         | 
| 106 | 
            -
                "cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 107 | 
            -
                "cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 108 | 
            -
                "gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 109 | 
            -
                "macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 110 | 
            -
                "cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 111 | 
            -
                "macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 112 | 
            -
                "macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 113 | 
            -
                "macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 114 | 
            -
                "cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 115 | 
            -
                "macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 116 | 
            -
                "macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
         | 
| 117 | 
            -
                # Other - contamination mostly
         | 
| 118 | 
            -
                "DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
         | 
| 119 | 
            -
                "CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
         | 
| 120 | 
            -
                "Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
         | 
| 121 | 
            -
                "Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
         | 
| 122 | 
            -
            }
         | 
| 123 |  | 
| 124 | 
             
            # Models which have been requested by orgs to not be submitted on the leaderboard
         | 
| 125 | 
             
            DO_NOT_SUBMIT_MODELS = [
         | 
| @@ -133,12 +19,16 @@ DO_NOT_SUBMIT_MODELS = [ | |
| 133 | 
             
            def flag_models(leaderboard_data: list[dict]):
         | 
| 134 | 
             
                """Flags models based on external criteria or flagged status."""
         | 
| 135 | 
             
                for model_data in leaderboard_data:
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 136 | 
             
                    # If a model is not flagged, use its "fullname" as a key
         | 
| 137 | 
             
                    if model_data[AutoEvalColumn.not_flagged.name]:
         | 
| 138 | 
             
                        flag_key = model_data[AutoEvalColumn.fullname.name]
         | 
| 139 | 
             
                    else:
         | 
| 140 | 
            -
                         | 
| 141 | 
            -
                        flag_key = "merged"
         | 
| 142 |  | 
| 143 | 
             
                    # Reverse the logic: Check for non-flagged models instead
         | 
| 144 | 
             
                    if flag_key in FLAGGED_MODELS:
         | 
|  | |
| 4 |  | 
| 5 | 
             
            # Models which have been flagged by users as being problematic for a reason or another
         | 
| 6 | 
             
            # (Model name to forum discussion link)
         | 
| 7 | 
            +
            # None for the v2 so far!
         | 
| 8 | 
            +
            FLAGGED_MODELS = {}
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 9 |  | 
| 10 | 
             
            # Models which have been requested by orgs to not be submitted on the leaderboard
         | 
| 11 | 
             
            DO_NOT_SUBMIT_MODELS = [
         | 
|  | |
| 19 | 
             
            def flag_models(leaderboard_data: list[dict]):
         | 
| 20 | 
             
                """Flags models based on external criteria or flagged status."""
         | 
| 21 | 
             
                for model_data in leaderboard_data:
         | 
| 22 | 
            +
                    # Skip flagging if maintainers_highlight is True
         | 
| 23 | 
            +
                    if model_data.get(AutoEvalColumn.maintainers_highlight.name, False):
         | 
| 24 | 
            +
                        model_data[AutoEvalColumn.not_flagged.name] = True
         | 
| 25 | 
            +
                        continue
         | 
| 26 | 
            +
             | 
| 27 | 
             
                    # If a model is not flagged, use its "fullname" as a key
         | 
| 28 | 
             
                    if model_data[AutoEvalColumn.not_flagged.name]:
         | 
| 29 | 
             
                        flag_key = model_data[AutoEvalColumn.fullname.name]
         | 
| 30 | 
             
                    else:
         | 
| 31 | 
            +
                        flag_key = None
         | 
|  | |
| 32 |  | 
| 33 | 
             
                    # Reverse the logic: Check for non-flagged models instead
         | 
| 34 | 
             
                    if flag_key in FLAGGED_MODELS:
         | 
    	
        src/populate.py
    CHANGED
    
    | @@ -2,14 +2,15 @@ import pathlib | |
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
             
            from datasets import Dataset
         | 
| 4 | 
             
            from src.display.formatting import has_no_nan_values, make_clickable_model
         | 
| 5 | 
            -
            from src.display.utils import AutoEvalColumn, EvalQueueColumn | 
| 6 | 
             
            from src.leaderboard.filter_models import filter_models_flags
         | 
| 7 | 
             
            from src.display.utils import load_json_data
         | 
| 8 |  | 
| 9 |  | 
| 10 | 
             
            def _process_model_data(entry, model_name_key="model", revision_key="revision"):
         | 
| 11 | 
             
                """Enrich model data with clickable links and revisions."""
         | 
| 12 | 
            -
                entry[EvalQueueColumn. | 
|  | |
| 13 | 
             
                entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
         | 
| 14 | 
             
                return entry
         | 
| 15 |  | 
| @@ -50,4 +51,4 @@ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: | |
| 50 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
| 51 | 
             
                df = df[cols].round(decimals=2)
         | 
| 52 | 
             
                df = df[has_no_nan_values(df, benchmark_cols)]
         | 
| 53 | 
            -
                return df
         | 
|  | |
| 2 | 
             
            import pandas as pd
         | 
| 3 | 
             
            from datasets import Dataset
         | 
| 4 | 
             
            from src.display.formatting import has_no_nan_values, make_clickable_model
         | 
| 5 | 
            +
            from src.display.utils import AutoEvalColumn, EvalQueueColumn
         | 
| 6 | 
             
            from src.leaderboard.filter_models import filter_models_flags
         | 
| 7 | 
             
            from src.display.utils import load_json_data
         | 
| 8 |  | 
| 9 |  | 
| 10 | 
             
            def _process_model_data(entry, model_name_key="model", revision_key="revision"):
         | 
| 11 | 
             
                """Enrich model data with clickable links and revisions."""
         | 
| 12 | 
            +
                entry[EvalQueueColumn.model_name.name] = entry.get(model_name_key, "")
         | 
| 13 | 
            +
                entry[EvalQueueColumn.model_link.name] = make_clickable_model(entry.get(model_name_key, ""))
         | 
| 14 | 
             
                entry[EvalQueueColumn.revision.name] = entry.get(revision_key, "main")
         | 
| 15 | 
             
                return entry
         | 
| 16 |  | 
|  | |
| 51 | 
             
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
| 52 | 
             
                df = df[cols].round(decimals=2)
         | 
| 53 | 
             
                df = df[has_no_nan_values(df, benchmark_cols)]
         | 
| 54 | 
            +
                return df
         | 
    	
        src/submission/submit.py
    CHANGED
    
    | @@ -32,6 +32,7 @@ def add_new_eval( | |
| 32 | 
             
                precision: str,
         | 
| 33 | 
             
                weight_type: str,
         | 
| 34 | 
             
                model_type: str,
         | 
|  | |
| 35 | 
             
            ):
         | 
| 36 | 
             
                global REQUESTED_MODELS
         | 
| 37 | 
             
                global USERS_TO_SUBMISSION_DATES
         | 
| @@ -129,6 +130,7 @@ def add_new_eval( | |
| 129 | 
             
                    "model_type": model_type,
         | 
| 130 | 
             
                    "job_id": -1,
         | 
| 131 | 
             
                    "job_start_time": None,
         | 
|  | |
| 132 | 
             
                }
         | 
| 133 |  | 
| 134 | 
             
                supplementary_info = {
         | 
|  | |
| 32 | 
             
                precision: str,
         | 
| 33 | 
             
                weight_type: str,
         | 
| 34 | 
             
                model_type: str,
         | 
| 35 | 
            +
                use_chat_template: bool,
         | 
| 36 | 
             
            ):
         | 
| 37 | 
             
                global REQUESTED_MODELS
         | 
| 38 | 
             
                global USERS_TO_SUBMISSION_DATES
         | 
|  | |
| 130 | 
             
                    "model_type": model_type,
         | 
| 131 | 
             
                    "job_id": -1,
         | 
| 132 | 
             
                    "job_start_time": None,
         | 
| 133 | 
            +
                    "use_chat_template": use_chat_template,
         | 
| 134 | 
             
                }
         | 
| 135 |  | 
| 136 | 
             
                supplementary_info = {
         | 
    	
        src/tools/plots.py
    CHANGED
    
    | @@ -4,7 +4,7 @@ import plotly.express as px | |
| 4 | 
             
            from plotly.graph_objs import Figure
         | 
| 5 |  | 
| 6 | 
             
            from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
         | 
| 7 | 
            -
            from src.display.utils import human_baseline_row as HUMAN_BASELINE
         | 
| 8 | 
             
            from src.leaderboard.filter_models import FLAGGED_MODELS
         | 
| 9 |  | 
| 10 |  | 
|  | |
| 4 | 
             
            from plotly.graph_objs import Figure
         | 
| 5 |  | 
| 6 | 
             
            from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
         | 
| 7 | 
            +
            # from src.display.utils import human_baseline_row as HUMAN_BASELINE
         | 
| 8 | 
             
            from src.leaderboard.filter_models import FLAGGED_MODELS
         | 
| 9 |  | 
| 10 |  | 
    	
        src/voting/vote_system.py
    ADDED
    
    | @@ -0,0 +1,151 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import logging
         | 
| 3 | 
            +
            import pathlib
         | 
| 4 | 
            +
            import pandas as pd
         | 
| 5 | 
            +
            import gradio as gr
         | 
| 6 | 
            +
            import schedule
         | 
| 7 | 
            +
            import time
         | 
| 8 | 
            +
            from datetime import datetime, timezone
         | 
| 9 | 
            +
            from src.display.utils import EvalQueueColumn
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            from src.envs import API
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            # Set up logging
         | 
| 14 | 
            +
            logging.basicConfig(level=logging.INFO)
         | 
| 15 | 
            +
            logger = logging.getLogger(__name__)
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            class VoteManager:
         | 
| 18 | 
            +
                def __init__(self, votes_path, eval_requests_path, repo_id):
         | 
| 19 | 
            +
                    self.votes_path = votes_path
         | 
| 20 | 
            +
                    self.eval_requests_path = eval_requests_path
         | 
| 21 | 
            +
                    self.repo_id = repo_id
         | 
| 22 | 
            +
                    self.vote_dataset = self.read_vote_dataset()
         | 
| 23 | 
            +
                    self.vote_check_set = self.make_check_set(self.vote_dataset)
         | 
| 24 | 
            +
                    self.votes_to_upload = []
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def init_vote_dataset(self):
         | 
| 27 | 
            +
                    self.vote_dataset = self.read_vote_dataset()
         | 
| 28 | 
            +
                    self.vote_check_set = self.make_check_set(self.vote_dataset)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def read_vote_dataset(self):
         | 
| 31 | 
            +
                    result = []
         | 
| 32 | 
            +
                    votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
         | 
| 33 | 
            +
                    if votes_file.exists():
         | 
| 34 | 
            +
                        with open(votes_file, "r") as f:
         | 
| 35 | 
            +
                            for line in f:
         | 
| 36 | 
            +
                                data = json.loads(line.strip())
         | 
| 37 | 
            +
                                result.append(data)
         | 
| 38 | 
            +
                    result = pd.DataFrame(result)
         | 
| 39 | 
            +
                    return result
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                def make_check_set(self, vote_dataset: pd.DataFrame):
         | 
| 42 | 
            +
                    result = list()
         | 
| 43 | 
            +
                    for row in vote_dataset.itertuples(index=False, name='vote'):
         | 
| 44 | 
            +
                        result.append((row.model, row.revision, row.username))
         | 
| 45 | 
            +
                    return set(result)
         | 
| 46 | 
            +
                
         | 
| 47 | 
            +
                def get_model_revision(self, selected_model: str) -> str:
         | 
| 48 | 
            +
                    """Fetch the revision for the given model from the request files."""
         | 
| 49 | 
            +
                    for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
         | 
| 50 | 
            +
                        if user_folder.is_dir():
         | 
| 51 | 
            +
                            for file in user_folder.glob("*.json"):
         | 
| 52 | 
            +
                                with open(file, "r") as f:
         | 
| 53 | 
            +
                                    data = json.load(f)
         | 
| 54 | 
            +
                                    if data.get("model") == selected_model:
         | 
| 55 | 
            +
                                        return data.get("revision", "main")
         | 
| 56 | 
            +
                    return "main"
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                def create_request_vote_df(self, pending_models_df: gr.Dataframe):
         | 
| 59 | 
            +
                    if pending_models_df.empty or not "model_name" in pending_models_df.columns:
         | 
| 60 | 
            +
                        return pending_models_df
         | 
| 61 | 
            +
                    self.vote_dataset = self.read_vote_dataset()
         | 
| 62 | 
            +
                    vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    pending_models_df_votes = pd.merge(
         | 
| 65 | 
            +
                        pending_models_df, 
         | 
| 66 | 
            +
                        vote_counts, 
         | 
| 67 | 
            +
                        left_on=["model_name", 'revision'], 
         | 
| 68 | 
            +
                        right_on=['model', 'revision'], 
         | 
| 69 | 
            +
                        how='left'
         | 
| 70 | 
            +
                    )
         | 
| 71 | 
            +
                    # Filling empty votes
         | 
| 72 | 
            +
                    pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
         | 
| 73 | 
            +
                    pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
         | 
| 74 | 
            +
                    # Removing useless columns
         | 
| 75 | 
            +
                    pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
         | 
| 76 | 
            +
                    return pending_models_df_votes
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                # Function to be called when a user votes for a model
         | 
| 79 | 
            +
                def add_vote(
         | 
| 80 | 
            +
                        self,
         | 
| 81 | 
            +
                        selected_model: str,
         | 
| 82 | 
            +
                        pending_models_df: gr.Dataframe,
         | 
| 83 | 
            +
                        profile: gr.OAuthProfile | None
         | 
| 84 | 
            +
                    ):
         | 
| 85 | 
            +
                    logger.debug(f"Type of list before usage: {type(list)}")
         | 
| 86 | 
            +
                    # model_name, revision, user_id, timestamp
         | 
| 87 | 
            +
                    if selected_model in ["str", ""]:
         | 
| 88 | 
            +
                        gr.Warning("No model selected")
         | 
| 89 | 
            +
                        return
         | 
| 90 | 
            +
                    
         | 
| 91 | 
            +
                    if profile is None:
         | 
| 92 | 
            +
                        gr.Warning("Hub Login required")
         | 
| 93 | 
            +
                        return
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    vote_username = profile.username
         | 
| 96 | 
            +
                    model_revision = self.get_model_revision(selected_model)
         | 
| 97 | 
            +
                    
         | 
| 98 | 
            +
                    # tuple (immutable) for checking than already voted for model
         | 
| 99 | 
            +
                    check_tuple = (selected_model, model_revision, vote_username)
         | 
| 100 | 
            +
                    if check_tuple in self.vote_check_set:
         | 
| 101 | 
            +
                        gr.Warning("Already voted for this model")
         | 
| 102 | 
            +
                        return
         | 
| 103 | 
            +
                    
         | 
| 104 | 
            +
                    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                    vote_obj = {
         | 
| 107 | 
            +
                        "model": selected_model,
         | 
| 108 | 
            +
                        "revision": model_revision,
         | 
| 109 | 
            +
                        "username": vote_username,
         | 
| 110 | 
            +
                        "timestamp": current_time
         | 
| 111 | 
            +
                    }
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                    # Append the vote to the JSONL file
         | 
| 114 | 
            +
                    try:
         | 
| 115 | 
            +
                        votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
         | 
| 116 | 
            +
                        with open(votes_file, "a") as f:
         | 
| 117 | 
            +
                            f.write(json.dumps(vote_obj) + "\n")
         | 
| 118 | 
            +
                        logger.info(f"Vote added locally: {vote_obj}")
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                        self.votes_to_upload.append(vote_obj)
         | 
| 121 | 
            +
                    except Exception as e:
         | 
| 122 | 
            +
                        logger.error(f"Failed to write vote to file: {e}")
         | 
| 123 | 
            +
                        gr.Warning("Failed to record vote. Please try again")
         | 
| 124 | 
            +
                        return
         | 
| 125 | 
            +
                    
         | 
| 126 | 
            +
                    self.vote_check_set.add(check_tuple)
         | 
| 127 | 
            +
                    gr.Info(f"Voted for {selected_model}")
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                    return self.create_request_vote_df(pending_models_df)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                def upload_votes(self):
         | 
| 132 | 
            +
                    if self.votes_to_upload:
         | 
| 133 | 
            +
                        votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
         | 
| 134 | 
            +
                        try:
         | 
| 135 | 
            +
                            with open(votes_file, "rb") as f:
         | 
| 136 | 
            +
                                API.upload_file(
         | 
| 137 | 
            +
                                    path_or_fileobj=f,
         | 
| 138 | 
            +
                                    path_in_repo="votes_data.jsonl",
         | 
| 139 | 
            +
                                    repo_id=self.repo_id,
         | 
| 140 | 
            +
                                    repo_type="dataset",
         | 
| 141 | 
            +
                                    commit_message="Updating votes_data.jsonl with new votes",
         | 
| 142 | 
            +
                                )
         | 
| 143 | 
            +
                            logger.info("Votes uploaded to votes repository")
         | 
| 144 | 
            +
                            self.votes_to_upload.clear()
         | 
| 145 | 
            +
                        except Exception as e:
         | 
| 146 | 
            +
                            logger.error(f"Failed to upload votes to repository: {e}")
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            def run_scheduler(vote_manager):
         | 
| 149 | 
            +
                while True:
         | 
| 150 | 
            +
                    schedule.run_pending()
         | 
| 151 | 
            +
                    time.sleep(1)
         | 
