File size: 3,247 Bytes
167137b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4ae93a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from typing import Literal

from huggingface_hub import HfFileSystem, hf_hub_download

KEY_TO_CATEGORY_NAME = {
    "full": "Overall",
    "coding": "Coding",
    "long_user": "Longer Query",
    "english": "English",
    "chinese": "Chinese",
    "french": "French",
    "no_tie": "Exclude Ties",
    "no_short": "Exclude Short Query (< 5 tokens)",
    "no_refusal": "Exclude Refusal",
}
CAT_NAME_TO_EXPLANATION = {
    "Overall": "Overall Questions",
    "Coding": "Coding: whether conversation contains code snippets",
    "Longer Query": "Longer Query (>= 500 tokens)",
    "English": "English Prompts",
    "Chinese": "Chinese Prompts",
    "French": "French Prompts",
    "Exclude Ties": "Exclude Ties and Bothbad",
    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
}

PROPRIETARY_LICENSES = [
    "Proprietary",
]


def download_latest_data_from_space(
    repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
    """
    Downloads the latest data file of the specified file type from the given repository space.

    Args:
        repo_id (str): The ID of the repository space.
        file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".

    Returns:
        str: The local file path of the downloaded data file.
    """

    def extract_date(filename):
        return filename.split("/")[-1].split(".")[0].split("_")[-1]

    fs = HfFileSystem()
    data_file_path = f"spaces/{repo_id}/*.{file_type}"
    files = fs.glob(data_file_path)
    latest_file = sorted(files, key=extract_date, reverse=True)[0]

    latest_filepath_local = hf_hub_download(
        repo_id=repo_id,
        filename=latest_file.split("/")[-1],
        repo_type="space",
    )
    return latest_filepath_local


def get_constants(dfs):
    """
    Calculate and return the minimum and maximum Elo scores, as well as the maximum number of models per month.

    Parameters:
    - dfs (dict): A dictionary containing DataFrames for different categories.

    Returns:
    - min_elo_score (float): The minimum Elo score across all DataFrames.
    - max_elo_score (float): The maximum Elo score across all DataFrames.
    - upper_models_per_month (int): The maximum number of models per month per license across all DataFrames.
    """
    filter_ranges = {}
    for k, df in dfs.items():
        filter_ranges[k] = {
            "min_elo_score": df["rating"].min().round(),
            "max_elo_score": df["rating"].max().round(),
            "upper_models_per_month": int(
                df.groupby(["Month-Year", "License"])["rating"]
                .apply(lambda x: x.count())
                .max()
            ),
        }

    min_elo_score = float("inf")
    max_elo_score = float("-inf")
    upper_models_per_month = 0

    for _, value in filter_ranges.items():
        min_elo_score = min(min_elo_score, value["min_elo_score"])
        max_elo_score = max(max_elo_score, value["max_elo_score"])
        upper_models_per_month = max(
            upper_models_per_month, value["upper_models_per_month"]
        )
    return min_elo_score, max_elo_score, upper_models_per_month