File size: 1,873 Bytes
167137b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from typing import Literal

from huggingface_hub import HfFileSystem, hf_hub_download

KEY_TO_CATEGORY_NAME = {
    "full": "Overall",
    "coding": "Coding",
    "long_user": "Longer Query",
    "english": "English",
    "chinese": "Chinese",
    "french": "French",
    "no_tie": "Exclude Ties",
    "no_short": "Exclude Short Query (< 5 tokens)",
    "no_refusal": "Exclude Refusal",
}
CAT_NAME_TO_EXPLANATION = {
    "Overall": "Overall Questions",
    "Coding": "Coding: whether conversation contains code snippets",
    "Longer Query": "Longer Query (>= 500 tokens)",
    "English": "English Prompts",
    "Chinese": "Chinese Prompts",
    "French": "French Prompts",
    "Exclude Ties": "Exclude Ties and Bothbad",
    "Exclude Short Query (< 5 tokens)": "Exclude Short User Query (< 5 tokens)",
    "Exclude Refusal": 'Exclude model responses with refusal (e.g., "I cannot answer")',
}

PROPRIETARY_LICENSES = [
    "Proprietary",
]


def download_latest_data_from_space(
    repo_id: str, file_type: Literal["pkl", "csv"]
) -> str:
    """
    Downloads the latest data file of the specified file type from the given repository space.

    Args:
        repo_id (str): The ID of the repository space.
        file_type (Literal["pkl", "csv"]): The type of the data file to download. Must be either "pkl" or "csv".

    Returns:
        str: The local file path of the downloaded data file.
    """

    def extract_date(filename):
        return filename.split("/")[-1].split(".")[0].split("_")[-1]

    fs = HfFileSystem()
    data_file_path = f"spaces/{repo_id}/*.{file_type}"
    files = fs.glob(data_file_path)
    latest_file = sorted(files, key=extract_date, reverse=True)[0]

    latest_filepath_local = hf_hub_download(
        repo_id=repo_id,
        filename=latest_file.split("/")[-1],
        repo_type="space",
    )
    return latest_filepath_local