File size: 2,698 Bytes
1a0cf07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import json
import os
import re
import random
from typing import Any, Iterable, Union

from datasets import Dataset, concatenate_datasets, load_dataset

from data_utils import (
    lower_keys,
    parse_question,
    parse_ground_truth,
)


def load_jsonl(file):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                yield json.loads(line)
            except:
                print("Error in loading:", line)
                exit()


def load_data(
        data_name, 
        split='test', 
        data_dir='./data',
        num_test_sample=-1,
    ):
    if data_name.lower() == "math":
        data_name = 'MATH'  # we use 500 problem test split in "Let's Verify Step-by-Step"
    data_file = f"{data_dir}/{data_name}/{split}.jsonl"
    if os.path.exists(data_file):
        examples = list(load_jsonl(data_file))
    else:
        if data_name == "mmlu_stem":
            dataset = load_dataset("hails/mmlu_no_train", 'all', split='test')
            # only keep stem subjects
            stem_subjects = ['abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry',
                'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security',
                'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology',
                'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics',
                'high_school_physics', 'high_school_statistics', 'machine_learning']
            dataset = dataset.rename_column("subject", "type")
            dataset = dataset.filter(lambda x: x['type'] in stem_subjects)
        elif data_name == "mathvista":
            raise NotImplementedError(data_name)
        elif data_name == "gpqa":
            dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train")
        elif data_name == "codeforces":
            raise NotImplementedError(data_name)
        else:
            raise NotImplementedError(data_name)

        examples = list(dataset)
        examples = [lower_keys(example) for example in examples]
        dataset = Dataset.from_list(examples)
        os.makedirs(f"{data_dir}/{data_name}", exist_ok=True)
        dataset.to_json(data_file)

    # add 'idx' in the first column
    if 'idx' not in examples[0]:
        examples = [{'idx': i, **example} for i, example in enumerate(examples)]

    # dedepulicate & sort
    examples = sorted(examples, key=lambda x: x['idx'])

    if num_test_sample > 0:
        examples = examples[:num_test_sample]

    return examples


if __name__ == "__main__":
    examples = load_data("gpqa", "test")
    print('test')