File size: 4,991 Bytes
734e414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
roles_map = {
    'system': 'system',
    'user': 'user',
    'human': 'user',
    'assistant': 'assistant',
    'gpt': 'assistant',
    'AI': 'assistant',
}


core_instruct_datasets = [
    #
    # general instructs
    #
    # 1.48 GB, 1,420,909
    # mlabonne/open-perfectblend
    #   meta-math/MetaMathQA 	395,000
    #   openbmb/UltraInteract_sft 	288,579
    #   HuggingFaceH4/ultrachat_200k 	207,865
    #   microsoft/orca-math-word-problems-200k 	200,035
    #   HuggingFaceH4/ultrafeedback_binarized 	187,405
    #   theblackcat102/evol-codealpaca-v1 	111,272
    #   Post-training-Data-Flywheel/AutoIF-instruct-61k 	61,492
    #   mlabonne/lmsys-arena-human-preference-55k-sharegpt 	57,362
    *[
        {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [
            {'role': roles_map[m['from']], 'content': m['value']}
            for m in msgs
        ]}
        for i in range(0, 100, 10)
    ],
    # 1.41 GB, 939,343
    # allenai/tulu-3-sft-mixture
    #   CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024)
    #   FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023)
    #   No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023)
    #   OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024)
    #   Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts
    #   Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts
    #   Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts
    #   Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts
    #   Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts
    #   NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024)
    #   Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024)
    #   Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024)
    #   Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts
    #   Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024)
    #   WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024)
    #   TableGPT (MIT), 5,000 prompts (Zha et al., 2023)
    #   SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024)
    #   Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023)
    *[
        {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'}
        for i in range(0, 100, 10)
    ],

    #
    # multilingual instructs
    #
    # 2.48 GB, 5,808,694
    # rombodawg/Everything_Instruct_Multilingual
    # Science:
    #     antiven0m/physical-reasoning-dpoScience
    #     LawalAfeez/science-dataset
    # Social media:
    #     Kyle1668/AG-Tweets
    #     euclaise/reddit-instruct-curated
    # General Knowledge:
    #     NousResearch/CharacterCodex_Characters
    #     jstet/quotes-500k_Famous_Quotes
    #     FronkonGames/steam-games-dataset_Video_Games
    #     totuta_youtube_subs_howto100M_HowTo
    # Multi-lingual:
    #     Amani27/massive_translation_dataset
    #     udmurtNLP/udmurt-russian-english-labse
    #     grosenthal/latin_english
    #     msarmi9/korean-english-multitarget-ted-talks-task
    #     HaiderSultanArc/MT-Urdu-English_Translate
    #     Garsa3112/ChineseEnglishTranslationDataset
    # Cooking:
    #     andrewsiah/se_cooking_preference_sft
    #     Hieu-Phamkaggle/food_recipes
    # Writing:
    #     shahules786/PoetryFoundationData
    #     euclaise/writingprompts
    #     qwedsacf/ivypanda-essaysEssay
    # Medicine:
    #     keivalya/MedQuad-MedicalQnADataset
    #     nuvocare/MSD
    # History:
    #     ambrosfitz10k/history_data_v4
    # Law:
    #     dzunggg/legal-qa-v1
    # Role-Play:
    #     roleplay4/fun_CoupleRP
    #     Undi95andrijdavid/roleplay-conversation-sharegpt
    # News:
    #     RealTimeData/bbc_news_alltime
    # Coding: (rombodawg/code_bagel)
    #     layoric/tiny-codes-alpaca
    #     glaiveai/glaive-code-assistant-v3
    #     ajibawa-2023/Code-290k-ShareGPT
    #     chargoddard/commitpack-ft-instruct-rated
    #     iamtarun/code_instructions_120k_alpaca
    #     ise-uiuc/Magicoder-Evol-Instruct-110K
    #     cognitivecomputations/dolphin-coder
    #     nickrosh/Evol-Instruct-Code-80k-v1
    #     coseal/CodeUltraFeedback_binarized
    #     CyberNative/Code_Vulnerability_Security_DPO
    # Math: (rombodawg/code_bagel)
    #     TIGER-Lab/MathInstruct
    # Function calling: (rombodawg/code_bagel)
    #     glaiveai/glaive-function-calling-v2
    # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
    #     teknium/OpenHermes-2.5
    *[
        {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [
            {'role': 'system', 'content': r['instruction']},
            {'role': 'user', 'content': r['input']},
            {'role': 'assistant', 'content': r['output']},
        ]}
        for i in range(0, 100, 10)
    ],
]