tyfeng1997 commited on
Commit
1be5a68
·
verified ·
1 Parent(s): 566ca24

Model save

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Math-1.5B
3
+ library_name: transformers
4
+ model_name: Qwen2.5-Math-1.5B-Open-R1-Distill
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen2.5-Math-1.5B-Open-R1-Distill
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="tyfeng1997/Qwen2.5-Math-1.5B-Open-R1-Distill", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/bofeng1997-ty/huggingface/runs/c3qlabub)
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.0.dev0
38
+ - Transformers: 4.49.0.dev0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.2.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 6.955833048956928e+17,
3
+ "train_loss": 0.8593901687198215,
4
+ "train_runtime": 5693.7016,
5
+ "train_samples": 16610,
6
+ "train_samples_per_second": 3.797,
7
+ "train_steps_per_second": 0.079
8
+ }
config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-Math-1.5B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "max_position_embeddings": 4096,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 10000,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
+ "use_mrope": false,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.49.0.dev0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a6d7ab943ee0bfd695540e1eecfceb4768abca0d47b104021be35bdc8c3abfc
3
+ size 3087467144
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|endoftext|>"
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|endoftext|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 6.955833048956928e+17,
3
+ "train_loss": 0.8593901687198215,
4
+ "train_runtime": 5693.7016,
5
+ "train_samples": 16610,
6
+ "train_samples_per_second": 3.797,
7
+ "train_steps_per_second": 0.079
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,843 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9991673605328892,
5
+ "eval_steps": 50,
6
+ "global_step": 450,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.011101859561476548,
13
+ "grad_norm": 1.140625,
14
+ "learning_rate": 2.222222222222222e-06,
15
+ "loss": 1.0928,
16
+ "mean_token_accuracy": 0.717196784696785,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.022203719122953096,
21
+ "grad_norm": 1.1015625,
22
+ "learning_rate": 4.444444444444444e-06,
23
+ "loss": 1.1024,
24
+ "mean_token_accuracy": 0.7158291392437733,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.03330557868442964,
29
+ "grad_norm": 1.03125,
30
+ "learning_rate": 6.666666666666667e-06,
31
+ "loss": 1.0782,
32
+ "mean_token_accuracy": 0.7194658119658119,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.04440743824590619,
37
+ "grad_norm": 0.96484375,
38
+ "learning_rate": 8.888888888888888e-06,
39
+ "loss": 1.0777,
40
+ "mean_token_accuracy": 0.7197588522588524,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.055509297807382736,
45
+ "grad_norm": 0.7734375,
46
+ "learning_rate": 1.1111111111111113e-05,
47
+ "loss": 1.0614,
48
+ "mean_token_accuracy": 0.7216748066748067,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.06661115736885928,
53
+ "grad_norm": 0.6796875,
54
+ "learning_rate": 1.3333333333333333e-05,
55
+ "loss": 1.059,
56
+ "mean_token_accuracy": 0.719067969067969,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.07771301693033583,
61
+ "grad_norm": 0.6171875,
62
+ "learning_rate": 1.555555555555556e-05,
63
+ "loss": 1.0328,
64
+ "mean_token_accuracy": 0.7237962962962966,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.08881487649181238,
69
+ "grad_norm": 0.55859375,
70
+ "learning_rate": 1.7777777777777777e-05,
71
+ "loss": 1.014,
72
+ "mean_token_accuracy": 0.7259676434676435,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.09991673605328892,
77
+ "grad_norm": 0.46484375,
78
+ "learning_rate": 2e-05,
79
+ "loss": 1.0066,
80
+ "mean_token_accuracy": 0.725960520960521,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.11101859561476547,
85
+ "grad_norm": 0.4140625,
86
+ "learning_rate": 1.9992479525042305e-05,
87
+ "loss": 0.9537,
88
+ "mean_token_accuracy": 0.7386558811558809,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.11101859561476547,
93
+ "eval_loss": 0.9796226024627686,
94
+ "eval_mean_token_accuracy": 0.7271781595311009,
95
+ "eval_runtime": 9.7194,
96
+ "eval_samples_per_second": 13.272,
97
+ "eval_steps_per_second": 3.395,
98
+ "step": 50
99
+ },
100
+ {
101
+ "epoch": 0.12212045517624202,
102
+ "grad_norm": 0.359375,
103
+ "learning_rate": 1.996992941167792e-05,
104
+ "loss": 0.9602,
105
+ "mean_token_accuracy": 0.7342826617826618,
106
+ "step": 55
107
+ },
108
+ {
109
+ "epoch": 0.13322231473771856,
110
+ "grad_norm": 0.326171875,
111
+ "learning_rate": 1.9932383577419432e-05,
112
+ "loss": 0.916,
113
+ "mean_token_accuracy": 0.74437307661244,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.1443241742991951,
118
+ "grad_norm": 0.3515625,
119
+ "learning_rate": 1.9879898494768093e-05,
120
+ "loss": 0.909,
121
+ "mean_token_accuracy": 0.7461548636548637,
122
+ "step": 65
123
+ },
124
+ {
125
+ "epoch": 0.15542603386067166,
126
+ "grad_norm": 0.3125,
127
+ "learning_rate": 1.9812553106273848e-05,
128
+ "loss": 0.9079,
129
+ "mean_token_accuracy": 0.74519129019129,
130
+ "step": 70
131
+ },
132
+ {
133
+ "epoch": 0.16652789342214822,
134
+ "grad_norm": 0.333984375,
135
+ "learning_rate": 1.973044870579824e-05,
136
+ "loss": 0.9071,
137
+ "mean_token_accuracy": 0.7446357346357346,
138
+ "step": 75
139
+ },
140
+ {
141
+ "epoch": 0.17762975298362477,
142
+ "grad_norm": 0.30078125,
143
+ "learning_rate": 1.9633708786158803e-05,
144
+ "loss": 0.8953,
145
+ "mean_token_accuracy": 0.746184371184371,
146
+ "step": 80
147
+ },
148
+ {
149
+ "epoch": 0.18873161254510132,
150
+ "grad_norm": 0.275390625,
151
+ "learning_rate": 1.9522478853384154e-05,
152
+ "loss": 0.8582,
153
+ "mean_token_accuracy": 0.7562728937728938,
154
+ "step": 85
155
+ },
156
+ {
157
+ "epoch": 0.19983347210657784,
158
+ "grad_norm": 0.296875,
159
+ "learning_rate": 1.9396926207859085e-05,
160
+ "loss": 0.8719,
161
+ "mean_token_accuracy": 0.7528663003663003,
162
+ "step": 90
163
+ },
164
+ {
165
+ "epoch": 0.2109353316680544,
166
+ "grad_norm": 0.271484375,
167
+ "learning_rate": 1.9257239692688907e-05,
168
+ "loss": 0.8735,
169
+ "mean_token_accuracy": 0.7512274542096703,
170
+ "step": 95
171
+ },
172
+ {
173
+ "epoch": 0.22203719122953094,
174
+ "grad_norm": 0.2734375,
175
+ "learning_rate": 1.9103629409661468e-05,
176
+ "loss": 0.8656,
177
+ "mean_token_accuracy": 0.7536752136752136,
178
+ "step": 100
179
+ },
180
+ {
181
+ "epoch": 0.22203719122953094,
182
+ "eval_loss": 0.9016062617301941,
183
+ "eval_mean_token_accuracy": 0.7405376758317936,
184
+ "eval_runtime": 9.7188,
185
+ "eval_samples_per_second": 13.273,
186
+ "eval_steps_per_second": 3.395,
187
+ "step": 100
188
+ },
189
+ {
190
+ "epoch": 0.2331390507910075,
191
+ "grad_norm": 0.2734375,
192
+ "learning_rate": 1.8936326403234125e-05,
193
+ "loss": 0.8687,
194
+ "mean_token_accuracy": 0.7531064306064306,
195
+ "step": 105
196
+ },
197
+ {
198
+ "epoch": 0.24424091035248405,
199
+ "grad_norm": 0.267578125,
200
+ "learning_rate": 1.8755582313020912e-05,
201
+ "loss": 0.8548,
202
+ "mean_token_accuracy": 0.75515422674015,
203
+ "step": 110
204
+ },
205
+ {
206
+ "epoch": 0.2553427699139606,
207
+ "grad_norm": 0.26953125,
208
+ "learning_rate": 1.8561668995302668e-05,
209
+ "loss": 0.8543,
210
+ "mean_token_accuracy": 0.7542501017501018,
211
+ "step": 115
212
+ },
213
+ {
214
+ "epoch": 0.2664446294754371,
215
+ "grad_norm": 0.26953125,
216
+ "learning_rate": 1.8354878114129368e-05,
217
+ "loss": 0.8373,
218
+ "mean_token_accuracy": 0.7589896214896215,
219
+ "step": 120
220
+ },
221
+ {
222
+ "epoch": 0.2775464890369137,
223
+ "grad_norm": 0.265625,
224
+ "learning_rate": 1.8135520702629677e-05,
225
+ "loss": 0.8489,
226
+ "mean_token_accuracy": 0.7568091168091166,
227
+ "step": 125
228
+ },
229
+ {
230
+ "epoch": 0.2886483485983902,
231
+ "grad_norm": 0.267578125,
232
+ "learning_rate": 1.7903926695187595e-05,
233
+ "loss": 0.8557,
234
+ "mean_token_accuracy": 0.7542979242979242,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.2997502081598668,
239
+ "grad_norm": 0.26171875,
240
+ "learning_rate": 1.766044443118978e-05,
241
+ "loss": 0.8329,
242
+ "mean_token_accuracy": 0.7595054945054944,
243
+ "step": 135
244
+ },
245
+ {
246
+ "epoch": 0.31085206772134333,
247
+ "grad_norm": 0.2490234375,
248
+ "learning_rate": 1.740544013109005e-05,
249
+ "loss": 0.874,
250
+ "mean_token_accuracy": 0.748472730972731,
251
+ "step": 140
252
+ },
253
+ {
254
+ "epoch": 0.32195392728281985,
255
+ "grad_norm": 0.255859375,
256
+ "learning_rate": 1.7139297345578992e-05,
257
+ "loss": 0.8295,
258
+ "mean_token_accuracy": 0.7612077737077737,
259
+ "step": 145
260
+ },
261
+ {
262
+ "epoch": 0.33305578684429643,
263
+ "grad_norm": 0.251953125,
264
+ "learning_rate": 1.686241637868734e-05,
265
+ "loss": 0.8092,
266
+ "mean_token_accuracy": 0.7669424094424093,
267
+ "step": 150
268
+ },
269
+ {
270
+ "epoch": 0.33305578684429643,
271
+ "eval_loss": 0.8762778639793396,
272
+ "eval_mean_token_accuracy": 0.7449022654905009,
273
+ "eval_runtime": 9.7018,
274
+ "eval_samples_per_second": 13.296,
275
+ "eval_steps_per_second": 3.401,
276
+ "step": 150
277
+ },
278
+ {
279
+ "epoch": 0.34415764640577295,
280
+ "grad_norm": 0.251953125,
281
+ "learning_rate": 1.657521368569064e-05,
282
+ "loss": 0.8355,
283
+ "mean_token_accuracy": 0.7576159951159951,
284
+ "step": 155
285
+ },
286
+ {
287
+ "epoch": 0.35525950596724953,
288
+ "grad_norm": 0.2578125,
289
+ "learning_rate": 1.627812124672099e-05,
290
+ "loss": 0.8398,
291
+ "mean_token_accuracy": 0.7564778061483249,
292
+ "step": 160
293
+ },
294
+ {
295
+ "epoch": 0.36636136552872606,
296
+ "grad_norm": 0.251953125,
297
+ "learning_rate": 1.5971585917027864e-05,
298
+ "loss": 0.8576,
299
+ "mean_token_accuracy": 0.7524267399267397,
300
+ "step": 165
301
+ },
302
+ {
303
+ "epoch": 0.37746322509020264,
304
+ "grad_norm": 0.2578125,
305
+ "learning_rate": 1.5656068754865388e-05,
306
+ "loss": 0.8254,
307
+ "mean_token_accuracy": 0.760271918721826,
308
+ "step": 170
309
+ },
310
+ {
311
+ "epoch": 0.38856508465167916,
312
+ "grad_norm": 0.255859375,
313
+ "learning_rate": 1.5332044328016916e-05,
314
+ "loss": 0.8502,
315
+ "mean_token_accuracy": 0.7546072446072445,
316
+ "step": 175
317
+ },
318
+ {
319
+ "epoch": 0.3996669442131557,
320
+ "grad_norm": 0.25390625,
321
+ "learning_rate": 1.5000000000000002e-05,
322
+ "loss": 0.8418,
323
+ "mean_token_accuracy": 0.7569129019129018,
324
+ "step": 180
325
+ },
326
+ {
327
+ "epoch": 0.41076880377463226,
328
+ "grad_norm": 0.255859375,
329
+ "learning_rate": 1.4660435197025391e-05,
330
+ "loss": 0.8346,
331
+ "mean_token_accuracy": 0.7576149776149775,
332
+ "step": 185
333
+ },
334
+ {
335
+ "epoch": 0.4218706633361088,
336
+ "grad_norm": 0.24609375,
337
+ "learning_rate": 1.4313860656812537e-05,
338
+ "loss": 0.8478,
339
+ "mean_token_accuracy": 0.7535816035816034,
340
+ "step": 190
341
+ },
342
+ {
343
+ "epoch": 0.43297252289758537,
344
+ "grad_norm": 0.259765625,
345
+ "learning_rate": 1.396079766039157e-05,
346
+ "loss": 0.8107,
347
+ "mean_token_accuracy": 0.7654395604395604,
348
+ "step": 195
349
+ },
350
+ {
351
+ "epoch": 0.4440743824590619,
352
+ "grad_norm": 0.251953125,
353
+ "learning_rate": 1.3601777248047105e-05,
354
+ "loss": 0.8124,
355
+ "mean_token_accuracy": 0.763194953194953,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.4440743824590619,
360
+ "eval_loss": 0.8637130260467529,
361
+ "eval_mean_token_accuracy": 0.7472996502408268,
362
+ "eval_runtime": 9.6971,
363
+ "eval_samples_per_second": 13.303,
364
+ "eval_steps_per_second": 3.403,
365
+ "step": 200
366
+ },
367
+ {
368
+ "epoch": 0.4551762420205384,
369
+ "grad_norm": 0.25390625,
370
+ "learning_rate": 1.3237339420583213e-05,
371
+ "loss": 0.8375,
372
+ "mean_token_accuracy": 0.757132682132682,
373
+ "step": 205
374
+ },
375
+ {
376
+ "epoch": 0.466278101582015,
377
+ "grad_norm": 0.251953125,
378
+ "learning_rate": 1.2868032327110904e-05,
379
+ "loss": 0.8577,
380
+ "mean_token_accuracy": 0.750907610907611,
381
+ "step": 210
382
+ },
383
+ {
384
+ "epoch": 0.4773799611434915,
385
+ "grad_norm": 0.2431640625,
386
+ "learning_rate": 1.2494411440579814e-05,
387
+ "loss": 0.82,
388
+ "mean_token_accuracy": 0.7625814000814001,
389
+ "step": 215
390
+ },
391
+ {
392
+ "epoch": 0.4884818207049681,
393
+ "grad_norm": 0.25390625,
394
+ "learning_rate": 1.211703872229411e-05,
395
+ "loss": 0.8099,
396
+ "mean_token_accuracy": 0.7646743996743997,
397
+ "step": 220
398
+ },
399
+ {
400
+ "epoch": 0.4995836802664446,
401
+ "grad_norm": 0.2490234375,
402
+ "learning_rate": 1.1736481776669307e-05,
403
+ "loss": 0.8195,
404
+ "mean_token_accuracy": 0.7627518372346456,
405
+ "step": 225
406
+ },
407
+ {
408
+ "epoch": 0.5106855398279212,
409
+ "grad_norm": 0.2490234375,
410
+ "learning_rate": 1.1353312997501313e-05,
411
+ "loss": 0.8248,
412
+ "mean_token_accuracy": 0.7611874236874236,
413
+ "step": 230
414
+ },
415
+ {
416
+ "epoch": 0.5217873993893978,
417
+ "grad_norm": 0.23828125,
418
+ "learning_rate": 1.0968108707031792e-05,
419
+ "loss": 0.8174,
420
+ "mean_token_accuracy": 0.7629131054131055,
421
+ "step": 235
422
+ },
423
+ {
424
+ "epoch": 0.5328892589508742,
425
+ "grad_norm": 0.244140625,
426
+ "learning_rate": 1.0581448289104759e-05,
427
+ "loss": 0.8105,
428
+ "mean_token_accuracy": 0.7650734632542214,
429
+ "step": 240
430
+ },
431
+ {
432
+ "epoch": 0.5439911185123508,
433
+ "grad_norm": 0.244140625,
434
+ "learning_rate": 1.0193913317718245e-05,
435
+ "loss": 0.81,
436
+ "mean_token_accuracy": 0.7640618640618639,
437
+ "step": 245
438
+ },
439
+ {
440
+ "epoch": 0.5550929780738274,
441
+ "grad_norm": 0.2421875,
442
+ "learning_rate": 9.806086682281759e-06,
443
+ "loss": 0.8205,
444
+ "mean_token_accuracy": 0.7610907610907611,
445
+ "step": 250
446
+ },
447
+ {
448
+ "epoch": 0.5550929780738274,
449
+ "eval_loss": 0.8572535514831543,
450
+ "eval_mean_token_accuracy": 0.748435551376728,
451
+ "eval_runtime": 9.7051,
452
+ "eval_samples_per_second": 13.292,
453
+ "eval_steps_per_second": 3.4,
454
+ "step": 250
455
+ },
456
+ {
457
+ "epoch": 0.5661948376353039,
458
+ "grad_norm": 0.255859375,
459
+ "learning_rate": 9.418551710895243e-06,
460
+ "loss": 0.8235,
461
+ "mean_token_accuracy": 0.7599084249084251,
462
+ "step": 255
463
+ },
464
+ {
465
+ "epoch": 0.5772966971967805,
466
+ "grad_norm": 0.251953125,
467
+ "learning_rate": 9.03189129296821e-06,
468
+ "loss": 0.8286,
469
+ "mean_token_accuracy": 0.7585877085877086,
470
+ "step": 260
471
+ },
472
+ {
473
+ "epoch": 0.588398556758257,
474
+ "grad_norm": 0.259765625,
475
+ "learning_rate": 8.646687002498692e-06,
476
+ "loss": 0.8256,
477
+ "mean_token_accuracy": 0.7592775742775744,
478
+ "step": 265
479
+ },
480
+ {
481
+ "epoch": 0.5995004163197336,
482
+ "grad_norm": 0.251953125,
483
+ "learning_rate": 8.263518223330698e-06,
484
+ "loss": 0.8217,
485
+ "mean_token_accuracy": 0.7614479039479042,
486
+ "step": 270
487
+ },
488
+ {
489
+ "epoch": 0.6106022758812101,
490
+ "grad_norm": 0.2431640625,
491
+ "learning_rate": 7.882961277705897e-06,
492
+ "loss": 0.8354,
493
+ "mean_token_accuracy": 0.7570135306548853,
494
+ "step": 275
495
+ },
496
+ {
497
+ "epoch": 0.6217041354426867,
498
+ "grad_norm": 0.2490234375,
499
+ "learning_rate": 7.505588559420188e-06,
500
+ "loss": 0.8222,
501
+ "mean_token_accuracy": 0.7614336589336589,
502
+ "step": 280
503
+ },
504
+ {
505
+ "epoch": 0.6328059950041632,
506
+ "grad_norm": 0.244140625,
507
+ "learning_rate": 7.131967672889101e-06,
508
+ "loss": 0.8579,
509
+ "mean_token_accuracy": 0.7512146494805758,
510
+ "step": 285
511
+ },
512
+ {
513
+ "epoch": 0.6439078545656397,
514
+ "grad_norm": 0.25390625,
515
+ "learning_rate": 6.762660579416791e-06,
516
+ "loss": 0.8253,
517
+ "mean_token_accuracy": 0.7587220187220187,
518
+ "step": 290
519
+ },
520
+ {
521
+ "epoch": 0.6550097141271163,
522
+ "grad_norm": 0.2412109375,
523
+ "learning_rate": 6.3982227519528986e-06,
524
+ "loss": 0.8294,
525
+ "mean_token_accuracy": 0.7600905599174735,
526
+ "step": 295
527
+ },
528
+ {
529
+ "epoch": 0.6661115736885929,
530
+ "grad_norm": 0.2470703125,
531
+ "learning_rate": 6.039202339608432e-06,
532
+ "loss": 0.8129,
533
+ "mean_token_accuracy": 0.763727106227106,
534
+ "step": 300
535
+ },
536
+ {
537
+ "epoch": 0.6661115736885929,
538
+ "eval_loss": 0.854372501373291,
539
+ "eval_mean_token_accuracy": 0.7488219841161018,
540
+ "eval_runtime": 9.716,
541
+ "eval_samples_per_second": 13.277,
542
+ "eval_steps_per_second": 3.396,
543
+ "step": 300
544
+ },
545
+ {
546
+ "epoch": 0.6772134332500694,
547
+ "grad_norm": 0.2431640625,
548
+ "learning_rate": 5.686139343187468e-06,
549
+ "loss": 0.8089,
550
+ "mean_token_accuracy": 0.7635978835978834,
551
+ "step": 305
552
+ },
553
+ {
554
+ "epoch": 0.6883152928115459,
555
+ "grad_norm": 0.2421875,
556
+ "learning_rate": 5.339564802974615e-06,
557
+ "loss": 0.8162,
558
+ "mean_token_accuracy": 0.7625620675620676,
559
+ "step": 310
560
+ },
561
+ {
562
+ "epoch": 0.6994171523730225,
563
+ "grad_norm": 0.25,
564
+ "learning_rate": 5.000000000000003e-06,
565
+ "loss": 0.818,
566
+ "mean_token_accuracy": 0.7617165242165239,
567
+ "step": 315
568
+ },
569
+ {
570
+ "epoch": 0.7105190119344991,
571
+ "grad_norm": 0.251953125,
572
+ "learning_rate": 4.66795567198309e-06,
573
+ "loss": 0.8164,
574
+ "mean_token_accuracy": 0.7618395600557701,
575
+ "step": 320
576
+ },
577
+ {
578
+ "epoch": 0.7216208714959755,
579
+ "grad_norm": 0.248046875,
580
+ "learning_rate": 4.343931245134616e-06,
581
+ "loss": 0.8391,
582
+ "mean_token_accuracy": 0.7558740333740334,
583
+ "step": 325
584
+ },
585
+ {
586
+ "epoch": 0.7327227310574521,
587
+ "grad_norm": 0.251953125,
588
+ "learning_rate": 4.028414082972141e-06,
589
+ "loss": 0.8288,
590
+ "mean_token_accuracy": 0.7581603581603582,
591
+ "step": 330
592
+ },
593
+ {
594
+ "epoch": 0.7438245906189287,
595
+ "grad_norm": 0.2470703125,
596
+ "learning_rate": 3.7218787532790167e-06,
597
+ "loss": 0.8266,
598
+ "mean_token_accuracy": 0.758972323972324,
599
+ "step": 335
600
+ },
601
+ {
602
+ "epoch": 0.7549264501804053,
603
+ "grad_norm": 0.23828125,
604
+ "learning_rate": 3.424786314309365e-06,
605
+ "loss": 0.8169,
606
+ "mean_token_accuracy": 0.7615511468202194,
607
+ "step": 340
608
+ },
609
+ {
610
+ "epoch": 0.7660283097418817,
611
+ "grad_norm": 0.248046875,
612
+ "learning_rate": 3.1375836213126653e-06,
613
+ "loss": 0.8207,
614
+ "mean_token_accuracy": 0.7615923890923892,
615
+ "step": 345
616
+ },
617
+ {
618
+ "epoch": 0.7771301693033583,
619
+ "grad_norm": 0.23828125,
620
+ "learning_rate": 2.8607026544210115e-06,
621
+ "loss": 0.8251,
622
+ "mean_token_accuracy": 0.7595614570614571,
623
+ "step": 350
624
+ },
625
+ {
626
+ "epoch": 0.7771301693033583,
627
+ "eval_loss": 0.8534859418869019,
628
+ "eval_mean_token_accuracy": 0.7491665197547551,
629
+ "eval_runtime": 9.7117,
630
+ "eval_samples_per_second": 13.283,
631
+ "eval_steps_per_second": 3.398,
632
+ "step": 350
633
+ },
634
+ {
635
+ "epoch": 0.7882320288648349,
636
+ "grad_norm": 0.2314453125,
637
+ "learning_rate": 2.594559868909956e-06,
638
+ "loss": 0.789,
639
+ "mean_token_accuracy": 0.7690638990638987,
640
+ "step": 355
641
+ },
642
+ {
643
+ "epoch": 0.7993338884263114,
644
+ "grad_norm": 0.2490234375,
645
+ "learning_rate": 2.339555568810221e-06,
646
+ "loss": 0.8195,
647
+ "mean_token_accuracy": 0.76007733007733,
648
+ "step": 360
649
+ },
650
+ {
651
+ "epoch": 0.810435747987788,
652
+ "grad_norm": 0.2578125,
653
+ "learning_rate": 2.0960733048124082e-06,
654
+ "loss": 0.8358,
655
+ "mean_token_accuracy": 0.7567429792429792,
656
+ "step": 365
657
+ },
658
+ {
659
+ "epoch": 0.8215376075492645,
660
+ "grad_norm": 0.234375,
661
+ "learning_rate": 1.8644792973703252e-06,
662
+ "loss": 0.797,
663
+ "mean_token_accuracy": 0.7672700447700449,
664
+ "step": 370
665
+ },
666
+ {
667
+ "epoch": 0.832639467110741,
668
+ "grad_norm": 0.2333984375,
669
+ "learning_rate": 1.6451218858706374e-06,
670
+ "loss": 0.8269,
671
+ "mean_token_accuracy": 0.7588715913715915,
672
+ "step": 375
673
+ },
674
+ {
675
+ "epoch": 0.8437413266722176,
676
+ "grad_norm": 0.244140625,
677
+ "learning_rate": 1.4383310046973365e-06,
678
+ "loss": 0.8243,
679
+ "mean_token_accuracy": 0.7594324261790015,
680
+ "step": 380
681
+ },
682
+ {
683
+ "epoch": 0.8548431862336942,
684
+ "grad_norm": 0.2412109375,
685
+ "learning_rate": 1.2444176869790925e-06,
686
+ "loss": 0.8229,
687
+ "mean_token_accuracy": 0.7609747659747659,
688
+ "step": 385
689
+ },
690
+ {
691
+ "epoch": 0.8659450457951707,
692
+ "grad_norm": 0.24609375,
693
+ "learning_rate": 1.0636735967658785e-06,
694
+ "loss": 0.804,
695
+ "mean_token_accuracy": 0.7651831501831503,
696
+ "step": 390
697
+ },
698
+ {
699
+ "epoch": 0.8770469053566472,
700
+ "grad_norm": 0.2470703125,
701
+ "learning_rate": 8.963705903385344e-07,
702
+ "loss": 0.8512,
703
+ "mean_token_accuracy": 0.7523127798127797,
704
+ "step": 395
705
+ },
706
+ {
707
+ "epoch": 0.8881487649181238,
708
+ "grad_norm": 0.259765625,
709
+ "learning_rate": 7.427603073110967e-07,
710
+ "loss": 0.8437,
711
+ "mean_token_accuracy": 0.7555453805453806,
712
+ "step": 400
713
+ },
714
+ {
715
+ "epoch": 0.8881487649181238,
716
+ "eval_loss": 0.8532679080963135,
717
+ "eval_mean_token_accuracy": 0.748890434184552,
718
+ "eval_runtime": 9.7147,
719
+ "eval_samples_per_second": 13.279,
720
+ "eval_steps_per_second": 3.397,
721
+ "step": 400
722
+ },
723
+ {
724
+ "epoch": 0.8992506244796004,
725
+ "grad_norm": 0.2490234375,
726
+ "learning_rate": 6.030737921409169e-07,
727
+ "loss": 0.8155,
728
+ "mean_token_accuracy": 0.7616107041107041,
729
+ "step": 405
730
+ },
731
+ {
732
+ "epoch": 0.9103524840410768,
733
+ "grad_norm": 0.25390625,
734
+ "learning_rate": 4.775211466158469e-07,
735
+ "loss": 0.8045,
736
+ "mean_token_accuracy": 0.7646845746845747,
737
+ "step": 410
738
+ },
739
+ {
740
+ "epoch": 0.9214543436025534,
741
+ "grad_norm": 0.2412109375,
742
+ "learning_rate": 3.662912138411967e-07,
743
+ "loss": 0.848,
744
+ "mean_token_accuracy": 0.7535592185592186,
745
+ "step": 415
746
+ },
747
+ {
748
+ "epoch": 0.93255620316403,
749
+ "grad_norm": 0.2412109375,
750
+ "learning_rate": 2.6955129420176193e-07,
751
+ "loss": 0.796,
752
+ "mean_token_accuracy": 0.7675030525030523,
753
+ "step": 420
754
+ },
755
+ {
756
+ "epoch": 0.9436580627255066,
757
+ "grad_norm": 0.2451171875,
758
+ "learning_rate": 1.874468937261531e-07,
759
+ "loss": 0.8322,
760
+ "mean_token_accuracy": 0.7581685062193277,
761
+ "step": 425
762
+ },
763
+ {
764
+ "epoch": 0.954759922286983,
765
+ "grad_norm": 0.265625,
766
+ "learning_rate": 1.201015052319099e-07,
767
+ "loss": 0.8023,
768
+ "mean_token_accuracy": 0.7657336182336185,
769
+ "step": 430
770
+ },
771
+ {
772
+ "epoch": 0.9658617818484596,
773
+ "grad_norm": 0.23828125,
774
+ "learning_rate": 6.761642258056977e-08,
775
+ "loss": 0.8094,
776
+ "mean_token_accuracy": 0.7642338217338217,
777
+ "step": 435
778
+ },
779
+ {
780
+ "epoch": 0.9769636414099362,
781
+ "grad_norm": 0.2392578125,
782
+ "learning_rate": 3.0070588322079765e-08,
783
+ "loss": 0.8206,
784
+ "mean_token_accuracy": 0.7607519332519332,
785
+ "step": 440
786
+ },
787
+ {
788
+ "epoch": 0.9880655009714127,
789
+ "grad_norm": 0.2490234375,
790
+ "learning_rate": 7.520474957699586e-09,
791
+ "loss": 0.8151,
792
+ "mean_token_accuracy": 0.7614092389092388,
793
+ "step": 445
794
+ },
795
+ {
796
+ "epoch": 0.9991673605328892,
797
+ "grad_norm": 0.23828125,
798
+ "learning_rate": 0.0,
799
+ "loss": 0.7954,
800
+ "mean_token_accuracy": 0.7673087098087098,
801
+ "step": 450
802
+ },
803
+ {
804
+ "epoch": 0.9991673605328892,
805
+ "eval_loss": 0.8533338904380798,
806
+ "eval_mean_token_accuracy": 0.748884884179002,
807
+ "eval_runtime": 9.7042,
808
+ "eval_samples_per_second": 13.293,
809
+ "eval_steps_per_second": 3.401,
810
+ "step": 450
811
+ },
812
+ {
813
+ "epoch": 0.9991673605328892,
814
+ "step": 450,
815
+ "total_flos": 6.955833048956928e+17,
816
+ "train_loss": 0.8593901687198215,
817
+ "train_runtime": 5693.7016,
818
+ "train_samples_per_second": 3.797,
819
+ "train_steps_per_second": 0.079
820
+ }
821
+ ],
822
+ "logging_steps": 5,
823
+ "max_steps": 450,
824
+ "num_input_tokens_seen": 0,
825
+ "num_train_epochs": 1,
826
+ "save_steps": 500,
827
+ "stateful_callbacks": {
828
+ "TrainerControl": {
829
+ "args": {
830
+ "should_epoch_stop": false,
831
+ "should_evaluate": false,
832
+ "should_log": false,
833
+ "should_save": false,
834
+ "should_training_stop": false
835
+ },
836
+ "attributes": {}
837
+ }
838
+ },
839
+ "total_flos": 6.955833048956928e+17,
840
+ "train_batch_size": 6,
841
+ "trial_name": null,
842
+ "trial_params": null
843
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:537681bb904cf0275134a3c7dfb9434bed4c62bd2b44f4bd71f5ed2ed6af1348
3
+ size 6008
vocab.json ADDED
The diff for this file is too large to render. See raw diff