Ahaa1234 commited on
Commit
a3e38bd
·
verified ·
1 Parent(s): c1564c6

Model save

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ model-index:
5
+ - name: clip-roberta-finetuned
6
+ results: []
7
+ ---
8
+
9
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
10
+ should probably proofread and complete it, then remove this comment. -->
11
+
12
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/ahmedchtourou2019/huggingface/runs/3nvkg166)
13
+ # clip-roberta-finetuned
14
+
15
+ This model was trained from scratch on an unknown dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 1.7544
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 5e-05
37
+ - train_batch_size: 64
38
+ - eval_batch_size: 64
39
+ - seed: 42
40
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
41
+ - lr_scheduler_type: linear
42
+ - num_epochs: 2.0
43
+
44
+ ### Training results
45
+
46
+ | Training Loss | Epoch | Step | Validation Loss |
47
+ |:-------------:|:-----:|:----:|:---------------:|
48
+ | 4.0141 | 1.0 | 1 | 1.6837 |
49
+ | 4.1484 | 2.0 | 2 | 1.7544 |
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - Transformers 4.41.0.dev0
55
+ - Pytorch 2.2.1+cu121
56
+ - Datasets 2.19.1
57
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_loss": 3.893137216567993,
4
+ "eval_runtime": 22.5319,
5
+ "eval_samples_per_second": 6.657,
6
+ "eval_steps_per_second": 0.133,
7
+ "total_flos": 266767525440000.0,
8
+ "train_loss": 4.029143333435059,
9
+ "train_runtime": 1253.0431,
10
+ "train_samples_per_second": 1.595,
11
+ "train_steps_per_second": 0.026
12
+ }
config.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./clip-roberta",
3
+ "architectures": [
4
+ "VisionTextDualEncoderModel"
5
+ ],
6
+ "logit_scale_init_value": 2.6592,
7
+ "model_type": "vision-text-dual-encoder",
8
+ "projection_dim": 512,
9
+ "text_config": {
10
+ "_name_or_path": "FacebookAI/roberta-base",
11
+ "add_cross_attention": false,
12
+ "architectures": [
13
+ "RobertaForMaskedLM"
14
+ ],
15
+ "attention_probs_dropout_prob": 0.1,
16
+ "bad_words_ids": null,
17
+ "begin_suppress_tokens": null,
18
+ "bos_token_id": 0,
19
+ "chunk_size_feed_forward": 0,
20
+ "classifier_dropout": null,
21
+ "cross_attention_hidden_size": null,
22
+ "decoder_start_token_id": null,
23
+ "diversity_penalty": 0.0,
24
+ "do_sample": false,
25
+ "early_stopping": false,
26
+ "encoder_no_repeat_ngram_size": 0,
27
+ "eos_token_id": 2,
28
+ "exponential_decay_length_penalty": null,
29
+ "finetuning_task": null,
30
+ "forced_bos_token_id": null,
31
+ "forced_eos_token_id": null,
32
+ "hidden_act": "gelu",
33
+ "hidden_dropout_prob": 0.1,
34
+ "hidden_size": 768,
35
+ "id2label": {
36
+ "0": "LABEL_0",
37
+ "1": "LABEL_1"
38
+ },
39
+ "initializer_range": 0.02,
40
+ "intermediate_size": 3072,
41
+ "is_decoder": false,
42
+ "is_encoder_decoder": false,
43
+ "label2id": {
44
+ "LABEL_0": 0,
45
+ "LABEL_1": 1
46
+ },
47
+ "layer_norm_eps": 1e-05,
48
+ "length_penalty": 1.0,
49
+ "max_length": 20,
50
+ "max_position_embeddings": 514,
51
+ "min_length": 0,
52
+ "model_type": "roberta",
53
+ "no_repeat_ngram_size": 0,
54
+ "num_attention_heads": 12,
55
+ "num_beam_groups": 1,
56
+ "num_beams": 1,
57
+ "num_hidden_layers": 12,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": 1,
63
+ "position_embedding_type": "absolute",
64
+ "prefix": null,
65
+ "problem_type": null,
66
+ "pruned_heads": {},
67
+ "remove_invalid_values": false,
68
+ "repetition_penalty": 1.0,
69
+ "return_dict": true,
70
+ "return_dict_in_generate": false,
71
+ "sep_token_id": null,
72
+ "suppress_tokens": null,
73
+ "task_specific_params": null,
74
+ "temperature": 1.0,
75
+ "tf_legacy_loss": false,
76
+ "tie_encoder_decoder": false,
77
+ "tie_word_embeddings": true,
78
+ "tokenizer_class": null,
79
+ "top_k": 50,
80
+ "top_p": 1.0,
81
+ "torch_dtype": null,
82
+ "torchscript": false,
83
+ "type_vocab_size": 1,
84
+ "typical_p": 1.0,
85
+ "use_bfloat16": false,
86
+ "use_cache": true,
87
+ "vocab_size": 50265
88
+ },
89
+ "torch_dtype": "float32",
90
+ "transformers_version": "4.41.0.dev0",
91
+ "vision_config": {
92
+ "_name_or_path": "openai/clip-vit-base-patch32",
93
+ "add_cross_attention": false,
94
+ "architectures": null,
95
+ "attention_dropout": 0.0,
96
+ "bad_words_ids": null,
97
+ "begin_suppress_tokens": null,
98
+ "bos_token_id": null,
99
+ "chunk_size_feed_forward": 0,
100
+ "cross_attention_hidden_size": null,
101
+ "decoder_start_token_id": null,
102
+ "diversity_penalty": 0.0,
103
+ "do_sample": false,
104
+ "dropout": 0.0,
105
+ "early_stopping": false,
106
+ "encoder_no_repeat_ngram_size": 0,
107
+ "eos_token_id": null,
108
+ "exponential_decay_length_penalty": null,
109
+ "finetuning_task": null,
110
+ "forced_bos_token_id": null,
111
+ "forced_eos_token_id": null,
112
+ "hidden_act": "quick_gelu",
113
+ "hidden_size": 768,
114
+ "id2label": {
115
+ "0": "LABEL_0",
116
+ "1": "LABEL_1"
117
+ },
118
+ "image_size": 224,
119
+ "initializer_factor": 1.0,
120
+ "initializer_range": 0.02,
121
+ "intermediate_size": 3072,
122
+ "is_decoder": false,
123
+ "is_encoder_decoder": false,
124
+ "label2id": {
125
+ "LABEL_0": 0,
126
+ "LABEL_1": 1
127
+ },
128
+ "layer_norm_eps": 1e-05,
129
+ "length_penalty": 1.0,
130
+ "max_length": 20,
131
+ "min_length": 0,
132
+ "model_type": "clip_vision_model",
133
+ "no_repeat_ngram_size": 0,
134
+ "num_attention_heads": 12,
135
+ "num_beam_groups": 1,
136
+ "num_beams": 1,
137
+ "num_channels": 3,
138
+ "num_hidden_layers": 12,
139
+ "num_return_sequences": 1,
140
+ "output_attentions": false,
141
+ "output_hidden_states": false,
142
+ "output_scores": false,
143
+ "pad_token_id": null,
144
+ "patch_size": 32,
145
+ "prefix": null,
146
+ "problem_type": null,
147
+ "projection_dim": 512,
148
+ "pruned_heads": {},
149
+ "remove_invalid_values": false,
150
+ "repetition_penalty": 1.0,
151
+ "return_dict": true,
152
+ "return_dict_in_generate": false,
153
+ "sep_token_id": null,
154
+ "suppress_tokens": null,
155
+ "task_specific_params": null,
156
+ "temperature": 1.0,
157
+ "tf_legacy_loss": false,
158
+ "tie_encoder_decoder": false,
159
+ "tie_word_embeddings": true,
160
+ "tokenizer_class": null,
161
+ "top_k": 50,
162
+ "top_p": 1.0,
163
+ "torch_dtype": null,
164
+ "torchscript": false,
165
+ "typical_p": 1.0,
166
+ "use_bfloat16": false
167
+ }
168
+ }
eval_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "eval_loss": 3.893137216567993,
4
+ "eval_runtime": 22.5319,
5
+ "eval_samples_per_second": 6.657,
6
+ "eval_steps_per_second": 0.133
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d0ebbb17fb374fae361481a680b79025b82eb5ee94c53d54d073230224800f4
3
+ size 851603588
preprocessor_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_center_crop",
8
+ "crop_size",
9
+ "do_rescale",
10
+ "rescale_factor",
11
+ "do_normalize",
12
+ "image_mean",
13
+ "image_std",
14
+ "do_convert_rgb",
15
+ "return_tensors",
16
+ "data_format",
17
+ "input_data_format"
18
+ ],
19
+ "crop_size": {
20
+ "height": 224,
21
+ "width": 224
22
+ },
23
+ "do_center_crop": true,
24
+ "do_convert_rgb": true,
25
+ "do_normalize": true,
26
+ "do_rescale": true,
27
+ "do_resize": true,
28
+ "image_mean": [
29
+ 0.48145466,
30
+ 0.4578275,
31
+ 0.40821073
32
+ ],
33
+ "image_processor_type": "CLIPImageProcessor",
34
+ "image_std": [
35
+ 0.26862954,
36
+ 0.26130258,
37
+ 0.27577711
38
+ ],
39
+ "processor_class": "VisionTextDualEncoderProcessor",
40
+ "resample": 3,
41
+ "rescale_factor": 0.00392156862745098,
42
+ "size": {
43
+ "shortest_edge": 224
44
+ }
45
+ }
runs/Apr23_14-12-21_724d2d989f55/events.out.tfevents.1713881874.724d2d989f55.12119.0 ADDED
File without changes
runs/Apr23_14-40-22_87b1853ebd42/events.out.tfevents.1713883848.87b1853ebd42.3330.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb2494cf2d64d21827ff711044fe3759abfa8c46941dd2dd2fa7b8b1bd9b0d4d
3
+ size 8996
runs/Apr23_14-40-22_87b1853ebd42/events.out.tfevents.1713884578.87b1853ebd42.3330.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:793658b5b5d4594ec526456dc91c37cb8d2871742fc12e6965698b2ef76cf5ac
3
+ size 354
runs/Apr24_12-35-31_f7b1006b82d0/events.out.tfevents.1713962760.f7b1006b82d0.1724.0 ADDED
File without changes
runs/Apr24_14-17-17_4d9e3066714e/events.out.tfevents.1713968742.4d9e3066714e.2953.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b51f2f6bc88803030833400f2a1599001a68d724678f51913b74d997f2bf047
3
+ size 38793
runs/Apr24_14-17-17_4d9e3066714e/events.out.tfevents.1713970006.4d9e3066714e.2953.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63ec20676a89d242f2fcd9331a586da26ceaf263687428217da54be3eea20903
3
+ size 354
runs/Apr24_20-57-27_a73a230dd558/events.out.tfevents.1713992256.a73a230dd558.12284.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41270281a2b21fe5d9151ba193437a21ae3c9cc373389b2ab272a0f58a7b5f9c
3
+ size 17493
runs/Apr24_20-57-27_a73a230dd558/events.out.tfevents.1713993703.a73a230dd558.12284.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39eee467960c4a5f9817accbc3f4e8e9fd5e1dad7b9ca66a2f7bb2707481fa19
3
+ size 354
runs/Apr25_16-44-45_e00da5affe68/events.out.tfevents.1714064392.e00da5affe68.1696.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da646197cc6225afb1b1f5ffbf12c604682a98335bb3343dc0c58b5be72988fd
3
+ size 8745
runs/May03_15-03-33_b0efebecdfbe/events.out.tfevents.1714749244.b0efebecdfbe.15850.0 ADDED
File without changes
runs/May04_11-11-39_185903ad0266/events.out.tfevents.1714821106.185903ad0266.9545.0 ADDED
File without changes
runs/May06_10-39-50_b0cf0c1defaa/events.out.tfevents.1714992000.b0cf0c1defaa.11656.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:330c472929a65aef85c3767fe701044f10ab50fe3e236b158401b87bc0173388
3
+ size 10039
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<s>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "processor_class": "VisionTextDualEncoderProcessor",
54
+ "sep_token": "</s>",
55
+ "tokenizer_class": "RobertaTokenizer",
56
+ "trim_offsets": true,
57
+ "unk_token": "<unk>"
58
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.0,
3
+ "total_flos": 266767525440000.0,
4
+ "train_loss": 4.029143333435059,
5
+ "train_runtime": 1253.0431,
6
+ "train_samples_per_second": 1.595,
7
+ "train_steps_per_second": 0.026
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 1.0,
6
+ "global_step": 32,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0625,
13
+ "eval_loss": 4.102813243865967,
14
+ "eval_runtime": 46.3336,
15
+ "eval_samples_per_second": 3.237,
16
+ "eval_steps_per_second": 0.065,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.125,
21
+ "eval_loss": 4.02440071105957,
22
+ "eval_runtime": 8.6463,
23
+ "eval_samples_per_second": 17.348,
24
+ "eval_steps_per_second": 0.347,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.1875,
29
+ "eval_loss": 4.143311500549316,
30
+ "eval_runtime": 16.1378,
31
+ "eval_samples_per_second": 9.295,
32
+ "eval_steps_per_second": 0.186,
33
+ "step": 3
34
+ },
35
+ {
36
+ "epoch": 0.25,
37
+ "eval_loss": 4.173377990722656,
38
+ "eval_runtime": 8.329,
39
+ "eval_samples_per_second": 18.009,
40
+ "eval_steps_per_second": 0.36,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.3125,
45
+ "eval_loss": 4.01168155670166,
46
+ "eval_runtime": 16.236,
47
+ "eval_samples_per_second": 9.239,
48
+ "eval_steps_per_second": 0.185,
49
+ "step": 5
50
+ },
51
+ {
52
+ "epoch": 0.375,
53
+ "eval_loss": 4.010961055755615,
54
+ "eval_runtime": 8.3188,
55
+ "eval_samples_per_second": 18.031,
56
+ "eval_steps_per_second": 0.361,
57
+ "step": 6
58
+ },
59
+ {
60
+ "epoch": 0.4375,
61
+ "eval_loss": 4.041440486907959,
62
+ "eval_runtime": 15.8046,
63
+ "eval_samples_per_second": 9.491,
64
+ "eval_steps_per_second": 0.19,
65
+ "step": 7
66
+ },
67
+ {
68
+ "epoch": 0.5,
69
+ "eval_loss": 4.010050296783447,
70
+ "eval_runtime": 8.7811,
71
+ "eval_samples_per_second": 17.082,
72
+ "eval_steps_per_second": 0.342,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.5625,
77
+ "eval_loss": 4.008200645446777,
78
+ "eval_runtime": 15.9749,
79
+ "eval_samples_per_second": 9.39,
80
+ "eval_steps_per_second": 0.188,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.625,
85
+ "eval_loss": 3.9981045722961426,
86
+ "eval_runtime": 8.2704,
87
+ "eval_samples_per_second": 18.137,
88
+ "eval_steps_per_second": 0.363,
89
+ "step": 10
90
+ },
91
+ {
92
+ "epoch": 0.6875,
93
+ "eval_loss": 3.9941446781158447,
94
+ "eval_runtime": 15.9208,
95
+ "eval_samples_per_second": 9.422,
96
+ "eval_steps_per_second": 0.188,
97
+ "step": 11
98
+ },
99
+ {
100
+ "epoch": 0.75,
101
+ "eval_loss": 3.995903253555298,
102
+ "eval_runtime": 8.2167,
103
+ "eval_samples_per_second": 18.255,
104
+ "eval_steps_per_second": 0.365,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.8125,
109
+ "eval_loss": 3.9886744022369385,
110
+ "eval_runtime": 15.9708,
111
+ "eval_samples_per_second": 9.392,
112
+ "eval_steps_per_second": 0.188,
113
+ "step": 13
114
+ },
115
+ {
116
+ "epoch": 0.875,
117
+ "eval_loss": 3.982693672180176,
118
+ "eval_runtime": 8.8265,
119
+ "eval_samples_per_second": 16.994,
120
+ "eval_steps_per_second": 0.34,
121
+ "step": 14
122
+ },
123
+ {
124
+ "epoch": 0.9375,
125
+ "eval_loss": 3.9799349308013916,
126
+ "eval_runtime": 15.9139,
127
+ "eval_samples_per_second": 9.426,
128
+ "eval_steps_per_second": 0.189,
129
+ "step": 15
130
+ },
131
+ {
132
+ "epoch": 1.0,
133
+ "eval_loss": 3.9809114933013916,
134
+ "eval_runtime": 2.7645,
135
+ "eval_samples_per_second": 54.259,
136
+ "eval_steps_per_second": 1.085,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 1.0625,
141
+ "eval_loss": 3.9646639823913574,
142
+ "eval_runtime": 8.3504,
143
+ "eval_samples_per_second": 17.963,
144
+ "eval_steps_per_second": 0.359,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 1.125,
149
+ "eval_loss": 3.9485819339752197,
150
+ "eval_runtime": 2.9291,
151
+ "eval_samples_per_second": 51.21,
152
+ "eval_steps_per_second": 1.024,
153
+ "step": 18
154
+ },
155
+ {
156
+ "epoch": 1.1875,
157
+ "eval_loss": 3.9438858032226562,
158
+ "eval_runtime": 15.8414,
159
+ "eval_samples_per_second": 9.469,
160
+ "eval_steps_per_second": 0.189,
161
+ "step": 19
162
+ },
163
+ {
164
+ "epoch": 1.25,
165
+ "eval_loss": 3.941105842590332,
166
+ "eval_runtime": 2.7691,
167
+ "eval_samples_per_second": 54.17,
168
+ "eval_steps_per_second": 1.083,
169
+ "step": 20
170
+ },
171
+ {
172
+ "epoch": 1.3125,
173
+ "eval_loss": 3.933875322341919,
174
+ "eval_runtime": 8.7798,
175
+ "eval_samples_per_second": 17.085,
176
+ "eval_steps_per_second": 0.342,
177
+ "step": 21
178
+ },
179
+ {
180
+ "epoch": 1.375,
181
+ "eval_loss": 3.9402499198913574,
182
+ "eval_runtime": 3.0846,
183
+ "eval_samples_per_second": 48.629,
184
+ "eval_steps_per_second": 0.973,
185
+ "step": 22
186
+ },
187
+ {
188
+ "epoch": 1.4375,
189
+ "eval_loss": 3.964958429336548,
190
+ "eval_runtime": 16.1126,
191
+ "eval_samples_per_second": 9.31,
192
+ "eval_steps_per_second": 0.186,
193
+ "step": 23
194
+ },
195
+ {
196
+ "epoch": 1.5,
197
+ "eval_loss": 3.963437080383301,
198
+ "eval_runtime": 2.8397,
199
+ "eval_samples_per_second": 52.822,
200
+ "eval_steps_per_second": 1.056,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 1.5625,
205
+ "eval_loss": 3.9432413578033447,
206
+ "eval_runtime": 8.5666,
207
+ "eval_samples_per_second": 17.51,
208
+ "eval_steps_per_second": 0.35,
209
+ "step": 25
210
+ },
211
+ {
212
+ "epoch": 1.625,
213
+ "eval_loss": 3.9292993545532227,
214
+ "eval_runtime": 2.8879,
215
+ "eval_samples_per_second": 51.94,
216
+ "eval_steps_per_second": 1.039,
217
+ "step": 26
218
+ },
219
+ {
220
+ "epoch": 1.6875,
221
+ "eval_loss": 3.916614532470703,
222
+ "eval_runtime": 16.1071,
223
+ "eval_samples_per_second": 9.313,
224
+ "eval_steps_per_second": 0.186,
225
+ "step": 27
226
+ },
227
+ {
228
+ "epoch": 1.75,
229
+ "eval_loss": 3.9063076972961426,
230
+ "eval_runtime": 2.884,
231
+ "eval_samples_per_second": 52.01,
232
+ "eval_steps_per_second": 1.04,
233
+ "step": 28
234
+ },
235
+ {
236
+ "epoch": 1.8125,
237
+ "eval_loss": 3.9020252227783203,
238
+ "eval_runtime": 8.445,
239
+ "eval_samples_per_second": 17.762,
240
+ "eval_steps_per_second": 0.355,
241
+ "step": 29
242
+ },
243
+ {
244
+ "epoch": 1.875,
245
+ "eval_loss": 3.8990707397460938,
246
+ "eval_runtime": 3.0029,
247
+ "eval_samples_per_second": 49.952,
248
+ "eval_steps_per_second": 0.999,
249
+ "step": 30
250
+ },
251
+ {
252
+ "epoch": 1.9375,
253
+ "eval_loss": 3.8950419425964355,
254
+ "eval_runtime": 16.0252,
255
+ "eval_samples_per_second": 9.36,
256
+ "eval_steps_per_second": 0.187,
257
+ "step": 31
258
+ },
259
+ {
260
+ "epoch": 2.0,
261
+ "eval_loss": 3.893137216567993,
262
+ "eval_runtime": 2.8946,
263
+ "eval_samples_per_second": 51.821,
264
+ "eval_steps_per_second": 1.036,
265
+ "step": 32
266
+ },
267
+ {
268
+ "epoch": 2.0,
269
+ "step": 32,
270
+ "total_flos": 266767525440000.0,
271
+ "train_loss": 4.029143333435059,
272
+ "train_runtime": 1253.0431,
273
+ "train_samples_per_second": 1.595,
274
+ "train_steps_per_second": 0.026
275
+ }
276
+ ],
277
+ "logging_steps": 500,
278
+ "max_steps": 32,
279
+ "num_input_tokens_seen": 0,
280
+ "num_train_epochs": 2,
281
+ "save_steps": 500,
282
+ "total_flos": 266767525440000.0,
283
+ "train_batch_size": 64,
284
+ "trial_name": null,
285
+ "trial_params": null
286
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c2fe482bbe8d183d739c92755f0d99020a7c9af4003925ae1d9f71d59774596
3
+ size 5112
vocab.json ADDED
The diff for this file is too large to render. See raw diff