alexue4 commited on
Commit
b88d047
1 Parent(s): cc5b775

End of training

Browse files
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/mt5-small
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: text-translit-detector-ru
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # text-translit-detector-ru
15
+
16
+ This model is a fine-tuned version of [google/mt5-small](https://huggingface.co/google/mt5-small) on the None dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 0.0393
19
+ - Mean Distance: 0
20
+ - Max Distance: 1
21
+
22
+ ## Model description
23
+
24
+ More information needed
25
+
26
+ ## Intended uses & limitations
27
+
28
+ More information needed
29
+
30
+ ## Training and evaluation data
31
+
32
+ More information needed
33
+
34
+ ## Training procedure
35
+
36
+ ### Training hyperparameters
37
+
38
+ The following hyperparameters were used during training:
39
+ - learning_rate: 0.0001
40
+ - train_batch_size: 15
41
+ - eval_batch_size: 15
42
+ - seed: 42
43
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
44
+ - lr_scheduler_type: linear
45
+ - lr_scheduler_warmup_ratio: 0.1
46
+ - num_epochs: 20
47
+
48
+ ### Training results
49
+
50
+ | Training Loss | Epoch | Step | Validation Loss | Mean Distance | Max Distance |
51
+ |:-------------:|:-----:|:-----:|:---------------:|:-------------:|:------------:|
52
+ | 0.2324 | 1.0 | 2664 | 0.1072 | 0 | 1 |
53
+ | 0.0151 | 2.0 | 5328 | 0.0436 | 0 | 1 |
54
+ | 0.0094 | 3.0 | 7992 | 0.0241 | 0 | 1 |
55
+ | 0.0056 | 4.0 | 10656 | 0.0309 | 0 | 1 |
56
+ | 0.0068 | 5.0 | 13320 | 0.0356 | 0 | 1 |
57
+ | 0.0041 | 6.0 | 15984 | 0.0186 | 0 | 1 |
58
+ | 0.0034 | 7.0 | 18648 | 0.0426 | 0 | 1 |
59
+ | 0.0043 | 8.0 | 21312 | 0.0172 | 0 | 1 |
60
+ | 0.004 | 9.0 | 23976 | 0.0272 | 0 | 1 |
61
+ | 0.0005 | 10.0 | 26640 | 0.0333 | 0 | 1 |
62
+ | 0.0025 | 11.0 | 29304 | 0.0358 | 0 | 1 |
63
+ | 0.0021 | 12.0 | 31968 | 0.0474 | 0 | 1 |
64
+ | 0.0007 | 13.0 | 34632 | 0.0402 | 0 | 1 |
65
+ | 0.0017 | 14.0 | 37296 | 0.0392 | 0 | 1 |
66
+ | 0.0007 | 15.0 | 39960 | 0.0394 | 0 | 1 |
67
+ | 0.0013 | 16.0 | 42624 | 0.0442 | 0 | 1 |
68
+ | 0.0002 | 17.0 | 45288 | 0.0443 | 0 | 1 |
69
+ | 0.0013 | 18.0 | 47952 | 0.0389 | 0 | 1 |
70
+ | 0.0001 | 19.0 | 50616 | 0.0412 | 0 | 1 |
71
+ | 0.0001 | 20.0 | 53280 | 0.0393 | 0 | 1 |
72
+
73
+
74
+ ### Framework versions
75
+
76
+ - Transformers 4.35.0
77
+ - Pytorch 2.1.0+cu118
78
+ - Datasets 2.14.6
79
+ - Tokenizers 0.14.1
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-small",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "classifier_dropout": 0.0,
7
+ "d_ff": 1024,
8
+ "d_kv": 64,
9
+ "d_model": 512,
10
+ "decoder_start_token_id": 0,
11
+ "dense_act_fn": "gelu_new",
12
+ "dropout_rate": 0.1,
13
+ "eos_token_id": 1,
14
+ "feed_forward_proj": "gated-gelu",
15
+ "initializer_factor": 1.0,
16
+ "is_encoder_decoder": true,
17
+ "is_gated_act": true,
18
+ "layer_norm_epsilon": 1e-06,
19
+ "model_type": "mt5",
20
+ "num_decoder_layers": 8,
21
+ "num_heads": 6,
22
+ "num_layers": 8,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.35.0",
30
+ "use_cache": true,
31
+ "vocab_size": 250112
32
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.35.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:139f4efafa81328de028fe45cfd22391527b568ecc61cf081db24d84a0d357e1
3
+ size 1200729512
runs/Nov12_15-27-02_DESKTOP-A45193E/events.out.tfevents.1699795624.DESKTOP-A45193E ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a516146ba1a3a0134d3be7ecde0b3b4703f4834cc0427d2afc7dedce56be6d9
3
+ size 44274
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "<pad>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
+ "clean_up_tokenization_spaces": true,
30
+ "eos_token": "</s>",
31
+ "extra_ids": 0,
32
+ "legacy": true,
33
+ "model_max_length": 1000000000000000019884624838656,
34
+ "pad_token": "<pad>",
35
+ "sp_model_kwargs": {},
36
+ "tokenizer_class": "T5Tokenizer",
37
+ "unk_token": "<unk>"
38
+ }
trainer_state.json ADDED
@@ -0,0 +1,1428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 20.0,
5
+ "eval_steps": 500,
6
+ "global_step": 53280,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 1.876876876876877e-08,
14
+ "loss": 26.3774,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.1,
19
+ "learning_rate": 5.011261261261262e-06,
20
+ "loss": 28.5448,
21
+ "step": 267
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.0022522522522524e-05,
26
+ "loss": 27.0258,
27
+ "step": 534
28
+ },
29
+ {
30
+ "epoch": 0.3,
31
+ "learning_rate": 1.5033783783783784e-05,
32
+ "loss": 22.7776,
33
+ "step": 801
34
+ },
35
+ {
36
+ "epoch": 0.4,
37
+ "learning_rate": 2.0045045045045048e-05,
38
+ "loss": 17.4592,
39
+ "step": 1068
40
+ },
41
+ {
42
+ "epoch": 0.5,
43
+ "learning_rate": 2.505630630630631e-05,
44
+ "loss": 8.3509,
45
+ "step": 1335
46
+ },
47
+ {
48
+ "epoch": 0.6,
49
+ "learning_rate": 3.006756756756757e-05,
50
+ "loss": 0.8438,
51
+ "step": 1602
52
+ },
53
+ {
54
+ "epoch": 0.7,
55
+ "learning_rate": 3.507882882882883e-05,
56
+ "loss": 0.3406,
57
+ "step": 1869
58
+ },
59
+ {
60
+ "epoch": 0.8,
61
+ "learning_rate": 4.0090090090090096e-05,
62
+ "loss": 0.2968,
63
+ "step": 2136
64
+ },
65
+ {
66
+ "epoch": 0.9,
67
+ "learning_rate": 4.510135135135135e-05,
68
+ "loss": 0.2324,
69
+ "step": 2403
70
+ },
71
+ {
72
+ "epoch": 1.0,
73
+ "eval_loss": 0.10719971358776093,
74
+ "eval_max_distance": 1,
75
+ "eval_mean_distance": 0,
76
+ "eval_runtime": 1.1818,
77
+ "eval_samples_per_second": 302.937,
78
+ "eval_steps_per_second": 20.309,
79
+ "step": 2664
80
+ },
81
+ {
82
+ "epoch": 1.0,
83
+ "learning_rate": 5.011261261261262e-05,
84
+ "loss": 0.122,
85
+ "step": 2670
86
+ },
87
+ {
88
+ "epoch": 1.1,
89
+ "learning_rate": 5.512387387387388e-05,
90
+ "loss": 0.1234,
91
+ "step": 2937
92
+ },
93
+ {
94
+ "epoch": 1.2,
95
+ "learning_rate": 6.013513513513514e-05,
96
+ "loss": 0.1377,
97
+ "step": 3204
98
+ },
99
+ {
100
+ "epoch": 1.3,
101
+ "learning_rate": 6.514639639639641e-05,
102
+ "loss": 0.0721,
103
+ "step": 3471
104
+ },
105
+ {
106
+ "epoch": 1.4,
107
+ "learning_rate": 7.015765765765766e-05,
108
+ "loss": 0.0447,
109
+ "step": 3738
110
+ },
111
+ {
112
+ "epoch": 1.5,
113
+ "learning_rate": 7.516891891891891e-05,
114
+ "loss": 0.0262,
115
+ "step": 4005
116
+ },
117
+ {
118
+ "epoch": 1.6,
119
+ "learning_rate": 8.018018018018019e-05,
120
+ "loss": 0.0262,
121
+ "step": 4272
122
+ },
123
+ {
124
+ "epoch": 1.7,
125
+ "learning_rate": 8.519144144144144e-05,
126
+ "loss": 0.0235,
127
+ "step": 4539
128
+ },
129
+ {
130
+ "epoch": 1.8,
131
+ "learning_rate": 9.02027027027027e-05,
132
+ "loss": 0.0163,
133
+ "step": 4806
134
+ },
135
+ {
136
+ "epoch": 1.9,
137
+ "learning_rate": 9.521396396396397e-05,
138
+ "loss": 0.0151,
139
+ "step": 5073
140
+ },
141
+ {
142
+ "epoch": 2.0,
143
+ "eval_loss": 0.04358534514904022,
144
+ "eval_max_distance": 1,
145
+ "eval_mean_distance": 0,
146
+ "eval_runtime": 0.906,
147
+ "eval_samples_per_second": 395.165,
148
+ "eval_steps_per_second": 26.492,
149
+ "step": 5328
150
+ },
151
+ {
152
+ "epoch": 2.0,
153
+ "learning_rate": 9.997497497497498e-05,
154
+ "loss": 0.014,
155
+ "step": 5340
156
+ },
157
+ {
158
+ "epoch": 2.1,
159
+ "learning_rate": 9.941816816816817e-05,
160
+ "loss": 0.0136,
161
+ "step": 5607
162
+ },
163
+ {
164
+ "epoch": 2.2,
165
+ "learning_rate": 9.886136136136137e-05,
166
+ "loss": 0.007,
167
+ "step": 5874
168
+ },
169
+ {
170
+ "epoch": 2.31,
171
+ "learning_rate": 9.830455455455457e-05,
172
+ "loss": 0.0144,
173
+ "step": 6141
174
+ },
175
+ {
176
+ "epoch": 2.41,
177
+ "learning_rate": 9.774774774774775e-05,
178
+ "loss": 0.008,
179
+ "step": 6408
180
+ },
181
+ {
182
+ "epoch": 2.51,
183
+ "learning_rate": 9.719094094094095e-05,
184
+ "loss": 0.0109,
185
+ "step": 6675
186
+ },
187
+ {
188
+ "epoch": 2.61,
189
+ "learning_rate": 9.663413413413414e-05,
190
+ "loss": 0.0141,
191
+ "step": 6942
192
+ },
193
+ {
194
+ "epoch": 2.71,
195
+ "learning_rate": 9.607732732732732e-05,
196
+ "loss": 0.011,
197
+ "step": 7209
198
+ },
199
+ {
200
+ "epoch": 2.81,
201
+ "learning_rate": 9.552052052052053e-05,
202
+ "loss": 0.0136,
203
+ "step": 7476
204
+ },
205
+ {
206
+ "epoch": 2.91,
207
+ "learning_rate": 9.496371371371372e-05,
208
+ "loss": 0.0094,
209
+ "step": 7743
210
+ },
211
+ {
212
+ "epoch": 3.0,
213
+ "eval_loss": 0.024132976308465004,
214
+ "eval_max_distance": 1,
215
+ "eval_mean_distance": 0,
216
+ "eval_runtime": 0.9049,
217
+ "eval_samples_per_second": 395.615,
218
+ "eval_steps_per_second": 26.522,
219
+ "step": 7992
220
+ },
221
+ {
222
+ "epoch": 3.01,
223
+ "learning_rate": 9.440690690690692e-05,
224
+ "loss": 0.0158,
225
+ "step": 8010
226
+ },
227
+ {
228
+ "epoch": 3.11,
229
+ "learning_rate": 9.38501001001001e-05,
230
+ "loss": 0.0085,
231
+ "step": 8277
232
+ },
233
+ {
234
+ "epoch": 3.21,
235
+ "learning_rate": 9.329329329329329e-05,
236
+ "loss": 0.0074,
237
+ "step": 8544
238
+ },
239
+ {
240
+ "epoch": 3.31,
241
+ "learning_rate": 9.27364864864865e-05,
242
+ "loss": 0.0069,
243
+ "step": 8811
244
+ },
245
+ {
246
+ "epoch": 3.41,
247
+ "learning_rate": 9.217967967967968e-05,
248
+ "loss": 0.0049,
249
+ "step": 9078
250
+ },
251
+ {
252
+ "epoch": 3.51,
253
+ "learning_rate": 9.162287287287288e-05,
254
+ "loss": 0.0045,
255
+ "step": 9345
256
+ },
257
+ {
258
+ "epoch": 3.61,
259
+ "learning_rate": 9.106606606606607e-05,
260
+ "loss": 0.0053,
261
+ "step": 9612
262
+ },
263
+ {
264
+ "epoch": 3.71,
265
+ "learning_rate": 9.050925925925925e-05,
266
+ "loss": 0.0069,
267
+ "step": 9879
268
+ },
269
+ {
270
+ "epoch": 3.81,
271
+ "learning_rate": 8.995245245245245e-05,
272
+ "loss": 0.0067,
273
+ "step": 10146
274
+ },
275
+ {
276
+ "epoch": 3.91,
277
+ "learning_rate": 8.939564564564565e-05,
278
+ "loss": 0.0056,
279
+ "step": 10413
280
+ },
281
+ {
282
+ "epoch": 4.0,
283
+ "eval_loss": 0.030885161831974983,
284
+ "eval_max_distance": 1,
285
+ "eval_mean_distance": 0,
286
+ "eval_runtime": 0.9782,
287
+ "eval_samples_per_second": 365.991,
288
+ "eval_steps_per_second": 24.536,
289
+ "step": 10656
290
+ },
291
+ {
292
+ "epoch": 4.01,
293
+ "learning_rate": 8.883883883883885e-05,
294
+ "loss": 0.0057,
295
+ "step": 10680
296
+ },
297
+ {
298
+ "epoch": 4.11,
299
+ "learning_rate": 8.828203203203204e-05,
300
+ "loss": 0.0018,
301
+ "step": 10947
302
+ },
303
+ {
304
+ "epoch": 4.21,
305
+ "learning_rate": 8.772522522522522e-05,
306
+ "loss": 0.0034,
307
+ "step": 11214
308
+ },
309
+ {
310
+ "epoch": 4.31,
311
+ "learning_rate": 8.716841841841842e-05,
312
+ "loss": 0.0074,
313
+ "step": 11481
314
+ },
315
+ {
316
+ "epoch": 4.41,
317
+ "learning_rate": 8.661161161161162e-05,
318
+ "loss": 0.002,
319
+ "step": 11748
320
+ },
321
+ {
322
+ "epoch": 4.51,
323
+ "learning_rate": 8.605480480480482e-05,
324
+ "loss": 0.0121,
325
+ "step": 12015
326
+ },
327
+ {
328
+ "epoch": 4.61,
329
+ "learning_rate": 8.5497997997998e-05,
330
+ "loss": 0.0049,
331
+ "step": 12282
332
+ },
333
+ {
334
+ "epoch": 4.71,
335
+ "learning_rate": 8.49411911911912e-05,
336
+ "loss": 0.0045,
337
+ "step": 12549
338
+ },
339
+ {
340
+ "epoch": 4.81,
341
+ "learning_rate": 8.438438438438439e-05,
342
+ "loss": 0.0022,
343
+ "step": 12816
344
+ },
345
+ {
346
+ "epoch": 4.91,
347
+ "learning_rate": 8.382757757757757e-05,
348
+ "loss": 0.0068,
349
+ "step": 13083
350
+ },
351
+ {
352
+ "epoch": 5.0,
353
+ "eval_loss": 0.03561040014028549,
354
+ "eval_max_distance": 1,
355
+ "eval_mean_distance": 0,
356
+ "eval_runtime": 0.882,
357
+ "eval_samples_per_second": 405.884,
358
+ "eval_steps_per_second": 27.21,
359
+ "step": 13320
360
+ },
361
+ {
362
+ "epoch": 5.01,
363
+ "learning_rate": 8.327077077077078e-05,
364
+ "loss": 0.0063,
365
+ "step": 13350
366
+ },
367
+ {
368
+ "epoch": 5.11,
369
+ "learning_rate": 8.271396396396397e-05,
370
+ "loss": 0.002,
371
+ "step": 13617
372
+ },
373
+ {
374
+ "epoch": 5.21,
375
+ "learning_rate": 8.215715715715717e-05,
376
+ "loss": 0.007,
377
+ "step": 13884
378
+ },
379
+ {
380
+ "epoch": 5.31,
381
+ "learning_rate": 8.160035035035035e-05,
382
+ "loss": 0.0036,
383
+ "step": 14151
384
+ },
385
+ {
386
+ "epoch": 5.41,
387
+ "learning_rate": 8.104354354354354e-05,
388
+ "loss": 0.0036,
389
+ "step": 14418
390
+ },
391
+ {
392
+ "epoch": 5.51,
393
+ "learning_rate": 8.048673673673675e-05,
394
+ "loss": 0.0087,
395
+ "step": 14685
396
+ },
397
+ {
398
+ "epoch": 5.61,
399
+ "learning_rate": 7.992992992992994e-05,
400
+ "loss": 0.0013,
401
+ "step": 14952
402
+ },
403
+ {
404
+ "epoch": 5.71,
405
+ "learning_rate": 7.937312312312313e-05,
406
+ "loss": 0.0031,
407
+ "step": 15219
408
+ },
409
+ {
410
+ "epoch": 5.81,
411
+ "learning_rate": 7.881631631631632e-05,
412
+ "loss": 0.0031,
413
+ "step": 15486
414
+ },
415
+ {
416
+ "epoch": 5.91,
417
+ "learning_rate": 7.82595095095095e-05,
418
+ "loss": 0.0041,
419
+ "step": 15753
420
+ },
421
+ {
422
+ "epoch": 6.0,
423
+ "eval_loss": 0.018554789945483208,
424
+ "eval_max_distance": 1,
425
+ "eval_mean_distance": 0,
426
+ "eval_runtime": 0.8772,
427
+ "eval_samples_per_second": 408.133,
428
+ "eval_steps_per_second": 27.361,
429
+ "step": 15984
430
+ },
431
+ {
432
+ "epoch": 6.01,
433
+ "learning_rate": 7.77027027027027e-05,
434
+ "loss": 0.0044,
435
+ "step": 16020
436
+ },
437
+ {
438
+ "epoch": 6.11,
439
+ "learning_rate": 7.71458958958959e-05,
440
+ "loss": 0.0012,
441
+ "step": 16287
442
+ },
443
+ {
444
+ "epoch": 6.21,
445
+ "learning_rate": 7.65890890890891e-05,
446
+ "loss": 0.0036,
447
+ "step": 16554
448
+ },
449
+ {
450
+ "epoch": 6.31,
451
+ "learning_rate": 7.603228228228229e-05,
452
+ "loss": 0.0028,
453
+ "step": 16821
454
+ },
455
+ {
456
+ "epoch": 6.41,
457
+ "learning_rate": 7.547547547547547e-05,
458
+ "loss": 0.0039,
459
+ "step": 17088
460
+ },
461
+ {
462
+ "epoch": 6.51,
463
+ "learning_rate": 7.491866866866867e-05,
464
+ "loss": 0.0013,
465
+ "step": 17355
466
+ },
467
+ {
468
+ "epoch": 6.61,
469
+ "learning_rate": 7.436186186186187e-05,
470
+ "loss": 0.0026,
471
+ "step": 17622
472
+ },
473
+ {
474
+ "epoch": 6.72,
475
+ "learning_rate": 7.380505505505507e-05,
476
+ "loss": 0.0015,
477
+ "step": 17889
478
+ },
479
+ {
480
+ "epoch": 6.82,
481
+ "learning_rate": 7.324824824824825e-05,
482
+ "loss": 0.0009,
483
+ "step": 18156
484
+ },
485
+ {
486
+ "epoch": 6.92,
487
+ "learning_rate": 7.269144144144144e-05,
488
+ "loss": 0.0034,
489
+ "step": 18423
490
+ },
491
+ {
492
+ "epoch": 7.0,
493
+ "eval_loss": 0.04260706901550293,
494
+ "eval_max_distance": 1,
495
+ "eval_mean_distance": 0,
496
+ "eval_runtime": 0.9016,
497
+ "eval_samples_per_second": 397.069,
498
+ "eval_steps_per_second": 26.619,
499
+ "step": 18648
500
+ },
501
+ {
502
+ "epoch": 7.02,
503
+ "learning_rate": 7.213463463463464e-05,
504
+ "loss": 0.0027,
505
+ "step": 18690
506
+ },
507
+ {
508
+ "epoch": 7.12,
509
+ "learning_rate": 7.157782782782782e-05,
510
+ "loss": 0.0018,
511
+ "step": 18957
512
+ },
513
+ {
514
+ "epoch": 7.22,
515
+ "learning_rate": 7.102102102102103e-05,
516
+ "loss": 0.0024,
517
+ "step": 19224
518
+ },
519
+ {
520
+ "epoch": 7.32,
521
+ "learning_rate": 7.046421421421422e-05,
522
+ "loss": 0.0021,
523
+ "step": 19491
524
+ },
525
+ {
526
+ "epoch": 7.42,
527
+ "learning_rate": 6.99074074074074e-05,
528
+ "loss": 0.0017,
529
+ "step": 19758
530
+ },
531
+ {
532
+ "epoch": 7.52,
533
+ "learning_rate": 6.93506006006006e-05,
534
+ "loss": 0.0013,
535
+ "step": 20025
536
+ },
537
+ {
538
+ "epoch": 7.62,
539
+ "learning_rate": 6.879379379379379e-05,
540
+ "loss": 0.0025,
541
+ "step": 20292
542
+ },
543
+ {
544
+ "epoch": 7.72,
545
+ "learning_rate": 6.8236986986987e-05,
546
+ "loss": 0.0017,
547
+ "step": 20559
548
+ },
549
+ {
550
+ "epoch": 7.82,
551
+ "learning_rate": 6.768018018018019e-05,
552
+ "loss": 0.0022,
553
+ "step": 20826
554
+ },
555
+ {
556
+ "epoch": 7.92,
557
+ "learning_rate": 6.712337337337337e-05,
558
+ "loss": 0.0043,
559
+ "step": 21093
560
+ },
561
+ {
562
+ "epoch": 8.0,
563
+ "eval_loss": 0.017180927097797394,
564
+ "eval_max_distance": 1,
565
+ "eval_mean_distance": 0,
566
+ "eval_runtime": 0.8803,
567
+ "eval_samples_per_second": 406.66,
568
+ "eval_steps_per_second": 27.262,
569
+ "step": 21312
570
+ },
571
+ {
572
+ "epoch": 8.02,
573
+ "learning_rate": 6.656656656656657e-05,
574
+ "loss": 0.0035,
575
+ "step": 21360
576
+ },
577
+ {
578
+ "epoch": 8.12,
579
+ "learning_rate": 6.600975975975976e-05,
580
+ "loss": 0.0024,
581
+ "step": 21627
582
+ },
583
+ {
584
+ "epoch": 8.22,
585
+ "learning_rate": 6.545295295295295e-05,
586
+ "loss": 0.0036,
587
+ "step": 21894
588
+ },
589
+ {
590
+ "epoch": 8.32,
591
+ "learning_rate": 6.489614614614615e-05,
592
+ "loss": 0.0034,
593
+ "step": 22161
594
+ },
595
+ {
596
+ "epoch": 8.42,
597
+ "learning_rate": 6.433933933933934e-05,
598
+ "loss": 0.002,
599
+ "step": 22428
600
+ },
601
+ {
602
+ "epoch": 8.52,
603
+ "learning_rate": 6.378253253253254e-05,
604
+ "loss": 0.0029,
605
+ "step": 22695
606
+ },
607
+ {
608
+ "epoch": 8.62,
609
+ "learning_rate": 6.322572572572572e-05,
610
+ "loss": 0.002,
611
+ "step": 22962
612
+ },
613
+ {
614
+ "epoch": 8.72,
615
+ "learning_rate": 6.266891891891892e-05,
616
+ "loss": 0.0011,
617
+ "step": 23229
618
+ },
619
+ {
620
+ "epoch": 8.82,
621
+ "learning_rate": 6.211211211211212e-05,
622
+ "loss": 0.0006,
623
+ "step": 23496
624
+ },
625
+ {
626
+ "epoch": 8.92,
627
+ "learning_rate": 6.15553053053053e-05,
628
+ "loss": 0.004,
629
+ "step": 23763
630
+ },
631
+ {
632
+ "epoch": 9.0,
633
+ "eval_loss": 0.027217011898756027,
634
+ "eval_max_distance": 1,
635
+ "eval_mean_distance": 0,
636
+ "eval_runtime": 0.8897,
637
+ "eval_samples_per_second": 402.398,
638
+ "eval_steps_per_second": 26.976,
639
+ "step": 23976
640
+ },
641
+ {
642
+ "epoch": 9.02,
643
+ "learning_rate": 6.0998498498498503e-05,
644
+ "loss": 0.0038,
645
+ "step": 24030
646
+ },
647
+ {
648
+ "epoch": 9.12,
649
+ "learning_rate": 6.0441691691691695e-05,
650
+ "loss": 0.0022,
651
+ "step": 24297
652
+ },
653
+ {
654
+ "epoch": 9.22,
655
+ "learning_rate": 5.988488488488489e-05,
656
+ "loss": 0.0011,
657
+ "step": 24564
658
+ },
659
+ {
660
+ "epoch": 9.32,
661
+ "learning_rate": 5.932807807807807e-05,
662
+ "loss": 0.0013,
663
+ "step": 24831
664
+ },
665
+ {
666
+ "epoch": 9.42,
667
+ "learning_rate": 5.877127127127128e-05,
668
+ "loss": 0.0014,
669
+ "step": 25098
670
+ },
671
+ {
672
+ "epoch": 9.52,
673
+ "learning_rate": 5.821446446446447e-05,
674
+ "loss": 0.0037,
675
+ "step": 25365
676
+ },
677
+ {
678
+ "epoch": 9.62,
679
+ "learning_rate": 5.765765765765766e-05,
680
+ "loss": 0.0011,
681
+ "step": 25632
682
+ },
683
+ {
684
+ "epoch": 9.72,
685
+ "learning_rate": 5.7100850850850854e-05,
686
+ "loss": 0.0028,
687
+ "step": 25899
688
+ },
689
+ {
690
+ "epoch": 9.82,
691
+ "learning_rate": 5.654404404404404e-05,
692
+ "loss": 0.0012,
693
+ "step": 26166
694
+ },
695
+ {
696
+ "epoch": 9.92,
697
+ "learning_rate": 5.5987237237237245e-05,
698
+ "loss": 0.0005,
699
+ "step": 26433
700
+ },
701
+ {
702
+ "epoch": 10.0,
703
+ "eval_loss": 0.033312857151031494,
704
+ "eval_max_distance": 1,
705
+ "eval_mean_distance": 0,
706
+ "eval_runtime": 0.9102,
707
+ "eval_samples_per_second": 393.315,
708
+ "eval_steps_per_second": 26.367,
709
+ "step": 26640
710
+ },
711
+ {
712
+ "epoch": 10.02,
713
+ "learning_rate": 5.543043043043044e-05,
714
+ "loss": 0.0023,
715
+ "step": 26700
716
+ },
717
+ {
718
+ "epoch": 10.12,
719
+ "learning_rate": 5.487362362362363e-05,
720
+ "loss": 0.0032,
721
+ "step": 26967
722
+ },
723
+ {
724
+ "epoch": 10.22,
725
+ "learning_rate": 5.431681681681682e-05,
726
+ "loss": 0.0013,
727
+ "step": 27234
728
+ },
729
+ {
730
+ "epoch": 10.32,
731
+ "learning_rate": 5.3760010010010006e-05,
732
+ "loss": 0.0009,
733
+ "step": 27501
734
+ },
735
+ {
736
+ "epoch": 10.42,
737
+ "learning_rate": 5.320320320320321e-05,
738
+ "loss": 0.0039,
739
+ "step": 27768
740
+ },
741
+ {
742
+ "epoch": 10.52,
743
+ "learning_rate": 5.2646396396396403e-05,
744
+ "loss": 0.0008,
745
+ "step": 28035
746
+ },
747
+ {
748
+ "epoch": 10.62,
749
+ "learning_rate": 5.2089589589589595e-05,
750
+ "loss": 0.0009,
751
+ "step": 28302
752
+ },
753
+ {
754
+ "epoch": 10.72,
755
+ "learning_rate": 5.153278278278279e-05,
756
+ "loss": 0.002,
757
+ "step": 28569
758
+ },
759
+ {
760
+ "epoch": 10.82,
761
+ "learning_rate": 5.097597597597597e-05,
762
+ "loss": 0.0006,
763
+ "step": 28836
764
+ },
765
+ {
766
+ "epoch": 10.92,
767
+ "learning_rate": 5.0419169169169165e-05,
768
+ "loss": 0.0025,
769
+ "step": 29103
770
+ },
771
+ {
772
+ "epoch": 11.0,
773
+ "eval_loss": 0.03584469109773636,
774
+ "eval_max_distance": 1,
775
+ "eval_mean_distance": 0,
776
+ "eval_runtime": 0.8813,
777
+ "eval_samples_per_second": 406.214,
778
+ "eval_steps_per_second": 27.232,
779
+ "step": 29304
780
+ },
781
+ {
782
+ "epoch": 11.02,
783
+ "learning_rate": 4.9862362362362363e-05,
784
+ "loss": 0.0041,
785
+ "step": 29370
786
+ },
787
+ {
788
+ "epoch": 11.12,
789
+ "learning_rate": 4.930555555555556e-05,
790
+ "loss": 0.0002,
791
+ "step": 29637
792
+ },
793
+ {
794
+ "epoch": 11.23,
795
+ "learning_rate": 4.8748748748748754e-05,
796
+ "loss": 0.0001,
797
+ "step": 29904
798
+ },
799
+ {
800
+ "epoch": 11.33,
801
+ "learning_rate": 4.819194194194194e-05,
802
+ "loss": 0.0001,
803
+ "step": 30171
804
+ },
805
+ {
806
+ "epoch": 11.43,
807
+ "learning_rate": 4.763513513513514e-05,
808
+ "loss": 0.0003,
809
+ "step": 30438
810
+ },
811
+ {
812
+ "epoch": 11.53,
813
+ "learning_rate": 4.707832832832833e-05,
814
+ "loss": 0.003,
815
+ "step": 30705
816
+ },
817
+ {
818
+ "epoch": 11.63,
819
+ "learning_rate": 4.652152152152152e-05,
820
+ "loss": 0.0002,
821
+ "step": 30972
822
+ },
823
+ {
824
+ "epoch": 11.73,
825
+ "learning_rate": 4.596471471471472e-05,
826
+ "loss": 0.0037,
827
+ "step": 31239
828
+ },
829
+ {
830
+ "epoch": 11.83,
831
+ "learning_rate": 4.540790790790791e-05,
832
+ "loss": 0.0014,
833
+ "step": 31506
834
+ },
835
+ {
836
+ "epoch": 11.93,
837
+ "learning_rate": 4.48511011011011e-05,
838
+ "loss": 0.0021,
839
+ "step": 31773
840
+ },
841
+ {
842
+ "epoch": 12.0,
843
+ "eval_loss": 0.04740298539400101,
844
+ "eval_max_distance": 1,
845
+ "eval_mean_distance": 0,
846
+ "eval_runtime": 0.881,
847
+ "eval_samples_per_second": 406.347,
848
+ "eval_steps_per_second": 27.241,
849
+ "step": 31968
850
+ },
851
+ {
852
+ "epoch": 12.03,
853
+ "learning_rate": 4.42942942942943e-05,
854
+ "loss": 0.0006,
855
+ "step": 32040
856
+ },
857
+ {
858
+ "epoch": 12.13,
859
+ "learning_rate": 4.373748748748749e-05,
860
+ "loss": 0.0007,
861
+ "step": 32307
862
+ },
863
+ {
864
+ "epoch": 12.23,
865
+ "learning_rate": 4.318068068068069e-05,
866
+ "loss": 0.0015,
867
+ "step": 32574
868
+ },
869
+ {
870
+ "epoch": 12.33,
871
+ "learning_rate": 4.262387387387388e-05,
872
+ "loss": 0.0014,
873
+ "step": 32841
874
+ },
875
+ {
876
+ "epoch": 12.43,
877
+ "learning_rate": 4.2067067067067065e-05,
878
+ "loss": 0.0012,
879
+ "step": 33108
880
+ },
881
+ {
882
+ "epoch": 12.53,
883
+ "learning_rate": 4.1510260260260263e-05,
884
+ "loss": 0.0019,
885
+ "step": 33375
886
+ },
887
+ {
888
+ "epoch": 12.63,
889
+ "learning_rate": 4.0953453453453455e-05,
890
+ "loss": 0.0011,
891
+ "step": 33642
892
+ },
893
+ {
894
+ "epoch": 12.73,
895
+ "learning_rate": 4.039664664664665e-05,
896
+ "loss": 0.0016,
897
+ "step": 33909
898
+ },
899
+ {
900
+ "epoch": 12.83,
901
+ "learning_rate": 3.9839839839839846e-05,
902
+ "loss": 0.0001,
903
+ "step": 34176
904
+ },
905
+ {
906
+ "epoch": 12.93,
907
+ "learning_rate": 3.928303303303303e-05,
908
+ "loss": 0.0007,
909
+ "step": 34443
910
+ },
911
+ {
912
+ "epoch": 13.0,
913
+ "eval_loss": 0.0401989221572876,
914
+ "eval_max_distance": 1,
915
+ "eval_mean_distance": 0,
916
+ "eval_runtime": 0.887,
917
+ "eval_samples_per_second": 403.585,
918
+ "eval_steps_per_second": 27.056,
919
+ "step": 34632
920
+ },
921
+ {
922
+ "epoch": 13.03,
923
+ "learning_rate": 3.8726226226226223e-05,
924
+ "loss": 0.0006,
925
+ "step": 34710
926
+ },
927
+ {
928
+ "epoch": 13.13,
929
+ "learning_rate": 3.816941941941942e-05,
930
+ "loss": 0.0002,
931
+ "step": 34977
932
+ },
933
+ {
934
+ "epoch": 13.23,
935
+ "learning_rate": 3.7612612612612614e-05,
936
+ "loss": 0.0012,
937
+ "step": 35244
938
+ },
939
+ {
940
+ "epoch": 13.33,
941
+ "learning_rate": 3.705580580580581e-05,
942
+ "loss": 0.0027,
943
+ "step": 35511
944
+ },
945
+ {
946
+ "epoch": 13.43,
947
+ "learning_rate": 3.6498998998999e-05,
948
+ "loss": 0.0014,
949
+ "step": 35778
950
+ },
951
+ {
952
+ "epoch": 13.53,
953
+ "learning_rate": 3.594219219219219e-05,
954
+ "loss": 0.0006,
955
+ "step": 36045
956
+ },
957
+ {
958
+ "epoch": 13.63,
959
+ "learning_rate": 3.538538538538539e-05,
960
+ "loss": 0.0023,
961
+ "step": 36312
962
+ },
963
+ {
964
+ "epoch": 13.73,
965
+ "learning_rate": 3.482857857857858e-05,
966
+ "loss": 0.0,
967
+ "step": 36579
968
+ },
969
+ {
970
+ "epoch": 13.83,
971
+ "learning_rate": 3.427177177177177e-05,
972
+ "loss": 0.0056,
973
+ "step": 36846
974
+ },
975
+ {
976
+ "epoch": 13.93,
977
+ "learning_rate": 3.3714964964964965e-05,
978
+ "loss": 0.0017,
979
+ "step": 37113
980
+ },
981
+ {
982
+ "epoch": 14.0,
983
+ "eval_loss": 0.03919493407011032,
984
+ "eval_max_distance": 1,
985
+ "eval_mean_distance": 0,
986
+ "eval_runtime": 0.8793,
987
+ "eval_samples_per_second": 407.126,
988
+ "eval_steps_per_second": 27.293,
989
+ "step": 37296
990
+ },
991
+ {
992
+ "epoch": 14.03,
993
+ "learning_rate": 3.315815815815816e-05,
994
+ "loss": 0.0002,
995
+ "step": 37380
996
+ },
997
+ {
998
+ "epoch": 14.13,
999
+ "learning_rate": 3.260135135135135e-05,
1000
+ "loss": 0.0012,
1001
+ "step": 37647
1002
+ },
1003
+ {
1004
+ "epoch": 14.23,
1005
+ "learning_rate": 3.204454454454455e-05,
1006
+ "loss": 0.0009,
1007
+ "step": 37914
1008
+ },
1009
+ {
1010
+ "epoch": 14.33,
1011
+ "learning_rate": 3.148773773773774e-05,
1012
+ "loss": 0.0006,
1013
+ "step": 38181
1014
+ },
1015
+ {
1016
+ "epoch": 14.43,
1017
+ "learning_rate": 3.093093093093093e-05,
1018
+ "loss": 0.0006,
1019
+ "step": 38448
1020
+ },
1021
+ {
1022
+ "epoch": 14.53,
1023
+ "learning_rate": 3.0374124124124127e-05,
1024
+ "loss": 0.0015,
1025
+ "step": 38715
1026
+ },
1027
+ {
1028
+ "epoch": 14.63,
1029
+ "learning_rate": 2.9817317317317315e-05,
1030
+ "loss": 0.0014,
1031
+ "step": 38982
1032
+ },
1033
+ {
1034
+ "epoch": 14.73,
1035
+ "learning_rate": 2.9260510510510514e-05,
1036
+ "loss": 0.0009,
1037
+ "step": 39249
1038
+ },
1039
+ {
1040
+ "epoch": 14.83,
1041
+ "learning_rate": 2.8703703703703706e-05,
1042
+ "loss": 0.0016,
1043
+ "step": 39516
1044
+ },
1045
+ {
1046
+ "epoch": 14.93,
1047
+ "learning_rate": 2.8146896896896895e-05,
1048
+ "loss": 0.0007,
1049
+ "step": 39783
1050
+ },
1051
+ {
1052
+ "epoch": 15.0,
1053
+ "eval_loss": 0.03942238539457321,
1054
+ "eval_max_distance": 1,
1055
+ "eval_mean_distance": 0,
1056
+ "eval_runtime": 0.8858,
1057
+ "eval_samples_per_second": 404.155,
1058
+ "eval_steps_per_second": 27.094,
1059
+ "step": 39960
1060
+ },
1061
+ {
1062
+ "epoch": 15.03,
1063
+ "learning_rate": 2.7590090090090094e-05,
1064
+ "loss": 0.0003,
1065
+ "step": 40050
1066
+ },
1067
+ {
1068
+ "epoch": 15.13,
1069
+ "learning_rate": 2.7033283283283286e-05,
1070
+ "loss": 0.0002,
1071
+ "step": 40317
1072
+ },
1073
+ {
1074
+ "epoch": 15.23,
1075
+ "learning_rate": 2.6476476476476474e-05,
1076
+ "loss": 0.0002,
1077
+ "step": 40584
1078
+ },
1079
+ {
1080
+ "epoch": 15.33,
1081
+ "learning_rate": 2.5919669669669673e-05,
1082
+ "loss": 0.0,
1083
+ "step": 40851
1084
+ },
1085
+ {
1086
+ "epoch": 15.43,
1087
+ "learning_rate": 2.536286286286286e-05,
1088
+ "loss": 0.0017,
1089
+ "step": 41118
1090
+ },
1091
+ {
1092
+ "epoch": 15.53,
1093
+ "learning_rate": 2.4806056056056057e-05,
1094
+ "loss": 0.0002,
1095
+ "step": 41385
1096
+ },
1097
+ {
1098
+ "epoch": 15.64,
1099
+ "learning_rate": 2.4249249249249252e-05,
1100
+ "loss": 0.0007,
1101
+ "step": 41652
1102
+ },
1103
+ {
1104
+ "epoch": 15.74,
1105
+ "learning_rate": 2.3692442442442444e-05,
1106
+ "loss": 0.0001,
1107
+ "step": 41919
1108
+ },
1109
+ {
1110
+ "epoch": 15.84,
1111
+ "learning_rate": 2.3135635635635636e-05,
1112
+ "loss": 0.0011,
1113
+ "step": 42186
1114
+ },
1115
+ {
1116
+ "epoch": 15.94,
1117
+ "learning_rate": 2.2578828828828828e-05,
1118
+ "loss": 0.0013,
1119
+ "step": 42453
1120
+ },
1121
+ {
1122
+ "epoch": 16.0,
1123
+ "eval_loss": 0.04424785450100899,
1124
+ "eval_max_distance": 1,
1125
+ "eval_mean_distance": 0,
1126
+ "eval_runtime": 0.8837,
1127
+ "eval_samples_per_second": 405.133,
1128
+ "eval_steps_per_second": 27.16,
1129
+ "step": 42624
1130
+ },
1131
+ {
1132
+ "epoch": 16.04,
1133
+ "learning_rate": 2.2022022022022024e-05,
1134
+ "loss": 0.0006,
1135
+ "step": 42720
1136
+ },
1137
+ {
1138
+ "epoch": 16.14,
1139
+ "learning_rate": 2.146521521521522e-05,
1140
+ "loss": 0.0013,
1141
+ "step": 42987
1142
+ },
1143
+ {
1144
+ "epoch": 16.24,
1145
+ "learning_rate": 2.0908408408408408e-05,
1146
+ "loss": 0.0005,
1147
+ "step": 43254
1148
+ },
1149
+ {
1150
+ "epoch": 16.34,
1151
+ "learning_rate": 2.0351601601601603e-05,
1152
+ "loss": 0.0008,
1153
+ "step": 43521
1154
+ },
1155
+ {
1156
+ "epoch": 16.44,
1157
+ "learning_rate": 1.9794794794794795e-05,
1158
+ "loss": 0.0003,
1159
+ "step": 43788
1160
+ },
1161
+ {
1162
+ "epoch": 16.54,
1163
+ "learning_rate": 1.9237987987987987e-05,
1164
+ "loss": 0.0001,
1165
+ "step": 44055
1166
+ },
1167
+ {
1168
+ "epoch": 16.64,
1169
+ "learning_rate": 1.8681181181181182e-05,
1170
+ "loss": 0.0005,
1171
+ "step": 44322
1172
+ },
1173
+ {
1174
+ "epoch": 16.74,
1175
+ "learning_rate": 1.8124374374374374e-05,
1176
+ "loss": 0.0026,
1177
+ "step": 44589
1178
+ },
1179
+ {
1180
+ "epoch": 16.84,
1181
+ "learning_rate": 1.756756756756757e-05,
1182
+ "loss": 0.0008,
1183
+ "step": 44856
1184
+ },
1185
+ {
1186
+ "epoch": 16.94,
1187
+ "learning_rate": 1.701076076076076e-05,
1188
+ "loss": 0.0002,
1189
+ "step": 45123
1190
+ },
1191
+ {
1192
+ "epoch": 17.0,
1193
+ "eval_loss": 0.04427039995789528,
1194
+ "eval_max_distance": 1,
1195
+ "eval_mean_distance": 0,
1196
+ "eval_runtime": 0.8885,
1197
+ "eval_samples_per_second": 402.91,
1198
+ "eval_steps_per_second": 27.011,
1199
+ "step": 45288
1200
+ },
1201
+ {
1202
+ "epoch": 17.04,
1203
+ "learning_rate": 1.6453953953953954e-05,
1204
+ "loss": 0.0001,
1205
+ "step": 45390
1206
+ },
1207
+ {
1208
+ "epoch": 17.14,
1209
+ "learning_rate": 1.589714714714715e-05,
1210
+ "loss": 0.0004,
1211
+ "step": 45657
1212
+ },
1213
+ {
1214
+ "epoch": 17.24,
1215
+ "learning_rate": 1.534034034034034e-05,
1216
+ "loss": 0.0001,
1217
+ "step": 45924
1218
+ },
1219
+ {
1220
+ "epoch": 17.34,
1221
+ "learning_rate": 1.4783533533533533e-05,
1222
+ "loss": 0.0011,
1223
+ "step": 46191
1224
+ },
1225
+ {
1226
+ "epoch": 17.44,
1227
+ "learning_rate": 1.4226726726726727e-05,
1228
+ "loss": 0.001,
1229
+ "step": 46458
1230
+ },
1231
+ {
1232
+ "epoch": 17.54,
1233
+ "learning_rate": 1.3669919919919922e-05,
1234
+ "loss": 0.0003,
1235
+ "step": 46725
1236
+ },
1237
+ {
1238
+ "epoch": 17.64,
1239
+ "learning_rate": 1.3113113113113112e-05,
1240
+ "loss": 0.0007,
1241
+ "step": 46992
1242
+ },
1243
+ {
1244
+ "epoch": 17.74,
1245
+ "learning_rate": 1.2556306306306306e-05,
1246
+ "loss": 0.0002,
1247
+ "step": 47259
1248
+ },
1249
+ {
1250
+ "epoch": 17.84,
1251
+ "learning_rate": 1.19994994994995e-05,
1252
+ "loss": 0.0024,
1253
+ "step": 47526
1254
+ },
1255
+ {
1256
+ "epoch": 17.94,
1257
+ "learning_rate": 1.1442692692692693e-05,
1258
+ "loss": 0.0013,
1259
+ "step": 47793
1260
+ },
1261
+ {
1262
+ "epoch": 18.0,
1263
+ "eval_loss": 0.038870543241500854,
1264
+ "eval_max_distance": 1,
1265
+ "eval_mean_distance": 0,
1266
+ "eval_runtime": 0.8826,
1267
+ "eval_samples_per_second": 405.621,
1268
+ "eval_steps_per_second": 27.192,
1269
+ "step": 47952
1270
+ },
1271
+ {
1272
+ "epoch": 18.04,
1273
+ "learning_rate": 1.0885885885885887e-05,
1274
+ "loss": 0.0005,
1275
+ "step": 48060
1276
+ },
1277
+ {
1278
+ "epoch": 18.14,
1279
+ "learning_rate": 1.0329079079079079e-05,
1280
+ "loss": 0.0,
1281
+ "step": 48327
1282
+ },
1283
+ {
1284
+ "epoch": 18.24,
1285
+ "learning_rate": 9.772272272272273e-06,
1286
+ "loss": 0.0005,
1287
+ "step": 48594
1288
+ },
1289
+ {
1290
+ "epoch": 18.34,
1291
+ "learning_rate": 9.215465465465466e-06,
1292
+ "loss": 0.0017,
1293
+ "step": 48861
1294
+ },
1295
+ {
1296
+ "epoch": 18.44,
1297
+ "learning_rate": 8.65865865865866e-06,
1298
+ "loss": 0.0003,
1299
+ "step": 49128
1300
+ },
1301
+ {
1302
+ "epoch": 18.54,
1303
+ "learning_rate": 8.101851851851852e-06,
1304
+ "loss": 0.0012,
1305
+ "step": 49395
1306
+ },
1307
+ {
1308
+ "epoch": 18.64,
1309
+ "learning_rate": 7.545045045045046e-06,
1310
+ "loss": 0.0,
1311
+ "step": 49662
1312
+ },
1313
+ {
1314
+ "epoch": 18.74,
1315
+ "learning_rate": 6.9882382382382385e-06,
1316
+ "loss": 0.0011,
1317
+ "step": 49929
1318
+ },
1319
+ {
1320
+ "epoch": 18.84,
1321
+ "learning_rate": 6.431431431431431e-06,
1322
+ "loss": 0.0013,
1323
+ "step": 50196
1324
+ },
1325
+ {
1326
+ "epoch": 18.94,
1327
+ "learning_rate": 5.874624624624625e-06,
1328
+ "loss": 0.0001,
1329
+ "step": 50463
1330
+ },
1331
+ {
1332
+ "epoch": 19.0,
1333
+ "eval_loss": 0.04122824966907501,
1334
+ "eval_max_distance": 1,
1335
+ "eval_mean_distance": 0,
1336
+ "eval_runtime": 0.8851,
1337
+ "eval_samples_per_second": 404.458,
1338
+ "eval_steps_per_second": 27.114,
1339
+ "step": 50616
1340
+ },
1341
+ {
1342
+ "epoch": 19.04,
1343
+ "learning_rate": 5.317817817817819e-06,
1344
+ "loss": 0.0012,
1345
+ "step": 50730
1346
+ },
1347
+ {
1348
+ "epoch": 19.14,
1349
+ "learning_rate": 4.7610110110110115e-06,
1350
+ "loss": 0.0001,
1351
+ "step": 50997
1352
+ },
1353
+ {
1354
+ "epoch": 19.24,
1355
+ "learning_rate": 4.204204204204204e-06,
1356
+ "loss": 0.0002,
1357
+ "step": 51264
1358
+ },
1359
+ {
1360
+ "epoch": 19.34,
1361
+ "learning_rate": 3.647397397397397e-06,
1362
+ "loss": 0.0,
1363
+ "step": 51531
1364
+ },
1365
+ {
1366
+ "epoch": 19.44,
1367
+ "learning_rate": 3.090590590590591e-06,
1368
+ "loss": 0.0003,
1369
+ "step": 51798
1370
+ },
1371
+ {
1372
+ "epoch": 19.54,
1373
+ "learning_rate": 2.533783783783784e-06,
1374
+ "loss": 0.0021,
1375
+ "step": 52065
1376
+ },
1377
+ {
1378
+ "epoch": 19.64,
1379
+ "learning_rate": 1.976976976976977e-06,
1380
+ "loss": 0.0,
1381
+ "step": 52332
1382
+ },
1383
+ {
1384
+ "epoch": 19.74,
1385
+ "learning_rate": 1.4201701701701704e-06,
1386
+ "loss": 0.0001,
1387
+ "step": 52599
1388
+ },
1389
+ {
1390
+ "epoch": 19.84,
1391
+ "learning_rate": 8.633633633633634e-07,
1392
+ "loss": 0.0009,
1393
+ "step": 52866
1394
+ },
1395
+ {
1396
+ "epoch": 19.94,
1397
+ "learning_rate": 3.0655655655655656e-07,
1398
+ "loss": 0.0001,
1399
+ "step": 53133
1400
+ },
1401
+ {
1402
+ "epoch": 20.0,
1403
+ "eval_loss": 0.0392613410949707,
1404
+ "eval_max_distance": 1,
1405
+ "eval_mean_distance": 0,
1406
+ "eval_runtime": 0.8819,
1407
+ "eval_samples_per_second": 405.923,
1408
+ "eval_steps_per_second": 27.213,
1409
+ "step": 53280
1410
+ },
1411
+ {
1412
+ "epoch": 20.0,
1413
+ "step": 53280,
1414
+ "total_flos": 3.004276018040832e+16,
1415
+ "train_loss": 0.5359516274340651,
1416
+ "train_runtime": 5327.8303,
1417
+ "train_samples_per_second": 149.956,
1418
+ "train_steps_per_second": 10.0
1419
+ }
1420
+ ],
1421
+ "logging_steps": 267,
1422
+ "max_steps": 53280,
1423
+ "num_train_epochs": 20,
1424
+ "save_steps": 533,
1425
+ "total_flos": 3.004276018040832e+16,
1426
+ "trial_name": null,
1427
+ "trial_params": null
1428
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c96f8fdbf9fd73c249b43d3bffe70a51dda3b7a8b070e37888600604b3e00bcb
3
+ size 4664