nemik commited on
Commit
78ad67f
1 Parent(s): 56465d2

End of training

Browse files
README.md CHANGED
@@ -25,16 +25,16 @@ model-index:
25
  metrics:
26
  - name: Accuracy
27
  type: accuracy
28
- value: 0.9610619469026549
29
  - name: F1
30
  type: f1
31
- value: 0.9051724137931034
32
  - name: Precision
33
  type: precision
34
- value: 0.9012875536480687
35
  - name: Recall
36
  type: recall
37
- value: 0.9090909090909091
38
  ---
39
 
40
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -44,11 +44,11 @@ should probably proofread and complete it, then remove this comment. -->
44
 
45
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the webdataset dataset.
46
  It achieves the following results on the evaluation set:
47
- - Loss: 0.1110
48
- - Accuracy: 0.9611
49
- - F1: 0.9052
50
- - Precision: 0.9013
51
- - Recall: 0.9091
52
 
53
  ## Model description
54
 
 
25
  metrics:
26
  - name: Accuracy
27
  type: accuracy
28
+ value: 0.963716814159292
29
  - name: F1
30
  type: f1
31
+ value: 0.9118279569892475
32
  - name: Precision
33
  type: precision
34
+ value: 0.905982905982906
35
  - name: Recall
36
  type: recall
37
+ value: 0.9177489177489178
38
  ---
39
 
40
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
44
 
45
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the webdataset dataset.
46
  It achieves the following results on the evaluation set:
47
+ - Loss: 0.0965
48
+ - Accuracy: 0.9637
49
+ - F1: 0.9118
50
+ - Precision: 0.9060
51
+ - Recall: 0.9177
52
 
53
  ## Model description
54
 
all_results.json CHANGED
@@ -1,8 +1,16 @@
1
  {
2
- "epoch": 17.0,
3
- "total_flos": 1.333267779734618e+18,
4
- "train_loss": 0.16752693287151701,
5
- "train_runtime": 275.2566,
6
- "train_samples_per_second": 62.502,
7
- "train_steps_per_second": 3.953
 
 
 
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.963716814159292,
4
+ "eval_f1": 0.9118279569892475,
5
+ "eval_loss": 0.09645664691925049,
6
+ "eval_precision": 0.905982905982906,
7
+ "eval_recall": 0.9177489177489178,
8
+ "eval_runtime": 0.9893,
9
+ "eval_samples_per_second": 114.226,
10
+ "eval_steps_per_second": 15.163,
11
+ "total_flos": 2.352825493649326e+18,
12
+ "train_loss": 0.05003170374160012,
13
+ "train_runtime": 517.118,
14
+ "train_samples_per_second": 58.71,
15
+ "train_steps_per_second": 3.713
16
  }
eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.963716814159292,
4
+ "eval_f1": 0.9118279569892475,
5
+ "eval_loss": 0.09645664691925049,
6
+ "eval_precision": 0.905982905982906,
7
+ "eval_recall": 0.9177489177489178,
8
+ "eval_runtime": 0.9893,
9
+ "eval_samples_per_second": 114.226,
10
+ "eval_steps_per_second": 15.163
11
+ }
runs/Jul25_04-38-12_d85b605ca0e9/events.out.tfevents.1721882854.d85b605ca0e9.546.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1d05bca1c639e8d629fa575e09355eba8e8b6b2e13660194cfc1044aab08ad6
3
+ size 560
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 17.0,
3
- "total_flos": 1.333267779734618e+18,
4
- "train_loss": 0.16752693287151701,
5
- "train_runtime": 275.2566,
6
- "train_samples_per_second": 62.502,
7
- "train_steps_per_second": 3.953
8
  }
 
1
  {
2
+ "epoch": 30.0,
3
+ "total_flos": 2.352825493649326e+18,
4
+ "train_loss": 0.05003170374160012,
5
+ "train_runtime": 517.118,
6
+ "train_samples_per_second": 58.71,
7
+ "train_steps_per_second": 3.713
8
  }
trainer_state.json CHANGED
@@ -1,903 +1,1599 @@
1
  {
2
- "best_metric": 0.13913680613040924,
3
- "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned_v2024-7-24-frost/checkpoint-1000",
4
- "epoch": 17.0,
5
  "eval_steps": 100,
6
- "global_step": 1088,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.15625,
13
- "grad_norm": 0.388886958360672,
14
- "learning_rate": 1.834862385321101e-05,
15
- "loss": 0.6876,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.3125,
20
- "grad_norm": 0.43717288970947266,
21
- "learning_rate": 3.669724770642202e-05,
22
- "loss": 0.6556,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.46875,
27
- "grad_norm": 0.41058769822120667,
28
- "learning_rate": 5.504587155963303e-05,
29
- "loss": 0.5865,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.625,
34
- "grad_norm": 0.37728703022003174,
35
- "learning_rate": 7.339449541284404e-05,
36
- "loss": 0.5101,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.78125,
41
- "grad_norm": 0.3225114643573761,
42
- "learning_rate": 9.174311926605506e-05,
43
- "loss": 0.4621,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.9375,
48
- "grad_norm": 0.27789732813835144,
49
- "learning_rate": 0.00011009174311926606,
50
- "loss": 0.4339,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.09375,
55
- "grad_norm": 0.3035929501056671,
56
- "learning_rate": 0.00012844036697247707,
57
- "loss": 0.3999,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 1.25,
62
- "grad_norm": 0.29363128542900085,
63
- "learning_rate": 0.0001467889908256881,
64
- "loss": 0.3808,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 1.40625,
69
- "grad_norm": 0.2852456867694855,
70
- "learning_rate": 0.0001651376146788991,
71
- "loss": 0.356,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 1.5625,
76
- "grad_norm": 0.2548869550228119,
77
- "learning_rate": 0.00018348623853211012,
78
- "loss": 0.3281,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 1.5625,
83
- "eval_accuracy": 0.9008849557522124,
84
- "eval_f1": 0.6956521739130436,
85
- "eval_loss": 0.31767982244491577,
86
- "eval_precision": 0.8767123287671232,
87
- "eval_recall": 0.5765765765765766,
88
- "eval_runtime": 0.9054,
89
- "eval_samples_per_second": 124.801,
90
- "eval_steps_per_second": 16.566,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 1.71875,
95
- "grad_norm": 0.34990283846855164,
96
- "learning_rate": 0.00019979570990806946,
97
- "loss": 0.3169,
98
  "step": 110
99
  },
100
  {
101
  "epoch": 1.875,
102
- "grad_norm": 0.4318304657936096,
103
- "learning_rate": 0.00019775280898876404,
104
- "loss": 0.307,
105
  "step": 120
106
  },
107
  {
108
  "epoch": 2.03125,
109
- "grad_norm": 0.4618605077266693,
110
- "learning_rate": 0.00019570990806945865,
111
- "loss": 0.2988,
112
  "step": 130
113
  },
114
  {
115
  "epoch": 2.1875,
116
- "grad_norm": 0.35013139247894287,
117
- "learning_rate": 0.00019366700715015323,
118
- "loss": 0.2962,
119
  "step": 140
120
  },
121
  {
122
  "epoch": 2.34375,
123
- "grad_norm": 0.2871937155723572,
124
- "learning_rate": 0.00019162410623084782,
125
- "loss": 0.2718,
126
  "step": 150
127
  },
128
  {
129
  "epoch": 2.5,
130
- "grad_norm": 0.5972371101379395,
131
- "learning_rate": 0.0001895812053115424,
132
- "loss": 0.2724,
133
  "step": 160
134
  },
135
  {
136
  "epoch": 2.65625,
137
- "grad_norm": 0.49075496196746826,
138
- "learning_rate": 0.00018753830439223698,
139
- "loss": 0.2681,
140
  "step": 170
141
  },
142
  {
143
  "epoch": 2.8125,
144
- "grad_norm": 0.23726806044578552,
145
- "learning_rate": 0.00018549540347293156,
146
- "loss": 0.2527,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 2.96875,
151
- "grad_norm": 0.4829351305961609,
152
- "learning_rate": 0.00018345250255362615,
153
- "loss": 0.2484,
154
  "step": 190
155
  },
156
  {
157
  "epoch": 3.125,
158
- "grad_norm": 0.3746371567249298,
159
- "learning_rate": 0.00018140960163432076,
160
- "loss": 0.2532,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 3.125,
165
- "eval_accuracy": 0.9176991150442478,
166
- "eval_f1": 0.7832167832167832,
167
- "eval_loss": 0.24235816299915314,
168
- "eval_precision": 0.8115942028985508,
169
- "eval_recall": 0.7567567567567568,
170
- "eval_runtime": 0.8872,
171
- "eval_samples_per_second": 127.371,
172
- "eval_steps_per_second": 16.908,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 3.28125,
177
- "grad_norm": 0.5272416472434998,
178
- "learning_rate": 0.00017936670071501534,
179
- "loss": 0.2438,
180
  "step": 210
181
  },
182
  {
183
  "epoch": 3.4375,
184
- "grad_norm": 0.2519098222255707,
185
- "learning_rate": 0.00017732379979570992,
186
- "loss": 0.2241,
187
  "step": 220
188
  },
189
  {
190
  "epoch": 3.59375,
191
- "grad_norm": 0.22354206442832947,
192
- "learning_rate": 0.0001752808988764045,
193
- "loss": 0.223,
194
  "step": 230
195
  },
196
  {
197
  "epoch": 3.75,
198
- "grad_norm": 0.32517552375793457,
199
- "learning_rate": 0.0001732379979570991,
200
- "loss": 0.2414,
201
  "step": 240
202
  },
203
  {
204
  "epoch": 3.90625,
205
- "grad_norm": 0.32100722193717957,
206
- "learning_rate": 0.00017119509703779367,
207
- "loss": 0.2188,
208
  "step": 250
209
  },
210
  {
211
  "epoch": 4.0625,
212
- "grad_norm": 0.5668182969093323,
213
- "learning_rate": 0.00016915219611848828,
214
- "loss": 0.1933,
215
  "step": 260
216
  },
217
  {
218
  "epoch": 4.21875,
219
- "grad_norm": 0.42494627833366394,
220
- "learning_rate": 0.00016710929519918286,
221
- "loss": 0.2244,
222
  "step": 270
223
  },
224
  {
225
  "epoch": 4.375,
226
- "grad_norm": 0.4030027687549591,
227
- "learning_rate": 0.00016506639427987742,
228
- "loss": 0.1939,
229
  "step": 280
230
  },
231
  {
232
  "epoch": 4.53125,
233
- "grad_norm": 0.3662806749343872,
234
- "learning_rate": 0.000163023493360572,
235
- "loss": 0.2117,
236
  "step": 290
237
  },
238
  {
239
  "epoch": 4.6875,
240
- "grad_norm": 0.23639629781246185,
241
- "learning_rate": 0.0001609805924412666,
242
- "loss": 0.1762,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 4.6875,
247
- "eval_accuracy": 0.9407079646017699,
248
- "eval_f1": 0.8452655889145496,
249
- "eval_loss": 0.18491290509700775,
250
- "eval_precision": 0.8672985781990521,
251
- "eval_recall": 0.8243243243243243,
252
- "eval_runtime": 1.2006,
253
- "eval_samples_per_second": 94.116,
254
- "eval_steps_per_second": 12.493,
255
  "step": 300
256
  },
257
  {
258
  "epoch": 4.84375,
259
- "grad_norm": 0.26584741473197937,
260
- "learning_rate": 0.0001589376915219612,
261
- "loss": 0.1844,
262
  "step": 310
263
  },
264
  {
265
  "epoch": 5.0,
266
- "grad_norm": 0.44927456974983215,
267
- "learning_rate": 0.00015689479060265578,
268
- "loss": 0.2022,
269
  "step": 320
270
  },
271
  {
272
  "epoch": 5.15625,
273
- "grad_norm": 0.24379728734493256,
274
- "learning_rate": 0.00015485188968335036,
275
- "loss": 0.185,
276
  "step": 330
277
  },
278
  {
279
  "epoch": 5.3125,
280
- "grad_norm": 0.34985587000846863,
281
- "learning_rate": 0.00015280898876404494,
282
- "loss": 0.1567,
283
  "step": 340
284
  },
285
  {
286
  "epoch": 5.46875,
287
- "grad_norm": 0.325273334980011,
288
- "learning_rate": 0.00015076608784473953,
289
- "loss": 0.1658,
290
  "step": 350
291
  },
292
  {
293
  "epoch": 5.625,
294
- "grad_norm": 0.26209816336631775,
295
- "learning_rate": 0.00014872318692543413,
296
- "loss": 0.1739,
297
  "step": 360
298
  },
299
  {
300
  "epoch": 5.78125,
301
- "grad_norm": 0.32775962352752686,
302
- "learning_rate": 0.00014668028600612872,
303
- "loss": 0.1755,
304
  "step": 370
305
  },
306
  {
307
  "epoch": 5.9375,
308
- "grad_norm": 0.2825620174407959,
309
- "learning_rate": 0.0001446373850868233,
310
- "loss": 0.1735,
311
  "step": 380
312
  },
313
  {
314
  "epoch": 6.09375,
315
- "grad_norm": 0.18162801861763,
316
- "learning_rate": 0.00014259448416751788,
317
- "loss": 0.1694,
318
  "step": 390
319
  },
320
  {
321
  "epoch": 6.25,
322
- "grad_norm": 0.3598109483718872,
323
- "learning_rate": 0.00014055158324821247,
324
- "loss": 0.1525,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 6.25,
329
- "eval_accuracy": 0.9256637168141593,
330
- "eval_f1": 0.8055555555555555,
331
- "eval_loss": 0.18338988721370697,
332
- "eval_precision": 0.8285714285714286,
333
- "eval_recall": 0.7837837837837838,
334
- "eval_runtime": 0.8607,
335
- "eval_samples_per_second": 131.29,
336
- "eval_steps_per_second": 17.428,
337
  "step": 400
338
  },
339
  {
340
  "epoch": 6.40625,
341
- "grad_norm": 0.592587947845459,
342
- "learning_rate": 0.00013850868232890705,
343
- "loss": 0.1518,
344
  "step": 410
345
  },
346
  {
347
  "epoch": 6.5625,
348
- "grad_norm": 0.1838444322347641,
349
- "learning_rate": 0.00013646578140960163,
350
- "loss": 0.161,
351
  "step": 420
352
  },
353
  {
354
  "epoch": 6.71875,
355
- "grad_norm": 0.326659232378006,
356
- "learning_rate": 0.00013442288049029624,
357
- "loss": 0.1603,
358
  "step": 430
359
  },
360
  {
361
  "epoch": 6.875,
362
- "grad_norm": 0.45708832144737244,
363
- "learning_rate": 0.00013237997957099082,
364
- "loss": 0.1609,
365
  "step": 440
366
  },
367
  {
368
  "epoch": 7.03125,
369
- "grad_norm": 0.17103692889213562,
370
- "learning_rate": 0.0001303370786516854,
371
- "loss": 0.1263,
372
  "step": 450
373
  },
374
  {
375
  "epoch": 7.1875,
376
- "grad_norm": 0.40236711502075195,
377
- "learning_rate": 0.00012829417773238,
378
- "loss": 0.1386,
379
  "step": 460
380
  },
381
  {
382
  "epoch": 7.34375,
383
- "grad_norm": 0.5935817360877991,
384
- "learning_rate": 0.00012625127681307457,
385
- "loss": 0.1419,
386
  "step": 470
387
  },
388
  {
389
  "epoch": 7.5,
390
- "grad_norm": 0.42183375358581543,
391
- "learning_rate": 0.00012420837589376916,
392
- "loss": 0.1296,
393
  "step": 480
394
  },
395
  {
396
  "epoch": 7.65625,
397
- "grad_norm": 0.28589242696762085,
398
- "learning_rate": 0.00012216547497446374,
399
- "loss": 0.1298,
400
  "step": 490
401
  },
402
  {
403
  "epoch": 7.8125,
404
- "grad_norm": 0.14183469116687775,
405
- "learning_rate": 0.00012012257405515832,
406
- "loss": 0.1447,
407
  "step": 500
408
  },
409
  {
410
  "epoch": 7.8125,
411
- "eval_accuracy": 0.9415929203539823,
412
- "eval_f1": 0.8472222222222222,
413
- "eval_loss": 0.16118969023227692,
414
- "eval_precision": 0.8714285714285714,
415
- "eval_recall": 0.8243243243243243,
416
- "eval_runtime": 0.9028,
417
- "eval_samples_per_second": 125.167,
418
- "eval_steps_per_second": 16.615,
419
  "step": 500
420
  },
421
  {
422
  "epoch": 7.96875,
423
- "grad_norm": 0.4977594017982483,
424
- "learning_rate": 0.0001180796731358529,
425
- "loss": 0.1385,
426
  "step": 510
427
  },
428
  {
429
  "epoch": 8.125,
430
- "grad_norm": 0.6004766225814819,
431
- "learning_rate": 0.0001160367722165475,
432
- "loss": 0.1532,
433
  "step": 520
434
  },
435
  {
436
  "epoch": 8.28125,
437
- "grad_norm": 0.20785477757453918,
438
- "learning_rate": 0.00011399387129724208,
439
- "loss": 0.1329,
440
  "step": 530
441
  },
442
  {
443
  "epoch": 8.4375,
444
- "grad_norm": 0.30308064818382263,
445
- "learning_rate": 0.00011195097037793667,
446
- "loss": 0.1351,
447
  "step": 540
448
  },
449
  {
450
  "epoch": 8.59375,
451
- "grad_norm": 0.40658825635910034,
452
- "learning_rate": 0.00010990806945863126,
453
- "loss": 0.1289,
454
  "step": 550
455
  },
456
  {
457
  "epoch": 8.75,
458
- "grad_norm": 0.15297789871692657,
459
- "learning_rate": 0.00010786516853932584,
460
- "loss": 0.1353,
461
  "step": 560
462
  },
463
  {
464
  "epoch": 8.90625,
465
- "grad_norm": 0.2919471859931946,
466
- "learning_rate": 0.00010582226762002043,
467
- "loss": 0.1215,
468
  "step": 570
469
  },
470
  {
471
  "epoch": 9.0625,
472
- "grad_norm": 0.23703481256961823,
473
- "learning_rate": 0.00010377936670071502,
474
- "loss": 0.1107,
475
  "step": 580
476
  },
477
  {
478
  "epoch": 9.21875,
479
- "grad_norm": 0.15953165292739868,
480
- "learning_rate": 0.0001017364657814096,
481
- "loss": 0.1349,
482
  "step": 590
483
  },
484
  {
485
  "epoch": 9.375,
486
- "grad_norm": 0.2888895869255066,
487
- "learning_rate": 9.969356486210419e-05,
488
- "loss": 0.1114,
489
  "step": 600
490
  },
491
  {
492
  "epoch": 9.375,
493
- "eval_accuracy": 0.9433628318584071,
494
- "eval_f1": 0.8545454545454546,
495
- "eval_loss": 0.15215742588043213,
496
- "eval_precision": 0.8623853211009175,
497
- "eval_recall": 0.8468468468468469,
498
- "eval_runtime": 0.8895,
499
- "eval_samples_per_second": 127.032,
500
- "eval_steps_per_second": 16.863,
501
  "step": 600
502
  },
503
  {
504
  "epoch": 9.53125,
505
- "grad_norm": 0.23866873979568481,
506
- "learning_rate": 9.765066394279879e-05,
507
- "loss": 0.1097,
508
  "step": 610
509
  },
510
  {
511
  "epoch": 9.6875,
512
- "grad_norm": 0.2734706997871399,
513
- "learning_rate": 9.560776302349337e-05,
514
- "loss": 0.1096,
515
  "step": 620
516
  },
517
  {
518
  "epoch": 9.84375,
519
- "grad_norm": 0.5498181581497192,
520
- "learning_rate": 9.356486210418795e-05,
521
- "loss": 0.1039,
522
  "step": 630
523
  },
524
  {
525
  "epoch": 10.0,
526
- "grad_norm": 0.17813219130039215,
527
- "learning_rate": 9.152196118488255e-05,
528
- "loss": 0.1002,
529
  "step": 640
530
  },
531
  {
532
  "epoch": 10.15625,
533
- "grad_norm": 0.34687340259552,
534
- "learning_rate": 8.947906026557712e-05,
535
- "loss": 0.1164,
536
  "step": 650
537
  },
538
  {
539
  "epoch": 10.3125,
540
- "grad_norm": 0.18163448572158813,
541
- "learning_rate": 8.743615934627171e-05,
542
- "loss": 0.1038,
543
  "step": 660
544
  },
545
  {
546
  "epoch": 10.46875,
547
- "grad_norm": 0.2275228351354599,
548
- "learning_rate": 8.53932584269663e-05,
549
- "loss": 0.1165,
550
  "step": 670
551
  },
552
  {
553
  "epoch": 10.625,
554
- "grad_norm": 0.3568095862865448,
555
- "learning_rate": 8.335035750766088e-05,
556
- "loss": 0.0986,
557
  "step": 680
558
  },
559
  {
560
  "epoch": 10.78125,
561
- "grad_norm": 0.2227988988161087,
562
- "learning_rate": 8.130745658835548e-05,
563
- "loss": 0.1139,
564
  "step": 690
565
  },
566
  {
567
  "epoch": 10.9375,
568
- "grad_norm": 0.3007115423679352,
569
- "learning_rate": 7.926455566905006e-05,
570
- "loss": 0.1004,
571
  "step": 700
572
  },
573
  {
574
  "epoch": 10.9375,
575
- "eval_accuracy": 0.9451327433628318,
576
- "eval_f1": 0.8571428571428571,
577
- "eval_loss": 0.15245945751667023,
578
- "eval_precision": 0.8773584905660378,
579
- "eval_recall": 0.8378378378378378,
580
- "eval_runtime": 1.1963,
581
- "eval_samples_per_second": 94.461,
582
- "eval_steps_per_second": 12.539,
583
  "step": 700
584
  },
585
  {
586
  "epoch": 11.09375,
587
- "grad_norm": 0.2825964689254761,
588
- "learning_rate": 7.722165474974464e-05,
589
- "loss": 0.1071,
590
  "step": 710
591
  },
592
  {
593
  "epoch": 11.25,
594
- "grad_norm": 0.2866518795490265,
595
- "learning_rate": 7.517875383043922e-05,
596
- "loss": 0.0797,
597
  "step": 720
598
  },
599
  {
600
  "epoch": 11.40625,
601
- "grad_norm": 0.40209805965423584,
602
- "learning_rate": 7.313585291113382e-05,
603
- "loss": 0.0919,
604
  "step": 730
605
  },
606
  {
607
  "epoch": 11.5625,
608
- "grad_norm": 0.22357816994190216,
609
- "learning_rate": 7.10929519918284e-05,
610
- "loss": 0.0862,
611
  "step": 740
612
  },
613
  {
614
  "epoch": 11.71875,
615
- "grad_norm": 0.21823176741600037,
616
- "learning_rate": 6.905005107252299e-05,
617
- "loss": 0.0921,
618
  "step": 750
619
  },
620
  {
621
  "epoch": 11.875,
622
- "grad_norm": 0.37442561984062195,
623
- "learning_rate": 6.700715015321757e-05,
624
- "loss": 0.0926,
625
  "step": 760
626
  },
627
  {
628
  "epoch": 12.03125,
629
- "grad_norm": 0.10888390988111496,
630
- "learning_rate": 6.496424923391215e-05,
631
- "loss": 0.0758,
632
  "step": 770
633
  },
634
  {
635
  "epoch": 12.1875,
636
- "grad_norm": 0.30619969964027405,
637
- "learning_rate": 6.292134831460675e-05,
638
- "loss": 0.0824,
639
  "step": 780
640
  },
641
  {
642
  "epoch": 12.34375,
643
- "grad_norm": 0.2507191598415375,
644
- "learning_rate": 6.087844739530133e-05,
645
- "loss": 0.0815,
646
  "step": 790
647
  },
648
  {
649
  "epoch": 12.5,
650
- "grad_norm": 0.1795010268688202,
651
- "learning_rate": 5.883554647599592e-05,
652
- "loss": 0.0831,
653
  "step": 800
654
  },
655
  {
656
  "epoch": 12.5,
657
- "eval_accuracy": 0.9513274336283186,
658
- "eval_f1": 0.8741418764302059,
659
- "eval_loss": 0.14417380094528198,
660
- "eval_precision": 0.8883720930232558,
661
- "eval_recall": 0.8603603603603603,
662
- "eval_runtime": 0.9097,
663
- "eval_samples_per_second": 124.22,
664
- "eval_steps_per_second": 16.489,
665
  "step": 800
666
  },
667
  {
668
  "epoch": 12.65625,
669
- "grad_norm": 0.3208577334880829,
670
- "learning_rate": 5.67926455566905e-05,
671
- "loss": 0.0759,
672
  "step": 810
673
  },
674
  {
675
  "epoch": 12.8125,
676
- "grad_norm": 0.13223229348659515,
677
- "learning_rate": 5.474974463738509e-05,
678
- "loss": 0.087,
679
  "step": 820
680
  },
681
  {
682
  "epoch": 12.96875,
683
- "grad_norm": 0.25973808765411377,
684
- "learning_rate": 5.270684371807968e-05,
685
- "loss": 0.0919,
686
  "step": 830
687
  },
688
  {
689
  "epoch": 13.125,
690
- "grad_norm": 0.2930574417114258,
691
- "learning_rate": 5.0663942798774264e-05,
692
- "loss": 0.0826,
693
  "step": 840
694
  },
695
  {
696
  "epoch": 13.28125,
697
- "grad_norm": 0.28331613540649414,
698
- "learning_rate": 4.862104187946885e-05,
699
- "loss": 0.0703,
700
  "step": 850
701
  },
702
  {
703
  "epoch": 13.4375,
704
- "grad_norm": 0.5403579473495483,
705
- "learning_rate": 4.657814096016344e-05,
706
- "loss": 0.0877,
707
  "step": 860
708
  },
709
  {
710
  "epoch": 13.59375,
711
- "grad_norm": 0.24780268967151642,
712
- "learning_rate": 4.453524004085802e-05,
713
- "loss": 0.0783,
714
  "step": 870
715
  },
716
  {
717
  "epoch": 13.75,
718
- "grad_norm": 0.43411558866500854,
719
- "learning_rate": 4.24923391215526e-05,
720
- "loss": 0.0809,
721
  "step": 880
722
  },
723
  {
724
  "epoch": 13.90625,
725
- "grad_norm": 0.33748045563697815,
726
- "learning_rate": 4.044943820224719e-05,
727
- "loss": 0.0875,
728
  "step": 890
729
  },
730
  {
731
  "epoch": 14.0625,
732
- "grad_norm": 0.18760047852993011,
733
- "learning_rate": 3.840653728294178e-05,
734
- "loss": 0.0654,
735
  "step": 900
736
  },
737
  {
738
  "epoch": 14.0625,
739
- "eval_accuracy": 0.9495575221238938,
740
- "eval_f1": 0.8689655172413793,
741
- "eval_loss": 0.13779258728027344,
742
- "eval_precision": 0.8873239436619719,
743
- "eval_recall": 0.8513513513513513,
744
- "eval_runtime": 0.8895,
745
- "eval_samples_per_second": 127.038,
746
- "eval_steps_per_second": 16.863,
747
  "step": 900
748
  },
749
  {
750
  "epoch": 14.21875,
751
- "grad_norm": 0.15740624070167542,
752
- "learning_rate": 3.6363636363636364e-05,
753
- "loss": 0.0729,
754
  "step": 910
755
  },
756
  {
757
  "epoch": 14.375,
758
- "grad_norm": 0.21466964483261108,
759
- "learning_rate": 3.4320735444330954e-05,
760
- "loss": 0.0613,
761
  "step": 920
762
  },
763
  {
764
  "epoch": 14.53125,
765
- "grad_norm": 0.32680827379226685,
766
- "learning_rate": 3.2277834525025536e-05,
767
- "loss": 0.0636,
768
  "step": 930
769
  },
770
  {
771
  "epoch": 14.6875,
772
- "grad_norm": 0.14597666263580322,
773
- "learning_rate": 3.0234933605720123e-05,
774
- "loss": 0.0758,
775
  "step": 940
776
  },
777
  {
778
  "epoch": 14.84375,
779
- "grad_norm": 0.1691855937242508,
780
- "learning_rate": 2.819203268641471e-05,
781
- "loss": 0.0788,
782
  "step": 950
783
  },
784
  {
785
  "epoch": 15.0,
786
- "grad_norm": 0.6366788148880005,
787
- "learning_rate": 2.61491317671093e-05,
788
- "loss": 0.0775,
789
  "step": 960
790
  },
791
  {
792
  "epoch": 15.15625,
793
- "grad_norm": 0.4018935561180115,
794
- "learning_rate": 2.410623084780388e-05,
795
- "loss": 0.0616,
796
  "step": 970
797
  },
798
  {
799
  "epoch": 15.3125,
800
- "grad_norm": 0.2861553728580475,
801
- "learning_rate": 2.2063329928498467e-05,
802
- "loss": 0.0648,
803
  "step": 980
804
  },
805
  {
806
  "epoch": 15.46875,
807
- "grad_norm": 0.14629952609539032,
808
- "learning_rate": 2.0020429009193057e-05,
809
- "loss": 0.0687,
810
  "step": 990
811
  },
812
  {
813
  "epoch": 15.625,
814
- "grad_norm": 0.18827380239963531,
815
- "learning_rate": 1.797752808988764e-05,
816
- "loss": 0.0583,
817
  "step": 1000
818
  },
819
  {
820
  "epoch": 15.625,
821
- "eval_accuracy": 0.9530973451327434,
822
- "eval_f1": 0.8798185941043084,
823
- "eval_loss": 0.13913680613040924,
824
- "eval_precision": 0.8858447488584474,
825
- "eval_recall": 0.8738738738738738,
826
- "eval_runtime": 1.128,
827
- "eval_samples_per_second": 100.175,
828
- "eval_steps_per_second": 13.298,
829
  "step": 1000
830
  },
831
  {
832
  "epoch": 15.78125,
833
- "grad_norm": 0.4585842788219452,
834
- "learning_rate": 1.593462717058223e-05,
835
- "loss": 0.061,
836
  "step": 1010
837
  },
838
  {
839
  "epoch": 15.9375,
840
- "grad_norm": 0.20938844978809357,
841
- "learning_rate": 1.3891726251276812e-05,
842
- "loss": 0.0682,
843
  "step": 1020
844
  },
845
  {
846
  "epoch": 16.09375,
847
- "grad_norm": 0.1632617712020874,
848
- "learning_rate": 1.18488253319714e-05,
849
- "loss": 0.065,
850
  "step": 1030
851
  },
852
  {
853
  "epoch": 16.25,
854
- "grad_norm": 0.25956493616104126,
855
- "learning_rate": 9.805924412665986e-06,
856
- "loss": 0.0761,
857
  "step": 1040
858
  },
859
  {
860
  "epoch": 16.40625,
861
- "grad_norm": 0.17571082711219788,
862
- "learning_rate": 7.763023493360572e-06,
863
- "loss": 0.0676,
864
  "step": 1050
865
  },
866
  {
867
  "epoch": 16.5625,
868
- "grad_norm": 0.1871979534626007,
869
- "learning_rate": 5.720122574055159e-06,
870
- "loss": 0.0663,
871
  "step": 1060
872
  },
873
  {
874
  "epoch": 16.71875,
875
- "grad_norm": 0.18988612294197083,
876
- "learning_rate": 3.677221654749745e-06,
877
- "loss": 0.0608,
878
  "step": 1070
879
  },
880
  {
881
  "epoch": 16.875,
882
- "grad_norm": 0.18478451669216156,
883
- "learning_rate": 1.6343207354443311e-06,
884
- "loss": 0.0511,
885
  "step": 1080
886
  },
887
  {
888
- "epoch": 17.0,
889
- "step": 1088,
890
- "total_flos": 1.333267779734618e+18,
891
- "train_loss": 0.16752693287151701,
892
- "train_runtime": 275.2566,
893
- "train_samples_per_second": 62.502,
894
- "train_steps_per_second": 3.953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
  }
896
  ],
897
  "logging_steps": 10,
898
- "max_steps": 1088,
899
  "num_input_tokens_seen": 0,
900
- "num_train_epochs": 17,
901
  "save_steps": 500,
902
  "stateful_callbacks": {
903
  "TrainerControl": {
@@ -911,7 +1607,7 @@
911
  "attributes": {}
912
  }
913
  },
914
- "total_flos": 1.333267779734618e+18,
915
  "train_batch_size": 16,
916
  "trial_name": null,
917
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.09645664691925049,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned_v2024-7-24-frost/checkpoint-500",
4
+ "epoch": 30.0,
5
  "eval_steps": 100,
6
+ "global_step": 1920,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.15625,
13
+ "grad_norm": 0.3123115003108978,
14
+ "learning_rate": 1.0416666666666668e-05,
15
+ "loss": 0.0743,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.3125,
20
+ "grad_norm": 0.10650705546140671,
21
+ "learning_rate": 2.0833333333333336e-05,
22
+ "loss": 0.0699,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.46875,
27
+ "grad_norm": 0.40636828541755676,
28
+ "learning_rate": 3.125e-05,
29
+ "loss": 0.0732,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.625,
34
+ "grad_norm": 0.42912840843200684,
35
+ "learning_rate": 4.166666666666667e-05,
36
+ "loss": 0.075,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.78125,
41
+ "grad_norm": 0.3166373372077942,
42
+ "learning_rate": 5.208333333333334e-05,
43
+ "loss": 0.0695,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.9375,
48
+ "grad_norm": 0.8551476001739502,
49
+ "learning_rate": 6.25e-05,
50
+ "loss": 0.0883,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 1.09375,
55
+ "grad_norm": 0.18066875636577606,
56
+ "learning_rate": 7.291666666666667e-05,
57
+ "loss": 0.0699,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 1.25,
62
+ "grad_norm": 0.28325945138931274,
63
+ "learning_rate": 8.333333333333334e-05,
64
+ "loss": 0.0627,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 1.40625,
69
+ "grad_norm": 0.3701513409614563,
70
+ "learning_rate": 9.375e-05,
71
+ "loss": 0.0866,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 1.5625,
76
+ "grad_norm": 0.35587912797927856,
77
+ "learning_rate": 0.00010416666666666667,
78
+ "loss": 0.0728,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 1.5625,
83
+ "eval_accuracy": 0.984070796460177,
84
+ "eval_f1": 0.9606986899563319,
85
+ "eval_loss": 0.06593623757362366,
86
+ "eval_precision": 0.9691629955947136,
87
+ "eval_recall": 0.9523809523809523,
88
+ "eval_runtime": 0.9039,
89
+ "eval_samples_per_second": 125.009,
90
+ "eval_steps_per_second": 16.594,
91
  "step": 100
92
  },
93
  {
94
  "epoch": 1.71875,
95
+ "grad_norm": 0.23759329319000244,
96
+ "learning_rate": 0.00011458333333333333,
97
+ "loss": 0.0653,
98
  "step": 110
99
  },
100
  {
101
  "epoch": 1.875,
102
+ "grad_norm": 0.6092272996902466,
103
+ "learning_rate": 0.000125,
104
+ "loss": 0.1015,
105
  "step": 120
106
  },
107
  {
108
  "epoch": 2.03125,
109
+ "grad_norm": 0.15887708961963654,
110
+ "learning_rate": 0.0001354166666666667,
111
+ "loss": 0.09,
112
  "step": 130
113
  },
114
  {
115
  "epoch": 2.1875,
116
+ "grad_norm": 0.3399417996406555,
117
+ "learning_rate": 0.00014583333333333335,
118
+ "loss": 0.0847,
119
  "step": 140
120
  },
121
  {
122
  "epoch": 2.34375,
123
+ "grad_norm": 0.2599344253540039,
124
+ "learning_rate": 0.00015625,
125
+ "loss": 0.0722,
126
  "step": 150
127
  },
128
  {
129
  "epoch": 2.5,
130
+ "grad_norm": 0.20714014768600464,
131
+ "learning_rate": 0.0001666666666666667,
132
+ "loss": 0.0915,
133
  "step": 160
134
  },
135
  {
136
  "epoch": 2.65625,
137
+ "grad_norm": 0.7900287508964539,
138
+ "learning_rate": 0.00017708333333333335,
139
+ "loss": 0.1008,
140
  "step": 170
141
  },
142
  {
143
  "epoch": 2.8125,
144
+ "grad_norm": 0.23315797746181488,
145
+ "learning_rate": 0.0001875,
146
+ "loss": 0.1142,
147
  "step": 180
148
  },
149
  {
150
  "epoch": 2.96875,
151
+ "grad_norm": 1.258319616317749,
152
+ "learning_rate": 0.0001979166666666667,
153
+ "loss": 0.1027,
154
  "step": 190
155
  },
156
  {
157
  "epoch": 3.125,
158
+ "grad_norm": 1.0168662071228027,
159
+ "learning_rate": 0.0001990740740740741,
160
+ "loss": 0.0871,
161
  "step": 200
162
  },
163
  {
164
  "epoch": 3.125,
165
+ "eval_accuracy": 0.9566371681415929,
166
+ "eval_f1": 0.8941684665226782,
167
+ "eval_loss": 0.12436065077781677,
168
+ "eval_precision": 0.8922413793103449,
169
+ "eval_recall": 0.8961038961038961,
170
+ "eval_runtime": 0.8601,
171
+ "eval_samples_per_second": 131.383,
172
+ "eval_steps_per_second": 17.44,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 3.28125,
177
+ "grad_norm": 0.38566353917121887,
178
+ "learning_rate": 0.0001979166666666667,
179
+ "loss": 0.1166,
180
  "step": 210
181
  },
182
  {
183
  "epoch": 3.4375,
184
+ "grad_norm": 0.4687894284725189,
185
+ "learning_rate": 0.00019675925925925926,
186
+ "loss": 0.108,
187
  "step": 220
188
  },
189
  {
190
  "epoch": 3.59375,
191
+ "grad_norm": 0.5190223455429077,
192
+ "learning_rate": 0.00019560185185185186,
193
+ "loss": 0.0901,
194
  "step": 230
195
  },
196
  {
197
  "epoch": 3.75,
198
+ "grad_norm": 0.5094243288040161,
199
+ "learning_rate": 0.00019444444444444446,
200
+ "loss": 0.1144,
201
  "step": 240
202
  },
203
  {
204
  "epoch": 3.90625,
205
+ "grad_norm": 0.5921277403831482,
206
+ "learning_rate": 0.00019328703703703706,
207
+ "loss": 0.1196,
208
  "step": 250
209
  },
210
  {
211
  "epoch": 4.0625,
212
+ "grad_norm": 0.23840609192848206,
213
+ "learning_rate": 0.00019212962962962963,
214
+ "loss": 0.1122,
215
  "step": 260
216
  },
217
  {
218
  "epoch": 4.21875,
219
+ "grad_norm": 0.9276812672615051,
220
+ "learning_rate": 0.00019097222222222223,
221
+ "loss": 0.1147,
222
  "step": 270
223
  },
224
  {
225
  "epoch": 4.375,
226
+ "grad_norm": 0.7325614094734192,
227
+ "learning_rate": 0.00018981481481481483,
228
+ "loss": 0.1075,
229
  "step": 280
230
  },
231
  {
232
  "epoch": 4.53125,
233
+ "grad_norm": 0.5574468374252319,
234
+ "learning_rate": 0.00018865740740740743,
235
+ "loss": 0.0958,
236
  "step": 290
237
  },
238
  {
239
  "epoch": 4.6875,
240
+ "grad_norm": 0.3893429934978485,
241
+ "learning_rate": 0.0001875,
242
+ "loss": 0.0999,
243
  "step": 300
244
  },
245
  {
246
  "epoch": 4.6875,
247
+ "eval_accuracy": 0.963716814159292,
248
+ "eval_f1": 0.9125799573560768,
249
+ "eval_loss": 0.10427873581647873,
250
+ "eval_precision": 0.8991596638655462,
251
+ "eval_recall": 0.9264069264069265,
252
+ "eval_runtime": 0.8493,
253
+ "eval_samples_per_second": 133.049,
254
+ "eval_steps_per_second": 17.661,
255
  "step": 300
256
  },
257
  {
258
  "epoch": 4.84375,
259
+ "grad_norm": 0.49028488993644714,
260
+ "learning_rate": 0.0001863425925925926,
261
+ "loss": 0.1087,
262
  "step": 310
263
  },
264
  {
265
  "epoch": 5.0,
266
+ "grad_norm": 0.6510241627693176,
267
+ "learning_rate": 0.0001851851851851852,
268
+ "loss": 0.0949,
269
  "step": 320
270
  },
271
  {
272
  "epoch": 5.15625,
273
+ "grad_norm": 0.3244408667087555,
274
+ "learning_rate": 0.00018402777777777778,
275
+ "loss": 0.0957,
276
  "step": 330
277
  },
278
  {
279
  "epoch": 5.3125,
280
+ "grad_norm": 0.32894158363342285,
281
+ "learning_rate": 0.00018287037037037038,
282
+ "loss": 0.0761,
283
  "step": 340
284
  },
285
  {
286
  "epoch": 5.46875,
287
+ "grad_norm": 0.4168912470340729,
288
+ "learning_rate": 0.00018171296296296297,
289
+ "loss": 0.1014,
290
  "step": 350
291
  },
292
  {
293
  "epoch": 5.625,
294
+ "grad_norm": 0.30746978521347046,
295
+ "learning_rate": 0.00018055555555555557,
296
+ "loss": 0.0789,
297
  "step": 360
298
  },
299
  {
300
  "epoch": 5.78125,
301
+ "grad_norm": 0.3337535262107849,
302
+ "learning_rate": 0.00017939814814814815,
303
+ "loss": 0.0891,
304
  "step": 370
305
  },
306
  {
307
  "epoch": 5.9375,
308
+ "grad_norm": 0.2659320533275604,
309
+ "learning_rate": 0.00017824074074074075,
310
+ "loss": 0.0798,
311
  "step": 380
312
  },
313
  {
314
  "epoch": 6.09375,
315
+ "grad_norm": 0.28791913390159607,
316
+ "learning_rate": 0.00017708333333333335,
317
+ "loss": 0.0961,
318
  "step": 390
319
  },
320
  {
321
  "epoch": 6.25,
322
+ "grad_norm": 0.41803187131881714,
323
+ "learning_rate": 0.00017592592592592595,
324
+ "loss": 0.0743,
325
  "step": 400
326
  },
327
  {
328
  "epoch": 6.25,
329
+ "eval_accuracy": 0.9610619469026549,
330
+ "eval_f1": 0.9043478260869565,
331
+ "eval_loss": 0.10431604832410812,
332
+ "eval_precision": 0.9082969432314411,
333
+ "eval_recall": 0.9004329004329005,
334
+ "eval_runtime": 1.3126,
335
+ "eval_samples_per_second": 86.086,
336
+ "eval_steps_per_second": 11.427,
337
  "step": 400
338
  },
339
  {
340
  "epoch": 6.40625,
341
+ "grad_norm": 0.398034930229187,
342
+ "learning_rate": 0.00017476851851851852,
343
+ "loss": 0.0798,
344
  "step": 410
345
  },
346
  {
347
  "epoch": 6.5625,
348
+ "grad_norm": 0.533364474773407,
349
+ "learning_rate": 0.00017361111111111112,
350
+ "loss": 0.0808,
351
  "step": 420
352
  },
353
  {
354
  "epoch": 6.71875,
355
+ "grad_norm": 0.6189862489700317,
356
+ "learning_rate": 0.00017245370370370372,
357
+ "loss": 0.091,
358
  "step": 430
359
  },
360
  {
361
  "epoch": 6.875,
362
+ "grad_norm": 0.31593209505081177,
363
+ "learning_rate": 0.00017129629629629632,
364
+ "loss": 0.0729,
365
  "step": 440
366
  },
367
  {
368
  "epoch": 7.03125,
369
+ "grad_norm": 0.3167741000652313,
370
+ "learning_rate": 0.0001701388888888889,
371
+ "loss": 0.0796,
372
  "step": 450
373
  },
374
  {
375
  "epoch": 7.1875,
376
+ "grad_norm": 0.30901169776916504,
377
+ "learning_rate": 0.0001689814814814815,
378
+ "loss": 0.0867,
379
  "step": 460
380
  },
381
  {
382
  "epoch": 7.34375,
383
+ "grad_norm": 0.13378705084323883,
384
+ "learning_rate": 0.0001678240740740741,
385
+ "loss": 0.0701,
386
  "step": 470
387
  },
388
  {
389
  "epoch": 7.5,
390
+ "grad_norm": 0.15507709980010986,
391
+ "learning_rate": 0.0001666666666666667,
392
+ "loss": 0.0789,
393
  "step": 480
394
  },
395
  {
396
  "epoch": 7.65625,
397
+ "grad_norm": 0.21113860607147217,
398
+ "learning_rate": 0.00016550925925925926,
399
+ "loss": 0.0647,
400
  "step": 490
401
  },
402
  {
403
  "epoch": 7.8125,
404
+ "grad_norm": 0.15848499536514282,
405
+ "learning_rate": 0.00016435185185185186,
406
+ "loss": 0.0655,
407
  "step": 500
408
  },
409
  {
410
  "epoch": 7.8125,
411
+ "eval_accuracy": 0.963716814159292,
412
+ "eval_f1": 0.9118279569892475,
413
+ "eval_loss": 0.09645664691925049,
414
+ "eval_precision": 0.905982905982906,
415
+ "eval_recall": 0.9177489177489178,
416
+ "eval_runtime": 2.2304,
417
+ "eval_samples_per_second": 50.664,
418
+ "eval_steps_per_second": 6.725,
419
  "step": 500
420
  },
421
  {
422
  "epoch": 7.96875,
423
+ "grad_norm": 0.19086486101150513,
424
+ "learning_rate": 0.00016319444444444446,
425
+ "loss": 0.0502,
426
  "step": 510
427
  },
428
  {
429
  "epoch": 8.125,
430
+ "grad_norm": 0.4851354956626892,
431
+ "learning_rate": 0.00016203703703703706,
432
+ "loss": 0.0646,
433
  "step": 520
434
  },
435
  {
436
  "epoch": 8.28125,
437
+ "grad_norm": 0.43803560733795166,
438
+ "learning_rate": 0.00016087962962962963,
439
+ "loss": 0.0668,
440
  "step": 530
441
  },
442
  {
443
  "epoch": 8.4375,
444
+ "grad_norm": 0.26552197337150574,
445
+ "learning_rate": 0.00015972222222222223,
446
+ "loss": 0.0549,
447
  "step": 540
448
  },
449
  {
450
  "epoch": 8.59375,
451
+ "grad_norm": 0.18909405171871185,
452
+ "learning_rate": 0.00015856481481481483,
453
+ "loss": 0.07,
454
  "step": 550
455
  },
456
  {
457
  "epoch": 8.75,
458
+ "grad_norm": 0.2485276311635971,
459
+ "learning_rate": 0.00015740740740740743,
460
+ "loss": 0.0525,
461
  "step": 560
462
  },
463
  {
464
  "epoch": 8.90625,
465
+ "grad_norm": 0.4424391984939575,
466
+ "learning_rate": 0.00015625,
467
+ "loss": 0.0775,
468
  "step": 570
469
  },
470
  {
471
  "epoch": 9.0625,
472
+ "grad_norm": 0.2584344446659088,
473
+ "learning_rate": 0.0001550925925925926,
474
+ "loss": 0.064,
475
  "step": 580
476
  },
477
  {
478
  "epoch": 9.21875,
479
+ "grad_norm": 0.6115286946296692,
480
+ "learning_rate": 0.0001539351851851852,
481
+ "loss": 0.0599,
482
  "step": 590
483
  },
484
  {
485
  "epoch": 9.375,
486
+ "grad_norm": 0.5155323147773743,
487
+ "learning_rate": 0.00015277777777777777,
488
+ "loss": 0.0559,
489
  "step": 600
490
  },
491
  {
492
  "epoch": 9.375,
493
+ "eval_accuracy": 0.9619469026548673,
494
+ "eval_f1": 0.9087048832271762,
495
+ "eval_loss": 0.10384609550237656,
496
+ "eval_precision": 0.8916666666666667,
497
+ "eval_recall": 0.9264069264069265,
498
+ "eval_runtime": 2.2545,
499
+ "eval_samples_per_second": 50.123,
500
+ "eval_steps_per_second": 6.653,
501
  "step": 600
502
  },
503
  {
504
  "epoch": 9.53125,
505
+ "grad_norm": 0.40844494104385376,
506
+ "learning_rate": 0.00015162037037037037,
507
+ "loss": 0.072,
508
  "step": 610
509
  },
510
  {
511
  "epoch": 9.6875,
512
+ "grad_norm": 0.2091340720653534,
513
+ "learning_rate": 0.00015046296296296297,
514
+ "loss": 0.0513,
515
  "step": 620
516
  },
517
  {
518
  "epoch": 9.84375,
519
+ "grad_norm": 0.22117160260677338,
520
+ "learning_rate": 0.00014930555555555557,
521
+ "loss": 0.0623,
522
  "step": 630
523
  },
524
  {
525
  "epoch": 10.0,
526
+ "grad_norm": 0.31236401200294495,
527
+ "learning_rate": 0.00014814814814814815,
528
+ "loss": 0.0534,
529
  "step": 640
530
  },
531
  {
532
  "epoch": 10.15625,
533
+ "grad_norm": 0.31281912326812744,
534
+ "learning_rate": 0.00014699074074074075,
535
+ "loss": 0.0644,
536
  "step": 650
537
  },
538
  {
539
  "epoch": 10.3125,
540
+ "grad_norm": 0.5201927423477173,
541
+ "learning_rate": 0.00014583333333333335,
542
+ "loss": 0.057,
543
  "step": 660
544
  },
545
  {
546
  "epoch": 10.46875,
547
+ "grad_norm": 0.2596763074398041,
548
+ "learning_rate": 0.00014467592592592594,
549
+ "loss": 0.0542,
550
  "step": 670
551
  },
552
  {
553
  "epoch": 10.625,
554
+ "grad_norm": 0.3063810467720032,
555
+ "learning_rate": 0.00014351851851851852,
556
+ "loss": 0.0389,
557
  "step": 680
558
  },
559
  {
560
  "epoch": 10.78125,
561
+ "grad_norm": 0.48713332414627075,
562
+ "learning_rate": 0.00014236111111111112,
563
+ "loss": 0.0742,
564
  "step": 690
565
  },
566
  {
567
  "epoch": 10.9375,
568
+ "grad_norm": 0.21316884458065033,
569
+ "learning_rate": 0.00014120370370370372,
570
+ "loss": 0.0517,
571
  "step": 700
572
  },
573
  {
574
  "epoch": 10.9375,
575
+ "eval_accuracy": 0.9584070796460177,
576
+ "eval_f1": 0.8997867803837952,
577
+ "eval_loss": 0.09719711542129517,
578
+ "eval_precision": 0.8865546218487395,
579
+ "eval_recall": 0.9134199134199135,
580
+ "eval_runtime": 0.8764,
581
+ "eval_samples_per_second": 128.937,
582
+ "eval_steps_per_second": 17.115,
583
  "step": 700
584
  },
585
  {
586
  "epoch": 11.09375,
587
+ "grad_norm": 0.9216361045837402,
588
+ "learning_rate": 0.00014004629629629632,
589
+ "loss": 0.0623,
590
  "step": 710
591
  },
592
  {
593
  "epoch": 11.25,
594
+ "grad_norm": 0.31130528450012207,
595
+ "learning_rate": 0.0001388888888888889,
596
+ "loss": 0.0641,
597
  "step": 720
598
  },
599
  {
600
  "epoch": 11.40625,
601
+ "grad_norm": 1.03948974609375,
602
+ "learning_rate": 0.0001377314814814815,
603
+ "loss": 0.0594,
604
  "step": 730
605
  },
606
  {
607
  "epoch": 11.5625,
608
+ "grad_norm": 0.12757237255573273,
609
+ "learning_rate": 0.0001365740740740741,
610
+ "loss": 0.0572,
611
  "step": 740
612
  },
613
  {
614
  "epoch": 11.71875,
615
+ "grad_norm": 0.25488346815109253,
616
+ "learning_rate": 0.0001354166666666667,
617
+ "loss": 0.0533,
618
  "step": 750
619
  },
620
  {
621
  "epoch": 11.875,
622
+ "grad_norm": 0.2517576813697815,
623
+ "learning_rate": 0.00013425925925925926,
624
+ "loss": 0.0557,
625
  "step": 760
626
  },
627
  {
628
  "epoch": 12.03125,
629
+ "grad_norm": 0.14332328736782074,
630
+ "learning_rate": 0.00013310185185185186,
631
+ "loss": 0.0433,
632
  "step": 770
633
  },
634
  {
635
  "epoch": 12.1875,
636
+ "grad_norm": 0.7062014937400818,
637
+ "learning_rate": 0.00013194444444444446,
638
+ "loss": 0.0569,
639
  "step": 780
640
  },
641
  {
642
  "epoch": 12.34375,
643
+ "grad_norm": 0.727057158946991,
644
+ "learning_rate": 0.00013078703703703706,
645
+ "loss": 0.0443,
646
  "step": 790
647
  },
648
  {
649
  "epoch": 12.5,
650
+ "grad_norm": 0.17331984639167786,
651
+ "learning_rate": 0.00012962962962962963,
652
+ "loss": 0.0407,
653
  "step": 800
654
  },
655
  {
656
  "epoch": 12.5,
657
+ "eval_accuracy": 0.963716814159292,
658
+ "eval_f1": 0.9110629067245118,
659
+ "eval_loss": 0.11198227852582932,
660
+ "eval_precision": 0.9130434782608695,
661
+ "eval_recall": 0.9090909090909091,
662
+ "eval_runtime": 0.9019,
663
+ "eval_samples_per_second": 125.294,
664
+ "eval_steps_per_second": 16.632,
665
  "step": 800
666
  },
667
  {
668
  "epoch": 12.65625,
669
+ "grad_norm": 0.104576975107193,
670
+ "learning_rate": 0.00012847222222222223,
671
+ "loss": 0.0465,
672
  "step": 810
673
  },
674
  {
675
  "epoch": 12.8125,
676
+ "grad_norm": 0.6552168726921082,
677
+ "learning_rate": 0.00012731481481481483,
678
+ "loss": 0.0536,
679
  "step": 820
680
  },
681
  {
682
  "epoch": 12.96875,
683
+ "grad_norm": 0.39452189207077026,
684
+ "learning_rate": 0.00012615740740740743,
685
+ "loss": 0.0514,
686
  "step": 830
687
  },
688
  {
689
  "epoch": 13.125,
690
+ "grad_norm": 0.16756129264831543,
691
+ "learning_rate": 0.000125,
692
+ "loss": 0.0417,
693
  "step": 840
694
  },
695
  {
696
  "epoch": 13.28125,
697
+ "grad_norm": 0.13866697251796722,
698
+ "learning_rate": 0.00012384259259259258,
699
+ "loss": 0.0419,
700
  "step": 850
701
  },
702
  {
703
  "epoch": 13.4375,
704
+ "grad_norm": 0.9053749442100525,
705
+ "learning_rate": 0.0001226851851851852,
706
+ "loss": 0.0548,
707
  "step": 860
708
  },
709
  {
710
  "epoch": 13.59375,
711
+ "grad_norm": 0.43149927258491516,
712
+ "learning_rate": 0.00012152777777777777,
713
+ "loss": 0.0503,
714
  "step": 870
715
  },
716
  {
717
  "epoch": 13.75,
718
+ "grad_norm": 0.49532395601272583,
719
+ "learning_rate": 0.00012037037037037037,
720
+ "loss": 0.0476,
721
  "step": 880
722
  },
723
  {
724
  "epoch": 13.90625,
725
+ "grad_norm": 0.12025842815637589,
726
+ "learning_rate": 0.00011921296296296296,
727
+ "loss": 0.049,
728
  "step": 890
729
  },
730
  {
731
  "epoch": 14.0625,
732
+ "grad_norm": 0.8570975065231323,
733
+ "learning_rate": 0.00011805555555555556,
734
+ "loss": 0.0513,
735
  "step": 900
736
  },
737
  {
738
  "epoch": 14.0625,
739
+ "eval_accuracy": 0.9557522123893806,
740
+ "eval_f1": 0.8893805309734513,
741
+ "eval_loss": 0.1092919260263443,
742
+ "eval_precision": 0.9095022624434389,
743
+ "eval_recall": 0.8701298701298701,
744
+ "eval_runtime": 1.2237,
745
+ "eval_samples_per_second": 92.344,
746
+ "eval_steps_per_second": 12.258,
747
  "step": 900
748
  },
749
  {
750
  "epoch": 14.21875,
751
+ "grad_norm": 0.4120664596557617,
752
+ "learning_rate": 0.00011689814814814815,
753
+ "loss": 0.0552,
754
  "step": 910
755
  },
756
  {
757
  "epoch": 14.375,
758
+ "grad_norm": 0.24265483021736145,
759
+ "learning_rate": 0.00011574074074074075,
760
+ "loss": 0.0434,
761
  "step": 920
762
  },
763
  {
764
  "epoch": 14.53125,
765
+ "grad_norm": 0.23618777096271515,
766
+ "learning_rate": 0.00011458333333333333,
767
+ "loss": 0.0482,
768
  "step": 930
769
  },
770
  {
771
  "epoch": 14.6875,
772
+ "grad_norm": 0.1366555392742157,
773
+ "learning_rate": 0.00011342592592592593,
774
+ "loss": 0.045,
775
  "step": 940
776
  },
777
  {
778
  "epoch": 14.84375,
779
+ "grad_norm": 0.1841152310371399,
780
+ "learning_rate": 0.00011226851851851852,
781
+ "loss": 0.0539,
782
  "step": 950
783
  },
784
  {
785
  "epoch": 15.0,
786
+ "grad_norm": 0.6849538087844849,
787
+ "learning_rate": 0.00011111111111111112,
788
+ "loss": 0.0363,
789
  "step": 960
790
  },
791
  {
792
  "epoch": 15.15625,
793
+ "grad_norm": 0.5442699790000916,
794
+ "learning_rate": 0.0001099537037037037,
795
+ "loss": 0.0372,
796
  "step": 970
797
  },
798
  {
799
  "epoch": 15.3125,
800
+ "grad_norm": 0.3825988173484802,
801
+ "learning_rate": 0.0001087962962962963,
802
+ "loss": 0.0405,
803
  "step": 980
804
  },
805
  {
806
  "epoch": 15.46875,
807
+ "grad_norm": 0.0459093414247036,
808
+ "learning_rate": 0.00010763888888888889,
809
+ "loss": 0.0386,
810
  "step": 990
811
  },
812
  {
813
  "epoch": 15.625,
814
+ "grad_norm": 0.2602522373199463,
815
+ "learning_rate": 0.00010648148148148149,
816
+ "loss": 0.0378,
817
  "step": 1000
818
  },
819
  {
820
  "epoch": 15.625,
821
+ "eval_accuracy": 0.9548672566371681,
822
+ "eval_f1": 0.8888888888888888,
823
+ "eval_loss": 0.11969945579767227,
824
+ "eval_precision": 0.8947368421052632,
825
+ "eval_recall": 0.8831168831168831,
826
+ "eval_runtime": 0.8768,
827
+ "eval_samples_per_second": 128.871,
828
+ "eval_steps_per_second": 17.107,
829
  "step": 1000
830
  },
831
  {
832
  "epoch": 15.78125,
833
+ "grad_norm": 0.07926033437252045,
834
+ "learning_rate": 0.00010532407407407407,
835
+ "loss": 0.0419,
836
  "step": 1010
837
  },
838
  {
839
  "epoch": 15.9375,
840
+ "grad_norm": 0.2084084302186966,
841
+ "learning_rate": 0.00010416666666666667,
842
+ "loss": 0.0336,
843
  "step": 1020
844
  },
845
  {
846
  "epoch": 16.09375,
847
+ "grad_norm": 0.11587415635585785,
848
+ "learning_rate": 0.00010300925925925926,
849
+ "loss": 0.0293,
850
  "step": 1030
851
  },
852
  {
853
  "epoch": 16.25,
854
+ "grad_norm": 0.4128260314464569,
855
+ "learning_rate": 0.00010185185185185186,
856
+ "loss": 0.0346,
857
  "step": 1040
858
  },
859
  {
860
  "epoch": 16.40625,
861
+ "grad_norm": 0.2051563411951065,
862
+ "learning_rate": 0.00010069444444444445,
863
+ "loss": 0.0404,
864
  "step": 1050
865
  },
866
  {
867
  "epoch": 16.5625,
868
+ "grad_norm": 1.0257600545883179,
869
+ "learning_rate": 9.953703703703704e-05,
870
+ "loss": 0.0521,
871
  "step": 1060
872
  },
873
  {
874
  "epoch": 16.71875,
875
+ "grad_norm": 0.13610199093818665,
876
+ "learning_rate": 9.837962962962963e-05,
877
+ "loss": 0.0513,
878
  "step": 1070
879
  },
880
  {
881
  "epoch": 16.875,
882
+ "grad_norm": 0.5424107909202576,
883
+ "learning_rate": 9.722222222222223e-05,
884
+ "loss": 0.0662,
885
  "step": 1080
886
  },
887
  {
888
+ "epoch": 17.03125,
889
+ "grad_norm": 0.1417212188243866,
890
+ "learning_rate": 9.606481481481482e-05,
891
+ "loss": 0.0364,
892
+ "step": 1090
893
+ },
894
+ {
895
+ "epoch": 17.1875,
896
+ "grad_norm": 0.15864621102809906,
897
+ "learning_rate": 9.490740740740742e-05,
898
+ "loss": 0.0487,
899
+ "step": 1100
900
+ },
901
+ {
902
+ "epoch": 17.1875,
903
+ "eval_accuracy": 0.9646017699115044,
904
+ "eval_f1": 0.9137931034482759,
905
+ "eval_loss": 0.09552007168531418,
906
+ "eval_precision": 0.9098712446351931,
907
+ "eval_recall": 0.9177489177489178,
908
+ "eval_runtime": 0.8603,
909
+ "eval_samples_per_second": 131.352,
910
+ "eval_steps_per_second": 17.436,
911
+ "step": 1100
912
+ },
913
+ {
914
+ "epoch": 17.34375,
915
+ "grad_norm": 0.5382766127586365,
916
+ "learning_rate": 9.375e-05,
917
+ "loss": 0.0315,
918
+ "step": 1110
919
+ },
920
+ {
921
+ "epoch": 17.5,
922
+ "grad_norm": 0.16078180074691772,
923
+ "learning_rate": 9.25925925925926e-05,
924
+ "loss": 0.0245,
925
+ "step": 1120
926
+ },
927
+ {
928
+ "epoch": 17.65625,
929
+ "grad_norm": 0.05187100172042847,
930
+ "learning_rate": 9.143518518518519e-05,
931
+ "loss": 0.0342,
932
+ "step": 1130
933
+ },
934
+ {
935
+ "epoch": 17.8125,
936
+ "grad_norm": 0.4016551077365875,
937
+ "learning_rate": 9.027777777777779e-05,
938
+ "loss": 0.0285,
939
+ "step": 1140
940
+ },
941
+ {
942
+ "epoch": 17.96875,
943
+ "grad_norm": 0.22832362353801727,
944
+ "learning_rate": 8.912037037037037e-05,
945
+ "loss": 0.0379,
946
+ "step": 1150
947
+ },
948
+ {
949
+ "epoch": 18.125,
950
+ "grad_norm": 1.3720444440841675,
951
+ "learning_rate": 8.796296296296297e-05,
952
+ "loss": 0.0369,
953
+ "step": 1160
954
+ },
955
+ {
956
+ "epoch": 18.28125,
957
+ "grad_norm": 0.23146755993366241,
958
+ "learning_rate": 8.680555555555556e-05,
959
+ "loss": 0.0343,
960
+ "step": 1170
961
+ },
962
+ {
963
+ "epoch": 18.4375,
964
+ "grad_norm": 0.2672041356563568,
965
+ "learning_rate": 8.564814814814816e-05,
966
+ "loss": 0.0353,
967
+ "step": 1180
968
+ },
969
+ {
970
+ "epoch": 18.59375,
971
+ "grad_norm": 0.17212288081645966,
972
+ "learning_rate": 8.449074074074074e-05,
973
+ "loss": 0.0315,
974
+ "step": 1190
975
+ },
976
+ {
977
+ "epoch": 18.75,
978
+ "grad_norm": 0.12989170849323273,
979
+ "learning_rate": 8.333333333333334e-05,
980
+ "loss": 0.0272,
981
+ "step": 1200
982
+ },
983
+ {
984
+ "epoch": 18.75,
985
+ "eval_accuracy": 0.9566371681415929,
986
+ "eval_f1": 0.8927789934354485,
987
+ "eval_loss": 0.10875095427036285,
988
+ "eval_precision": 0.9026548672566371,
989
+ "eval_recall": 0.8831168831168831,
990
+ "eval_runtime": 1.2152,
991
+ "eval_samples_per_second": 92.985,
992
+ "eval_steps_per_second": 12.343,
993
+ "step": 1200
994
+ },
995
+ {
996
+ "epoch": 18.90625,
997
+ "grad_norm": 0.15251386165618896,
998
+ "learning_rate": 8.217592592592593e-05,
999
+ "loss": 0.0353,
1000
+ "step": 1210
1001
+ },
1002
+ {
1003
+ "epoch": 19.0625,
1004
+ "grad_norm": 0.1500956416130066,
1005
+ "learning_rate": 8.101851851851853e-05,
1006
+ "loss": 0.0258,
1007
+ "step": 1220
1008
+ },
1009
+ {
1010
+ "epoch": 19.21875,
1011
+ "grad_norm": 0.16236737370491028,
1012
+ "learning_rate": 7.986111111111112e-05,
1013
+ "loss": 0.0318,
1014
+ "step": 1230
1015
+ },
1016
+ {
1017
+ "epoch": 19.375,
1018
+ "grad_norm": 0.5188699960708618,
1019
+ "learning_rate": 7.870370370370372e-05,
1020
+ "loss": 0.0388,
1021
+ "step": 1240
1022
+ },
1023
+ {
1024
+ "epoch": 19.53125,
1025
+ "grad_norm": 0.14171747863292694,
1026
+ "learning_rate": 7.75462962962963e-05,
1027
+ "loss": 0.0371,
1028
+ "step": 1250
1029
+ },
1030
+ {
1031
+ "epoch": 19.6875,
1032
+ "grad_norm": 0.355496883392334,
1033
+ "learning_rate": 7.638888888888889e-05,
1034
+ "loss": 0.0278,
1035
+ "step": 1260
1036
+ },
1037
+ {
1038
+ "epoch": 19.84375,
1039
+ "grad_norm": 0.30447283387184143,
1040
+ "learning_rate": 7.523148148148149e-05,
1041
+ "loss": 0.0307,
1042
+ "step": 1270
1043
+ },
1044
+ {
1045
+ "epoch": 20.0,
1046
+ "grad_norm": 1.8779629468917847,
1047
+ "learning_rate": 7.407407407407407e-05,
1048
+ "loss": 0.0378,
1049
+ "step": 1280
1050
+ },
1051
+ {
1052
+ "epoch": 20.15625,
1053
+ "grad_norm": 0.24115116894245148,
1054
+ "learning_rate": 7.291666666666667e-05,
1055
+ "loss": 0.0234,
1056
+ "step": 1290
1057
+ },
1058
+ {
1059
+ "epoch": 20.3125,
1060
+ "grad_norm": 0.07465353608131409,
1061
+ "learning_rate": 7.175925925925926e-05,
1062
+ "loss": 0.0241,
1063
+ "step": 1300
1064
+ },
1065
+ {
1066
+ "epoch": 20.3125,
1067
+ "eval_accuracy": 0.963716814159292,
1068
+ "eval_f1": 0.9114470842332613,
1069
+ "eval_loss": 0.09792255610227585,
1070
+ "eval_precision": 0.9094827586206896,
1071
+ "eval_recall": 0.9134199134199135,
1072
+ "eval_runtime": 0.8811,
1073
+ "eval_samples_per_second": 128.253,
1074
+ "eval_steps_per_second": 17.025,
1075
+ "step": 1300
1076
+ },
1077
+ {
1078
+ "epoch": 20.46875,
1079
+ "grad_norm": 0.44642359018325806,
1080
+ "learning_rate": 7.060185185185186e-05,
1081
+ "loss": 0.0271,
1082
+ "step": 1310
1083
+ },
1084
+ {
1085
+ "epoch": 20.625,
1086
+ "grad_norm": 0.16677480936050415,
1087
+ "learning_rate": 6.944444444444444e-05,
1088
+ "loss": 0.0199,
1089
+ "step": 1320
1090
+ },
1091
+ {
1092
+ "epoch": 20.78125,
1093
+ "grad_norm": 0.05306961014866829,
1094
+ "learning_rate": 6.828703703703704e-05,
1095
+ "loss": 0.028,
1096
+ "step": 1330
1097
+ },
1098
+ {
1099
+ "epoch": 20.9375,
1100
+ "grad_norm": 0.7962948679924011,
1101
+ "learning_rate": 6.712962962962963e-05,
1102
+ "loss": 0.0238,
1103
+ "step": 1340
1104
+ },
1105
+ {
1106
+ "epoch": 21.09375,
1107
+ "grad_norm": 0.19253899157047272,
1108
+ "learning_rate": 6.597222222222223e-05,
1109
+ "loss": 0.0367,
1110
+ "step": 1350
1111
+ },
1112
+ {
1113
+ "epoch": 21.25,
1114
+ "grad_norm": 0.22666649520397186,
1115
+ "learning_rate": 6.481481481481482e-05,
1116
+ "loss": 0.021,
1117
+ "step": 1360
1118
+ },
1119
+ {
1120
+ "epoch": 21.40625,
1121
+ "grad_norm": 0.09341959655284882,
1122
+ "learning_rate": 6.365740740740742e-05,
1123
+ "loss": 0.0217,
1124
+ "step": 1370
1125
+ },
1126
+ {
1127
+ "epoch": 21.5625,
1128
+ "grad_norm": 0.40562504529953003,
1129
+ "learning_rate": 6.25e-05,
1130
+ "loss": 0.0268,
1131
+ "step": 1380
1132
+ },
1133
+ {
1134
+ "epoch": 21.71875,
1135
+ "grad_norm": 0.20743058621883392,
1136
+ "learning_rate": 6.13425925925926e-05,
1137
+ "loss": 0.0394,
1138
+ "step": 1390
1139
+ },
1140
+ {
1141
+ "epoch": 21.875,
1142
+ "grad_norm": 0.16062897443771362,
1143
+ "learning_rate": 6.018518518518519e-05,
1144
+ "loss": 0.0311,
1145
+ "step": 1400
1146
+ },
1147
+ {
1148
+ "epoch": 21.875,
1149
+ "eval_accuracy": 0.9654867256637168,
1150
+ "eval_f1": 0.9157667386609072,
1151
+ "eval_loss": 0.11342811584472656,
1152
+ "eval_precision": 0.9137931034482759,
1153
+ "eval_recall": 0.9177489177489178,
1154
+ "eval_runtime": 0.8884,
1155
+ "eval_samples_per_second": 127.2,
1156
+ "eval_steps_per_second": 16.885,
1157
+ "step": 1400
1158
+ },
1159
+ {
1160
+ "epoch": 22.03125,
1161
+ "grad_norm": 0.08394443988800049,
1162
+ "learning_rate": 5.902777777777778e-05,
1163
+ "loss": 0.0312,
1164
+ "step": 1410
1165
+ },
1166
+ {
1167
+ "epoch": 22.1875,
1168
+ "grad_norm": 0.6736553311347961,
1169
+ "learning_rate": 5.787037037037037e-05,
1170
+ "loss": 0.0383,
1171
+ "step": 1420
1172
+ },
1173
+ {
1174
+ "epoch": 22.34375,
1175
+ "grad_norm": 0.563914954662323,
1176
+ "learning_rate": 5.6712962962962965e-05,
1177
+ "loss": 0.0287,
1178
+ "step": 1430
1179
+ },
1180
+ {
1181
+ "epoch": 22.5,
1182
+ "grad_norm": 0.08304356783628464,
1183
+ "learning_rate": 5.555555555555556e-05,
1184
+ "loss": 0.026,
1185
+ "step": 1440
1186
+ },
1187
+ {
1188
+ "epoch": 22.65625,
1189
+ "grad_norm": 0.6314889788627625,
1190
+ "learning_rate": 5.439814814814815e-05,
1191
+ "loss": 0.0337,
1192
+ "step": 1450
1193
+ },
1194
+ {
1195
+ "epoch": 22.8125,
1196
+ "grad_norm": 0.1526585817337036,
1197
+ "learning_rate": 5.3240740740740744e-05,
1198
+ "loss": 0.0386,
1199
+ "step": 1460
1200
+ },
1201
+ {
1202
+ "epoch": 22.96875,
1203
+ "grad_norm": 0.4352094829082489,
1204
+ "learning_rate": 5.208333333333334e-05,
1205
+ "loss": 0.0225,
1206
+ "step": 1470
1207
+ },
1208
+ {
1209
+ "epoch": 23.125,
1210
+ "grad_norm": 0.07802680879831314,
1211
+ "learning_rate": 5.092592592592593e-05,
1212
+ "loss": 0.028,
1213
+ "step": 1480
1214
+ },
1215
+ {
1216
+ "epoch": 23.28125,
1217
+ "grad_norm": 0.06631523370742798,
1218
+ "learning_rate": 4.976851851851852e-05,
1219
+ "loss": 0.0216,
1220
+ "step": 1490
1221
+ },
1222
+ {
1223
+ "epoch": 23.4375,
1224
+ "grad_norm": 0.4568875730037689,
1225
+ "learning_rate": 4.8611111111111115e-05,
1226
+ "loss": 0.0303,
1227
+ "step": 1500
1228
+ },
1229
+ {
1230
+ "epoch": 23.4375,
1231
+ "eval_accuracy": 0.9628318584070796,
1232
+ "eval_f1": 0.9078947368421053,
1233
+ "eval_loss": 0.10922601819038391,
1234
+ "eval_precision": 0.92,
1235
+ "eval_recall": 0.8961038961038961,
1236
+ "eval_runtime": 1.1366,
1237
+ "eval_samples_per_second": 99.417,
1238
+ "eval_steps_per_second": 13.197,
1239
+ "step": 1500
1240
+ },
1241
+ {
1242
+ "epoch": 23.59375,
1243
+ "grad_norm": 0.16732257604599,
1244
+ "learning_rate": 4.745370370370371e-05,
1245
+ "loss": 0.0182,
1246
+ "step": 1510
1247
+ },
1248
+ {
1249
+ "epoch": 23.75,
1250
+ "grad_norm": 0.8489612340927124,
1251
+ "learning_rate": 4.62962962962963e-05,
1252
+ "loss": 0.0419,
1253
+ "step": 1520
1254
+ },
1255
+ {
1256
+ "epoch": 23.90625,
1257
+ "grad_norm": 0.23256537318229675,
1258
+ "learning_rate": 4.5138888888888894e-05,
1259
+ "loss": 0.0344,
1260
+ "step": 1530
1261
+ },
1262
+ {
1263
+ "epoch": 24.0625,
1264
+ "grad_norm": 0.23274816572666168,
1265
+ "learning_rate": 4.3981481481481486e-05,
1266
+ "loss": 0.0283,
1267
+ "step": 1540
1268
+ },
1269
+ {
1270
+ "epoch": 24.21875,
1271
+ "grad_norm": 0.1935439109802246,
1272
+ "learning_rate": 4.282407407407408e-05,
1273
+ "loss": 0.0293,
1274
+ "step": 1550
1275
+ },
1276
+ {
1277
+ "epoch": 24.375,
1278
+ "grad_norm": 0.4433891773223877,
1279
+ "learning_rate": 4.166666666666667e-05,
1280
+ "loss": 0.0341,
1281
+ "step": 1560
1282
+ },
1283
+ {
1284
+ "epoch": 24.53125,
1285
+ "grad_norm": 0.3329981863498688,
1286
+ "learning_rate": 4.0509259259259265e-05,
1287
+ "loss": 0.022,
1288
+ "step": 1570
1289
+ },
1290
+ {
1291
+ "epoch": 24.6875,
1292
+ "grad_norm": 0.774336040019989,
1293
+ "learning_rate": 3.935185185185186e-05,
1294
+ "loss": 0.0304,
1295
+ "step": 1580
1296
+ },
1297
+ {
1298
+ "epoch": 24.84375,
1299
+ "grad_norm": 0.45676717162132263,
1300
+ "learning_rate": 3.8194444444444444e-05,
1301
+ "loss": 0.0213,
1302
+ "step": 1590
1303
+ },
1304
+ {
1305
+ "epoch": 25.0,
1306
+ "grad_norm": 0.18224991858005524,
1307
+ "learning_rate": 3.7037037037037037e-05,
1308
+ "loss": 0.0225,
1309
+ "step": 1600
1310
+ },
1311
+ {
1312
+ "epoch": 25.0,
1313
+ "eval_accuracy": 0.9628318584070796,
1314
+ "eval_f1": 0.908296943231441,
1315
+ "eval_loss": 0.11213955283164978,
1316
+ "eval_precision": 0.9162995594713657,
1317
+ "eval_recall": 0.9004329004329005,
1318
+ "eval_runtime": 1.05,
1319
+ "eval_samples_per_second": 107.622,
1320
+ "eval_steps_per_second": 14.286,
1321
+ "step": 1600
1322
+ },
1323
+ {
1324
+ "epoch": 25.15625,
1325
+ "grad_norm": 0.45673811435699463,
1326
+ "learning_rate": 3.587962962962963e-05,
1327
+ "loss": 0.0253,
1328
+ "step": 1610
1329
+ },
1330
+ {
1331
+ "epoch": 25.3125,
1332
+ "grad_norm": 0.09005212038755417,
1333
+ "learning_rate": 3.472222222222222e-05,
1334
+ "loss": 0.0127,
1335
+ "step": 1620
1336
+ },
1337
+ {
1338
+ "epoch": 25.46875,
1339
+ "grad_norm": 0.20682398974895477,
1340
+ "learning_rate": 3.3564814814814815e-05,
1341
+ "loss": 0.0231,
1342
+ "step": 1630
1343
+ },
1344
+ {
1345
+ "epoch": 25.625,
1346
+ "grad_norm": 0.7664525508880615,
1347
+ "learning_rate": 3.240740740740741e-05,
1348
+ "loss": 0.0174,
1349
+ "step": 1640
1350
+ },
1351
+ {
1352
+ "epoch": 25.78125,
1353
+ "grad_norm": 0.20978455245494843,
1354
+ "learning_rate": 3.125e-05,
1355
+ "loss": 0.0203,
1356
+ "step": 1650
1357
+ },
1358
+ {
1359
+ "epoch": 25.9375,
1360
+ "grad_norm": 0.5540274977684021,
1361
+ "learning_rate": 3.0092592592592593e-05,
1362
+ "loss": 0.0205,
1363
+ "step": 1660
1364
+ },
1365
+ {
1366
+ "epoch": 26.09375,
1367
+ "grad_norm": 0.1240416169166565,
1368
+ "learning_rate": 2.8935185185185186e-05,
1369
+ "loss": 0.0168,
1370
+ "step": 1670
1371
+ },
1372
+ {
1373
+ "epoch": 26.25,
1374
+ "grad_norm": 0.04385749623179436,
1375
+ "learning_rate": 2.777777777777778e-05,
1376
+ "loss": 0.0162,
1377
+ "step": 1680
1378
+ },
1379
+ {
1380
+ "epoch": 26.40625,
1381
+ "grad_norm": 0.1799972951412201,
1382
+ "learning_rate": 2.6620370370370372e-05,
1383
+ "loss": 0.0196,
1384
+ "step": 1690
1385
+ },
1386
+ {
1387
+ "epoch": 26.5625,
1388
+ "grad_norm": 0.24593585729599,
1389
+ "learning_rate": 2.5462962962962965e-05,
1390
+ "loss": 0.0292,
1391
+ "step": 1700
1392
+ },
1393
+ {
1394
+ "epoch": 26.5625,
1395
+ "eval_accuracy": 0.9619469026548673,
1396
+ "eval_f1": 0.9071274298056156,
1397
+ "eval_loss": 0.11493521183729172,
1398
+ "eval_precision": 0.9051724137931034,
1399
+ "eval_recall": 0.9090909090909091,
1400
+ "eval_runtime": 0.8719,
1401
+ "eval_samples_per_second": 129.607,
1402
+ "eval_steps_per_second": 17.204,
1403
+ "step": 1700
1404
+ },
1405
+ {
1406
+ "epoch": 26.71875,
1407
+ "grad_norm": 0.2344673126935959,
1408
+ "learning_rate": 2.4305555555555558e-05,
1409
+ "loss": 0.0177,
1410
+ "step": 1710
1411
+ },
1412
+ {
1413
+ "epoch": 26.875,
1414
+ "grad_norm": 0.263621062040329,
1415
+ "learning_rate": 2.314814814814815e-05,
1416
+ "loss": 0.0288,
1417
+ "step": 1720
1418
+ },
1419
+ {
1420
+ "epoch": 27.03125,
1421
+ "grad_norm": 0.27248746156692505,
1422
+ "learning_rate": 2.1990740740740743e-05,
1423
+ "loss": 0.0158,
1424
+ "step": 1730
1425
+ },
1426
+ {
1427
+ "epoch": 27.1875,
1428
+ "grad_norm": 0.35065901279449463,
1429
+ "learning_rate": 2.0833333333333336e-05,
1430
+ "loss": 0.0198,
1431
+ "step": 1740
1432
+ },
1433
+ {
1434
+ "epoch": 27.34375,
1435
+ "grad_norm": 0.23319651186466217,
1436
+ "learning_rate": 1.967592592592593e-05,
1437
+ "loss": 0.0208,
1438
+ "step": 1750
1439
+ },
1440
+ {
1441
+ "epoch": 27.5,
1442
+ "grad_norm": 0.15196481347084045,
1443
+ "learning_rate": 1.8518518518518518e-05,
1444
+ "loss": 0.0161,
1445
+ "step": 1760
1446
+ },
1447
+ {
1448
+ "epoch": 27.65625,
1449
+ "grad_norm": 0.17273353040218353,
1450
+ "learning_rate": 1.736111111111111e-05,
1451
+ "loss": 0.0213,
1452
+ "step": 1770
1453
+ },
1454
+ {
1455
+ "epoch": 27.8125,
1456
+ "grad_norm": 0.31511059403419495,
1457
+ "learning_rate": 1.6203703703703704e-05,
1458
+ "loss": 0.012,
1459
+ "step": 1780
1460
+ },
1461
+ {
1462
+ "epoch": 27.96875,
1463
+ "grad_norm": 0.09265203773975372,
1464
+ "learning_rate": 1.5046296296296297e-05,
1465
+ "loss": 0.0218,
1466
+ "step": 1790
1467
+ },
1468
+ {
1469
+ "epoch": 28.125,
1470
+ "grad_norm": 0.2638147473335266,
1471
+ "learning_rate": 1.388888888888889e-05,
1472
+ "loss": 0.0261,
1473
+ "step": 1800
1474
+ },
1475
+ {
1476
+ "epoch": 28.125,
1477
+ "eval_accuracy": 0.9619469026548673,
1478
+ "eval_f1": 0.9079229122055675,
1479
+ "eval_loss": 0.11067904531955719,
1480
+ "eval_precision": 0.8983050847457628,
1481
+ "eval_recall": 0.9177489177489178,
1482
+ "eval_runtime": 0.9722,
1483
+ "eval_samples_per_second": 116.235,
1484
+ "eval_steps_per_second": 15.429,
1485
+ "step": 1800
1486
+ },
1487
+ {
1488
+ "epoch": 28.28125,
1489
+ "grad_norm": 0.2734526991844177,
1490
+ "learning_rate": 1.2731481481481482e-05,
1491
+ "loss": 0.0175,
1492
+ "step": 1810
1493
+ },
1494
+ {
1495
+ "epoch": 28.4375,
1496
+ "grad_norm": 0.06026133522391319,
1497
+ "learning_rate": 1.1574074074074075e-05,
1498
+ "loss": 0.0168,
1499
+ "step": 1820
1500
+ },
1501
+ {
1502
+ "epoch": 28.59375,
1503
+ "grad_norm": 0.02611556649208069,
1504
+ "learning_rate": 1.0416666666666668e-05,
1505
+ "loss": 0.0177,
1506
+ "step": 1830
1507
+ },
1508
+ {
1509
+ "epoch": 28.75,
1510
+ "grad_norm": 0.23434928059577942,
1511
+ "learning_rate": 9.259259259259259e-06,
1512
+ "loss": 0.0252,
1513
+ "step": 1840
1514
+ },
1515
+ {
1516
+ "epoch": 28.90625,
1517
+ "grad_norm": 0.07623735815286636,
1518
+ "learning_rate": 8.101851851851852e-06,
1519
+ "loss": 0.0235,
1520
+ "step": 1850
1521
+ },
1522
+ {
1523
+ "epoch": 29.0625,
1524
+ "grad_norm": 0.05061192065477371,
1525
+ "learning_rate": 6.944444444444445e-06,
1526
+ "loss": 0.0185,
1527
+ "step": 1860
1528
+ },
1529
+ {
1530
+ "epoch": 29.21875,
1531
+ "grad_norm": 0.03355779871344566,
1532
+ "learning_rate": 5.787037037037038e-06,
1533
+ "loss": 0.0157,
1534
+ "step": 1870
1535
+ },
1536
+ {
1537
+ "epoch": 29.375,
1538
+ "grad_norm": 0.023396974429488182,
1539
+ "learning_rate": 4.6296296296296296e-06,
1540
+ "loss": 0.016,
1541
+ "step": 1880
1542
+ },
1543
+ {
1544
+ "epoch": 29.53125,
1545
+ "grad_norm": 0.31754446029663086,
1546
+ "learning_rate": 3.4722222222222224e-06,
1547
+ "loss": 0.0213,
1548
+ "step": 1890
1549
+ },
1550
+ {
1551
+ "epoch": 29.6875,
1552
+ "grad_norm": 0.04646694287657738,
1553
+ "learning_rate": 2.3148148148148148e-06,
1554
+ "loss": 0.0166,
1555
+ "step": 1900
1556
+ },
1557
+ {
1558
+ "epoch": 29.6875,
1559
+ "eval_accuracy": 0.9610619469026549,
1560
+ "eval_f1": 0.9051724137931034,
1561
+ "eval_loss": 0.11101004481315613,
1562
+ "eval_precision": 0.9012875536480687,
1563
+ "eval_recall": 0.9090909090909091,
1564
+ "eval_runtime": 0.8549,
1565
+ "eval_samples_per_second": 132.182,
1566
+ "eval_steps_per_second": 17.546,
1567
+ "step": 1900
1568
+ },
1569
+ {
1570
+ "epoch": 29.84375,
1571
+ "grad_norm": 0.05433151125907898,
1572
+ "learning_rate": 1.1574074074074074e-06,
1573
+ "loss": 0.0174,
1574
+ "step": 1910
1575
+ },
1576
+ {
1577
+ "epoch": 30.0,
1578
+ "grad_norm": 0.3085578382015228,
1579
+ "learning_rate": 0.0,
1580
+ "loss": 0.0208,
1581
+ "step": 1920
1582
+ },
1583
+ {
1584
+ "epoch": 30.0,
1585
+ "step": 1920,
1586
+ "total_flos": 2.352825493649326e+18,
1587
+ "train_loss": 0.05003170374160012,
1588
+ "train_runtime": 517.118,
1589
+ "train_samples_per_second": 58.71,
1590
+ "train_steps_per_second": 3.713
1591
  }
1592
  ],
1593
  "logging_steps": 10,
1594
+ "max_steps": 1920,
1595
  "num_input_tokens_seen": 0,
1596
+ "num_train_epochs": 30,
1597
  "save_steps": 500,
1598
  "stateful_callbacks": {
1599
  "TrainerControl": {
 
1607
  "attributes": {}
1608
  }
1609
  },
1610
+ "total_flos": 2.352825493649326e+18,
1611
  "train_batch_size": 16,
1612
  "trial_name": null,
1613
  "trial_params": null