DouglasBraga commited on
Commit
6d026a2
1 Parent(s): c51d1b2

End of training

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. all_results.json +10 -10
  3. eval_results.json +6 -6
  4. train_results.json +5 -5
  5. trainer_state.json +309 -1905
README.md CHANGED
@@ -23,7 +23,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.692
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +33,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.8708
37
- - Accuracy: 0.692
38
 
39
  ## Model description
40
 
 
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
+ value: 0.7025
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
33
 
34
  This model is a fine-tuned version of [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224) on the imagefolder dataset.
35
  It achieves the following results on the evaluation set:
36
+ - Loss: 0.8305
37
+ - Accuracy: 0.7025
38
 
39
  ## Model description
40
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 9.984,
3
- "eval_accuracy": 0.80625,
4
- "eval_loss": 0.662419319152832,
5
- "eval_runtime": 157.5998,
6
- "eval_samples_per_second": 25.381,
7
- "eval_steps_per_second": 0.793,
8
- "total_flos": 9.926487761391452e+18,
9
- "train_loss": 0.24441122439427254,
10
- "train_runtime": 137433.0815,
11
- "train_samples_per_second": 2.911,
12
  "train_steps_per_second": 0.023
13
  }
 
1
  {
2
+ "epoch": 2.9952,
3
+ "eval_accuracy": 0.7025,
4
+ "eval_loss": 0.830498456954956,
5
+ "eval_runtime": 156.0429,
6
+ "eval_samples_per_second": 25.634,
7
+ "eval_steps_per_second": 0.801,
8
+ "total_flos": 2.9779463284174356e+18,
9
+ "train_loss": 0.36125070786374247,
10
+ "train_runtime": 41238.7802,
11
+ "train_samples_per_second": 2.91,
12
  "train_steps_per_second": 0.023
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.984,
3
- "eval_accuracy": 0.80625,
4
- "eval_loss": 0.662419319152832,
5
- "eval_runtime": 157.5998,
6
- "eval_samples_per_second": 25.381,
7
- "eval_steps_per_second": 0.793
8
  }
 
1
  {
2
+ "epoch": 2.9952,
3
+ "eval_accuracy": 0.7025,
4
+ "eval_loss": 0.830498456954956,
5
+ "eval_runtime": 156.0429,
6
+ "eval_samples_per_second": 25.634,
7
+ "eval_steps_per_second": 0.801
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.984,
3
- "total_flos": 9.926487761391452e+18,
4
- "train_loss": 0.24441122439427254,
5
- "train_runtime": 137433.0815,
6
- "train_samples_per_second": 2.911,
7
  "train_steps_per_second": 0.023
8
  }
 
1
  {
2
+ "epoch": 2.9952,
3
+ "total_flos": 2.9779463284174356e+18,
4
+ "train_loss": 0.36125070786374247,
5
+ "train_runtime": 41238.7802,
6
+ "train_samples_per_second": 2.91,
7
  "train_steps_per_second": 0.023
8
  }
trainer_state.json CHANGED
@@ -1,2301 +1,705 @@
1
  {
2
- "best_metric": 0.80625,
3
- "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-leukemia-08-2024.v1.2\\checkpoint-2500",
4
- "epoch": 9.984,
5
  "eval_steps": 500,
6
- "global_step": 3120,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.032,
13
- "grad_norm": 5.175816059112549,
14
- "learning_rate": 1.6025641025641025e-06,
15
- "loss": 0.701,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.064,
20
- "grad_norm": 4.15721321105957,
21
- "learning_rate": 3.205128205128205e-06,
22
- "loss": 0.6915,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.096,
27
- "grad_norm": 3.4217724800109863,
28
- "learning_rate": 4.807692307692308e-06,
29
- "loss": 0.6905,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.128,
34
- "grad_norm": 5.180055141448975,
35
- "learning_rate": 6.41025641025641e-06,
36
- "loss": 0.6768,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
- "grad_norm": 5.26149320602417,
42
- "learning_rate": 8.012820512820515e-06,
43
- "loss": 0.6593,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.192,
48
- "grad_norm": 9.300067901611328,
49
- "learning_rate": 9.615384615384616e-06,
50
- "loss": 0.6601,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.224,
55
- "grad_norm": 7.7086100578308105,
56
- "learning_rate": 1.1217948717948719e-05,
57
- "loss": 0.6399,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.256,
62
- "grad_norm": 9.903676986694336,
63
- "learning_rate": 1.282051282051282e-05,
64
- "loss": 0.6239,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.288,
69
- "grad_norm": 8.374137878417969,
70
- "learning_rate": 1.4423076923076923e-05,
71
- "loss": 0.6283,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.32,
76
- "grad_norm": 13.180610656738281,
77
- "learning_rate": 1.602564102564103e-05,
78
- "loss": 0.6182,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.352,
83
- "grad_norm": 16.700641632080078,
84
- "learning_rate": 1.762820512820513e-05,
85
- "loss": 0.6064,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.384,
90
- "grad_norm": 7.996766567230225,
91
- "learning_rate": 1.923076923076923e-05,
92
- "loss": 0.6113,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.416,
97
- "grad_norm": 11.852957725524902,
98
- "learning_rate": 2.0833333333333336e-05,
99
- "loss": 0.5819,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.448,
104
- "grad_norm": 14.05556869506836,
105
- "learning_rate": 2.2435897435897437e-05,
106
- "loss": 0.5803,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.48,
111
- "grad_norm": 9.655508041381836,
112
- "learning_rate": 2.4038461538461542e-05,
113
- "loss": 0.5554,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.512,
118
- "grad_norm": 8.562996864318848,
119
- "learning_rate": 2.564102564102564e-05,
120
- "loss": 0.5555,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.544,
125
- "grad_norm": 25.228897094726562,
126
- "learning_rate": 2.724358974358974e-05,
127
- "loss": 0.5462,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.576,
132
- "grad_norm": 28.388383865356445,
133
- "learning_rate": 2.8685897435897437e-05,
134
- "loss": 0.591,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.608,
139
- "grad_norm": 25.528270721435547,
140
- "learning_rate": 3.0288461538461538e-05,
141
- "loss": 0.5561,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.64,
146
- "grad_norm": 9.540146827697754,
147
- "learning_rate": 3.189102564102564e-05,
148
- "loss": 0.5586,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.672,
153
- "grad_norm": 12.264636993408203,
154
- "learning_rate": 3.3493589743589744e-05,
155
- "loss": 0.5514,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.704,
160
- "grad_norm": 6.773402214050293,
161
- "learning_rate": 3.5096153846153845e-05,
162
- "loss": 0.5146,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.736,
167
- "grad_norm": 11.84838581085205,
168
- "learning_rate": 3.6698717948717946e-05,
169
- "loss": 0.474,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.768,
174
- "grad_norm": 7.139883041381836,
175
- "learning_rate": 3.8301282051282054e-05,
176
- "loss": 0.461,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.8,
181
- "grad_norm": 7.588615417480469,
182
- "learning_rate": 3.9903846153846155e-05,
183
- "loss": 0.4871,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.832,
188
- "grad_norm": 10.481608390808105,
189
- "learning_rate": 4.150641025641026e-05,
190
- "loss": 0.4536,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.864,
195
- "grad_norm": 19.465951919555664,
196
- "learning_rate": 4.3108974358974364e-05,
197
- "loss": 0.4818,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.896,
202
- "grad_norm": 13.416147232055664,
203
- "learning_rate": 4.4711538461538466e-05,
204
- "loss": 0.433,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.928,
209
- "grad_norm": 8.518133163452148,
210
- "learning_rate": 4.615384615384616e-05,
211
- "loss": 0.4145,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.96,
216
- "grad_norm": 53.73661804199219,
217
- "learning_rate": 4.775641025641026e-05,
218
- "loss": 0.5544,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.992,
223
- "grad_norm": 12.88679313659668,
224
- "learning_rate": 4.935897435897436e-05,
225
- "loss": 0.471,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9984,
230
- "eval_accuracy": 0.6715,
231
- "eval_loss": 0.590698778629303,
232
- "eval_runtime": 168.7555,
233
- "eval_samples_per_second": 23.703,
234
- "eval_steps_per_second": 0.741,
235
  "step": 312
236
  },
237
  {
238
  "epoch": 1.024,
239
- "grad_norm": 17.80354881286621,
240
- "learning_rate": 4.98931623931624e-05,
241
- "loss": 0.5001,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.056,
246
- "grad_norm": 9.920671463012695,
247
- "learning_rate": 4.971509971509972e-05,
248
- "loss": 0.4407,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.088,
253
- "grad_norm": 15.50145149230957,
254
- "learning_rate": 4.9537037037037035e-05,
255
- "loss": 0.416,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.12,
260
- "grad_norm": 11.008417129516602,
261
- "learning_rate": 4.935897435897436e-05,
262
- "loss": 0.4409,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.152,
267
- "grad_norm": 17.966873168945312,
268
- "learning_rate": 4.9180911680911686e-05,
269
- "loss": 0.4154,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.184,
274
- "grad_norm": 35.332679748535156,
275
- "learning_rate": 4.9002849002849004e-05,
276
- "loss": 0.4375,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.216,
281
- "grad_norm": 13.41420841217041,
282
- "learning_rate": 4.882478632478633e-05,
283
- "loss": 0.4239,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.248,
288
- "grad_norm": 42.17304611206055,
289
- "learning_rate": 4.864672364672365e-05,
290
- "loss": 0.4071,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.28,
295
- "grad_norm": 30.35808753967285,
296
- "learning_rate": 4.846866096866097e-05,
297
- "loss": 0.3735,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.312,
302
- "grad_norm": 24.77705955505371,
303
- "learning_rate": 4.829059829059829e-05,
304
- "loss": 0.4135,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 1.3439999999999999,
309
- "grad_norm": 11.592647552490234,
310
- "learning_rate": 4.8112535612535616e-05,
311
- "loss": 0.3879,
312
  "step": 420
313
  },
314
  {
315
  "epoch": 1.376,
316
- "grad_norm": 13.36685562133789,
317
- "learning_rate": 4.7934472934472934e-05,
318
- "loss": 0.383,
319
  "step": 430
320
  },
321
  {
322
  "epoch": 1.408,
323
- "grad_norm": 9.121360778808594,
324
- "learning_rate": 4.775641025641026e-05,
325
- "loss": 0.3852,
326
  "step": 440
327
  },
328
  {
329
  "epoch": 1.44,
330
- "grad_norm": 20.881765365600586,
331
- "learning_rate": 4.7578347578347584e-05,
332
- "loss": 0.3584,
333
  "step": 450
334
  },
335
  {
336
  "epoch": 1.472,
337
- "grad_norm": 41.44031524658203,
338
- "learning_rate": 4.74002849002849e-05,
339
- "loss": 0.4047,
340
  "step": 460
341
  },
342
  {
343
  "epoch": 1.504,
344
- "grad_norm": 6.26457405090332,
345
- "learning_rate": 4.722222222222222e-05,
346
- "loss": 0.3623,
347
  "step": 470
348
  },
349
  {
350
  "epoch": 1.536,
351
- "grad_norm": 18.60125732421875,
352
- "learning_rate": 4.7044159544159546e-05,
353
- "loss": 0.3609,
354
  "step": 480
355
  },
356
  {
357
  "epoch": 1.568,
358
- "grad_norm": 17.937673568725586,
359
- "learning_rate": 4.686609686609687e-05,
360
- "loss": 0.3832,
361
  "step": 490
362
  },
363
  {
364
  "epoch": 1.6,
365
- "grad_norm": 11.792247772216797,
366
- "learning_rate": 4.668803418803419e-05,
367
- "loss": 0.3499,
368
  "step": 500
369
  },
370
  {
371
  "epoch": 1.6320000000000001,
372
- "grad_norm": 35.20759201049805,
373
- "learning_rate": 4.6509971509971515e-05,
374
- "loss": 0.3703,
375
  "step": 510
376
  },
377
  {
378
  "epoch": 1.6640000000000001,
379
- "grad_norm": 9.141999244689941,
380
- "learning_rate": 4.633190883190883e-05,
381
- "loss": 0.3433,
382
  "step": 520
383
  },
384
  {
385
  "epoch": 1.696,
386
- "grad_norm": 9.520153999328613,
387
- "learning_rate": 4.615384615384616e-05,
388
- "loss": 0.3638,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 1.728,
393
- "grad_norm": 20.68021011352539,
394
- "learning_rate": 4.5975783475783476e-05,
395
- "loss": 0.4226,
396
  "step": 540
397
  },
398
  {
399
  "epoch": 1.76,
400
- "grad_norm": Infinity,
401
- "learning_rate": 4.581552706552707e-05,
402
- "loss": 0.3537,
403
  "step": 550
404
  },
405
  {
406
  "epoch": 1.792,
407
- "grad_norm": 29.779680252075195,
408
- "learning_rate": 4.563746438746439e-05,
409
- "loss": 0.4041,
410
  "step": 560
411
  },
412
  {
413
  "epoch": 1.8239999999999998,
414
- "grad_norm": 36.394161224365234,
415
- "learning_rate": 4.545940170940171e-05,
416
- "loss": 0.3585,
417
  "step": 570
418
  },
419
  {
420
  "epoch": 1.8559999999999999,
421
- "grad_norm": 27.704442977905273,
422
- "learning_rate": 4.528133903133903e-05,
423
- "loss": 0.3484,
424
  "step": 580
425
  },
426
  {
427
  "epoch": 1.888,
428
- "grad_norm": 9.653308868408203,
429
- "learning_rate": 4.510327635327636e-05,
430
- "loss": 0.3234,
431
  "step": 590
432
  },
433
  {
434
  "epoch": 1.92,
435
- "grad_norm": 15.85473346710205,
436
- "learning_rate": 4.492521367521368e-05,
437
- "loss": 0.3486,
438
  "step": 600
439
  },
440
  {
441
  "epoch": 1.952,
442
- "grad_norm": 18.47919464111328,
443
- "learning_rate": 4.4747150997151e-05,
444
- "loss": 0.3643,
445
  "step": 610
446
  },
447
  {
448
  "epoch": 1.984,
449
- "grad_norm": 25.203697204589844,
450
- "learning_rate": 4.456908831908832e-05,
451
- "loss": 0.3376,
452
  "step": 620
453
  },
454
  {
455
  "epoch": 2.0,
456
- "eval_accuracy": 0.702,
457
- "eval_loss": 0.8903971910476685,
458
- "eval_runtime": 155.8146,
459
- "eval_samples_per_second": 25.672,
460
- "eval_steps_per_second": 0.802,
461
  "step": 625
462
  },
463
  {
464
  "epoch": 2.016,
465
- "grad_norm": 15.359530448913574,
466
- "learning_rate": 4.439102564102564e-05,
467
- "loss": 0.3328,
468
  "step": 630
469
  },
470
  {
471
  "epoch": 2.048,
472
- "grad_norm": 32.16645812988281,
473
- "learning_rate": 4.4212962962962966e-05,
474
- "loss": 0.3114,
475
  "step": 640
476
  },
477
  {
478
  "epoch": 2.08,
479
- "grad_norm": 8.913246154785156,
480
- "learning_rate": 4.4034900284900285e-05,
481
- "loss": 0.3503,
482
  "step": 650
483
  },
484
  {
485
  "epoch": 2.112,
486
- "grad_norm": 33.13675308227539,
487
- "learning_rate": 4.385683760683761e-05,
488
- "loss": 0.3231,
489
  "step": 660
490
  },
491
  {
492
  "epoch": 2.144,
493
- "grad_norm": 24.989564895629883,
494
- "learning_rate": 4.367877492877493e-05,
495
- "loss": 0.299,
496
  "step": 670
497
  },
498
  {
499
  "epoch": 2.176,
500
- "grad_norm": 19.014232635498047,
501
- "learning_rate": 4.350071225071225e-05,
502
- "loss": 0.3708,
503
  "step": 680
504
  },
505
  {
506
  "epoch": 2.208,
507
- "grad_norm": 12.924874305725098,
508
- "learning_rate": 4.332264957264958e-05,
509
- "loss": 0.2897,
510
  "step": 690
511
  },
512
  {
513
  "epoch": 2.24,
514
- "grad_norm": 8.708645820617676,
515
- "learning_rate": 4.31445868945869e-05,
516
- "loss": 0.2907,
517
  "step": 700
518
  },
519
  {
520
  "epoch": 2.2720000000000002,
521
- "grad_norm": 23.476911544799805,
522
- "learning_rate": 4.2966524216524215e-05,
523
- "loss": 0.3338,
524
  "step": 710
525
  },
526
  {
527
  "epoch": 2.304,
528
- "grad_norm": 12.567790985107422,
529
- "learning_rate": 4.278846153846154e-05,
530
- "loss": 0.3332,
531
  "step": 720
532
  },
533
  {
534
  "epoch": 2.336,
535
- "grad_norm": 9.736689567565918,
536
- "learning_rate": 4.2610398860398865e-05,
537
- "loss": 0.3076,
538
  "step": 730
539
  },
540
  {
541
  "epoch": 2.368,
542
- "grad_norm": 8.446788787841797,
543
- "learning_rate": 4.2432336182336184e-05,
544
- "loss": 0.3052,
545
  "step": 740
546
  },
547
  {
548
  "epoch": 2.4,
549
- "grad_norm": 24.329153060913086,
550
- "learning_rate": 4.225427350427351e-05,
551
- "loss": 0.3356,
552
  "step": 750
553
  },
554
  {
555
  "epoch": 2.432,
556
- "grad_norm": 6.862959861755371,
557
- "learning_rate": 4.207621082621083e-05,
558
- "loss": 0.381,
559
  "step": 760
560
  },
561
  {
562
  "epoch": 2.464,
563
- "grad_norm": 12.4033842086792,
564
- "learning_rate": 4.1898148148148145e-05,
565
- "loss": 0.2917,
566
  "step": 770
567
  },
568
  {
569
  "epoch": 2.496,
570
- "grad_norm": 11.243486404418945,
571
- "learning_rate": 4.172008547008547e-05,
572
- "loss": 0.2766,
573
  "step": 780
574
  },
575
  {
576
  "epoch": 2.528,
577
- "grad_norm": 28.551401138305664,
578
- "learning_rate": 4.1542022792022796e-05,
579
- "loss": 0.2965,
580
  "step": 790
581
  },
582
  {
583
  "epoch": 2.56,
584
- "grad_norm": 7.9521484375,
585
- "learning_rate": 4.1363960113960114e-05,
586
- "loss": 0.2912,
587
  "step": 800
588
  },
589
  {
590
  "epoch": 2.592,
591
- "grad_norm": 14.33103084564209,
592
- "learning_rate": 4.118589743589744e-05,
593
- "loss": 0.3085,
594
  "step": 810
595
  },
596
  {
597
  "epoch": 2.624,
598
- "grad_norm": 14.015169143676758,
599
- "learning_rate": 4.1007834757834764e-05,
600
- "loss": 0.2754,
601
  "step": 820
602
  },
603
  {
604
  "epoch": 2.656,
605
- "grad_norm": 27.55406379699707,
606
- "learning_rate": 4.082977207977208e-05,
607
- "loss": 0.2825,
608
  "step": 830
609
  },
610
  {
611
  "epoch": 2.6879999999999997,
612
- "grad_norm": 24.08679962158203,
613
- "learning_rate": 4.06517094017094e-05,
614
- "loss": 0.3177,
615
  "step": 840
616
  },
617
  {
618
  "epoch": 2.7199999999999998,
619
- "grad_norm": 15.628003120422363,
620
- "learning_rate": 4.0473646723646726e-05,
621
- "loss": 0.289,
622
  "step": 850
623
  },
624
  {
625
  "epoch": 2.752,
626
- "grad_norm": 33.23855972290039,
627
- "learning_rate": 4.0295584045584044e-05,
628
- "loss": 0.2952,
629
  "step": 860
630
  },
631
  {
632
  "epoch": 2.784,
633
- "grad_norm": 28.434730529785156,
634
- "learning_rate": 4.011752136752137e-05,
635
- "loss": 0.2608,
636
  "step": 870
637
  },
638
  {
639
  "epoch": 2.816,
640
- "grad_norm": 12.023883819580078,
641
- "learning_rate": 3.9939458689458694e-05,
642
- "loss": 0.2821,
643
  "step": 880
644
  },
645
  {
646
  "epoch": 2.848,
647
- "grad_norm": 18.148351669311523,
648
- "learning_rate": 3.976139601139601e-05,
649
- "loss": 0.2998,
650
  "step": 890
651
  },
652
  {
653
  "epoch": 2.88,
654
- "grad_norm": 52.181968688964844,
655
- "learning_rate": 3.958333333333333e-05,
656
- "loss": 0.3114,
657
  "step": 900
658
  },
659
  {
660
  "epoch": 2.912,
661
- "grad_norm": 39.47223663330078,
662
- "learning_rate": 3.940527065527066e-05,
663
- "loss": 0.3137,
664
  "step": 910
665
  },
666
  {
667
  "epoch": 2.944,
668
- "grad_norm": 13.708846092224121,
669
- "learning_rate": 3.922720797720798e-05,
670
- "loss": 0.2849,
671
  "step": 920
672
  },
673
  {
674
  "epoch": 2.976,
675
- "grad_norm": 7.4183735847473145,
676
- "learning_rate": 3.90491452991453e-05,
677
- "loss": 0.2266,
678
  "step": 930
679
  },
680
  {
681
- "epoch": 2.9984,
682
- "eval_accuracy": 0.556,
683
- "eval_loss": 1.8065074682235718,
684
- "eval_runtime": 155.7933,
685
- "eval_samples_per_second": 25.675,
686
- "eval_steps_per_second": 0.802,
687
- "step": 937
688
- },
689
- {
690
- "epoch": 3.008,
691
- "grad_norm": 51.52549743652344,
692
- "learning_rate": 3.8871082621082625e-05,
693
- "loss": 0.3457,
694
- "step": 940
695
- },
696
- {
697
- "epoch": 3.04,
698
- "grad_norm": 22.345596313476562,
699
- "learning_rate": 3.869301994301994e-05,
700
- "loss": 0.3204,
701
- "step": 950
702
- },
703
- {
704
- "epoch": 3.072,
705
- "grad_norm": 16.317461013793945,
706
- "learning_rate": 3.851495726495727e-05,
707
- "loss": 0.3101,
708
- "step": 960
709
- },
710
- {
711
- "epoch": 3.104,
712
- "grad_norm": 27.974485397338867,
713
- "learning_rate": 3.8336894586894586e-05,
714
- "loss": 0.2918,
715
- "step": 970
716
- },
717
- {
718
- "epoch": 3.136,
719
- "grad_norm": 23.675912857055664,
720
- "learning_rate": 3.815883190883191e-05,
721
- "loss": 0.2817,
722
- "step": 980
723
- },
724
- {
725
- "epoch": 3.168,
726
- "grad_norm": 30.348739624023438,
727
- "learning_rate": 3.798076923076923e-05,
728
- "loss": 0.2431,
729
- "step": 990
730
- },
731
- {
732
- "epoch": 3.2,
733
- "grad_norm": 11.50307846069336,
734
- "learning_rate": 3.7802706552706555e-05,
735
- "loss": 0.2584,
736
- "step": 1000
737
- },
738
- {
739
- "epoch": 3.232,
740
- "grad_norm": 19.67528533935547,
741
- "learning_rate": 3.762464387464388e-05,
742
- "loss": 0.2714,
743
- "step": 1010
744
- },
745
- {
746
- "epoch": 3.2640000000000002,
747
- "grad_norm": 22.57136344909668,
748
- "learning_rate": 3.74465811965812e-05,
749
- "loss": 0.2544,
750
- "step": 1020
751
- },
752
- {
753
- "epoch": 3.296,
754
- "grad_norm": 16.600223541259766,
755
- "learning_rate": 3.726851851851852e-05,
756
- "loss": 0.2469,
757
- "step": 1030
758
- },
759
- {
760
- "epoch": 3.328,
761
- "grad_norm": 13.466314315795898,
762
- "learning_rate": 3.709045584045584e-05,
763
- "loss": 0.3036,
764
- "step": 1040
765
- },
766
- {
767
- "epoch": 3.36,
768
- "grad_norm": 21.5009765625,
769
- "learning_rate": 3.691239316239317e-05,
770
- "loss": 0.2864,
771
- "step": 1050
772
- },
773
- {
774
- "epoch": 3.392,
775
- "grad_norm": 7.8777008056640625,
776
- "learning_rate": 3.6734330484330485e-05,
777
- "loss": 0.2907,
778
- "step": 1060
779
- },
780
- {
781
- "epoch": 3.424,
782
- "grad_norm": 14.131542205810547,
783
- "learning_rate": 3.655626780626781e-05,
784
- "loss": 0.2849,
785
- "step": 1070
786
- },
787
- {
788
- "epoch": 3.456,
789
- "grad_norm": 11.86939525604248,
790
- "learning_rate": 3.637820512820513e-05,
791
- "loss": 0.2659,
792
- "step": 1080
793
- },
794
- {
795
- "epoch": 3.488,
796
- "grad_norm": 8.989656448364258,
797
- "learning_rate": 3.6200142450142454e-05,
798
- "loss": 0.2714,
799
- "step": 1090
800
- },
801
- {
802
- "epoch": 3.52,
803
- "grad_norm": 22.53978729248047,
804
- "learning_rate": 3.602207977207977e-05,
805
- "loss": 0.2592,
806
- "step": 1100
807
- },
808
- {
809
- "epoch": 3.552,
810
- "grad_norm": 11.309256553649902,
811
- "learning_rate": 3.58440170940171e-05,
812
- "loss": 0.248,
813
- "step": 1110
814
- },
815
- {
816
- "epoch": 3.584,
817
- "grad_norm": 8.17591381072998,
818
- "learning_rate": 3.5665954415954415e-05,
819
- "loss": 0.2481,
820
- "step": 1120
821
- },
822
- {
823
- "epoch": 3.616,
824
- "grad_norm": 44.116905212402344,
825
- "learning_rate": 3.548789173789174e-05,
826
- "loss": 0.2424,
827
- "step": 1130
828
- },
829
- {
830
- "epoch": 3.648,
831
- "grad_norm": 15.891951560974121,
832
- "learning_rate": 3.5309829059829066e-05,
833
- "loss": 0.2467,
834
- "step": 1140
835
- },
836
- {
837
- "epoch": 3.68,
838
- "grad_norm": 9.102974891662598,
839
- "learning_rate": 3.5131766381766384e-05,
840
- "loss": 0.2456,
841
- "step": 1150
842
- },
843
- {
844
- "epoch": 3.7119999999999997,
845
- "grad_norm": 14.138261795043945,
846
- "learning_rate": 3.49537037037037e-05,
847
- "loss": 0.2479,
848
- "step": 1160
849
- },
850
- {
851
- "epoch": 3.7439999999999998,
852
- "grad_norm": 37.55530548095703,
853
- "learning_rate": 3.477564102564103e-05,
854
- "loss": 0.2604,
855
- "step": 1170
856
- },
857
- {
858
- "epoch": 3.776,
859
- "grad_norm": 20.72405242919922,
860
- "learning_rate": 3.4597578347578346e-05,
861
- "loss": 0.2392,
862
- "step": 1180
863
- },
864
- {
865
- "epoch": 3.808,
866
- "grad_norm": 9.120800018310547,
867
- "learning_rate": 3.441951566951567e-05,
868
- "loss": 0.2243,
869
- "step": 1190
870
- },
871
- {
872
- "epoch": 3.84,
873
- "grad_norm": 5.410057067871094,
874
- "learning_rate": 3.4241452991452996e-05,
875
- "loss": 0.2796,
876
- "step": 1200
877
- },
878
- {
879
- "epoch": 3.872,
880
- "grad_norm": 13.668625831604004,
881
- "learning_rate": 3.4063390313390314e-05,
882
- "loss": 0.2386,
883
- "step": 1210
884
- },
885
- {
886
- "epoch": 3.904,
887
- "grad_norm": 15.398672103881836,
888
- "learning_rate": 3.388532763532763e-05,
889
- "loss": 0.2461,
890
- "step": 1220
891
- },
892
- {
893
- "epoch": 3.936,
894
- "grad_norm": 21.709699630737305,
895
- "learning_rate": 3.3707264957264964e-05,
896
- "loss": 0.2225,
897
- "step": 1230
898
- },
899
- {
900
- "epoch": 3.968,
901
- "grad_norm": 6.977370262145996,
902
- "learning_rate": 3.352920227920228e-05,
903
- "loss": 0.223,
904
- "step": 1240
905
- },
906
- {
907
- "epoch": 4.0,
908
- "grad_norm": 8.210868835449219,
909
- "learning_rate": 3.33511396011396e-05,
910
- "loss": 0.2529,
911
- "step": 1250
912
- },
913
- {
914
- "epoch": 4.0,
915
- "eval_accuracy": 0.713,
916
- "eval_loss": 0.8170278668403625,
917
- "eval_runtime": 160.9705,
918
- "eval_samples_per_second": 24.849,
919
- "eval_steps_per_second": 0.777,
920
- "step": 1250
921
- },
922
- {
923
- "epoch": 4.032,
924
- "grad_norm": 10.20702075958252,
925
- "learning_rate": 3.3173076923076926e-05,
926
- "loss": 0.2091,
927
- "step": 1260
928
- },
929
- {
930
- "epoch": 4.064,
931
- "grad_norm": 10.824769020080566,
932
- "learning_rate": 3.2995014245014244e-05,
933
- "loss": 0.2298,
934
- "step": 1270
935
- },
936
- {
937
- "epoch": 4.096,
938
- "grad_norm": 10.660799026489258,
939
- "learning_rate": 3.281695156695157e-05,
940
- "loss": 0.2172,
941
- "step": 1280
942
- },
943
- {
944
- "epoch": 4.128,
945
- "grad_norm": 12.774826049804688,
946
- "learning_rate": 3.263888888888889e-05,
947
- "loss": 0.2204,
948
- "step": 1290
949
- },
950
- {
951
- "epoch": 4.16,
952
- "grad_norm": 9.919928550720215,
953
- "learning_rate": 3.246082621082621e-05,
954
- "loss": 0.2355,
955
- "step": 1300
956
- },
957
- {
958
- "epoch": 4.192,
959
- "grad_norm": 10.62849235534668,
960
- "learning_rate": 3.228276353276353e-05,
961
- "loss": 0.2046,
962
- "step": 1310
963
- },
964
- {
965
- "epoch": 4.224,
966
- "grad_norm": 11.318085670471191,
967
- "learning_rate": 3.2104700854700856e-05,
968
- "loss": 0.2058,
969
- "step": 1320
970
- },
971
- {
972
- "epoch": 4.256,
973
- "grad_norm": 23.96332359313965,
974
- "learning_rate": 3.192663817663818e-05,
975
- "loss": 0.2158,
976
- "step": 1330
977
- },
978
- {
979
- "epoch": 4.288,
980
- "grad_norm": 12.120290756225586,
981
- "learning_rate": 3.17485754985755e-05,
982
- "loss": 0.1955,
983
- "step": 1340
984
- },
985
- {
986
- "epoch": 4.32,
987
- "grad_norm": 22.462135314941406,
988
- "learning_rate": 3.157051282051282e-05,
989
- "loss": 0.2213,
990
- "step": 1350
991
- },
992
- {
993
- "epoch": 4.352,
994
- "grad_norm": 10.982671737670898,
995
- "learning_rate": 3.139245014245014e-05,
996
- "loss": 0.204,
997
- "step": 1360
998
- },
999
- {
1000
- "epoch": 4.384,
1001
- "grad_norm": 9.690351486206055,
1002
- "learning_rate": 3.121438746438747e-05,
1003
- "loss": 0.2072,
1004
- "step": 1370
1005
- },
1006
- {
1007
- "epoch": 4.416,
1008
- "grad_norm": 21.02212142944336,
1009
- "learning_rate": 3.103632478632479e-05,
1010
- "loss": 0.2084,
1011
- "step": 1380
1012
- },
1013
- {
1014
- "epoch": 4.448,
1015
- "grad_norm": 11.301972389221191,
1016
- "learning_rate": 3.085826210826211e-05,
1017
- "loss": 0.2046,
1018
- "step": 1390
1019
- },
1020
- {
1021
- "epoch": 4.48,
1022
- "grad_norm": 13.271730422973633,
1023
- "learning_rate": 3.068019943019943e-05,
1024
- "loss": 0.2341,
1025
- "step": 1400
1026
- },
1027
- {
1028
- "epoch": 4.5120000000000005,
1029
- "grad_norm": 12.261125564575195,
1030
- "learning_rate": 3.0502136752136755e-05,
1031
- "loss": 0.2151,
1032
- "step": 1410
1033
- },
1034
- {
1035
- "epoch": 4.5440000000000005,
1036
- "grad_norm": 5.64617919921875,
1037
- "learning_rate": 3.0324074074074077e-05,
1038
- "loss": 0.2046,
1039
- "step": 1420
1040
- },
1041
- {
1042
- "epoch": 4.576,
1043
- "grad_norm": 17.69122314453125,
1044
- "learning_rate": 3.01460113960114e-05,
1045
- "loss": 0.2058,
1046
- "step": 1430
1047
- },
1048
- {
1049
- "epoch": 4.608,
1050
- "grad_norm": 10.206043243408203,
1051
- "learning_rate": 2.9967948717948717e-05,
1052
- "loss": 0.2235,
1053
- "step": 1440
1054
- },
1055
- {
1056
- "epoch": 4.64,
1057
- "grad_norm": 13.48324203491211,
1058
- "learning_rate": 2.978988603988604e-05,
1059
- "loss": 0.1907,
1060
- "step": 1450
1061
- },
1062
- {
1063
- "epoch": 4.672,
1064
- "grad_norm": 7.78882360458374,
1065
- "learning_rate": 2.9611823361823364e-05,
1066
- "loss": 0.2012,
1067
- "step": 1460
1068
- },
1069
- {
1070
- "epoch": 4.704,
1071
- "grad_norm": 10.230093002319336,
1072
- "learning_rate": 2.9433760683760685e-05,
1073
- "loss": 0.1803,
1074
- "step": 1470
1075
- },
1076
- {
1077
- "epoch": 4.736,
1078
- "grad_norm": 37.21369934082031,
1079
- "learning_rate": 2.9255698005698007e-05,
1080
- "loss": 0.1995,
1081
- "step": 1480
1082
- },
1083
- {
1084
- "epoch": 4.768,
1085
- "grad_norm": 10.13237190246582,
1086
- "learning_rate": 2.907763532763533e-05,
1087
- "loss": 0.2096,
1088
- "step": 1490
1089
- },
1090
- {
1091
- "epoch": 4.8,
1092
- "grad_norm": 6.978107929229736,
1093
- "learning_rate": 2.8899572649572647e-05,
1094
- "loss": 0.1905,
1095
- "step": 1500
1096
- },
1097
- {
1098
- "epoch": 4.832,
1099
- "grad_norm": 17.814905166625977,
1100
- "learning_rate": 2.8721509971509976e-05,
1101
- "loss": 0.1818,
1102
- "step": 1510
1103
- },
1104
- {
1105
- "epoch": 4.864,
1106
- "grad_norm": 10.511373519897461,
1107
- "learning_rate": 2.8543447293447294e-05,
1108
- "loss": 0.2339,
1109
- "step": 1520
1110
- },
1111
- {
1112
- "epoch": 4.896,
1113
- "grad_norm": 16.811450958251953,
1114
- "learning_rate": 2.8365384615384616e-05,
1115
- "loss": 0.1969,
1116
- "step": 1530
1117
- },
1118
- {
1119
- "epoch": 4.928,
1120
- "grad_norm": 8.571528434753418,
1121
- "learning_rate": 2.8187321937321937e-05,
1122
- "loss": 0.1951,
1123
- "step": 1540
1124
- },
1125
- {
1126
- "epoch": 4.96,
1127
- "grad_norm": 15.971531867980957,
1128
- "learning_rate": 2.8009259259259263e-05,
1129
- "loss": 0.2012,
1130
- "step": 1550
1131
- },
1132
- {
1133
- "epoch": 4.992,
1134
- "grad_norm": 22.862905502319336,
1135
- "learning_rate": 2.7831196581196584e-05,
1136
- "loss": 0.1925,
1137
- "step": 1560
1138
- },
1139
- {
1140
- "epoch": 4.9984,
1141
- "eval_accuracy": 0.69075,
1142
- "eval_loss": 1.0642794370651245,
1143
- "eval_runtime": 164.1188,
1144
- "eval_samples_per_second": 24.373,
1145
- "eval_steps_per_second": 0.762,
1146
- "step": 1562
1147
- },
1148
- {
1149
- "epoch": 5.024,
1150
- "grad_norm": 14.107147216796875,
1151
- "learning_rate": 2.7653133903133903e-05,
1152
- "loss": 0.2248,
1153
- "step": 1570
1154
- },
1155
- {
1156
- "epoch": 5.056,
1157
- "grad_norm": 16.5964412689209,
1158
- "learning_rate": 2.7475071225071224e-05,
1159
- "loss": 0.1826,
1160
- "step": 1580
1161
- },
1162
- {
1163
- "epoch": 5.088,
1164
- "grad_norm": 9.056426048278809,
1165
- "learning_rate": 2.7297008547008546e-05,
1166
- "loss": 0.1975,
1167
- "step": 1590
1168
- },
1169
- {
1170
- "epoch": 5.12,
1171
- "grad_norm": 11.496504783630371,
1172
- "learning_rate": 2.711894586894587e-05,
1173
- "loss": 0.1818,
1174
- "step": 1600
1175
- },
1176
- {
1177
- "epoch": 5.152,
1178
- "grad_norm": 14.043261528015137,
1179
- "learning_rate": 2.6940883190883193e-05,
1180
- "loss": 0.1744,
1181
- "step": 1610
1182
- },
1183
- {
1184
- "epoch": 5.184,
1185
- "grad_norm": 8.685189247131348,
1186
- "learning_rate": 2.6762820512820515e-05,
1187
- "loss": 0.2042,
1188
- "step": 1620
1189
- },
1190
- {
1191
- "epoch": 5.216,
1192
- "grad_norm": 7.994570732116699,
1193
- "learning_rate": 2.6584757834757833e-05,
1194
- "loss": 0.1703,
1195
- "step": 1630
1196
- },
1197
- {
1198
- "epoch": 5.248,
1199
- "grad_norm": 14.493515968322754,
1200
- "learning_rate": 2.640669515669516e-05,
1201
- "loss": 0.1736,
1202
- "step": 1640
1203
- },
1204
- {
1205
- "epoch": 5.28,
1206
- "grad_norm": 7.479248523712158,
1207
- "learning_rate": 2.622863247863248e-05,
1208
- "loss": 0.1868,
1209
- "step": 1650
1210
- },
1211
- {
1212
- "epoch": 5.312,
1213
- "grad_norm": 12.55150318145752,
1214
- "learning_rate": 2.60505698005698e-05,
1215
- "loss": 0.2088,
1216
- "step": 1660
1217
- },
1218
- {
1219
- "epoch": 5.344,
1220
- "grad_norm": 8.034307479858398,
1221
- "learning_rate": 2.5872507122507123e-05,
1222
- "loss": 0.18,
1223
- "step": 1670
1224
- },
1225
- {
1226
- "epoch": 5.376,
1227
- "grad_norm": 7.336935997009277,
1228
- "learning_rate": 2.5694444444444445e-05,
1229
- "loss": 0.173,
1230
- "step": 1680
1231
- },
1232
- {
1233
- "epoch": 5.408,
1234
- "grad_norm": 7.364587306976318,
1235
- "learning_rate": 2.551638176638177e-05,
1236
- "loss": 0.1739,
1237
- "step": 1690
1238
- },
1239
- {
1240
- "epoch": 5.44,
1241
- "grad_norm": 12.251140594482422,
1242
- "learning_rate": 2.533831908831909e-05,
1243
- "loss": 0.1766,
1244
- "step": 1700
1245
- },
1246
- {
1247
- "epoch": 5.4719999999999995,
1248
- "grad_norm": 8.115775108337402,
1249
- "learning_rate": 2.516025641025641e-05,
1250
- "loss": 0.1862,
1251
- "step": 1710
1252
- },
1253
- {
1254
- "epoch": 5.504,
1255
- "grad_norm": 12.475907325744629,
1256
- "learning_rate": 2.4982193732193735e-05,
1257
- "loss": 0.1988,
1258
- "step": 1720
1259
- },
1260
- {
1261
- "epoch": 5.536,
1262
- "grad_norm": 7.327911376953125,
1263
- "learning_rate": 2.4804131054131057e-05,
1264
- "loss": 0.1828,
1265
- "step": 1730
1266
- },
1267
- {
1268
- "epoch": 5.568,
1269
- "grad_norm": 8.060572624206543,
1270
- "learning_rate": 2.462606837606838e-05,
1271
- "loss": 0.1692,
1272
- "step": 1740
1273
- },
1274
- {
1275
- "epoch": 5.6,
1276
- "grad_norm": 28.06664276123047,
1277
- "learning_rate": 2.44480056980057e-05,
1278
- "loss": 0.1345,
1279
- "step": 1750
1280
- },
1281
- {
1282
- "epoch": 5.632,
1283
- "grad_norm": 12.124792098999023,
1284
- "learning_rate": 2.426994301994302e-05,
1285
- "loss": 0.2026,
1286
- "step": 1760
1287
- },
1288
- {
1289
- "epoch": 5.664,
1290
- "grad_norm": 12.650993347167969,
1291
- "learning_rate": 2.4091880341880344e-05,
1292
- "loss": 0.1861,
1293
- "step": 1770
1294
- },
1295
- {
1296
- "epoch": 5.696,
1297
- "grad_norm": 15.481111526489258,
1298
- "learning_rate": 2.3913817663817665e-05,
1299
- "loss": 0.1963,
1300
- "step": 1780
1301
- },
1302
- {
1303
- "epoch": 5.728,
1304
- "grad_norm": 12.246734619140625,
1305
- "learning_rate": 2.3735754985754987e-05,
1306
- "loss": 0.162,
1307
- "step": 1790
1308
- },
1309
- {
1310
- "epoch": 5.76,
1311
- "grad_norm": 11.539756774902344,
1312
- "learning_rate": 2.355769230769231e-05,
1313
- "loss": 0.2141,
1314
- "step": 1800
1315
- },
1316
- {
1317
- "epoch": 5.792,
1318
- "grad_norm": 7.406561374664307,
1319
- "learning_rate": 2.337962962962963e-05,
1320
- "loss": 0.1877,
1321
- "step": 1810
1322
- },
1323
- {
1324
- "epoch": 5.824,
1325
- "grad_norm": 14.728952407836914,
1326
- "learning_rate": 2.3201566951566952e-05,
1327
- "loss": 0.1796,
1328
- "step": 1820
1329
- },
1330
- {
1331
- "epoch": 5.856,
1332
- "grad_norm": 7.532665729522705,
1333
- "learning_rate": 2.3023504273504274e-05,
1334
- "loss": 0.157,
1335
- "step": 1830
1336
- },
1337
- {
1338
- "epoch": 5.888,
1339
- "grad_norm": 16.279264450073242,
1340
- "learning_rate": 2.2845441595441596e-05,
1341
- "loss": 0.1836,
1342
- "step": 1840
1343
- },
1344
- {
1345
- "epoch": 5.92,
1346
- "grad_norm": 7.029083728790283,
1347
- "learning_rate": 2.2667378917378917e-05,
1348
- "loss": 0.1788,
1349
- "step": 1850
1350
- },
1351
- {
1352
- "epoch": 5.952,
1353
- "grad_norm": 7.0097174644470215,
1354
- "learning_rate": 2.2489316239316242e-05,
1355
- "loss": 0.1658,
1356
- "step": 1860
1357
- },
1358
- {
1359
- "epoch": 5.984,
1360
- "grad_norm": 8.254197120666504,
1361
- "learning_rate": 2.231125356125356e-05,
1362
- "loss": 0.177,
1363
- "step": 1870
1364
- },
1365
- {
1366
- "epoch": 6.0,
1367
- "eval_accuracy": 0.68425,
1368
- "eval_loss": 1.2558131217956543,
1369
- "eval_runtime": 148.6431,
1370
- "eval_samples_per_second": 26.91,
1371
- "eval_steps_per_second": 0.841,
1372
- "step": 1875
1373
- },
1374
- {
1375
- "epoch": 6.016,
1376
- "grad_norm": 10.65648078918457,
1377
- "learning_rate": 2.2133190883190886e-05,
1378
- "loss": 0.197,
1379
- "step": 1880
1380
- },
1381
- {
1382
- "epoch": 6.048,
1383
- "grad_norm": 12.431746482849121,
1384
- "learning_rate": 2.1955128205128208e-05,
1385
- "loss": 0.1876,
1386
- "step": 1890
1387
- },
1388
- {
1389
- "epoch": 6.08,
1390
- "grad_norm": 20.30584716796875,
1391
- "learning_rate": 2.177706552706553e-05,
1392
- "loss": 0.1651,
1393
- "step": 1900
1394
- },
1395
- {
1396
- "epoch": 6.112,
1397
- "grad_norm": 13.690505981445312,
1398
- "learning_rate": 2.159900284900285e-05,
1399
- "loss": 0.1646,
1400
- "step": 1910
1401
- },
1402
- {
1403
- "epoch": 6.144,
1404
- "grad_norm": 23.424381256103516,
1405
- "learning_rate": 2.142094017094017e-05,
1406
- "loss": 0.1732,
1407
- "step": 1920
1408
- },
1409
- {
1410
- "epoch": 6.176,
1411
- "grad_norm": 8.711691856384277,
1412
- "learning_rate": 2.1242877492877494e-05,
1413
- "loss": 0.1537,
1414
- "step": 1930
1415
- },
1416
- {
1417
- "epoch": 6.208,
1418
- "grad_norm": 10.551396369934082,
1419
- "learning_rate": 2.1064814814814816e-05,
1420
- "loss": 0.1512,
1421
- "step": 1940
1422
- },
1423
- {
1424
- "epoch": 6.24,
1425
- "grad_norm": 31.970632553100586,
1426
- "learning_rate": 2.0886752136752138e-05,
1427
- "loss": 0.1607,
1428
- "step": 1950
1429
- },
1430
- {
1431
- "epoch": 6.272,
1432
- "grad_norm": 14.962709426879883,
1433
- "learning_rate": 2.070868945868946e-05,
1434
- "loss": 0.1476,
1435
- "step": 1960
1436
- },
1437
- {
1438
- "epoch": 6.304,
1439
- "grad_norm": 11.659952163696289,
1440
- "learning_rate": 2.053062678062678e-05,
1441
- "loss": 0.1627,
1442
- "step": 1970
1443
- },
1444
- {
1445
- "epoch": 6.336,
1446
- "grad_norm": 8.625643730163574,
1447
- "learning_rate": 2.0352564102564103e-05,
1448
- "loss": 0.1644,
1449
- "step": 1980
1450
- },
1451
- {
1452
- "epoch": 6.368,
1453
- "grad_norm": 20.484344482421875,
1454
- "learning_rate": 2.0174501424501425e-05,
1455
- "loss": 0.1605,
1456
- "step": 1990
1457
- },
1458
- {
1459
- "epoch": 6.4,
1460
- "grad_norm": 8.137100219726562,
1461
- "learning_rate": 1.9996438746438746e-05,
1462
- "loss": 0.1482,
1463
- "step": 2000
1464
- },
1465
- {
1466
- "epoch": 6.432,
1467
- "grad_norm": 9.601692199707031,
1468
- "learning_rate": 1.9818376068376068e-05,
1469
- "loss": 0.1654,
1470
- "step": 2010
1471
- },
1472
- {
1473
- "epoch": 6.464,
1474
- "grad_norm": 10.012831687927246,
1475
- "learning_rate": 1.9640313390313393e-05,
1476
- "loss": 0.1925,
1477
- "step": 2020
1478
- },
1479
- {
1480
- "epoch": 6.496,
1481
- "grad_norm": 7.507844924926758,
1482
- "learning_rate": 1.946225071225071e-05,
1483
- "loss": 0.1735,
1484
- "step": 2030
1485
- },
1486
- {
1487
- "epoch": 6.5280000000000005,
1488
- "grad_norm": 19.662033081054688,
1489
- "learning_rate": 1.9284188034188037e-05,
1490
- "loss": 0.1547,
1491
- "step": 2040
1492
- },
1493
- {
1494
- "epoch": 6.5600000000000005,
1495
- "grad_norm": 7.522573947906494,
1496
- "learning_rate": 1.910612535612536e-05,
1497
- "loss": 0.1602,
1498
- "step": 2050
1499
- },
1500
- {
1501
- "epoch": 6.592,
1502
- "grad_norm": 9.965826034545898,
1503
- "learning_rate": 1.892806267806268e-05,
1504
- "loss": 0.1488,
1505
- "step": 2060
1506
- },
1507
- {
1508
- "epoch": 6.624,
1509
- "grad_norm": 6.560938358306885,
1510
- "learning_rate": 1.8750000000000002e-05,
1511
- "loss": 0.1439,
1512
- "step": 2070
1513
- },
1514
- {
1515
- "epoch": 6.656,
1516
- "grad_norm": 16.683101654052734,
1517
- "learning_rate": 1.857193732193732e-05,
1518
- "loss": 0.1384,
1519
- "step": 2080
1520
- },
1521
- {
1522
- "epoch": 6.688,
1523
- "grad_norm": 7.641848564147949,
1524
- "learning_rate": 1.8393874643874645e-05,
1525
- "loss": 0.1429,
1526
- "step": 2090
1527
- },
1528
- {
1529
- "epoch": 6.72,
1530
- "grad_norm": 11.998226165771484,
1531
- "learning_rate": 1.8215811965811967e-05,
1532
- "loss": 0.1485,
1533
- "step": 2100
1534
- },
1535
- {
1536
- "epoch": 6.752,
1537
- "grad_norm": 7.238443374633789,
1538
- "learning_rate": 1.803774928774929e-05,
1539
- "loss": 0.1544,
1540
- "step": 2110
1541
- },
1542
- {
1543
- "epoch": 6.784,
1544
- "grad_norm": 7.6015777587890625,
1545
- "learning_rate": 1.785968660968661e-05,
1546
- "loss": 0.1864,
1547
- "step": 2120
1548
- },
1549
- {
1550
- "epoch": 6.816,
1551
- "grad_norm": 12.038315773010254,
1552
- "learning_rate": 1.7681623931623932e-05,
1553
- "loss": 0.1479,
1554
- "step": 2130
1555
- },
1556
- {
1557
- "epoch": 6.848,
1558
- "grad_norm": 25.752042770385742,
1559
- "learning_rate": 1.7503561253561254e-05,
1560
- "loss": 0.1479,
1561
- "step": 2140
1562
- },
1563
- {
1564
- "epoch": 6.88,
1565
- "grad_norm": 12.058791160583496,
1566
- "learning_rate": 1.732549857549858e-05,
1567
- "loss": 0.1342,
1568
- "step": 2150
1569
- },
1570
- {
1571
- "epoch": 6.912,
1572
- "grad_norm": 18.634387969970703,
1573
- "learning_rate": 1.7147435897435897e-05,
1574
- "loss": 0.165,
1575
- "step": 2160
1576
- },
1577
- {
1578
- "epoch": 6.944,
1579
- "grad_norm": 8.238666534423828,
1580
- "learning_rate": 1.696937321937322e-05,
1581
- "loss": 0.1452,
1582
- "step": 2170
1583
- },
1584
- {
1585
- "epoch": 6.976,
1586
- "grad_norm": 8.537055015563965,
1587
- "learning_rate": 1.6791310541310544e-05,
1588
- "loss": 0.1563,
1589
- "step": 2180
1590
- },
1591
- {
1592
- "epoch": 6.9984,
1593
- "eval_accuracy": 0.7445,
1594
- "eval_loss": 0.9205208420753479,
1595
- "eval_runtime": 156.8331,
1596
- "eval_samples_per_second": 25.505,
1597
- "eval_steps_per_second": 0.797,
1598
- "step": 2187
1599
- },
1600
- {
1601
- "epoch": 7.008,
1602
- "grad_norm": 21.315475463867188,
1603
- "learning_rate": 1.6613247863247862e-05,
1604
- "loss": 0.1621,
1605
- "step": 2190
1606
- },
1607
- {
1608
- "epoch": 7.04,
1609
- "grad_norm": 4.930037975311279,
1610
- "learning_rate": 1.6435185185185187e-05,
1611
- "loss": 0.143,
1612
- "step": 2200
1613
- },
1614
- {
1615
- "epoch": 7.072,
1616
- "grad_norm": 20.74496078491211,
1617
- "learning_rate": 1.625712250712251e-05,
1618
- "loss": 0.1334,
1619
- "step": 2210
1620
- },
1621
- {
1622
- "epoch": 7.104,
1623
- "grad_norm": 32.969154357910156,
1624
- "learning_rate": 1.607905982905983e-05,
1625
- "loss": 0.1502,
1626
- "step": 2220
1627
- },
1628
- {
1629
- "epoch": 7.136,
1630
- "grad_norm": 10.03094482421875,
1631
- "learning_rate": 1.5900997150997153e-05,
1632
- "loss": 0.1611,
1633
- "step": 2230
1634
- },
1635
- {
1636
- "epoch": 7.168,
1637
- "grad_norm": 10.585143089294434,
1638
- "learning_rate": 1.572293447293447e-05,
1639
- "loss": 0.1583,
1640
- "step": 2240
1641
- },
1642
- {
1643
- "epoch": 7.2,
1644
- "grad_norm": 11.96883487701416,
1645
- "learning_rate": 1.5544871794871796e-05,
1646
- "loss": 0.1433,
1647
- "step": 2250
1648
- },
1649
- {
1650
- "epoch": 7.232,
1651
- "grad_norm": 17.068143844604492,
1652
- "learning_rate": 1.5366809116809118e-05,
1653
- "loss": 0.1641,
1654
- "step": 2260
1655
- },
1656
- {
1657
- "epoch": 7.264,
1658
- "grad_norm": 9.417765617370605,
1659
- "learning_rate": 1.518874643874644e-05,
1660
- "loss": 0.1266,
1661
- "step": 2270
1662
- },
1663
- {
1664
- "epoch": 7.296,
1665
- "grad_norm": 12.315200805664062,
1666
- "learning_rate": 1.5010683760683761e-05,
1667
- "loss": 0.1337,
1668
- "step": 2280
1669
- },
1670
- {
1671
- "epoch": 7.328,
1672
- "grad_norm": 20.546875,
1673
- "learning_rate": 1.4832621082621084e-05,
1674
- "loss": 0.1312,
1675
- "step": 2290
1676
- },
1677
- {
1678
- "epoch": 7.36,
1679
- "grad_norm": 7.2887468338012695,
1680
- "learning_rate": 1.4654558404558405e-05,
1681
- "loss": 0.1534,
1682
- "step": 2300
1683
- },
1684
- {
1685
- "epoch": 7.392,
1686
- "grad_norm": 8.704584121704102,
1687
- "learning_rate": 1.4476495726495728e-05,
1688
- "loss": 0.1317,
1689
- "step": 2310
1690
- },
1691
- {
1692
- "epoch": 7.424,
1693
- "grad_norm": 9.408025741577148,
1694
- "learning_rate": 1.429843304843305e-05,
1695
- "loss": 0.1336,
1696
- "step": 2320
1697
- },
1698
- {
1699
- "epoch": 7.456,
1700
- "grad_norm": 16.08376121520996,
1701
- "learning_rate": 1.412037037037037e-05,
1702
- "loss": 0.1361,
1703
- "step": 2330
1704
- },
1705
- {
1706
- "epoch": 7.4879999999999995,
1707
- "grad_norm": 13.386305809020996,
1708
- "learning_rate": 1.3942307692307693e-05,
1709
- "loss": 0.1254,
1710
- "step": 2340
1711
- },
1712
- {
1713
- "epoch": 7.52,
1714
- "grad_norm": 8.729687690734863,
1715
- "learning_rate": 1.3764245014245015e-05,
1716
- "loss": 0.1477,
1717
- "step": 2350
1718
- },
1719
- {
1720
- "epoch": 7.552,
1721
- "grad_norm": 7.066646099090576,
1722
- "learning_rate": 1.3586182336182338e-05,
1723
- "loss": 0.1266,
1724
- "step": 2360
1725
- },
1726
- {
1727
- "epoch": 7.584,
1728
- "grad_norm": 11.6298828125,
1729
- "learning_rate": 1.3408119658119658e-05,
1730
- "loss": 0.1303,
1731
- "step": 2370
1732
- },
1733
- {
1734
- "epoch": 7.616,
1735
- "grad_norm": 5.477276802062988,
1736
- "learning_rate": 1.3230056980056982e-05,
1737
- "loss": 0.126,
1738
- "step": 2380
1739
- },
1740
- {
1741
- "epoch": 7.648,
1742
- "grad_norm": 9.086950302124023,
1743
- "learning_rate": 1.3051994301994303e-05,
1744
- "loss": 0.1383,
1745
- "step": 2390
1746
- },
1747
- {
1748
- "epoch": 7.68,
1749
- "grad_norm": 7.193227291107178,
1750
- "learning_rate": 1.2873931623931623e-05,
1751
- "loss": 0.1371,
1752
- "step": 2400
1753
- },
1754
- {
1755
- "epoch": 7.712,
1756
- "grad_norm": 11.82983684539795,
1757
- "learning_rate": 1.2695868945868947e-05,
1758
- "loss": 0.1406,
1759
- "step": 2410
1760
- },
1761
- {
1762
- "epoch": 7.744,
1763
- "grad_norm": 9.807685852050781,
1764
- "learning_rate": 1.2517806267806267e-05,
1765
- "loss": 0.1349,
1766
- "step": 2420
1767
- },
1768
- {
1769
- "epoch": 7.776,
1770
- "grad_norm": 13.509228706359863,
1771
- "learning_rate": 1.233974358974359e-05,
1772
- "loss": 0.1324,
1773
- "step": 2430
1774
- },
1775
- {
1776
- "epoch": 7.808,
1777
- "grad_norm": 20.25900650024414,
1778
- "learning_rate": 1.2161680911680912e-05,
1779
- "loss": 0.1337,
1780
- "step": 2440
1781
- },
1782
- {
1783
- "epoch": 7.84,
1784
- "grad_norm": 8.688305854797363,
1785
- "learning_rate": 1.1983618233618234e-05,
1786
- "loss": 0.1206,
1787
- "step": 2450
1788
- },
1789
- {
1790
- "epoch": 7.872,
1791
- "grad_norm": 16.35320472717285,
1792
- "learning_rate": 1.1805555555555555e-05,
1793
- "loss": 0.133,
1794
- "step": 2460
1795
- },
1796
- {
1797
- "epoch": 7.904,
1798
- "grad_norm": 35.276092529296875,
1799
- "learning_rate": 1.1627492877492879e-05,
1800
- "loss": 0.1425,
1801
- "step": 2470
1802
- },
1803
- {
1804
- "epoch": 7.936,
1805
- "grad_norm": 16.226844787597656,
1806
- "learning_rate": 1.14494301994302e-05,
1807
- "loss": 0.1421,
1808
- "step": 2480
1809
- },
1810
- {
1811
- "epoch": 7.968,
1812
- "grad_norm": 4.65103006362915,
1813
- "learning_rate": 1.1271367521367522e-05,
1814
- "loss": 0.1225,
1815
- "step": 2490
1816
- },
1817
- {
1818
- "epoch": 8.0,
1819
- "grad_norm": 10.949529647827148,
1820
- "learning_rate": 1.1093304843304844e-05,
1821
- "loss": 0.1417,
1822
- "step": 2500
1823
- },
1824
- {
1825
- "epoch": 8.0,
1826
- "eval_accuracy": 0.80625,
1827
- "eval_loss": 0.662419319152832,
1828
- "eval_runtime": 156.8602,
1829
- "eval_samples_per_second": 25.5,
1830
- "eval_steps_per_second": 0.797,
1831
- "step": 2500
1832
- },
1833
- {
1834
- "epoch": 8.032,
1835
- "grad_norm": 14.452116966247559,
1836
- "learning_rate": 1.0915242165242166e-05,
1837
- "loss": 0.1422,
1838
- "step": 2510
1839
- },
1840
- {
1841
- "epoch": 8.064,
1842
- "grad_norm": 26.71641731262207,
1843
- "learning_rate": 1.0737179487179487e-05,
1844
- "loss": 0.1337,
1845
- "step": 2520
1846
- },
1847
- {
1848
- "epoch": 8.096,
1849
- "grad_norm": 8.862665176391602,
1850
- "learning_rate": 1.0559116809116809e-05,
1851
- "loss": 0.1349,
1852
- "step": 2530
1853
- },
1854
- {
1855
- "epoch": 8.128,
1856
- "grad_norm": 12.3915433883667,
1857
- "learning_rate": 1.038105413105413e-05,
1858
- "loss": 0.1479,
1859
- "step": 2540
1860
- },
1861
- {
1862
- "epoch": 8.16,
1863
- "grad_norm": 11.4617280960083,
1864
- "learning_rate": 1.0202991452991454e-05,
1865
- "loss": 0.1295,
1866
- "step": 2550
1867
- },
1868
- {
1869
- "epoch": 8.192,
1870
- "grad_norm": 19.975990295410156,
1871
- "learning_rate": 1.0024928774928776e-05,
1872
- "loss": 0.1382,
1873
- "step": 2560
1874
- },
1875
- {
1876
- "epoch": 8.224,
1877
- "grad_norm": 9.245658874511719,
1878
- "learning_rate": 9.846866096866097e-06,
1879
- "loss": 0.1308,
1880
- "step": 2570
1881
- },
1882
- {
1883
- "epoch": 8.256,
1884
- "grad_norm": 5.326711177825928,
1885
- "learning_rate": 9.66880341880342e-06,
1886
- "loss": 0.1023,
1887
- "step": 2580
1888
- },
1889
- {
1890
- "epoch": 8.288,
1891
- "grad_norm": 11.740446090698242,
1892
- "learning_rate": 9.490740740740741e-06,
1893
- "loss": 0.1262,
1894
- "step": 2590
1895
- },
1896
- {
1897
- "epoch": 8.32,
1898
- "grad_norm": 10.618571281433105,
1899
- "learning_rate": 9.312678062678064e-06,
1900
- "loss": 0.1306,
1901
- "step": 2600
1902
- },
1903
- {
1904
- "epoch": 8.352,
1905
- "grad_norm": 21.31345558166504,
1906
- "learning_rate": 9.134615384615384e-06,
1907
- "loss": 0.136,
1908
- "step": 2610
1909
- },
1910
- {
1911
- "epoch": 8.384,
1912
- "grad_norm": 6.4540863037109375,
1913
- "learning_rate": 8.956552706552706e-06,
1914
- "loss": 0.1063,
1915
- "step": 2620
1916
- },
1917
- {
1918
- "epoch": 8.416,
1919
- "grad_norm": 12.291879653930664,
1920
- "learning_rate": 8.77849002849003e-06,
1921
- "loss": 0.1222,
1922
- "step": 2630
1923
- },
1924
- {
1925
- "epoch": 8.448,
1926
- "grad_norm": 7.860499858856201,
1927
- "learning_rate": 8.600427350427351e-06,
1928
- "loss": 0.1152,
1929
- "step": 2640
1930
- },
1931
- {
1932
- "epoch": 8.48,
1933
- "grad_norm": 7.345857620239258,
1934
- "learning_rate": 8.422364672364673e-06,
1935
- "loss": 0.1194,
1936
- "step": 2650
1937
- },
1938
- {
1939
- "epoch": 8.512,
1940
- "grad_norm": 25.202924728393555,
1941
- "learning_rate": 8.244301994301995e-06,
1942
- "loss": 0.1214,
1943
- "step": 2660
1944
- },
1945
- {
1946
- "epoch": 8.544,
1947
- "grad_norm": 4.337811470031738,
1948
- "learning_rate": 8.066239316239316e-06,
1949
- "loss": 0.1316,
1950
- "step": 2670
1951
- },
1952
- {
1953
- "epoch": 8.576,
1954
- "grad_norm": 9.332151412963867,
1955
- "learning_rate": 7.88817663817664e-06,
1956
- "loss": 0.1208,
1957
- "step": 2680
1958
- },
1959
- {
1960
- "epoch": 8.608,
1961
- "grad_norm": 15.260229110717773,
1962
- "learning_rate": 7.71011396011396e-06,
1963
- "loss": 0.1223,
1964
- "step": 2690
1965
- },
1966
- {
1967
- "epoch": 8.64,
1968
- "grad_norm": 4.976259708404541,
1969
- "learning_rate": 7.532051282051282e-06,
1970
- "loss": 0.111,
1971
- "step": 2700
1972
- },
1973
- {
1974
- "epoch": 8.672,
1975
- "grad_norm": 18.121919631958008,
1976
- "learning_rate": 7.353988603988604e-06,
1977
- "loss": 0.1184,
1978
- "step": 2710
1979
- },
1980
- {
1981
- "epoch": 8.704,
1982
- "grad_norm": 15.61289119720459,
1983
- "learning_rate": 7.1759259259259266e-06,
1984
- "loss": 0.125,
1985
- "step": 2720
1986
- },
1987
- {
1988
- "epoch": 8.736,
1989
- "grad_norm": 14.504264831542969,
1990
- "learning_rate": 6.997863247863248e-06,
1991
- "loss": 0.1165,
1992
- "step": 2730
1993
- },
1994
- {
1995
- "epoch": 8.768,
1996
- "grad_norm": 10.373973846435547,
1997
- "learning_rate": 6.819800569800571e-06,
1998
- "loss": 0.1117,
1999
- "step": 2740
2000
- },
2001
- {
2002
- "epoch": 8.8,
2003
- "grad_norm": 32.67918014526367,
2004
- "learning_rate": 6.6417378917378925e-06,
2005
- "loss": 0.1345,
2006
- "step": 2750
2007
- },
2008
- {
2009
- "epoch": 8.832,
2010
- "grad_norm": 11.788604736328125,
2011
- "learning_rate": 6.463675213675214e-06,
2012
- "loss": 0.1194,
2013
- "step": 2760
2014
- },
2015
- {
2016
- "epoch": 8.864,
2017
- "grad_norm": 10.273115158081055,
2018
- "learning_rate": 6.285612535612535e-06,
2019
- "loss": 0.1089,
2020
- "step": 2770
2021
- },
2022
- {
2023
- "epoch": 8.896,
2024
- "grad_norm": 18.808115005493164,
2025
- "learning_rate": 6.1075498575498585e-06,
2026
- "loss": 0.1106,
2027
- "step": 2780
2028
- },
2029
- {
2030
- "epoch": 8.928,
2031
- "grad_norm": 8.078729629516602,
2032
- "learning_rate": 5.929487179487179e-06,
2033
- "loss": 0.1407,
2034
- "step": 2790
2035
- },
2036
- {
2037
- "epoch": 8.96,
2038
- "grad_norm": 6.521687984466553,
2039
- "learning_rate": 5.751424501424502e-06,
2040
- "loss": 0.1195,
2041
- "step": 2800
2042
- },
2043
- {
2044
- "epoch": 8.992,
2045
- "grad_norm": 7.088119029998779,
2046
- "learning_rate": 5.573361823361824e-06,
2047
- "loss": 0.1284,
2048
- "step": 2810
2049
- },
2050
- {
2051
- "epoch": 8.9984,
2052
- "eval_accuracy": 0.739,
2053
- "eval_loss": 1.1648138761520386,
2054
- "eval_runtime": 157.4642,
2055
- "eval_samples_per_second": 25.403,
2056
- "eval_steps_per_second": 0.794,
2057
- "step": 2812
2058
- },
2059
- {
2060
- "epoch": 9.024,
2061
- "grad_norm": 19.490074157714844,
2062
- "learning_rate": 5.395299145299146e-06,
2063
- "loss": 0.1084,
2064
- "step": 2820
2065
- },
2066
- {
2067
- "epoch": 9.056,
2068
- "grad_norm": 6.817428112030029,
2069
- "learning_rate": 5.217236467236467e-06,
2070
- "loss": 0.1194,
2071
- "step": 2830
2072
- },
2073
- {
2074
- "epoch": 9.088,
2075
- "grad_norm": 8.714266777038574,
2076
- "learning_rate": 5.03917378917379e-06,
2077
- "loss": 0.1126,
2078
- "step": 2840
2079
- },
2080
- {
2081
- "epoch": 9.12,
2082
- "grad_norm": 15.373977661132812,
2083
- "learning_rate": 4.861111111111111e-06,
2084
- "loss": 0.1147,
2085
- "step": 2850
2086
- },
2087
- {
2088
- "epoch": 9.152,
2089
- "grad_norm": 8.224656105041504,
2090
- "learning_rate": 4.683048433048434e-06,
2091
- "loss": 0.1023,
2092
- "step": 2860
2093
- },
2094
- {
2095
- "epoch": 9.184,
2096
- "grad_norm": 14.560908317565918,
2097
- "learning_rate": 4.504985754985755e-06,
2098
- "loss": 0.1156,
2099
- "step": 2870
2100
- },
2101
- {
2102
- "epoch": 9.216,
2103
- "grad_norm": 9.510125160217285,
2104
- "learning_rate": 4.326923076923077e-06,
2105
- "loss": 0.1123,
2106
- "step": 2880
2107
- },
2108
- {
2109
- "epoch": 9.248,
2110
- "grad_norm": 11.263538360595703,
2111
- "learning_rate": 4.148860398860399e-06,
2112
- "loss": 0.1126,
2113
- "step": 2890
2114
- },
2115
- {
2116
- "epoch": 9.28,
2117
- "grad_norm": 7.910732269287109,
2118
- "learning_rate": 3.970797720797722e-06,
2119
- "loss": 0.1075,
2120
- "step": 2900
2121
- },
2122
- {
2123
- "epoch": 9.312,
2124
- "grad_norm": 6.68363618850708,
2125
- "learning_rate": 3.7927350427350425e-06,
2126
- "loss": 0.1001,
2127
- "step": 2910
2128
- },
2129
- {
2130
- "epoch": 9.344,
2131
- "grad_norm": 7.76242733001709,
2132
- "learning_rate": 3.6146723646723646e-06,
2133
- "loss": 0.0995,
2134
- "step": 2920
2135
- },
2136
- {
2137
- "epoch": 9.376,
2138
- "grad_norm": 12.8687744140625,
2139
- "learning_rate": 3.4366096866096867e-06,
2140
- "loss": 0.1449,
2141
- "step": 2930
2142
- },
2143
- {
2144
- "epoch": 9.408,
2145
- "grad_norm": 12.708325386047363,
2146
- "learning_rate": 3.258547008547009e-06,
2147
- "loss": 0.1299,
2148
- "step": 2940
2149
- },
2150
- {
2151
- "epoch": 9.44,
2152
- "grad_norm": 14.307499885559082,
2153
- "learning_rate": 3.0804843304843306e-06,
2154
- "loss": 0.1259,
2155
- "step": 2950
2156
- },
2157
- {
2158
- "epoch": 9.472,
2159
- "grad_norm": 16.907052993774414,
2160
- "learning_rate": 2.9024216524216523e-06,
2161
- "loss": 0.1014,
2162
- "step": 2960
2163
- },
2164
- {
2165
- "epoch": 9.504,
2166
- "grad_norm": 20.69292640686035,
2167
- "learning_rate": 2.7243589743589744e-06,
2168
- "loss": 0.1059,
2169
- "step": 2970
2170
- },
2171
- {
2172
- "epoch": 9.536,
2173
- "grad_norm": 7.013891696929932,
2174
- "learning_rate": 2.546296296296296e-06,
2175
- "loss": 0.1025,
2176
- "step": 2980
2177
- },
2178
- {
2179
- "epoch": 9.568,
2180
- "grad_norm": 10.161940574645996,
2181
- "learning_rate": 2.3682336182336183e-06,
2182
- "loss": 0.1132,
2183
- "step": 2990
2184
- },
2185
- {
2186
- "epoch": 9.6,
2187
- "grad_norm": 9.757913589477539,
2188
- "learning_rate": 2.19017094017094e-06,
2189
- "loss": 0.1053,
2190
- "step": 3000
2191
- },
2192
- {
2193
- "epoch": 9.632,
2194
- "grad_norm": 13.03531265258789,
2195
- "learning_rate": 2.012108262108262e-06,
2196
- "loss": 0.0961,
2197
- "step": 3010
2198
- },
2199
- {
2200
- "epoch": 9.664,
2201
- "grad_norm": 8.091846466064453,
2202
- "learning_rate": 1.834045584045584e-06,
2203
- "loss": 0.0904,
2204
- "step": 3020
2205
- },
2206
- {
2207
- "epoch": 9.696,
2208
- "grad_norm": 6.219699382781982,
2209
- "learning_rate": 1.6559829059829062e-06,
2210
- "loss": 0.0947,
2211
- "step": 3030
2212
- },
2213
- {
2214
- "epoch": 9.728,
2215
- "grad_norm": 8.676128387451172,
2216
- "learning_rate": 1.477920227920228e-06,
2217
- "loss": 0.095,
2218
- "step": 3040
2219
- },
2220
- {
2221
- "epoch": 9.76,
2222
- "grad_norm": 12.65188980102539,
2223
- "learning_rate": 1.29985754985755e-06,
2224
- "loss": 0.1245,
2225
- "step": 3050
2226
- },
2227
- {
2228
- "epoch": 9.792,
2229
- "grad_norm": 9.408976554870605,
2230
- "learning_rate": 1.121794871794872e-06,
2231
- "loss": 0.112,
2232
- "step": 3060
2233
- },
2234
- {
2235
- "epoch": 9.824,
2236
- "grad_norm": 5.554480075836182,
2237
- "learning_rate": 9.437321937321938e-07,
2238
- "loss": 0.0919,
2239
- "step": 3070
2240
- },
2241
- {
2242
- "epoch": 9.856,
2243
- "grad_norm": 20.830698013305664,
2244
- "learning_rate": 7.656695156695157e-07,
2245
- "loss": 0.1212,
2246
- "step": 3080
2247
- },
2248
- {
2249
- "epoch": 9.888,
2250
- "grad_norm": 8.195901870727539,
2251
- "learning_rate": 5.876068376068376e-07,
2252
- "loss": 0.1122,
2253
- "step": 3090
2254
- },
2255
- {
2256
- "epoch": 9.92,
2257
- "grad_norm": 14.159358024597168,
2258
- "learning_rate": 4.0954415954415953e-07,
2259
- "loss": 0.1236,
2260
- "step": 3100
2261
- },
2262
- {
2263
- "epoch": 9.952,
2264
- "grad_norm": 8.97334098815918,
2265
- "learning_rate": 2.3148148148148148e-07,
2266
- "loss": 0.1078,
2267
- "step": 3110
2268
- },
2269
- {
2270
- "epoch": 9.984,
2271
- "grad_norm": 8.013399124145508,
2272
- "learning_rate": 5.341880341880343e-08,
2273
- "loss": 0.0805,
2274
- "step": 3120
2275
- },
2276
- {
2277
- "epoch": 9.984,
2278
- "eval_accuracy": 0.74475,
2279
- "eval_loss": 1.1793317794799805,
2280
- "eval_runtime": 151.3727,
2281
- "eval_samples_per_second": 26.425,
2282
- "eval_steps_per_second": 0.826,
2283
- "step": 3120
2284
  },
2285
  {
2286
- "epoch": 9.984,
2287
- "step": 3120,
2288
- "total_flos": 9.926487761391452e+18,
2289
- "train_loss": 0.24441122439427254,
2290
- "train_runtime": 137433.0815,
2291
- "train_samples_per_second": 2.911,
2292
  "train_steps_per_second": 0.023
2293
  }
2294
  ],
2295
  "logging_steps": 10,
2296
- "max_steps": 3120,
2297
  "num_input_tokens_seen": 0,
2298
- "num_train_epochs": 10,
2299
  "save_steps": 500,
2300
  "stateful_callbacks": {
2301
  "TrainerControl": {
@@ -2309,7 +713,7 @@
2309
  "attributes": {}
2310
  }
2311
  },
2312
- "total_flos": 9.926487761391452e+18,
2313
  "train_batch_size": 32,
2314
  "trial_name": null,
2315
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7025,
3
+ "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-leukemia-08-2024.v1.2\\checkpoint-625",
4
+ "epoch": 2.9952,
5
  "eval_steps": 500,
6
+ "global_step": 936,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.032,
13
+ "grad_norm": 5.205770969390869,
14
+ "learning_rate": 5.319148936170213e-06,
15
+ "loss": 0.6922,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.064,
20
+ "grad_norm": 5.621664524078369,
21
+ "learning_rate": 1.0638297872340426e-05,
22
+ "loss": 0.6735,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.096,
27
+ "grad_norm": 4.5251078605651855,
28
+ "learning_rate": 1.595744680851064e-05,
29
+ "loss": 0.6799,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.128,
34
+ "grad_norm": 6.694261074066162,
35
+ "learning_rate": 2.1276595744680852e-05,
36
+ "loss": 0.6703,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.16,
41
+ "grad_norm": 5.9614176750183105,
42
+ "learning_rate": 2.6595744680851064e-05,
43
+ "loss": 0.6335,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.192,
48
+ "grad_norm": 9.891090393066406,
49
+ "learning_rate": 3.191489361702128e-05,
50
+ "loss": 0.6442,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.224,
55
+ "grad_norm": 5.152803897857666,
56
+ "learning_rate": 3.723404255319149e-05,
57
+ "loss": 0.6192,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.256,
62
+ "grad_norm": 19.81842613220215,
63
+ "learning_rate": 4.2553191489361704e-05,
64
+ "loss": 0.6061,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.288,
69
+ "grad_norm": 12.223305702209473,
70
+ "learning_rate": 4.787234042553192e-05,
71
+ "loss": 0.5728,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.32,
76
+ "grad_norm": 9.165103912353516,
77
+ "learning_rate": 4.96437054631829e-05,
78
+ "loss": 0.532,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.352,
83
+ "grad_norm": 11.49010181427002,
84
+ "learning_rate": 4.90498812351544e-05,
85
+ "loss": 0.5192,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.384,
90
+ "grad_norm": 16.154022216796875,
91
+ "learning_rate": 4.851543942992874e-05,
92
+ "loss": 0.5428,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.416,
97
+ "grad_norm": 15.223276138305664,
98
+ "learning_rate": 4.792161520190024e-05,
99
+ "loss": 0.4874,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.448,
104
+ "grad_norm": 15.795281410217285,
105
+ "learning_rate": 4.732779097387174e-05,
106
+ "loss": 0.4917,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.48,
111
+ "grad_norm": 15.048002243041992,
112
+ "learning_rate": 4.673396674584323e-05,
113
+ "loss": 0.4914,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.512,
118
+ "grad_norm": 9.113037109375,
119
+ "learning_rate": 4.6140142517814724e-05,
120
+ "loss": 0.4794,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.544,
125
+ "grad_norm": 8.940505027770996,
126
+ "learning_rate": 4.5546318289786225e-05,
127
+ "loss": 0.467,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.576,
132
+ "grad_norm": 10.99445915222168,
133
+ "learning_rate": 4.501187648456057e-05,
134
+ "loss": 0.4666,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.608,
139
+ "grad_norm": 11.823775291442871,
140
+ "learning_rate": 4.441805225653207e-05,
141
+ "loss": 0.4256,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.64,
146
+ "grad_norm": 32.22935485839844,
147
+ "learning_rate": 4.382422802850357e-05,
148
+ "loss": 0.412,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.672,
153
+ "grad_norm": 31.271358489990234,
154
+ "learning_rate": 4.323040380047506e-05,
155
+ "loss": 0.493,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.704,
160
+ "grad_norm": 7.7791900634765625,
161
+ "learning_rate": 4.263657957244656e-05,
162
+ "loss": 0.4073,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.736,
167
+ "grad_norm": 10.995684623718262,
168
+ "learning_rate": 4.204275534441806e-05,
169
+ "loss": 0.3982,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.768,
174
+ "grad_norm": 17.266887664794922,
175
+ "learning_rate": 4.144893111638955e-05,
176
+ "loss": 0.3794,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.8,
181
+ "grad_norm": 7.906798362731934,
182
+ "learning_rate": 4.0855106888361044e-05,
183
+ "loss": 0.3911,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.832,
188
+ "grad_norm": 13.777398109436035,
189
+ "learning_rate": 4.0261282660332545e-05,
190
+ "loss": 0.3829,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.864,
195
+ "grad_norm": 15.481444358825684,
196
+ "learning_rate": 3.966745843230404e-05,
197
+ "loss": 0.3937,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 0.896,
202
+ "grad_norm": 11.324766159057617,
203
+ "learning_rate": 3.907363420427554e-05,
204
+ "loss": 0.3503,
205
  "step": 280
206
  },
207
  {
208
  "epoch": 0.928,
209
+ "grad_norm": 19.757225036621094,
210
+ "learning_rate": 3.847980997624703e-05,
211
+ "loss": 0.3484,
212
  "step": 290
213
  },
214
  {
215
  "epoch": 0.96,
216
+ "grad_norm": 9.243185997009277,
217
+ "learning_rate": 3.7885985748218526e-05,
218
+ "loss": 0.3803,
219
  "step": 300
220
  },
221
  {
222
  "epoch": 0.992,
223
+ "grad_norm": 9.08983325958252,
224
+ "learning_rate": 3.7292161520190026e-05,
225
+ "loss": 0.3704,
226
  "step": 310
227
  },
228
  {
229
  "epoch": 0.9984,
230
+ "eval_accuracy": 0.63875,
231
+ "eval_loss": 0.7880752682685852,
232
+ "eval_runtime": 187.0148,
233
+ "eval_samples_per_second": 21.389,
234
+ "eval_steps_per_second": 0.668,
235
  "step": 312
236
  },
237
  {
238
  "epoch": 1.024,
239
+ "grad_norm": 45.11425018310547,
240
+ "learning_rate": 3.669833729216152e-05,
241
+ "loss": 0.3855,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.056,
246
+ "grad_norm": 26.53282928466797,
247
+ "learning_rate": 3.6104513064133013e-05,
248
+ "loss": 0.3922,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.088,
253
+ "grad_norm": 9.646315574645996,
254
+ "learning_rate": 3.5510688836104514e-05,
255
+ "loss": 0.4238,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.12,
260
+ "grad_norm": 21.654006958007812,
261
+ "learning_rate": 3.4916864608076014e-05,
262
+ "loss": 0.3569,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.152,
267
+ "grad_norm": 10.348204612731934,
268
+ "learning_rate": 3.432304038004751e-05,
269
+ "loss": 0.3122,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.184,
274
+ "grad_norm": 9.6500883102417,
275
+ "learning_rate": 3.372921615201901e-05,
276
+ "loss": 0.327,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.216,
281
+ "grad_norm": 16.52385902404785,
282
+ "learning_rate": 3.31353919239905e-05,
283
+ "loss": 0.4007,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.248,
288
+ "grad_norm": 13.382709503173828,
289
+ "learning_rate": 3.2541567695961995e-05,
290
+ "loss": 0.3414,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.28,
295
+ "grad_norm": 58.999324798583984,
296
+ "learning_rate": 3.1947743467933496e-05,
297
+ "loss": 0.3501,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.312,
302
+ "grad_norm": 7.981963157653809,
303
+ "learning_rate": 3.135391923990499e-05,
304
+ "loss": 0.3822,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 1.3439999999999999,
309
+ "grad_norm": 9.790070533752441,
310
+ "learning_rate": 3.076009501187649e-05,
311
+ "loss": 0.3383,
312
  "step": 420
313
  },
314
  {
315
  "epoch": 1.376,
316
+ "grad_norm": 8.631843566894531,
317
+ "learning_rate": 3.0166270783847983e-05,
318
+ "loss": 0.335,
319
  "step": 430
320
  },
321
  {
322
  "epoch": 1.408,
323
+ "grad_norm": 9.060861587524414,
324
+ "learning_rate": 2.9572446555819477e-05,
325
+ "loss": 0.3282,
326
  "step": 440
327
  },
328
  {
329
  "epoch": 1.44,
330
+ "grad_norm": 15.991608619689941,
331
+ "learning_rate": 2.8978622327790977e-05,
332
+ "loss": 0.3037,
333
  "step": 450
334
  },
335
  {
336
  "epoch": 1.472,
337
+ "grad_norm": 35.242740631103516,
338
+ "learning_rate": 2.838479809976247e-05,
339
+ "loss": 0.3321,
340
  "step": 460
341
  },
342
  {
343
  "epoch": 1.504,
344
+ "grad_norm": 10.073174476623535,
345
+ "learning_rate": 2.7790973871733968e-05,
346
+ "loss": 0.3023,
347
  "step": 470
348
  },
349
  {
350
  "epoch": 1.536,
351
+ "grad_norm": 19.67376708984375,
352
+ "learning_rate": 2.7197149643705465e-05,
353
+ "loss": 0.3168,
354
  "step": 480
355
  },
356
  {
357
  "epoch": 1.568,
358
+ "grad_norm": 6.408135890960693,
359
+ "learning_rate": 2.6603325415676962e-05,
360
+ "loss": 0.306,
361
  "step": 490
362
  },
363
  {
364
  "epoch": 1.6,
365
+ "grad_norm": 8.863042831420898,
366
+ "learning_rate": 2.6009501187648455e-05,
367
+ "loss": 0.3103,
368
  "step": 500
369
  },
370
  {
371
  "epoch": 1.6320000000000001,
372
+ "grad_norm": 17.778520584106445,
373
+ "learning_rate": 2.5415676959619956e-05,
374
+ "loss": 0.3123,
375
  "step": 510
376
  },
377
  {
378
  "epoch": 1.6640000000000001,
379
+ "grad_norm": 11.152641296386719,
380
+ "learning_rate": 2.482185273159145e-05,
381
+ "loss": 0.304,
382
  "step": 520
383
  },
384
  {
385
  "epoch": 1.696,
386
+ "grad_norm": 18.749303817749023,
387
+ "learning_rate": 2.4228028503562946e-05,
388
+ "loss": 0.2905,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 1.728,
393
+ "grad_norm": 26.219440460205078,
394
+ "learning_rate": 2.3634204275534443e-05,
395
+ "loss": 0.3163,
396
  "step": 540
397
  },
398
  {
399
  "epoch": 1.76,
400
+ "grad_norm": 16.212156295776367,
401
+ "learning_rate": 2.3040380047505937e-05,
402
+ "loss": 0.3073,
403
  "step": 550
404
  },
405
  {
406
  "epoch": 1.792,
407
+ "grad_norm": 11.084125518798828,
408
+ "learning_rate": 2.2446555819477437e-05,
409
+ "loss": 0.3166,
410
  "step": 560
411
  },
412
  {
413
  "epoch": 1.8239999999999998,
414
+ "grad_norm": 12.52534008026123,
415
+ "learning_rate": 2.1852731591448934e-05,
416
+ "loss": 0.293,
417
  "step": 570
418
  },
419
  {
420
  "epoch": 1.8559999999999999,
421
+ "grad_norm": 20.346576690673828,
422
+ "learning_rate": 2.1258907363420428e-05,
423
+ "loss": 0.2948,
424
  "step": 580
425
  },
426
  {
427
  "epoch": 1.888,
428
+ "grad_norm": 9.408613204956055,
429
+ "learning_rate": 2.0665083135391925e-05,
430
+ "loss": 0.3081,
431
  "step": 590
432
  },
433
  {
434
  "epoch": 1.92,
435
+ "grad_norm": 27.265464782714844,
436
+ "learning_rate": 2.0071258907363422e-05,
437
+ "loss": 0.3084,
438
  "step": 600
439
  },
440
  {
441
  "epoch": 1.952,
442
+ "grad_norm": 8.094114303588867,
443
+ "learning_rate": 1.947743467933492e-05,
444
+ "loss": 0.3047,
445
  "step": 610
446
  },
447
  {
448
  "epoch": 1.984,
449
+ "grad_norm": 13.893214225769043,
450
+ "learning_rate": 1.8883610451306412e-05,
451
+ "loss": 0.2726,
452
  "step": 620
453
  },
454
  {
455
  "epoch": 2.0,
456
+ "eval_accuracy": 0.7025,
457
+ "eval_loss": 0.830498456954956,
458
+ "eval_runtime": 154.4758,
459
+ "eval_samples_per_second": 25.894,
460
+ "eval_steps_per_second": 0.809,
461
  "step": 625
462
  },
463
  {
464
  "epoch": 2.016,
465
+ "grad_norm": 5.93212890625,
466
+ "learning_rate": 1.828978622327791e-05,
467
+ "loss": 0.2716,
468
  "step": 630
469
  },
470
  {
471
  "epoch": 2.048,
472
+ "grad_norm": 33.18698501586914,
473
+ "learning_rate": 1.7695961995249406e-05,
474
+ "loss": 0.2845,
475
  "step": 640
476
  },
477
  {
478
  "epoch": 2.08,
479
+ "grad_norm": 23.357540130615234,
480
+ "learning_rate": 1.7102137767220903e-05,
481
+ "loss": 0.3093,
482
  "step": 650
483
  },
484
  {
485
  "epoch": 2.112,
486
+ "grad_norm": 14.696449279785156,
487
+ "learning_rate": 1.65083135391924e-05,
488
+ "loss": 0.2886,
489
  "step": 660
490
  },
491
  {
492
  "epoch": 2.144,
493
+ "grad_norm": 32.7164192199707,
494
+ "learning_rate": 1.5914489311163897e-05,
495
+ "loss": 0.2865,
496
  "step": 670
497
  },
498
  {
499
  "epoch": 2.176,
500
+ "grad_norm": 11.33267879486084,
501
+ "learning_rate": 1.5320665083135394e-05,
502
+ "loss": 0.2616,
503
  "step": 680
504
  },
505
  {
506
  "epoch": 2.208,
507
+ "grad_norm": 10.119343757629395,
508
+ "learning_rate": 1.4726840855106888e-05,
509
+ "loss": 0.2768,
510
  "step": 690
511
  },
512
  {
513
  "epoch": 2.24,
514
+ "grad_norm": 12.684237480163574,
515
+ "learning_rate": 1.4133016627078385e-05,
516
+ "loss": 0.2332,
517
  "step": 700
518
  },
519
  {
520
  "epoch": 2.2720000000000002,
521
+ "grad_norm": 17.10093879699707,
522
+ "learning_rate": 1.3539192399049882e-05,
523
+ "loss": 0.2648,
524
  "step": 710
525
  },
526
  {
527
  "epoch": 2.304,
528
+ "grad_norm": 22.309677124023438,
529
+ "learning_rate": 1.2945368171021377e-05,
530
+ "loss": 0.2794,
531
  "step": 720
532
  },
533
  {
534
  "epoch": 2.336,
535
+ "grad_norm": 15.592386245727539,
536
+ "learning_rate": 1.2351543942992874e-05,
537
+ "loss": 0.2826,
538
  "step": 730
539
  },
540
  {
541
  "epoch": 2.368,
542
+ "grad_norm": 10.027396202087402,
543
+ "learning_rate": 1.1757719714964371e-05,
544
+ "loss": 0.2662,
545
  "step": 740
546
  },
547
  {
548
  "epoch": 2.4,
549
+ "grad_norm": 7.71236515045166,
550
+ "learning_rate": 1.1163895486935868e-05,
551
+ "loss": 0.2802,
552
  "step": 750
553
  },
554
  {
555
  "epoch": 2.432,
556
+ "grad_norm": 8.326642036437988,
557
+ "learning_rate": 1.0570071258907365e-05,
558
+ "loss": 0.2644,
559
  "step": 760
560
  },
561
  {
562
  "epoch": 2.464,
563
+ "grad_norm": 17.820472717285156,
564
+ "learning_rate": 9.97624703087886e-06,
565
+ "loss": 0.2649,
566
  "step": 770
567
  },
568
  {
569
  "epoch": 2.496,
570
+ "grad_norm": 10.056536674499512,
571
+ "learning_rate": 9.382422802850356e-06,
572
+ "loss": 0.248,
573
  "step": 780
574
  },
575
  {
576
  "epoch": 2.528,
577
+ "grad_norm": 30.569965362548828,
578
+ "learning_rate": 8.788598574821852e-06,
579
+ "loss": 0.2303,
580
  "step": 790
581
  },
582
  {
583
  "epoch": 2.56,
584
+ "grad_norm": 7.8709797859191895,
585
+ "learning_rate": 8.19477434679335e-06,
586
+ "loss": 0.2537,
587
  "step": 800
588
  },
589
  {
590
  "epoch": 2.592,
591
+ "grad_norm": 15.642313957214355,
592
+ "learning_rate": 7.6009501187648464e-06,
593
+ "loss": 0.2726,
594
  "step": 810
595
  },
596
  {
597
  "epoch": 2.624,
598
+ "grad_norm": 8.542279243469238,
599
+ "learning_rate": 7.007125890736342e-06,
600
+ "loss": 0.27,
601
  "step": 820
602
  },
603
  {
604
  "epoch": 2.656,
605
+ "grad_norm": 7.526516914367676,
606
+ "learning_rate": 6.4133016627078396e-06,
607
+ "loss": 0.2742,
608
  "step": 830
609
  },
610
  {
611
  "epoch": 2.6879999999999997,
612
+ "grad_norm": 9.183687210083008,
613
+ "learning_rate": 5.819477434679335e-06,
614
+ "loss": 0.2477,
615
  "step": 840
616
  },
617
  {
618
  "epoch": 2.7199999999999998,
619
+ "grad_norm": 13.24605655670166,
620
+ "learning_rate": 5.225653206650832e-06,
621
+ "loss": 0.2462,
622
  "step": 850
623
  },
624
  {
625
  "epoch": 2.752,
626
+ "grad_norm": 14.802075386047363,
627
+ "learning_rate": 4.631828978622328e-06,
628
+ "loss": 0.2451,
629
  "step": 860
630
  },
631
  {
632
  "epoch": 2.784,
633
+ "grad_norm": 8.528770446777344,
634
+ "learning_rate": 4.038004750593825e-06,
635
+ "loss": 0.2221,
636
  "step": 870
637
  },
638
  {
639
  "epoch": 2.816,
640
+ "grad_norm": 12.269448280334473,
641
+ "learning_rate": 3.4441805225653207e-06,
642
+ "loss": 0.2522,
643
  "step": 880
644
  },
645
  {
646
  "epoch": 2.848,
647
+ "grad_norm": 30.11684226989746,
648
+ "learning_rate": 2.850356294536817e-06,
649
+ "loss": 0.2509,
650
  "step": 890
651
  },
652
  {
653
  "epoch": 2.88,
654
+ "grad_norm": 25.639724731445312,
655
+ "learning_rate": 2.2565320665083133e-06,
656
+ "loss": 0.2423,
657
  "step": 900
658
  },
659
  {
660
  "epoch": 2.912,
661
+ "grad_norm": 10.527947425842285,
662
+ "learning_rate": 1.6627078384798101e-06,
663
+ "loss": 0.2528,
664
  "step": 910
665
  },
666
  {
667
  "epoch": 2.944,
668
+ "grad_norm": 18.0466365814209,
669
+ "learning_rate": 1.0688836104513065e-06,
670
+ "loss": 0.257,
671
  "step": 920
672
  },
673
  {
674
  "epoch": 2.976,
675
+ "grad_norm": 12.547955513000488,
676
+ "learning_rate": 4.750593824228029e-07,
677
+ "loss": 0.2103,
678
  "step": 930
679
  },
680
  {
681
+ "epoch": 2.9952,
682
+ "eval_accuracy": 0.692,
683
+ "eval_loss": 0.870841920375824,
684
+ "eval_runtime": 155.4401,
685
+ "eval_samples_per_second": 25.733,
686
+ "eval_steps_per_second": 0.804,
687
+ "step": 936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
  },
689
  {
690
+ "epoch": 2.9952,
691
+ "step": 936,
692
+ "total_flos": 2.9779463284174356e+18,
693
+ "train_loss": 0.36125070786374247,
694
+ "train_runtime": 41238.7802,
695
+ "train_samples_per_second": 2.91,
696
  "train_steps_per_second": 0.023
697
  }
698
  ],
699
  "logging_steps": 10,
700
+ "max_steps": 936,
701
  "num_input_tokens_seen": 0,
702
+ "num_train_epochs": 3,
703
  "save_steps": 500,
704
  "stateful_callbacks": {
705
  "TrainerControl": {
 
713
  "attributes": {}
714
  }
715
  },
716
+ "total_flos": 2.9779463284174356e+18,
717
  "train_batch_size": 32,
718
  "trial_name": null,
719
  "trial_params": null