raks87 commited on
Commit
9556436
1 Parent(s): 12b4b3f

End of training

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 2.99,
3
- "eval_accuracy": 0.9064,
4
- "eval_loss": 0.2777732014656067,
5
- "eval_runtime": 126.0372,
6
- "eval_samples_per_second": 119.013,
7
- "eval_steps_per_second": 3.721,
8
  "total_flos": 1.0585264325663785e+18,
9
- "train_loss": 0.9103804560371371,
10
- "train_runtime": 753.233,
11
- "train_samples_per_second": 139.399,
12
- "train_steps_per_second": 1.087
13
  }
 
1
  {
2
  "epoch": 2.99,
3
+ "eval_accuracy": 0.10166666666666667,
4
+ "eval_loss": NaN,
5
+ "eval_runtime": 135.6867,
6
+ "eval_samples_per_second": 110.549,
7
+ "eval_steps_per_second": 3.456,
8
  "total_flos": 1.0585264325663785e+18,
9
+ "train_loss": 0.0,
10
+ "train_runtime": 811.0273,
11
+ "train_samples_per_second": 129.465,
12
+ "train_steps_per_second": 1.01
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.99,
3
- "eval_accuracy": 0.9064,
4
- "eval_loss": 0.2777732014656067,
5
- "eval_runtime": 126.0372,
6
- "eval_samples_per_second": 119.013,
7
- "eval_steps_per_second": 3.721
8
  }
 
1
  {
2
  "epoch": 2.99,
3
+ "eval_accuracy": 0.10166666666666667,
4
+ "eval_loss": NaN,
5
+ "eval_runtime": 135.6867,
6
+ "eval_samples_per_second": 110.549,
7
+ "eval_steps_per_second": 3.456
8
  }
runs/Apr07_14-53-27_9b51f9451925/events.out.tfevents.1712502568.9b51f9451925.6697.6 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:936dea077f72a96abdf489ed31dba84951ee3bac2625c560dbeca6a6348ccae9
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.99,
3
  "total_flos": 1.0585264325663785e+18,
4
- "train_loss": 0.9103804560371371,
5
- "train_runtime": 753.233,
6
- "train_samples_per_second": 139.399,
7
- "train_steps_per_second": 1.087
8
  }
 
1
  {
2
  "epoch": 2.99,
3
  "total_flos": 1.0585264325663785e+18,
4
+ "train_loss": 0.0,
5
+ "train_runtime": 811.0273,
6
+ "train_samples_per_second": 129.465,
7
+ "train_steps_per_second": 1.01
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.9064,
3
- "best_model_checkpoint": "resnet-18-finetuned-cifar10/checkpoint-819",
4
  "epoch": 2.9945155393053016,
5
  "eval_steps": 500,
6
  "global_step": 819,
@@ -10,606 +10,606 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
- "grad_norm": 16.46918296813965,
14
  "learning_rate": 6.0975609756097564e-06,
15
- "loss": 3.2109,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07,
20
- "grad_norm": 22.162967681884766,
21
  "learning_rate": 1.2195121951219513e-05,
22
- "loss": 2.929,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.11,
27
- "grad_norm": 18.4061336517334,
28
  "learning_rate": 1.8292682926829268e-05,
29
- "loss": 2.6339,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.15,
34
- "grad_norm": 14.292740821838379,
35
  "learning_rate": 2.4390243902439026e-05,
36
- "loss": 2.2668,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.18,
41
- "grad_norm": 15.263444900512695,
42
  "learning_rate": 3.048780487804878e-05,
43
- "loss": 1.947,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.22,
48
- "grad_norm": 18.372751235961914,
49
  "learning_rate": 3.6585365853658535e-05,
50
- "loss": 1.7001,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.26,
55
- "grad_norm": 14.107364654541016,
56
  "learning_rate": 4.26829268292683e-05,
57
- "loss": 1.4365,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.29,
62
- "grad_norm": 11.996474266052246,
63
  "learning_rate": 4.878048780487805e-05,
64
- "loss": 1.3653,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.33,
69
- "grad_norm": 13.601606369018555,
70
  "learning_rate": 4.94572591587517e-05,
71
- "loss": 1.2311,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.37,
76
- "grad_norm": 12.406356811523438,
77
  "learning_rate": 4.877883310719132e-05,
78
- "loss": 1.2409,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.4,
83
- "grad_norm": 10.42392349243164,
84
  "learning_rate": 4.810040705563094e-05,
85
- "loss": 1.1009,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.44,
90
- "grad_norm": 11.649271011352539,
91
  "learning_rate": 4.742198100407056e-05,
92
- "loss": 1.1132,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.48,
97
- "grad_norm": 10.590466499328613,
98
  "learning_rate": 4.674355495251018e-05,
99
- "loss": 1.0677,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.51,
104
- "grad_norm": 10.409879684448242,
105
  "learning_rate": 4.60651289009498e-05,
106
- "loss": 1.0262,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.55,
111
- "grad_norm": 11.766679763793945,
112
  "learning_rate": 4.5386702849389416e-05,
113
- "loss": 1.0399,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.59,
118
- "grad_norm": 10.652596473693848,
119
  "learning_rate": 4.470827679782904e-05,
120
- "loss": 1.0203,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.62,
125
- "grad_norm": 10.809771537780762,
126
  "learning_rate": 4.402985074626866e-05,
127
- "loss": 0.9633,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.66,
132
- "grad_norm": 12.764290809631348,
133
  "learning_rate": 4.335142469470828e-05,
134
- "loss": 0.9355,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.69,
139
- "grad_norm": 10.757213592529297,
140
  "learning_rate": 4.26729986431479e-05,
141
- "loss": 0.9337,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.73,
146
- "grad_norm": 11.284212112426758,
147
  "learning_rate": 4.199457259158752e-05,
148
- "loss": 0.9314,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.77,
153
- "grad_norm": 11.635641098022461,
154
  "learning_rate": 4.131614654002714e-05,
155
- "loss": 0.8976,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.8,
160
- "grad_norm": 10.869036674499512,
161
  "learning_rate": 4.063772048846676e-05,
162
- "loss": 0.9065,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.84,
167
- "grad_norm": 11.703560829162598,
168
  "learning_rate": 3.995929443690638e-05,
169
- "loss": 0.8444,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.88,
174
- "grad_norm": 10.33545207977295,
175
  "learning_rate": 3.9280868385345995e-05,
176
- "loss": 0.825,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.91,
181
- "grad_norm": 7.844155311584473,
182
  "learning_rate": 3.860244233378562e-05,
183
- "loss": 0.8369,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.95,
188
- "grad_norm": 9.158990859985352,
189
  "learning_rate": 3.792401628222524e-05,
190
- "loss": 0.8422,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.99,
195
- "grad_norm": 9.740525245666504,
196
  "learning_rate": 3.724559023066486e-05,
197
- "loss": 0.8502,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 1.0,
202
- "eval_accuracy": 0.8632,
203
- "eval_loss": 0.4063829481601715,
204
- "eval_runtime": 123.7138,
205
- "eval_samples_per_second": 121.248,
206
- "eval_steps_per_second": 3.791,
207
  "step": 273
208
  },
209
  {
210
  "epoch": 1.02,
211
- "grad_norm": 7.927298069000244,
212
  "learning_rate": 3.656716417910448e-05,
213
- "loss": 0.847,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.06,
218
- "grad_norm": 9.951263427734375,
219
  "learning_rate": 3.58887381275441e-05,
220
- "loss": 0.8189,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.1,
225
- "grad_norm": 7.741189479827881,
226
  "learning_rate": 3.521031207598372e-05,
227
- "loss": 0.804,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.13,
232
- "grad_norm": 8.110347747802734,
233
  "learning_rate": 3.453188602442334e-05,
234
- "loss": 0.7829,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.17,
239
- "grad_norm": 8.17768669128418,
240
  "learning_rate": 3.385345997286296e-05,
241
- "loss": 0.8124,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.21,
246
- "grad_norm": 8.95181655883789,
247
  "learning_rate": 3.3175033921302575e-05,
248
- "loss": 0.7392,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.24,
253
- "grad_norm": 9.440756797790527,
254
  "learning_rate": 3.24966078697422e-05,
255
- "loss": 0.8488,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.28,
260
- "grad_norm": 8.526937484741211,
261
  "learning_rate": 3.181818181818182e-05,
262
- "loss": 0.7525,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.32,
267
- "grad_norm": 8.15047836303711,
268
  "learning_rate": 3.113975576662144e-05,
269
- "loss": 0.7966,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.35,
274
- "grad_norm": 8.909910202026367,
275
  "learning_rate": 3.046132971506106e-05,
276
- "loss": 0.7682,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.39,
281
- "grad_norm": 9.869742393493652,
282
  "learning_rate": 2.9782903663500678e-05,
283
- "loss": 0.7143,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.43,
288
- "grad_norm": 9.056320190429688,
289
  "learning_rate": 2.91044776119403e-05,
290
- "loss": 0.7467,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.46,
295
- "grad_norm": 8.215534210205078,
296
  "learning_rate": 2.842605156037992e-05,
297
- "loss": 0.7589,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.5,
302
- "grad_norm": 8.56048583984375,
303
  "learning_rate": 2.7747625508819542e-05,
304
- "loss": 0.786,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 1.54,
309
- "grad_norm": 7.839065074920654,
310
  "learning_rate": 2.7069199457259158e-05,
311
- "loss": 0.7033,
312
  "step": 420
313
  },
314
  {
315
  "epoch": 1.57,
316
- "grad_norm": 8.194784164428711,
317
  "learning_rate": 2.639077340569878e-05,
318
- "loss": 0.6867,
319
  "step": 430
320
  },
321
  {
322
  "epoch": 1.61,
323
- "grad_norm": 7.5709547996521,
324
  "learning_rate": 2.57123473541384e-05,
325
- "loss": 0.7555,
326
  "step": 440
327
  },
328
  {
329
  "epoch": 1.65,
330
- "grad_norm": 8.456563949584961,
331
  "learning_rate": 2.5033921302578023e-05,
332
- "loss": 0.6939,
333
  "step": 450
334
  },
335
  {
336
  "epoch": 1.68,
337
- "grad_norm": 8.453351020812988,
338
  "learning_rate": 2.4355495251017642e-05,
339
- "loss": 0.6965,
340
  "step": 460
341
  },
342
  {
343
  "epoch": 1.72,
344
- "grad_norm": 7.400697231292725,
345
  "learning_rate": 2.367706919945726e-05,
346
- "loss": 0.7067,
347
  "step": 470
348
  },
349
  {
350
  "epoch": 1.76,
351
- "grad_norm": 8.554298400878906,
352
  "learning_rate": 2.299864314789688e-05,
353
- "loss": 0.712,
354
  "step": 480
355
  },
356
  {
357
  "epoch": 1.79,
358
- "grad_norm": 8.703104972839355,
359
  "learning_rate": 2.2320217096336503e-05,
360
- "loss": 0.7176,
361
  "step": 490
362
  },
363
  {
364
  "epoch": 1.83,
365
- "grad_norm": 8.274211883544922,
366
  "learning_rate": 2.164179104477612e-05,
367
- "loss": 0.7214,
368
  "step": 500
369
  },
370
  {
371
  "epoch": 1.86,
372
- "grad_norm": 10.009110450744629,
373
  "learning_rate": 2.0963364993215738e-05,
374
- "loss": 0.7518,
375
  "step": 510
376
  },
377
  {
378
  "epoch": 1.9,
379
- "grad_norm": 9.217292785644531,
380
  "learning_rate": 2.028493894165536e-05,
381
- "loss": 0.7125,
382
  "step": 520
383
  },
384
  {
385
  "epoch": 1.94,
386
- "grad_norm": 7.580327987670898,
387
  "learning_rate": 1.960651289009498e-05,
388
- "loss": 0.6731,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 1.97,
393
- "grad_norm": 7.33704137802124,
394
  "learning_rate": 1.89280868385346e-05,
395
- "loss": 0.6924,
396
  "step": 540
397
  },
398
  {
399
  "epoch": 2.0,
400
- "eval_accuracy": 0.8956,
401
- "eval_loss": 0.3149263858795166,
402
- "eval_runtime": 128.5203,
403
- "eval_samples_per_second": 116.713,
404
- "eval_steps_per_second": 3.649,
405
  "step": 547
406
  },
407
  {
408
  "epoch": 2.01,
409
- "grad_norm": 8.008723258972168,
410
  "learning_rate": 1.824966078697422e-05,
411
- "loss": 0.7104,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.05,
416
- "grad_norm": 6.283710479736328,
417
  "learning_rate": 1.757123473541384e-05,
418
- "loss": 0.6671,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.08,
423
- "grad_norm": 7.36352014541626,
424
  "learning_rate": 1.689280868385346e-05,
425
- "loss": 0.6464,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.12,
430
- "grad_norm": 8.875202178955078,
431
  "learning_rate": 1.6214382632293083e-05,
432
- "loss": 0.6495,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.16,
437
- "grad_norm": 8.145440101623535,
438
  "learning_rate": 1.55359565807327e-05,
439
- "loss": 0.6609,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.19,
444
- "grad_norm": 8.108186721801758,
445
  "learning_rate": 1.485753052917232e-05,
446
- "loss": 0.7062,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 2.23,
451
- "grad_norm": 7.363587856292725,
452
  "learning_rate": 1.417910447761194e-05,
453
- "loss": 0.6395,
454
  "step": 610
455
  },
456
  {
457
  "epoch": 2.27,
458
- "grad_norm": 7.521917343139648,
459
  "learning_rate": 1.3500678426051561e-05,
460
- "loss": 0.657,
461
  "step": 620
462
  },
463
  {
464
  "epoch": 2.3,
465
- "grad_norm": 7.513711452484131,
466
  "learning_rate": 1.282225237449118e-05,
467
- "loss": 0.6638,
468
  "step": 630
469
  },
470
  {
471
  "epoch": 2.34,
472
- "grad_norm": 8.112990379333496,
473
  "learning_rate": 1.2143826322930801e-05,
474
- "loss": 0.6968,
475
  "step": 640
476
  },
477
  {
478
  "epoch": 2.38,
479
- "grad_norm": 7.811177730560303,
480
  "learning_rate": 1.1465400271370422e-05,
481
- "loss": 0.7007,
482
  "step": 650
483
  },
484
  {
485
  "epoch": 2.41,
486
- "grad_norm": 7.438766956329346,
487
  "learning_rate": 1.0786974219810041e-05,
488
- "loss": 0.6458,
489
  "step": 660
490
  },
491
  {
492
  "epoch": 2.45,
493
- "grad_norm": 9.478456497192383,
494
  "learning_rate": 1.010854816824966e-05,
495
- "loss": 0.6534,
496
  "step": 670
497
  },
498
  {
499
  "epoch": 2.49,
500
- "grad_norm": 7.470586776733398,
501
  "learning_rate": 9.430122116689281e-06,
502
- "loss": 0.6625,
503
  "step": 680
504
  },
505
  {
506
  "epoch": 2.52,
507
- "grad_norm": 7.3036956787109375,
508
  "learning_rate": 8.751696065128902e-06,
509
- "loss": 0.6581,
510
  "step": 690
511
  },
512
  {
513
  "epoch": 2.56,
514
- "grad_norm": 7.530010223388672,
515
  "learning_rate": 8.073270013568522e-06,
516
- "loss": 0.6214,
517
  "step": 700
518
  },
519
  {
520
  "epoch": 2.6,
521
- "grad_norm": 6.2767333984375,
522
  "learning_rate": 7.394843962008141e-06,
523
- "loss": 0.6136,
524
  "step": 710
525
  },
526
  {
527
  "epoch": 2.63,
528
- "grad_norm": 8.302584648132324,
529
  "learning_rate": 6.716417910447762e-06,
530
- "loss": 0.6215,
531
  "step": 720
532
  },
533
  {
534
  "epoch": 2.67,
535
- "grad_norm": 7.744215965270996,
536
  "learning_rate": 6.037991858887382e-06,
537
- "loss": 0.6382,
538
  "step": 730
539
  },
540
  {
541
  "epoch": 2.71,
542
- "grad_norm": 7.538488864898682,
543
  "learning_rate": 5.359565807327002e-06,
544
- "loss": 0.6239,
545
  "step": 740
546
  },
547
  {
548
  "epoch": 2.74,
549
- "grad_norm": 7.9057488441467285,
550
  "learning_rate": 4.681139755766622e-06,
551
- "loss": 0.6413,
552
  "step": 750
553
  },
554
  {
555
  "epoch": 2.78,
556
- "grad_norm": 6.213993549346924,
557
  "learning_rate": 4.002713704206242e-06,
558
- "loss": 0.6153,
559
  "step": 760
560
  },
561
  {
562
  "epoch": 2.82,
563
- "grad_norm": 6.704429626464844,
564
  "learning_rate": 3.324287652645862e-06,
565
- "loss": 0.6502,
566
  "step": 770
567
  },
568
  {
569
  "epoch": 2.85,
570
- "grad_norm": 10.968541145324707,
571
  "learning_rate": 2.645861601085482e-06,
572
- "loss": 0.6439,
573
  "step": 780
574
  },
575
  {
576
  "epoch": 2.89,
577
- "grad_norm": 8.878872871398926,
578
  "learning_rate": 1.967435549525102e-06,
579
- "loss": 0.6751,
580
  "step": 790
581
  },
582
  {
583
  "epoch": 2.93,
584
- "grad_norm": 8.374876022338867,
585
  "learning_rate": 1.289009497964722e-06,
586
- "loss": 0.6586,
587
  "step": 800
588
  },
589
  {
590
  "epoch": 2.96,
591
- "grad_norm": 8.15294075012207,
592
  "learning_rate": 6.10583446404342e-07,
593
- "loss": 0.6714,
594
  "step": 810
595
  },
596
  {
597
  "epoch": 2.99,
598
- "eval_accuracy": 0.9064,
599
- "eval_loss": 0.2777732014656067,
600
- "eval_runtime": 127.5667,
601
- "eval_samples_per_second": 117.586,
602
- "eval_steps_per_second": 3.677,
603
  "step": 819
604
  },
605
  {
606
  "epoch": 2.99,
607
  "step": 819,
608
  "total_flos": 1.0585264325663785e+18,
609
- "train_loss": 0.9103804560371371,
610
- "train_runtime": 753.233,
611
- "train_samples_per_second": 139.399,
612
- "train_steps_per_second": 1.087
613
  }
614
  ],
615
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.10166666666666667,
3
+ "best_model_checkpoint": "resnet-18-finetuned-cifar10/checkpoint-273",
4
  "epoch": 2.9945155393053016,
5
  "eval_steps": 500,
6
  "global_step": 819,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
+ "grad_norm": NaN,
14
  "learning_rate": 6.0975609756097564e-06,
15
+ "loss": 0.0,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.07,
20
+ "grad_norm": NaN,
21
  "learning_rate": 1.2195121951219513e-05,
22
+ "loss": 0.0,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.11,
27
+ "grad_norm": NaN,
28
  "learning_rate": 1.8292682926829268e-05,
29
+ "loss": 0.0,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.15,
34
+ "grad_norm": NaN,
35
  "learning_rate": 2.4390243902439026e-05,
36
+ "loss": 0.0,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.18,
41
+ "grad_norm": NaN,
42
  "learning_rate": 3.048780487804878e-05,
43
+ "loss": 0.0,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.22,
48
+ "grad_norm": NaN,
49
  "learning_rate": 3.6585365853658535e-05,
50
+ "loss": 0.0,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.26,
55
+ "grad_norm": NaN,
56
  "learning_rate": 4.26829268292683e-05,
57
+ "loss": 0.0,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.29,
62
+ "grad_norm": NaN,
63
  "learning_rate": 4.878048780487805e-05,
64
+ "loss": 0.0,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.33,
69
+ "grad_norm": NaN,
70
  "learning_rate": 4.94572591587517e-05,
71
+ "loss": 0.0,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.37,
76
+ "grad_norm": NaN,
77
  "learning_rate": 4.877883310719132e-05,
78
+ "loss": 0.0,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.4,
83
+ "grad_norm": NaN,
84
  "learning_rate": 4.810040705563094e-05,
85
+ "loss": 0.0,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.44,
90
+ "grad_norm": NaN,
91
  "learning_rate": 4.742198100407056e-05,
92
+ "loss": 0.0,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.48,
97
+ "grad_norm": NaN,
98
  "learning_rate": 4.674355495251018e-05,
99
+ "loss": 0.0,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.51,
104
+ "grad_norm": NaN,
105
  "learning_rate": 4.60651289009498e-05,
106
+ "loss": 0.0,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.55,
111
+ "grad_norm": NaN,
112
  "learning_rate": 4.5386702849389416e-05,
113
+ "loss": 0.0,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.59,
118
+ "grad_norm": NaN,
119
  "learning_rate": 4.470827679782904e-05,
120
+ "loss": 0.0,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.62,
125
+ "grad_norm": NaN,
126
  "learning_rate": 4.402985074626866e-05,
127
+ "loss": 0.0,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.66,
132
+ "grad_norm": NaN,
133
  "learning_rate": 4.335142469470828e-05,
134
+ "loss": 0.0,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.69,
139
+ "grad_norm": NaN,
140
  "learning_rate": 4.26729986431479e-05,
141
+ "loss": 0.0,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.73,
146
+ "grad_norm": NaN,
147
  "learning_rate": 4.199457259158752e-05,
148
+ "loss": 0.0,
149
  "step": 200
150
  },
151
  {
152
  "epoch": 0.77,
153
+ "grad_norm": NaN,
154
  "learning_rate": 4.131614654002714e-05,
155
+ "loss": 0.0,
156
  "step": 210
157
  },
158
  {
159
  "epoch": 0.8,
160
+ "grad_norm": NaN,
161
  "learning_rate": 4.063772048846676e-05,
162
+ "loss": 0.0,
163
  "step": 220
164
  },
165
  {
166
  "epoch": 0.84,
167
+ "grad_norm": NaN,
168
  "learning_rate": 3.995929443690638e-05,
169
+ "loss": 0.0,
170
  "step": 230
171
  },
172
  {
173
  "epoch": 0.88,
174
+ "grad_norm": NaN,
175
  "learning_rate": 3.9280868385345995e-05,
176
+ "loss": 0.0,
177
  "step": 240
178
  },
179
  {
180
  "epoch": 0.91,
181
+ "grad_norm": NaN,
182
  "learning_rate": 3.860244233378562e-05,
183
+ "loss": 0.0,
184
  "step": 250
185
  },
186
  {
187
  "epoch": 0.95,
188
+ "grad_norm": NaN,
189
  "learning_rate": 3.792401628222524e-05,
190
+ "loss": 0.0,
191
  "step": 260
192
  },
193
  {
194
  "epoch": 0.99,
195
+ "grad_norm": NaN,
196
  "learning_rate": 3.724559023066486e-05,
197
+ "loss": 0.0,
198
  "step": 270
199
  },
200
  {
201
  "epoch": 1.0,
202
+ "eval_accuracy": 0.10166666666666667,
203
+ "eval_loss": NaN,
204
+ "eval_runtime": 135.8532,
205
+ "eval_samples_per_second": 110.413,
206
+ "eval_steps_per_second": 3.452,
207
  "step": 273
208
  },
209
  {
210
  "epoch": 1.02,
211
+ "grad_norm": NaN,
212
  "learning_rate": 3.656716417910448e-05,
213
+ "loss": 0.0,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.06,
218
+ "grad_norm": NaN,
219
  "learning_rate": 3.58887381275441e-05,
220
+ "loss": 0.0,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.1,
225
+ "grad_norm": NaN,
226
  "learning_rate": 3.521031207598372e-05,
227
+ "loss": 0.0,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.13,
232
+ "grad_norm": NaN,
233
  "learning_rate": 3.453188602442334e-05,
234
+ "loss": 0.0,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.17,
239
+ "grad_norm": NaN,
240
  "learning_rate": 3.385345997286296e-05,
241
+ "loss": 0.0,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.21,
246
+ "grad_norm": NaN,
247
  "learning_rate": 3.3175033921302575e-05,
248
+ "loss": 0.0,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.24,
253
+ "grad_norm": NaN,
254
  "learning_rate": 3.24966078697422e-05,
255
+ "loss": 0.0,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.28,
260
+ "grad_norm": NaN,
261
  "learning_rate": 3.181818181818182e-05,
262
+ "loss": 0.0,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.32,
267
+ "grad_norm": NaN,
268
  "learning_rate": 3.113975576662144e-05,
269
+ "loss": 0.0,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.35,
274
+ "grad_norm": NaN,
275
  "learning_rate": 3.046132971506106e-05,
276
+ "loss": 0.0,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.39,
281
+ "grad_norm": NaN,
282
  "learning_rate": 2.9782903663500678e-05,
283
+ "loss": 0.0,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.43,
288
+ "grad_norm": NaN,
289
  "learning_rate": 2.91044776119403e-05,
290
+ "loss": 0.0,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.46,
295
+ "grad_norm": NaN,
296
  "learning_rate": 2.842605156037992e-05,
297
+ "loss": 0.0,
298
  "step": 400
299
  },
300
  {
301
  "epoch": 1.5,
302
+ "grad_norm": NaN,
303
  "learning_rate": 2.7747625508819542e-05,
304
+ "loss": 0.0,
305
  "step": 410
306
  },
307
  {
308
  "epoch": 1.54,
309
+ "grad_norm": NaN,
310
  "learning_rate": 2.7069199457259158e-05,
311
+ "loss": 0.0,
312
  "step": 420
313
  },
314
  {
315
  "epoch": 1.57,
316
+ "grad_norm": NaN,
317
  "learning_rate": 2.639077340569878e-05,
318
+ "loss": 0.0,
319
  "step": 430
320
  },
321
  {
322
  "epoch": 1.61,
323
+ "grad_norm": NaN,
324
  "learning_rate": 2.57123473541384e-05,
325
+ "loss": 0.0,
326
  "step": 440
327
  },
328
  {
329
  "epoch": 1.65,
330
+ "grad_norm": NaN,
331
  "learning_rate": 2.5033921302578023e-05,
332
+ "loss": 0.0,
333
  "step": 450
334
  },
335
  {
336
  "epoch": 1.68,
337
+ "grad_norm": NaN,
338
  "learning_rate": 2.4355495251017642e-05,
339
+ "loss": 0.0,
340
  "step": 460
341
  },
342
  {
343
  "epoch": 1.72,
344
+ "grad_norm": NaN,
345
  "learning_rate": 2.367706919945726e-05,
346
+ "loss": 0.0,
347
  "step": 470
348
  },
349
  {
350
  "epoch": 1.76,
351
+ "grad_norm": NaN,
352
  "learning_rate": 2.299864314789688e-05,
353
+ "loss": 0.0,
354
  "step": 480
355
  },
356
  {
357
  "epoch": 1.79,
358
+ "grad_norm": NaN,
359
  "learning_rate": 2.2320217096336503e-05,
360
+ "loss": 0.0,
361
  "step": 490
362
  },
363
  {
364
  "epoch": 1.83,
365
+ "grad_norm": NaN,
366
  "learning_rate": 2.164179104477612e-05,
367
+ "loss": 0.0,
368
  "step": 500
369
  },
370
  {
371
  "epoch": 1.86,
372
+ "grad_norm": NaN,
373
  "learning_rate": 2.0963364993215738e-05,
374
+ "loss": 0.0,
375
  "step": 510
376
  },
377
  {
378
  "epoch": 1.9,
379
+ "grad_norm": NaN,
380
  "learning_rate": 2.028493894165536e-05,
381
+ "loss": 0.0,
382
  "step": 520
383
  },
384
  {
385
  "epoch": 1.94,
386
+ "grad_norm": NaN,
387
  "learning_rate": 1.960651289009498e-05,
388
+ "loss": 0.0,
389
  "step": 530
390
  },
391
  {
392
  "epoch": 1.97,
393
+ "grad_norm": NaN,
394
  "learning_rate": 1.89280868385346e-05,
395
+ "loss": 0.0,
396
  "step": 540
397
  },
398
  {
399
  "epoch": 2.0,
400
+ "eval_accuracy": 0.10166666666666667,
401
+ "eval_loss": NaN,
402
+ "eval_runtime": 132.0195,
403
+ "eval_samples_per_second": 113.62,
404
+ "eval_steps_per_second": 3.553,
405
  "step": 547
406
  },
407
  {
408
  "epoch": 2.01,
409
+ "grad_norm": NaN,
410
  "learning_rate": 1.824966078697422e-05,
411
+ "loss": 0.0,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.05,
416
+ "grad_norm": NaN,
417
  "learning_rate": 1.757123473541384e-05,
418
+ "loss": 0.0,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.08,
423
+ "grad_norm": NaN,
424
  "learning_rate": 1.689280868385346e-05,
425
+ "loss": 0.0,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.12,
430
+ "grad_norm": NaN,
431
  "learning_rate": 1.6214382632293083e-05,
432
+ "loss": 0.0,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.16,
437
+ "grad_norm": NaN,
438
  "learning_rate": 1.55359565807327e-05,
439
+ "loss": 0.0,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.19,
444
+ "grad_norm": NaN,
445
  "learning_rate": 1.485753052917232e-05,
446
+ "loss": 0.0,
447
  "step": 600
448
  },
449
  {
450
  "epoch": 2.23,
451
+ "grad_norm": NaN,
452
  "learning_rate": 1.417910447761194e-05,
453
+ "loss": 0.0,
454
  "step": 610
455
  },
456
  {
457
  "epoch": 2.27,
458
+ "grad_norm": NaN,
459
  "learning_rate": 1.3500678426051561e-05,
460
+ "loss": 0.0,
461
  "step": 620
462
  },
463
  {
464
  "epoch": 2.3,
465
+ "grad_norm": NaN,
466
  "learning_rate": 1.282225237449118e-05,
467
+ "loss": 0.0,
468
  "step": 630
469
  },
470
  {
471
  "epoch": 2.34,
472
+ "grad_norm": NaN,
473
  "learning_rate": 1.2143826322930801e-05,
474
+ "loss": 0.0,
475
  "step": 640
476
  },
477
  {
478
  "epoch": 2.38,
479
+ "grad_norm": NaN,
480
  "learning_rate": 1.1465400271370422e-05,
481
+ "loss": 0.0,
482
  "step": 650
483
  },
484
  {
485
  "epoch": 2.41,
486
+ "grad_norm": NaN,
487
  "learning_rate": 1.0786974219810041e-05,
488
+ "loss": 0.0,
489
  "step": 660
490
  },
491
  {
492
  "epoch": 2.45,
493
+ "grad_norm": NaN,
494
  "learning_rate": 1.010854816824966e-05,
495
+ "loss": 0.0,
496
  "step": 670
497
  },
498
  {
499
  "epoch": 2.49,
500
+ "grad_norm": NaN,
501
  "learning_rate": 9.430122116689281e-06,
502
+ "loss": 0.0,
503
  "step": 680
504
  },
505
  {
506
  "epoch": 2.52,
507
+ "grad_norm": NaN,
508
  "learning_rate": 8.751696065128902e-06,
509
+ "loss": 0.0,
510
  "step": 690
511
  },
512
  {
513
  "epoch": 2.56,
514
+ "grad_norm": NaN,
515
  "learning_rate": 8.073270013568522e-06,
516
+ "loss": 0.0,
517
  "step": 700
518
  },
519
  {
520
  "epoch": 2.6,
521
+ "grad_norm": NaN,
522
  "learning_rate": 7.394843962008141e-06,
523
+ "loss": 0.0,
524
  "step": 710
525
  },
526
  {
527
  "epoch": 2.63,
528
+ "grad_norm": NaN,
529
  "learning_rate": 6.716417910447762e-06,
530
+ "loss": 0.0,
531
  "step": 720
532
  },
533
  {
534
  "epoch": 2.67,
535
+ "grad_norm": NaN,
536
  "learning_rate": 6.037991858887382e-06,
537
+ "loss": 0.0,
538
  "step": 730
539
  },
540
  {
541
  "epoch": 2.71,
542
+ "grad_norm": NaN,
543
  "learning_rate": 5.359565807327002e-06,
544
+ "loss": 0.0,
545
  "step": 740
546
  },
547
  {
548
  "epoch": 2.74,
549
+ "grad_norm": NaN,
550
  "learning_rate": 4.681139755766622e-06,
551
+ "loss": 0.0,
552
  "step": 750
553
  },
554
  {
555
  "epoch": 2.78,
556
+ "grad_norm": NaN,
557
  "learning_rate": 4.002713704206242e-06,
558
+ "loss": 0.0,
559
  "step": 760
560
  },
561
  {
562
  "epoch": 2.82,
563
+ "grad_norm": NaN,
564
  "learning_rate": 3.324287652645862e-06,
565
+ "loss": 0.0,
566
  "step": 770
567
  },
568
  {
569
  "epoch": 2.85,
570
+ "grad_norm": NaN,
571
  "learning_rate": 2.645861601085482e-06,
572
+ "loss": 0.0,
573
  "step": 780
574
  },
575
  {
576
  "epoch": 2.89,
577
+ "grad_norm": NaN,
578
  "learning_rate": 1.967435549525102e-06,
579
+ "loss": 0.0,
580
  "step": 790
581
  },
582
  {
583
  "epoch": 2.93,
584
+ "grad_norm": NaN,
585
  "learning_rate": 1.289009497964722e-06,
586
+ "loss": 0.0,
587
  "step": 800
588
  },
589
  {
590
  "epoch": 2.96,
591
+ "grad_norm": NaN,
592
  "learning_rate": 6.10583446404342e-07,
593
+ "loss": 0.0,
594
  "step": 810
595
  },
596
  {
597
  "epoch": 2.99,
598
+ "eval_accuracy": 0.10166666666666667,
599
+ "eval_loss": NaN,
600
+ "eval_runtime": 130.8726,
601
+ "eval_samples_per_second": 114.615,
602
+ "eval_steps_per_second": 3.584,
603
  "step": 819
604
  },
605
  {
606
  "epoch": 2.99,
607
  "step": 819,
608
  "total_flos": 1.0585264325663785e+18,
609
+ "train_loss": 0.0,
610
+ "train_runtime": 811.0273,
611
+ "train_samples_per_second": 129.465,
612
+ "train_steps_per_second": 1.01
613
  }
614
  ],
615
  "logging_steps": 10,