nemik commited on
Commit
94d3ef2
1 Parent(s): c4520ee

Training in progress, step 1000

Browse files
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 17.0,
3
+ "total_flos": 1.333267779734618e+18,
4
+ "train_loss": 0.16752693287151701,
5
+ "train_runtime": 275.2566,
6
+ "train_samples_per_second": 62.502,
7
+ "train_steps_per_second": 3.953
8
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:127e1abb846a26a6fd77233d8b218e9f64cb4778b63d772de076c4a190299d94
3
  size 343248584
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83430875cd067277b44370fd56095e9c2e9983de01bc2716a36fcc672a6cd55b
3
  size 343248584
runs/Jul25_04-38-12_d85b605ca0e9/events.out.tfevents.1721882318.d85b605ca0e9.546.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3e4ba31a2fb7209781b40bf758823c5a53fbeeaa3ee0a76a44f076161dd0635
3
+ size 31150
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 17.0,
3
+ "total_flos": 1.333267779734618e+18,
4
+ "train_loss": 0.16752693287151701,
5
+ "train_runtime": 275.2566,
6
+ "train_samples_per_second": 62.502,
7
+ "train_steps_per_second": 3.953
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,918 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.13913680613040924,
3
+ "best_model_checkpoint": "vit-base-patch16-224-in21k-finetuned_v2024-7-24-frost/checkpoint-1000",
4
+ "epoch": 17.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1088,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15625,
13
+ "grad_norm": 0.388886958360672,
14
+ "learning_rate": 1.834862385321101e-05,
15
+ "loss": 0.6876,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.3125,
20
+ "grad_norm": 0.43717288970947266,
21
+ "learning_rate": 3.669724770642202e-05,
22
+ "loss": 0.6556,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.46875,
27
+ "grad_norm": 0.41058769822120667,
28
+ "learning_rate": 5.504587155963303e-05,
29
+ "loss": 0.5865,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.625,
34
+ "grad_norm": 0.37728703022003174,
35
+ "learning_rate": 7.339449541284404e-05,
36
+ "loss": 0.5101,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.78125,
41
+ "grad_norm": 0.3225114643573761,
42
+ "learning_rate": 9.174311926605506e-05,
43
+ "loss": 0.4621,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.9375,
48
+ "grad_norm": 0.27789732813835144,
49
+ "learning_rate": 0.00011009174311926606,
50
+ "loss": 0.4339,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 1.09375,
55
+ "grad_norm": 0.3035929501056671,
56
+ "learning_rate": 0.00012844036697247707,
57
+ "loss": 0.3999,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 1.25,
62
+ "grad_norm": 0.29363128542900085,
63
+ "learning_rate": 0.0001467889908256881,
64
+ "loss": 0.3808,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 1.40625,
69
+ "grad_norm": 0.2852456867694855,
70
+ "learning_rate": 0.0001651376146788991,
71
+ "loss": 0.356,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 1.5625,
76
+ "grad_norm": 0.2548869550228119,
77
+ "learning_rate": 0.00018348623853211012,
78
+ "loss": 0.3281,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 1.5625,
83
+ "eval_accuracy": 0.9008849557522124,
84
+ "eval_f1": 0.6956521739130436,
85
+ "eval_loss": 0.31767982244491577,
86
+ "eval_precision": 0.8767123287671232,
87
+ "eval_recall": 0.5765765765765766,
88
+ "eval_runtime": 0.9054,
89
+ "eval_samples_per_second": 124.801,
90
+ "eval_steps_per_second": 16.566,
91
+ "step": 100
92
+ },
93
+ {
94
+ "epoch": 1.71875,
95
+ "grad_norm": 0.34990283846855164,
96
+ "learning_rate": 0.00019979570990806946,
97
+ "loss": 0.3169,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 1.875,
102
+ "grad_norm": 0.4318304657936096,
103
+ "learning_rate": 0.00019775280898876404,
104
+ "loss": 0.307,
105
+ "step": 120
106
+ },
107
+ {
108
+ "epoch": 2.03125,
109
+ "grad_norm": 0.4618605077266693,
110
+ "learning_rate": 0.00019570990806945865,
111
+ "loss": 0.2988,
112
+ "step": 130
113
+ },
114
+ {
115
+ "epoch": 2.1875,
116
+ "grad_norm": 0.35013139247894287,
117
+ "learning_rate": 0.00019366700715015323,
118
+ "loss": 0.2962,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 2.34375,
123
+ "grad_norm": 0.2871937155723572,
124
+ "learning_rate": 0.00019162410623084782,
125
+ "loss": 0.2718,
126
+ "step": 150
127
+ },
128
+ {
129
+ "epoch": 2.5,
130
+ "grad_norm": 0.5972371101379395,
131
+ "learning_rate": 0.0001895812053115424,
132
+ "loss": 0.2724,
133
+ "step": 160
134
+ },
135
+ {
136
+ "epoch": 2.65625,
137
+ "grad_norm": 0.49075496196746826,
138
+ "learning_rate": 0.00018753830439223698,
139
+ "loss": 0.2681,
140
+ "step": 170
141
+ },
142
+ {
143
+ "epoch": 2.8125,
144
+ "grad_norm": 0.23726806044578552,
145
+ "learning_rate": 0.00018549540347293156,
146
+ "loss": 0.2527,
147
+ "step": 180
148
+ },
149
+ {
150
+ "epoch": 2.96875,
151
+ "grad_norm": 0.4829351305961609,
152
+ "learning_rate": 0.00018345250255362615,
153
+ "loss": 0.2484,
154
+ "step": 190
155
+ },
156
+ {
157
+ "epoch": 3.125,
158
+ "grad_norm": 0.3746371567249298,
159
+ "learning_rate": 0.00018140960163432076,
160
+ "loss": 0.2532,
161
+ "step": 200
162
+ },
163
+ {
164
+ "epoch": 3.125,
165
+ "eval_accuracy": 0.9176991150442478,
166
+ "eval_f1": 0.7832167832167832,
167
+ "eval_loss": 0.24235816299915314,
168
+ "eval_precision": 0.8115942028985508,
169
+ "eval_recall": 0.7567567567567568,
170
+ "eval_runtime": 0.8872,
171
+ "eval_samples_per_second": 127.371,
172
+ "eval_steps_per_second": 16.908,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 3.28125,
177
+ "grad_norm": 0.5272416472434998,
178
+ "learning_rate": 0.00017936670071501534,
179
+ "loss": 0.2438,
180
+ "step": 210
181
+ },
182
+ {
183
+ "epoch": 3.4375,
184
+ "grad_norm": 0.2519098222255707,
185
+ "learning_rate": 0.00017732379979570992,
186
+ "loss": 0.2241,
187
+ "step": 220
188
+ },
189
+ {
190
+ "epoch": 3.59375,
191
+ "grad_norm": 0.22354206442832947,
192
+ "learning_rate": 0.0001752808988764045,
193
+ "loss": 0.223,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 3.75,
198
+ "grad_norm": 0.32517552375793457,
199
+ "learning_rate": 0.0001732379979570991,
200
+ "loss": 0.2414,
201
+ "step": 240
202
+ },
203
+ {
204
+ "epoch": 3.90625,
205
+ "grad_norm": 0.32100722193717957,
206
+ "learning_rate": 0.00017119509703779367,
207
+ "loss": 0.2188,
208
+ "step": 250
209
+ },
210
+ {
211
+ "epoch": 4.0625,
212
+ "grad_norm": 0.5668182969093323,
213
+ "learning_rate": 0.00016915219611848828,
214
+ "loss": 0.1933,
215
+ "step": 260
216
+ },
217
+ {
218
+ "epoch": 4.21875,
219
+ "grad_norm": 0.42494627833366394,
220
+ "learning_rate": 0.00016710929519918286,
221
+ "loss": 0.2244,
222
+ "step": 270
223
+ },
224
+ {
225
+ "epoch": 4.375,
226
+ "grad_norm": 0.4030027687549591,
227
+ "learning_rate": 0.00016506639427987742,
228
+ "loss": 0.1939,
229
+ "step": 280
230
+ },
231
+ {
232
+ "epoch": 4.53125,
233
+ "grad_norm": 0.3662806749343872,
234
+ "learning_rate": 0.000163023493360572,
235
+ "loss": 0.2117,
236
+ "step": 290
237
+ },
238
+ {
239
+ "epoch": 4.6875,
240
+ "grad_norm": 0.23639629781246185,
241
+ "learning_rate": 0.0001609805924412666,
242
+ "loss": 0.1762,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 4.6875,
247
+ "eval_accuracy": 0.9407079646017699,
248
+ "eval_f1": 0.8452655889145496,
249
+ "eval_loss": 0.18491290509700775,
250
+ "eval_precision": 0.8672985781990521,
251
+ "eval_recall": 0.8243243243243243,
252
+ "eval_runtime": 1.2006,
253
+ "eval_samples_per_second": 94.116,
254
+ "eval_steps_per_second": 12.493,
255
+ "step": 300
256
+ },
257
+ {
258
+ "epoch": 4.84375,
259
+ "grad_norm": 0.26584741473197937,
260
+ "learning_rate": 0.0001589376915219612,
261
+ "loss": 0.1844,
262
+ "step": 310
263
+ },
264
+ {
265
+ "epoch": 5.0,
266
+ "grad_norm": 0.44927456974983215,
267
+ "learning_rate": 0.00015689479060265578,
268
+ "loss": 0.2022,
269
+ "step": 320
270
+ },
271
+ {
272
+ "epoch": 5.15625,
273
+ "grad_norm": 0.24379728734493256,
274
+ "learning_rate": 0.00015485188968335036,
275
+ "loss": 0.185,
276
+ "step": 330
277
+ },
278
+ {
279
+ "epoch": 5.3125,
280
+ "grad_norm": 0.34985587000846863,
281
+ "learning_rate": 0.00015280898876404494,
282
+ "loss": 0.1567,
283
+ "step": 340
284
+ },
285
+ {
286
+ "epoch": 5.46875,
287
+ "grad_norm": 0.325273334980011,
288
+ "learning_rate": 0.00015076608784473953,
289
+ "loss": 0.1658,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 5.625,
294
+ "grad_norm": 0.26209816336631775,
295
+ "learning_rate": 0.00014872318692543413,
296
+ "loss": 0.1739,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 5.78125,
301
+ "grad_norm": 0.32775962352752686,
302
+ "learning_rate": 0.00014668028600612872,
303
+ "loss": 0.1755,
304
+ "step": 370
305
+ },
306
+ {
307
+ "epoch": 5.9375,
308
+ "grad_norm": 0.2825620174407959,
309
+ "learning_rate": 0.0001446373850868233,
310
+ "loss": 0.1735,
311
+ "step": 380
312
+ },
313
+ {
314
+ "epoch": 6.09375,
315
+ "grad_norm": 0.18162801861763,
316
+ "learning_rate": 0.00014259448416751788,
317
+ "loss": 0.1694,
318
+ "step": 390
319
+ },
320
+ {
321
+ "epoch": 6.25,
322
+ "grad_norm": 0.3598109483718872,
323
+ "learning_rate": 0.00014055158324821247,
324
+ "loss": 0.1525,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 6.25,
329
+ "eval_accuracy": 0.9256637168141593,
330
+ "eval_f1": 0.8055555555555555,
331
+ "eval_loss": 0.18338988721370697,
332
+ "eval_precision": 0.8285714285714286,
333
+ "eval_recall": 0.7837837837837838,
334
+ "eval_runtime": 0.8607,
335
+ "eval_samples_per_second": 131.29,
336
+ "eval_steps_per_second": 17.428,
337
+ "step": 400
338
+ },
339
+ {
340
+ "epoch": 6.40625,
341
+ "grad_norm": 0.592587947845459,
342
+ "learning_rate": 0.00013850868232890705,
343
+ "loss": 0.1518,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 6.5625,
348
+ "grad_norm": 0.1838444322347641,
349
+ "learning_rate": 0.00013646578140960163,
350
+ "loss": 0.161,
351
+ "step": 420
352
+ },
353
+ {
354
+ "epoch": 6.71875,
355
+ "grad_norm": 0.326659232378006,
356
+ "learning_rate": 0.00013442288049029624,
357
+ "loss": 0.1603,
358
+ "step": 430
359
+ },
360
+ {
361
+ "epoch": 6.875,
362
+ "grad_norm": 0.45708832144737244,
363
+ "learning_rate": 0.00013237997957099082,
364
+ "loss": 0.1609,
365
+ "step": 440
366
+ },
367
+ {
368
+ "epoch": 7.03125,
369
+ "grad_norm": 0.17103692889213562,
370
+ "learning_rate": 0.0001303370786516854,
371
+ "loss": 0.1263,
372
+ "step": 450
373
+ },
374
+ {
375
+ "epoch": 7.1875,
376
+ "grad_norm": 0.40236711502075195,
377
+ "learning_rate": 0.00012829417773238,
378
+ "loss": 0.1386,
379
+ "step": 460
380
+ },
381
+ {
382
+ "epoch": 7.34375,
383
+ "grad_norm": 0.5935817360877991,
384
+ "learning_rate": 0.00012625127681307457,
385
+ "loss": 0.1419,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 7.5,
390
+ "grad_norm": 0.42183375358581543,
391
+ "learning_rate": 0.00012420837589376916,
392
+ "loss": 0.1296,
393
+ "step": 480
394
+ },
395
+ {
396
+ "epoch": 7.65625,
397
+ "grad_norm": 0.28589242696762085,
398
+ "learning_rate": 0.00012216547497446374,
399
+ "loss": 0.1298,
400
+ "step": 490
401
+ },
402
+ {
403
+ "epoch": 7.8125,
404
+ "grad_norm": 0.14183469116687775,
405
+ "learning_rate": 0.00012012257405515832,
406
+ "loss": 0.1447,
407
+ "step": 500
408
+ },
409
+ {
410
+ "epoch": 7.8125,
411
+ "eval_accuracy": 0.9415929203539823,
412
+ "eval_f1": 0.8472222222222222,
413
+ "eval_loss": 0.16118969023227692,
414
+ "eval_precision": 0.8714285714285714,
415
+ "eval_recall": 0.8243243243243243,
416
+ "eval_runtime": 0.9028,
417
+ "eval_samples_per_second": 125.167,
418
+ "eval_steps_per_second": 16.615,
419
+ "step": 500
420
+ },
421
+ {
422
+ "epoch": 7.96875,
423
+ "grad_norm": 0.4977594017982483,
424
+ "learning_rate": 0.0001180796731358529,
425
+ "loss": 0.1385,
426
+ "step": 510
427
+ },
428
+ {
429
+ "epoch": 8.125,
430
+ "grad_norm": 0.6004766225814819,
431
+ "learning_rate": 0.0001160367722165475,
432
+ "loss": 0.1532,
433
+ "step": 520
434
+ },
435
+ {
436
+ "epoch": 8.28125,
437
+ "grad_norm": 0.20785477757453918,
438
+ "learning_rate": 0.00011399387129724208,
439
+ "loss": 0.1329,
440
+ "step": 530
441
+ },
442
+ {
443
+ "epoch": 8.4375,
444
+ "grad_norm": 0.30308064818382263,
445
+ "learning_rate": 0.00011195097037793667,
446
+ "loss": 0.1351,
447
+ "step": 540
448
+ },
449
+ {
450
+ "epoch": 8.59375,
451
+ "grad_norm": 0.40658825635910034,
452
+ "learning_rate": 0.00010990806945863126,
453
+ "loss": 0.1289,
454
+ "step": 550
455
+ },
456
+ {
457
+ "epoch": 8.75,
458
+ "grad_norm": 0.15297789871692657,
459
+ "learning_rate": 0.00010786516853932584,
460
+ "loss": 0.1353,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 8.90625,
465
+ "grad_norm": 0.2919471859931946,
466
+ "learning_rate": 0.00010582226762002043,
467
+ "loss": 0.1215,
468
+ "step": 570
469
+ },
470
+ {
471
+ "epoch": 9.0625,
472
+ "grad_norm": 0.23703481256961823,
473
+ "learning_rate": 0.00010377936670071502,
474
+ "loss": 0.1107,
475
+ "step": 580
476
+ },
477
+ {
478
+ "epoch": 9.21875,
479
+ "grad_norm": 0.15953165292739868,
480
+ "learning_rate": 0.0001017364657814096,
481
+ "loss": 0.1349,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 9.375,
486
+ "grad_norm": 0.2888895869255066,
487
+ "learning_rate": 9.969356486210419e-05,
488
+ "loss": 0.1114,
489
+ "step": 600
490
+ },
491
+ {
492
+ "epoch": 9.375,
493
+ "eval_accuracy": 0.9433628318584071,
494
+ "eval_f1": 0.8545454545454546,
495
+ "eval_loss": 0.15215742588043213,
496
+ "eval_precision": 0.8623853211009175,
497
+ "eval_recall": 0.8468468468468469,
498
+ "eval_runtime": 0.8895,
499
+ "eval_samples_per_second": 127.032,
500
+ "eval_steps_per_second": 16.863,
501
+ "step": 600
502
+ },
503
+ {
504
+ "epoch": 9.53125,
505
+ "grad_norm": 0.23866873979568481,
506
+ "learning_rate": 9.765066394279879e-05,
507
+ "loss": 0.1097,
508
+ "step": 610
509
+ },
510
+ {
511
+ "epoch": 9.6875,
512
+ "grad_norm": 0.2734706997871399,
513
+ "learning_rate": 9.560776302349337e-05,
514
+ "loss": 0.1096,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 9.84375,
519
+ "grad_norm": 0.5498181581497192,
520
+ "learning_rate": 9.356486210418795e-05,
521
+ "loss": 0.1039,
522
+ "step": 630
523
+ },
524
+ {
525
+ "epoch": 10.0,
526
+ "grad_norm": 0.17813219130039215,
527
+ "learning_rate": 9.152196118488255e-05,
528
+ "loss": 0.1002,
529
+ "step": 640
530
+ },
531
+ {
532
+ "epoch": 10.15625,
533
+ "grad_norm": 0.34687340259552,
534
+ "learning_rate": 8.947906026557712e-05,
535
+ "loss": 0.1164,
536
+ "step": 650
537
+ },
538
+ {
539
+ "epoch": 10.3125,
540
+ "grad_norm": 0.18163448572158813,
541
+ "learning_rate": 8.743615934627171e-05,
542
+ "loss": 0.1038,
543
+ "step": 660
544
+ },
545
+ {
546
+ "epoch": 10.46875,
547
+ "grad_norm": 0.2275228351354599,
548
+ "learning_rate": 8.53932584269663e-05,
549
+ "loss": 0.1165,
550
+ "step": 670
551
+ },
552
+ {
553
+ "epoch": 10.625,
554
+ "grad_norm": 0.3568095862865448,
555
+ "learning_rate": 8.335035750766088e-05,
556
+ "loss": 0.0986,
557
+ "step": 680
558
+ },
559
+ {
560
+ "epoch": 10.78125,
561
+ "grad_norm": 0.2227988988161087,
562
+ "learning_rate": 8.130745658835548e-05,
563
+ "loss": 0.1139,
564
+ "step": 690
565
+ },
566
+ {
567
+ "epoch": 10.9375,
568
+ "grad_norm": 0.3007115423679352,
569
+ "learning_rate": 7.926455566905006e-05,
570
+ "loss": 0.1004,
571
+ "step": 700
572
+ },
573
+ {
574
+ "epoch": 10.9375,
575
+ "eval_accuracy": 0.9451327433628318,
576
+ "eval_f1": 0.8571428571428571,
577
+ "eval_loss": 0.15245945751667023,
578
+ "eval_precision": 0.8773584905660378,
579
+ "eval_recall": 0.8378378378378378,
580
+ "eval_runtime": 1.1963,
581
+ "eval_samples_per_second": 94.461,
582
+ "eval_steps_per_second": 12.539,
583
+ "step": 700
584
+ },
585
+ {
586
+ "epoch": 11.09375,
587
+ "grad_norm": 0.2825964689254761,
588
+ "learning_rate": 7.722165474974464e-05,
589
+ "loss": 0.1071,
590
+ "step": 710
591
+ },
592
+ {
593
+ "epoch": 11.25,
594
+ "grad_norm": 0.2866518795490265,
595
+ "learning_rate": 7.517875383043922e-05,
596
+ "loss": 0.0797,
597
+ "step": 720
598
+ },
599
+ {
600
+ "epoch": 11.40625,
601
+ "grad_norm": 0.40209805965423584,
602
+ "learning_rate": 7.313585291113382e-05,
603
+ "loss": 0.0919,
604
+ "step": 730
605
+ },
606
+ {
607
+ "epoch": 11.5625,
608
+ "grad_norm": 0.22357816994190216,
609
+ "learning_rate": 7.10929519918284e-05,
610
+ "loss": 0.0862,
611
+ "step": 740
612
+ },
613
+ {
614
+ "epoch": 11.71875,
615
+ "grad_norm": 0.21823176741600037,
616
+ "learning_rate": 6.905005107252299e-05,
617
+ "loss": 0.0921,
618
+ "step": 750
619
+ },
620
+ {
621
+ "epoch": 11.875,
622
+ "grad_norm": 0.37442561984062195,
623
+ "learning_rate": 6.700715015321757e-05,
624
+ "loss": 0.0926,
625
+ "step": 760
626
+ },
627
+ {
628
+ "epoch": 12.03125,
629
+ "grad_norm": 0.10888390988111496,
630
+ "learning_rate": 6.496424923391215e-05,
631
+ "loss": 0.0758,
632
+ "step": 770
633
+ },
634
+ {
635
+ "epoch": 12.1875,
636
+ "grad_norm": 0.30619969964027405,
637
+ "learning_rate": 6.292134831460675e-05,
638
+ "loss": 0.0824,
639
+ "step": 780
640
+ },
641
+ {
642
+ "epoch": 12.34375,
643
+ "grad_norm": 0.2507191598415375,
644
+ "learning_rate": 6.087844739530133e-05,
645
+ "loss": 0.0815,
646
+ "step": 790
647
+ },
648
+ {
649
+ "epoch": 12.5,
650
+ "grad_norm": 0.1795010268688202,
651
+ "learning_rate": 5.883554647599592e-05,
652
+ "loss": 0.0831,
653
+ "step": 800
654
+ },
655
+ {
656
+ "epoch": 12.5,
657
+ "eval_accuracy": 0.9513274336283186,
658
+ "eval_f1": 0.8741418764302059,
659
+ "eval_loss": 0.14417380094528198,
660
+ "eval_precision": 0.8883720930232558,
661
+ "eval_recall": 0.8603603603603603,
662
+ "eval_runtime": 0.9097,
663
+ "eval_samples_per_second": 124.22,
664
+ "eval_steps_per_second": 16.489,
665
+ "step": 800
666
+ },
667
+ {
668
+ "epoch": 12.65625,
669
+ "grad_norm": 0.3208577334880829,
670
+ "learning_rate": 5.67926455566905e-05,
671
+ "loss": 0.0759,
672
+ "step": 810
673
+ },
674
+ {
675
+ "epoch": 12.8125,
676
+ "grad_norm": 0.13223229348659515,
677
+ "learning_rate": 5.474974463738509e-05,
678
+ "loss": 0.087,
679
+ "step": 820
680
+ },
681
+ {
682
+ "epoch": 12.96875,
683
+ "grad_norm": 0.25973808765411377,
684
+ "learning_rate": 5.270684371807968e-05,
685
+ "loss": 0.0919,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 13.125,
690
+ "grad_norm": 0.2930574417114258,
691
+ "learning_rate": 5.0663942798774264e-05,
692
+ "loss": 0.0826,
693
+ "step": 840
694
+ },
695
+ {
696
+ "epoch": 13.28125,
697
+ "grad_norm": 0.28331613540649414,
698
+ "learning_rate": 4.862104187946885e-05,
699
+ "loss": 0.0703,
700
+ "step": 850
701
+ },
702
+ {
703
+ "epoch": 13.4375,
704
+ "grad_norm": 0.5403579473495483,
705
+ "learning_rate": 4.657814096016344e-05,
706
+ "loss": 0.0877,
707
+ "step": 860
708
+ },
709
+ {
710
+ "epoch": 13.59375,
711
+ "grad_norm": 0.24780268967151642,
712
+ "learning_rate": 4.453524004085802e-05,
713
+ "loss": 0.0783,
714
+ "step": 870
715
+ },
716
+ {
717
+ "epoch": 13.75,
718
+ "grad_norm": 0.43411558866500854,
719
+ "learning_rate": 4.24923391215526e-05,
720
+ "loss": 0.0809,
721
+ "step": 880
722
+ },
723
+ {
724
+ "epoch": 13.90625,
725
+ "grad_norm": 0.33748045563697815,
726
+ "learning_rate": 4.044943820224719e-05,
727
+ "loss": 0.0875,
728
+ "step": 890
729
+ },
730
+ {
731
+ "epoch": 14.0625,
732
+ "grad_norm": 0.18760047852993011,
733
+ "learning_rate": 3.840653728294178e-05,
734
+ "loss": 0.0654,
735
+ "step": 900
736
+ },
737
+ {
738
+ "epoch": 14.0625,
739
+ "eval_accuracy": 0.9495575221238938,
740
+ "eval_f1": 0.8689655172413793,
741
+ "eval_loss": 0.13779258728027344,
742
+ "eval_precision": 0.8873239436619719,
743
+ "eval_recall": 0.8513513513513513,
744
+ "eval_runtime": 0.8895,
745
+ "eval_samples_per_second": 127.038,
746
+ "eval_steps_per_second": 16.863,
747
+ "step": 900
748
+ },
749
+ {
750
+ "epoch": 14.21875,
751
+ "grad_norm": 0.15740624070167542,
752
+ "learning_rate": 3.6363636363636364e-05,
753
+ "loss": 0.0729,
754
+ "step": 910
755
+ },
756
+ {
757
+ "epoch": 14.375,
758
+ "grad_norm": 0.21466964483261108,
759
+ "learning_rate": 3.4320735444330954e-05,
760
+ "loss": 0.0613,
761
+ "step": 920
762
+ },
763
+ {
764
+ "epoch": 14.53125,
765
+ "grad_norm": 0.32680827379226685,
766
+ "learning_rate": 3.2277834525025536e-05,
767
+ "loss": 0.0636,
768
+ "step": 930
769
+ },
770
+ {
771
+ "epoch": 14.6875,
772
+ "grad_norm": 0.14597666263580322,
773
+ "learning_rate": 3.0234933605720123e-05,
774
+ "loss": 0.0758,
775
+ "step": 940
776
+ },
777
+ {
778
+ "epoch": 14.84375,
779
+ "grad_norm": 0.1691855937242508,
780
+ "learning_rate": 2.819203268641471e-05,
781
+ "loss": 0.0788,
782
+ "step": 950
783
+ },
784
+ {
785
+ "epoch": 15.0,
786
+ "grad_norm": 0.6366788148880005,
787
+ "learning_rate": 2.61491317671093e-05,
788
+ "loss": 0.0775,
789
+ "step": 960
790
+ },
791
+ {
792
+ "epoch": 15.15625,
793
+ "grad_norm": 0.4018935561180115,
794
+ "learning_rate": 2.410623084780388e-05,
795
+ "loss": 0.0616,
796
+ "step": 970
797
+ },
798
+ {
799
+ "epoch": 15.3125,
800
+ "grad_norm": 0.2861553728580475,
801
+ "learning_rate": 2.2063329928498467e-05,
802
+ "loss": 0.0648,
803
+ "step": 980
804
+ },
805
+ {
806
+ "epoch": 15.46875,
807
+ "grad_norm": 0.14629952609539032,
808
+ "learning_rate": 2.0020429009193057e-05,
809
+ "loss": 0.0687,
810
+ "step": 990
811
+ },
812
+ {
813
+ "epoch": 15.625,
814
+ "grad_norm": 0.18827380239963531,
815
+ "learning_rate": 1.797752808988764e-05,
816
+ "loss": 0.0583,
817
+ "step": 1000
818
+ },
819
+ {
820
+ "epoch": 15.625,
821
+ "eval_accuracy": 0.9530973451327434,
822
+ "eval_f1": 0.8798185941043084,
823
+ "eval_loss": 0.13913680613040924,
824
+ "eval_precision": 0.8858447488584474,
825
+ "eval_recall": 0.8738738738738738,
826
+ "eval_runtime": 1.128,
827
+ "eval_samples_per_second": 100.175,
828
+ "eval_steps_per_second": 13.298,
829
+ "step": 1000
830
+ },
831
+ {
832
+ "epoch": 15.78125,
833
+ "grad_norm": 0.4585842788219452,
834
+ "learning_rate": 1.593462717058223e-05,
835
+ "loss": 0.061,
836
+ "step": 1010
837
+ },
838
+ {
839
+ "epoch": 15.9375,
840
+ "grad_norm": 0.20938844978809357,
841
+ "learning_rate": 1.3891726251276812e-05,
842
+ "loss": 0.0682,
843
+ "step": 1020
844
+ },
845
+ {
846
+ "epoch": 16.09375,
847
+ "grad_norm": 0.1632617712020874,
848
+ "learning_rate": 1.18488253319714e-05,
849
+ "loss": 0.065,
850
+ "step": 1030
851
+ },
852
+ {
853
+ "epoch": 16.25,
854
+ "grad_norm": 0.25956493616104126,
855
+ "learning_rate": 9.805924412665986e-06,
856
+ "loss": 0.0761,
857
+ "step": 1040
858
+ },
859
+ {
860
+ "epoch": 16.40625,
861
+ "grad_norm": 0.17571082711219788,
862
+ "learning_rate": 7.763023493360572e-06,
863
+ "loss": 0.0676,
864
+ "step": 1050
865
+ },
866
+ {
867
+ "epoch": 16.5625,
868
+ "grad_norm": 0.1871979534626007,
869
+ "learning_rate": 5.720122574055159e-06,
870
+ "loss": 0.0663,
871
+ "step": 1060
872
+ },
873
+ {
874
+ "epoch": 16.71875,
875
+ "grad_norm": 0.18988612294197083,
876
+ "learning_rate": 3.677221654749745e-06,
877
+ "loss": 0.0608,
878
+ "step": 1070
879
+ },
880
+ {
881
+ "epoch": 16.875,
882
+ "grad_norm": 0.18478451669216156,
883
+ "learning_rate": 1.6343207354443311e-06,
884
+ "loss": 0.0511,
885
+ "step": 1080
886
+ },
887
+ {
888
+ "epoch": 17.0,
889
+ "step": 1088,
890
+ "total_flos": 1.333267779734618e+18,
891
+ "train_loss": 0.16752693287151701,
892
+ "train_runtime": 275.2566,
893
+ "train_samples_per_second": 62.502,
894
+ "train_steps_per_second": 3.953
895
+ }
896
+ ],
897
+ "logging_steps": 10,
898
+ "max_steps": 1088,
899
+ "num_input_tokens_seen": 0,
900
+ "num_train_epochs": 17,
901
+ "save_steps": 500,
902
+ "stateful_callbacks": {
903
+ "TrainerControl": {
904
+ "args": {
905
+ "should_epoch_stop": false,
906
+ "should_evaluate": false,
907
+ "should_log": false,
908
+ "should_save": true,
909
+ "should_training_stop": true
910
+ },
911
+ "attributes": {}
912
+ }
913
+ },
914
+ "total_flos": 1.333267779734618e+18,
915
+ "train_batch_size": 16,
916
+ "trial_name": null,
917
+ "trial_params": null
918
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a59cbf37d5ef3ab1e40df24d1382d3bb0caafc421913f24d4dca9e1b84788517
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0929cc3c8fd963acaabc5e79cef7ef6ed9cc06e7d16d1c8702a5eda3b45072d
3
  size 5176