|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 2085, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00014354066985645933, |
|
"loss": 4.6999, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00028708133971291867, |
|
"loss": 4.0234, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043062200956937803, |
|
"loss": 3.84, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0005741626794258373, |
|
"loss": 3.7458, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.0007177033492822966, |
|
"loss": 3.6694, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.0008612440191387561, |
|
"loss": 3.6173, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.000999999298909658, |
|
"loss": 3.5916, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.0009993264033223551, |
|
"loss": 3.5461, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.0009973935099776767, |
|
"loss": 3.5042, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.0009942054963290549, |
|
"loss": 3.4956, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.0009897704069942402, |
|
"loss": 3.5649, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 0.0009840994334555549, |
|
"loss": 3.5746, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.0009772068858193608, |
|
"loss": 3.5424, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.000969110156706009, |
|
"loss": 3.567, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.0009598296773613879, |
|
"loss": 3.4753, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.0009493888661008194, |
|
"loss": 3.4324, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 0.0009378140692153992, |
|
"loss": 3.4661, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.0009251344944898958, |
|
"loss": 3.4081, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.0009113821374999736, |
|
"loss": 3.3848, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"learning_rate": 0.0008965917008747158, |
|
"loss": 3.3552, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"learning_rate": 0.000880800506728183, |
|
"loss": 3.3352, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"learning_rate": 0.0008640484024809779, |
|
"loss": 3.2935, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 0.0008463776603094606, |
|
"loss": 3.2754, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 0.0008278328704763516, |
|
"loss": 3.2657, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.0008084608288118838, |
|
"loss": 3.256, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 0.0007883104186294383, |
|
"loss": 3.2201, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 0.0007674324873736347, |
|
"loss": 3.1941, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 0.0007458797183121429, |
|
"loss": 3.1856, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 0.0007237064975949886, |
|
"loss": 3.1649, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"learning_rate": 0.0007009687770168125, |
|
"loss": 3.155, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"learning_rate": 0.0006777239328283909, |
|
"loss": 3.1357, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"learning_rate": 0.0006540306209536906, |
|
"loss": 3.1309, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"learning_rate": 0.0006299486289778033, |
|
"loss": 3.1091, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"learning_rate": 0.0006055387252792498, |
|
"loss": 3.0958, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"learning_rate": 0.0005808625056873581, |
|
"loss": 3.0813, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"learning_rate": 0.0005559822380516539, |
|
"loss": 3.0854, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 0.0005309607051154799, |
|
"loss": 3.068, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"learning_rate": 0.0005058610460903332, |
|
"loss": 3.0664, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 0.00048074659733069516, |
|
"loss": 3.0627, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"learning_rate": 0.00045568073251138903, |
|
"loss": 3.0376, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 0.0004307267027107653, |
|
"loss": 3.0274, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"learning_rate": 0.00040594747680324574, |
|
"loss": 3.0212, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"learning_rate": 0.0003814055825639795, |
|
"loss": 3.0101, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"learning_rate": 0.000357162948886567, |
|
"loss": 3.0076, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"learning_rate": 0.00033328074951199845, |
|
"loss": 3.0129, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"learning_rate": 0.0003098192486631408, |
|
"loss": 2.9902, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"learning_rate": 0.00028683764897429804, |
|
"loss": 2.9819, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"learning_rate": 0.0002643939420995788, |
|
"loss": 2.9891, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"learning_rate": 0.00024254476237704588, |
|
"loss": 2.9702, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"learning_rate": 0.00022134524391790916, |
|
"loss": 2.972, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"learning_rate": 0.00020084888148138487, |
|
"loss": 2.9733, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 0.00018110739548628618, |
|
"loss": 2.9688, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"learning_rate": 0.0001621706014999767, |
|
"loss": 2.9629, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"learning_rate": 0.00014408628453401574, |
|
"loss": 2.9609, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"learning_rate": 0.00012690007846369856, |
|
"loss": 2.9461, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"learning_rate": 0.00011065535087576234, |
|
"loss": 2.9493, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"learning_rate": 9.539309363483478e-05, |
|
"loss": 2.9417, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"learning_rate": 8.115181944476685e-05, |
|
"loss": 2.9459, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"learning_rate": 6.796746466586756e-05, |
|
"loss": 2.9273, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"learning_rate": 5.58732986332719e-05, |
|
"loss": 2.9397, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"learning_rate": 4.4899839705266174e-05, |
|
"loss": 2.943, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"learning_rate": 3.507477825341493e-05, |
|
"loss": 2.9381, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"learning_rate": 2.642290678881504e-05, |
|
"loss": 2.9509, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"learning_rate": 1.8966057400797153e-05, |
|
"loss": 2.9437, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"learning_rate": 1.272304666594032e-05, |
|
"loss": 2.9374, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"learning_rate": 7.709628166416128e-06, |
|
"loss": 2.9508, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"learning_rate": 3.93845273747806e-06, |
|
"loss": 2.9407, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"learning_rate": 1.4190365444071153e-06, |
|
"loss": 2.949, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"learning_rate": 1.5773706946742295e-07, |
|
"loss": 2.9544, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2085, |
|
"total_flos": 1.731497869485015e+17, |
|
"train_loss": 3.211610127181458, |
|
"train_runtime": 4482.5265, |
|
"train_samples_per_second": 74.379, |
|
"train_steps_per_second": 0.465 |
|
} |
|
], |
|
"logging_steps": 30, |
|
"max_steps": 2085, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 5000, |
|
"total_flos": 1.731497869485015e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|