{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.954337899543379, "eval_steps": 500, "global_step": 1090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0091324200913242, "grad_norm": 382.0, "learning_rate": 1.8348623853211011e-06, "loss": 46.9033, "step": 1 }, { "epoch": 0.045662100456621, "grad_norm": 318.0, "learning_rate": 9.174311926605506e-06, "loss": 46.3618, "step": 5 }, { "epoch": 0.091324200913242, "grad_norm": 139.0, "learning_rate": 1.834862385321101e-05, "loss": 39.3883, "step": 10 }, { "epoch": 0.136986301369863, "grad_norm": 52.25, "learning_rate": 2.7522935779816515e-05, "loss": 29.4216, "step": 15 }, { "epoch": 0.182648401826484, "grad_norm": 20.25, "learning_rate": 3.669724770642202e-05, "loss": 24.7169, "step": 20 }, { "epoch": 0.228310502283105, "grad_norm": 10.0, "learning_rate": 4.587155963302753e-05, "loss": 21.4244, "step": 25 }, { "epoch": 0.273972602739726, "grad_norm": 5.0, "learning_rate": 5.504587155963303e-05, "loss": 19.7804, "step": 30 }, { "epoch": 0.319634703196347, "grad_norm": 4.6875, "learning_rate": 6.422018348623854e-05, "loss": 19.1075, "step": 35 }, { "epoch": 0.365296803652968, "grad_norm": 8.5, "learning_rate": 7.339449541284404e-05, "loss": 18.037, "step": 40 }, { "epoch": 0.410958904109589, "grad_norm": 18.75, "learning_rate": 8.256880733944955e-05, "loss": 17.2652, "step": 45 }, { "epoch": 0.45662100456621, "grad_norm": 29.875, "learning_rate": 9.174311926605506e-05, "loss": 14.4775, "step": 50 }, { "epoch": 0.502283105022831, "grad_norm": 37.75, "learning_rate": 0.00010091743119266055, "loss": 9.6302, "step": 55 }, { "epoch": 0.547945205479452, "grad_norm": 8.75, "learning_rate": 0.00011009174311926606, "loss": 4.0499, "step": 60 }, { "epoch": 0.593607305936073, "grad_norm": 7.03125, "learning_rate": 0.00011926605504587157, "loss": 2.4784, "step": 65 }, { "epoch": 0.639269406392694, "grad_norm": 2.546875, "learning_rate": 0.00012844036697247707, "loss": 2.0967, "step": 70 }, { "epoch": 0.684931506849315, "grad_norm": 0.83984375, "learning_rate": 0.00013761467889908258, "loss": 1.8211, "step": 75 }, { "epoch": 0.730593607305936, "grad_norm": 1.5546875, "learning_rate": 0.0001467889908256881, "loss": 1.6757, "step": 80 }, { "epoch": 0.776255707762557, "grad_norm": 1.0546875, "learning_rate": 0.0001559633027522936, "loss": 1.5691, "step": 85 }, { "epoch": 0.821917808219178, "grad_norm": 0.96484375, "learning_rate": 0.0001651376146788991, "loss": 1.4872, "step": 90 }, { "epoch": 0.867579908675799, "grad_norm": 0.90234375, "learning_rate": 0.00017431192660550458, "loss": 1.4228, "step": 95 }, { "epoch": 0.91324200913242, "grad_norm": 2.546875, "learning_rate": 0.00018348623853211012, "loss": 1.3778, "step": 100 }, { "epoch": 0.958904109589041, "grad_norm": 1.5390625, "learning_rate": 0.0001926605504587156, "loss": 1.3554, "step": 105 }, { "epoch": 0.9954337899543378, "eval_loss": 2.645094871520996, "eval_runtime": 0.2786, "eval_samples_per_second": 35.888, "eval_steps_per_second": 3.589, "step": 109 }, { "epoch": 1.004566210045662, "grad_norm": 0.9296875, "learning_rate": 0.00019999948721966259, "loss": 1.326, "step": 110 }, { "epoch": 1.0502283105022832, "grad_norm": 1.015625, "learning_rate": 0.00019998154046002822, "loss": 1.291, "step": 115 }, { "epoch": 1.095890410958904, "grad_norm": 1.0625, "learning_rate": 0.0001999379599421534, "loss": 1.2721, "step": 120 }, { "epoch": 1.1415525114155252, "grad_norm": 1.5546875, "learning_rate": 0.00019986875683942535, "loss": 1.2479, "step": 125 }, { "epoch": 1.187214611872146, "grad_norm": 3.375, "learning_rate": 0.00019977394889447524, "loss": 1.2491, "step": 130 }, { "epoch": 1.2328767123287672, "grad_norm": 1.1171875, "learning_rate": 0.00019965356041462955, "loss": 1.2212, "step": 135 }, { "epoch": 1.278538812785388, "grad_norm": 1.546875, "learning_rate": 0.00019950762226567781, "loss": 1.2246, "step": 140 }, { "epoch": 1.3242009132420092, "grad_norm": 1.984375, "learning_rate": 0.00019933617186395917, "loss": 1.2387, "step": 145 }, { "epoch": 1.36986301369863, "grad_norm": 2.46875, "learning_rate": 0.00019913925316676945, "loss": 1.203, "step": 150 }, { "epoch": 1.4155251141552512, "grad_norm": 0.93359375, "learning_rate": 0.00019891691666109113, "loss": 1.1869, "step": 155 }, { "epoch": 1.461187214611872, "grad_norm": 2.28125, "learning_rate": 0.00019866921935064906, "loss": 1.1858, "step": 160 }, { "epoch": 1.5068493150684932, "grad_norm": 1.296875, "learning_rate": 0.00019839622474129596, "loss": 1.1696, "step": 165 }, { "epoch": 1.5525114155251143, "grad_norm": 0.81640625, "learning_rate": 0.00019809800282473013, "loss": 1.1624, "step": 170 }, { "epoch": 1.5981735159817352, "grad_norm": 1.734375, "learning_rate": 0.0001977746300605507, "loss": 1.1494, "step": 175 }, { "epoch": 1.643835616438356, "grad_norm": 1.203125, "learning_rate": 0.00019742618935665476, "loss": 1.1314, "step": 180 }, { "epoch": 1.6894977168949772, "grad_norm": 2.078125, "learning_rate": 0.00019705277004798073, "loss": 1.1407, "step": 185 }, { "epoch": 1.7351598173515983, "grad_norm": 2.0625, "learning_rate": 0.0001966544678736044, "loss": 1.1287, "step": 190 }, { "epoch": 1.7808219178082192, "grad_norm": 0.91796875, "learning_rate": 0.00019623138495219292, "loss": 1.1407, "step": 195 }, { "epoch": 1.82648401826484, "grad_norm": 12.0, "learning_rate": 0.00019578362975582292, "loss": 1.1151, "step": 200 }, { "epoch": 1.8721461187214612, "grad_norm": 1.2890625, "learning_rate": 0.00019531131708217005, "loss": 1.1221, "step": 205 }, { "epoch": 1.9178082191780823, "grad_norm": 0.90625, "learning_rate": 0.0001948145680250766, "loss": 1.0982, "step": 210 }, { "epoch": 1.9634703196347032, "grad_norm": 0.8984375, "learning_rate": 0.00019429350994350483, "loss": 1.0898, "step": 215 }, { "epoch": 2.0, "eval_loss": 2.508340358734131, "eval_runtime": 0.2456, "eval_samples_per_second": 40.72, "eval_steps_per_second": 4.072, "step": 219 }, { "epoch": 2.009132420091324, "grad_norm": 2.59375, "learning_rate": 0.00019374827642888398, "loss": 1.1068, "step": 220 }, { "epoch": 2.0547945205479454, "grad_norm": 4.09375, "learning_rate": 0.0001931790072708596, "loss": 1.0932, "step": 225 }, { "epoch": 2.1004566210045663, "grad_norm": 1.5, "learning_rate": 0.00019258584842145343, "loss": 1.1001, "step": 230 }, { "epoch": 2.146118721461187, "grad_norm": 11.25, "learning_rate": 0.00019196895195764362, "loss": 1.1001, "step": 235 }, { "epoch": 2.191780821917808, "grad_norm": 1.9375, "learning_rate": 0.0001913284760423745, "loss": 1.1046, "step": 240 }, { "epoch": 2.237442922374429, "grad_norm": 2.828125, "learning_rate": 0.00019066458488400584, "loss": 1.0795, "step": 245 }, { "epoch": 2.2831050228310503, "grad_norm": 1.1953125, "learning_rate": 0.00018997744869421246, "loss": 1.0767, "step": 250 }, { "epoch": 2.328767123287671, "grad_norm": 4.875, "learning_rate": 0.00018926724364434446, "loss": 1.059, "step": 255 }, { "epoch": 2.374429223744292, "grad_norm": 3.171875, "learning_rate": 0.0001885341518202595, "loss": 1.0695, "step": 260 }, { "epoch": 2.4200913242009134, "grad_norm": 0.734375, "learning_rate": 0.00018777836117563892, "loss": 1.0709, "step": 265 }, { "epoch": 2.4657534246575343, "grad_norm": 7.59375, "learning_rate": 0.00018700006548379898, "loss": 1.0677, "step": 270 }, { "epoch": 2.5114155251141552, "grad_norm": 0.984375, "learning_rate": 0.0001861994642880105, "loss": 1.0693, "step": 275 }, { "epoch": 2.557077625570776, "grad_norm": 0.95703125, "learning_rate": 0.00018537676285033887, "loss": 1.0508, "step": 280 }, { "epoch": 2.602739726027397, "grad_norm": 0.578125, "learning_rate": 0.0001845321720990181, "loss": 1.0449, "step": 285 }, { "epoch": 2.6484018264840183, "grad_norm": 1.015625, "learning_rate": 0.00018366590857437184, "loss": 1.0562, "step": 290 }, { "epoch": 2.6940639269406392, "grad_norm": 1.734375, "learning_rate": 0.00018277819437329576, "loss": 1.0428, "step": 295 }, { "epoch": 2.73972602739726, "grad_norm": 1.53125, "learning_rate": 0.00018186925709231532, "loss": 1.0321, "step": 300 }, { "epoch": 2.7853881278538815, "grad_norm": 1.0703125, "learning_rate": 0.0001809393297692334, "loss": 1.0253, "step": 305 }, { "epoch": 2.8310502283105023, "grad_norm": 3.1875, "learning_rate": 0.0001799886508233829, "loss": 1.0377, "step": 310 }, { "epoch": 2.8767123287671232, "grad_norm": 3.484375, "learning_rate": 0.0001790174639944997, "loss": 1.0359, "step": 315 }, { "epoch": 2.922374429223744, "grad_norm": 6.09375, "learning_rate": 0.00017802601828023138, "loss": 1.0428, "step": 320 }, { "epoch": 2.968036529680365, "grad_norm": 3.03125, "learning_rate": 0.00017701456787229804, "loss": 1.0434, "step": 325 }, { "epoch": 2.9954337899543377, "eval_loss": 2.480058193206787, "eval_runtime": 0.2581, "eval_samples_per_second": 38.741, "eval_steps_per_second": 3.874, "step": 328 }, { "epoch": 3.0136986301369864, "grad_norm": 7.625, "learning_rate": 0.0001759833720913214, "loss": 1.0302, "step": 330 }, { "epoch": 3.0593607305936072, "grad_norm": 1.7890625, "learning_rate": 0.00017493269532033883, "loss": 1.0273, "step": 335 }, { "epoch": 3.105022831050228, "grad_norm": 1.71875, "learning_rate": 0.0001738628069370195, "loss": 1.0212, "step": 340 }, { "epoch": 3.1506849315068495, "grad_norm": 1.3515625, "learning_rate": 0.00017277398124460023, "loss": 1.013, "step": 345 }, { "epoch": 3.1963470319634704, "grad_norm": 2.390625, "learning_rate": 0.000171666497401558, "loss": 1.0077, "step": 350 }, { "epoch": 3.2420091324200913, "grad_norm": 0.921875, "learning_rate": 0.0001705406393500381, "loss": 1.0111, "step": 355 }, { "epoch": 3.287671232876712, "grad_norm": 1.1875, "learning_rate": 0.00016939669574305566, "loss": 1.0047, "step": 360 }, { "epoch": 3.3333333333333335, "grad_norm": 0.97265625, "learning_rate": 0.0001682349598704892, "loss": 0.9977, "step": 365 }, { "epoch": 3.3789954337899544, "grad_norm": 1.09375, "learning_rate": 0.00016705572958388576, "loss": 0.9914, "step": 370 }, { "epoch": 3.4246575342465753, "grad_norm": 1.046875, "learning_rate": 0.00016585930722009601, "loss": 1.0012, "step": 375 }, { "epoch": 3.470319634703196, "grad_norm": 1.7421875, "learning_rate": 0.00016464599952375998, "loss": 0.9888, "step": 380 }, { "epoch": 3.5159817351598175, "grad_norm": 0.73828125, "learning_rate": 0.000163416117568662, "loss": 1.0036, "step": 385 }, { "epoch": 3.5616438356164384, "grad_norm": 2.515625, "learning_rate": 0.0001621699766779763, "loss": 0.9963, "step": 390 }, { "epoch": 3.6073059360730593, "grad_norm": 1.203125, "learning_rate": 0.00016090789634342278, "loss": 0.9955, "step": 395 }, { "epoch": 3.65296803652968, "grad_norm": 1.53125, "learning_rate": 0.00015963020014335438, "loss": 0.9953, "step": 400 }, { "epoch": 3.6986301369863015, "grad_norm": 2.015625, "learning_rate": 0.0001583372156597961, "loss": 0.9959, "step": 405 }, { "epoch": 3.7442922374429224, "grad_norm": 1.75, "learning_rate": 0.00015702927439445826, "loss": 0.9906, "step": 410 }, { "epoch": 3.7899543378995433, "grad_norm": 1.4453125, "learning_rate": 0.00015570671168374438, "loss": 0.9849, "step": 415 }, { "epoch": 3.8356164383561646, "grad_norm": 2.234375, "learning_rate": 0.00015436986661277577, "loss": 0.9697, "step": 420 }, { "epoch": 3.8812785388127855, "grad_norm": 1.5, "learning_rate": 0.0001530190819284555, "loss": 0.979, "step": 425 }, { "epoch": 3.9269406392694064, "grad_norm": 10.75, "learning_rate": 0.00015165470395159313, "loss": 0.9715, "step": 430 }, { "epoch": 3.9726027397260273, "grad_norm": 2.71875, "learning_rate": 0.0001502770824881133, "loss": 0.9864, "step": 435 }, { "epoch": 4.0, "eval_loss": 2.474334239959717, "eval_runtime": 0.2363, "eval_samples_per_second": 42.318, "eval_steps_per_second": 4.232, "step": 438 }, { "epoch": 4.018264840182648, "grad_norm": 1.3046875, "learning_rate": 0.00014888657073937076, "loss": 0.9764, "step": 440 }, { "epoch": 4.063926940639269, "grad_norm": 0.97265625, "learning_rate": 0.00014748352521159493, "loss": 0.9564, "step": 445 }, { "epoch": 4.109589041095891, "grad_norm": 0.7265625, "learning_rate": 0.0001460683056244869, "loss": 0.9573, "step": 450 }, { "epoch": 4.155251141552512, "grad_norm": 11.5625, "learning_rate": 0.00014464127481899312, "loss": 0.957, "step": 455 }, { "epoch": 4.200913242009133, "grad_norm": 0.91796875, "learning_rate": 0.00014320279866427796, "loss": 0.9596, "step": 460 }, { "epoch": 4.2465753424657535, "grad_norm": 2.03125, "learning_rate": 0.00014175324596392075, "loss": 0.9647, "step": 465 }, { "epoch": 4.292237442922374, "grad_norm": 1.359375, "learning_rate": 0.00014029298836135988, "loss": 0.9632, "step": 470 }, { "epoch": 4.337899543378995, "grad_norm": 5.09375, "learning_rate": 0.00013882240024460927, "loss": 0.9664, "step": 475 }, { "epoch": 4.383561643835616, "grad_norm": 2.96875, "learning_rate": 0.0001373418586502706, "loss": 0.964, "step": 480 }, { "epoch": 4.429223744292237, "grad_norm": 1.2421875, "learning_rate": 0.0001358517431668672, "loss": 0.9531, "step": 485 }, { "epoch": 4.474885844748858, "grad_norm": 1.046875, "learning_rate": 0.00013435243583752294, "loss": 0.958, "step": 490 }, { "epoch": 4.52054794520548, "grad_norm": 0.6953125, "learning_rate": 0.00013284432106201233, "loss": 0.9514, "step": 495 }, { "epoch": 4.566210045662101, "grad_norm": 0.7265625, "learning_rate": 0.00013132778549820618, "loss": 0.9588, "step": 500 }, { "epoch": 4.6118721461187215, "grad_norm": 0.93359375, "learning_rate": 0.00012980321796293836, "loss": 0.9494, "step": 505 }, { "epoch": 4.657534246575342, "grad_norm": 0.91796875, "learning_rate": 0.00012827100933231905, "loss": 0.9508, "step": 510 }, { "epoch": 4.703196347031963, "grad_norm": 1.0703125, "learning_rate": 0.00012673155244151985, "loss": 0.9557, "step": 515 }, { "epoch": 4.748858447488584, "grad_norm": 0.75, "learning_rate": 0.000125185241984057, "loss": 0.9508, "step": 520 }, { "epoch": 4.794520547945205, "grad_norm": 0.76953125, "learning_rate": 0.00012363247441059776, "loss": 0.9562, "step": 525 }, { "epoch": 4.840182648401827, "grad_norm": 1.078125, "learning_rate": 0.00012207364782731655, "loss": 0.9542, "step": 530 }, { "epoch": 4.885844748858448, "grad_norm": 2.1875, "learning_rate": 0.00012050916189382646, "loss": 0.9606, "step": 535 }, { "epoch": 4.931506849315069, "grad_norm": 1.84375, "learning_rate": 0.00011893941772071249, "loss": 0.9424, "step": 540 }, { "epoch": 4.9771689497716896, "grad_norm": 1.9375, "learning_rate": 0.00011736481776669306, "loss": 0.9371, "step": 545 }, { "epoch": 4.995433789954338, "eval_loss": 2.485384464263916, "eval_runtime": 0.2567, "eval_samples_per_second": 38.96, "eval_steps_per_second": 3.896, "step": 547 }, { "epoch": 5.0228310502283104, "grad_norm": 0.99609375, "learning_rate": 0.0001157857657354354, "loss": 0.9249, "step": 550 }, { "epoch": 5.068493150684931, "grad_norm": 2.53125, "learning_rate": 0.00011420266647205231, "loss": 0.9271, "step": 555 }, { "epoch": 5.114155251141552, "grad_norm": 12.0, "learning_rate": 0.00011261592585930576, "loss": 0.9329, "step": 560 }, { "epoch": 5.159817351598173, "grad_norm": 0.7890625, "learning_rate": 0.00011102595071354472, "loss": 0.9238, "step": 565 }, { "epoch": 5.205479452054795, "grad_norm": 6.3125, "learning_rate": 0.00010943314868040364, "loss": 0.9134, "step": 570 }, { "epoch": 5.251141552511416, "grad_norm": 0.69921875, "learning_rate": 0.00010783792813028827, "loss": 0.91, "step": 575 }, { "epoch": 5.296803652968037, "grad_norm": 4.375, "learning_rate": 0.00010624069805367559, "loss": 0.9193, "step": 580 }, { "epoch": 5.342465753424658, "grad_norm": 0.984375, "learning_rate": 0.00010464186795625482, "loss": 0.9101, "step": 585 }, { "epoch": 5.3881278538812785, "grad_norm": 3.1875, "learning_rate": 0.00010304184775393642, "loss": 0.9122, "step": 590 }, { "epoch": 5.433789954337899, "grad_norm": 0.90625, "learning_rate": 0.00010144104766775572, "loss": 0.9126, "step": 595 }, { "epoch": 5.47945205479452, "grad_norm": 1.171875, "learning_rate": 9.983987811869862e-05, "loss": 0.9177, "step": 600 }, { "epoch": 5.525114155251142, "grad_norm": 0.53515625, "learning_rate": 9.823874962247564e-05, "loss": 0.9089, "step": 605 }, { "epoch": 5.570776255707763, "grad_norm": 1.0078125, "learning_rate": 9.663807268427198e-05, "loss": 0.9112, "step": 610 }, { "epoch": 5.616438356164384, "grad_norm": 0.69140625, "learning_rate": 9.503825769350017e-05, "loss": 0.9142, "step": 615 }, { "epoch": 5.662100456621005, "grad_norm": 0.8359375, "learning_rate": 9.343971481858246e-05, "loss": 0.9068, "step": 620 }, { "epoch": 5.707762557077626, "grad_norm": 0.92578125, "learning_rate": 9.184285390178978e-05, "loss": 0.9134, "step": 625 }, { "epoch": 5.7534246575342465, "grad_norm": 1.890625, "learning_rate": 9.024808435416434e-05, "loss": 0.9106, "step": 630 }, { "epoch": 5.799086757990867, "grad_norm": 0.90234375, "learning_rate": 8.865581505055291e-05, "loss": 0.9108, "step": 635 }, { "epoch": 5.844748858447488, "grad_norm": 1.6015625, "learning_rate": 8.706645422477739e-05, "loss": 0.9027, "step": 640 }, { "epoch": 5.890410958904109, "grad_norm": 4.90625, "learning_rate": 8.548040936496989e-05, "loss": 0.9217, "step": 645 }, { "epoch": 5.936073059360731, "grad_norm": 7.09375, "learning_rate": 8.389808710909881e-05, "loss": 0.9227, "step": 650 }, { "epoch": 5.981735159817352, "grad_norm": 6.625, "learning_rate": 8.231989314071317e-05, "loss": 0.9157, "step": 655 }, { "epoch": 6.0, "eval_loss": 2.46421480178833, "eval_runtime": 0.2356, "eval_samples_per_second": 42.441, "eval_steps_per_second": 4.244, "step": 657 }, { "epoch": 6.027397260273973, "grad_norm": 0.7109375, "learning_rate": 8.07462320849313e-05, "loss": 0.902, "step": 660 }, { "epoch": 6.073059360730594, "grad_norm": 1.0703125, "learning_rate": 7.917750740470117e-05, "loss": 0.8855, "step": 665 }, { "epoch": 6.1187214611872145, "grad_norm": 1.25, "learning_rate": 7.761412129735852e-05, "loss": 0.9014, "step": 670 }, { "epoch": 6.164383561643835, "grad_norm": 0.984375, "learning_rate": 7.605647459150961e-05, "loss": 0.8863, "step": 675 }, { "epoch": 6.210045662100456, "grad_norm": 1.296875, "learning_rate": 7.450496664426477e-05, "loss": 0.8804, "step": 680 }, { "epoch": 6.255707762557078, "grad_norm": 1.1171875, "learning_rate": 7.295999523884921e-05, "loss": 0.8795, "step": 685 }, { "epoch": 6.301369863013699, "grad_norm": 0.890625, "learning_rate": 7.142195648261747e-05, "loss": 0.8855, "step": 690 }, { "epoch": 6.34703196347032, "grad_norm": 0.671875, "learning_rate": 6.989124470549745e-05, "loss": 0.8799, "step": 695 }, { "epoch": 6.392694063926941, "grad_norm": 0.796875, "learning_rate": 6.83682523588902e-05, "loss": 0.8731, "step": 700 }, { "epoch": 6.438356164383562, "grad_norm": 2.015625, "learning_rate": 6.685336991505122e-05, "loss": 0.8818, "step": 705 }, { "epoch": 6.4840182648401825, "grad_norm": 1.4140625, "learning_rate": 6.534698576697939e-05, "loss": 0.8792, "step": 710 }, { "epoch": 6.529680365296803, "grad_norm": 0.8125, "learning_rate": 6.384948612883873e-05, "loss": 0.8713, "step": 715 }, { "epoch": 6.575342465753424, "grad_norm": 0.53515625, "learning_rate": 6.2361254936939e-05, "loss": 0.8762, "step": 720 }, { "epoch": 6.621004566210045, "grad_norm": 0.515625, "learning_rate": 6.088267375130023e-05, "loss": 0.8708, "step": 725 }, { "epoch": 6.666666666666667, "grad_norm": 0.70703125, "learning_rate": 5.941412165782645e-05, "loss": 0.8797, "step": 730 }, { "epoch": 6.712328767123288, "grad_norm": 1.21875, "learning_rate": 5.79559751711138e-05, "loss": 0.8634, "step": 735 }, { "epoch": 6.757990867579909, "grad_norm": 0.81640625, "learning_rate": 5.650860813791785e-05, "loss": 0.872, "step": 740 }, { "epoch": 6.80365296803653, "grad_norm": 0.625, "learning_rate": 5.507239164130501e-05, "loss": 0.8661, "step": 745 }, { "epoch": 6.8493150684931505, "grad_norm": 0.828125, "learning_rate": 5.364769390551225e-05, "loss": 0.8744, "step": 750 }, { "epoch": 6.894977168949771, "grad_norm": 1.5234375, "learning_rate": 5.2234880201540284e-05, "loss": 0.8662, "step": 755 }, { "epoch": 6.940639269406392, "grad_norm": 0.94140625, "learning_rate": 5.0834312753503124e-05, "loss": 0.8764, "step": 760 }, { "epoch": 6.986301369863014, "grad_norm": 0.4609375, "learning_rate": 4.9446350645759885e-05, "loss": 0.8657, "step": 765 }, { "epoch": 6.995433789954338, "eval_loss": 2.5075883865356445, "eval_runtime": 0.2561, "eval_samples_per_second": 39.047, "eval_steps_per_second": 3.905, "step": 766 }, { "epoch": 7.031963470319635, "grad_norm": 0.408203125, "learning_rate": 4.807134973085036e-05, "loss": 0.8614, "step": 770 }, { "epoch": 7.077625570776256, "grad_norm": 0.56640625, "learning_rate": 4.6709662538260267e-05, "loss": 0.8457, "step": 775 }, { "epoch": 7.123287671232877, "grad_norm": 0.447265625, "learning_rate": 4.53616381840377e-05, "loss": 0.8502, "step": 780 }, { "epoch": 7.168949771689498, "grad_norm": 0.64453125, "learning_rate": 4.402762228128531e-05, "loss": 0.8536, "step": 785 }, { "epoch": 7.2146118721461185, "grad_norm": 0.78515625, "learning_rate": 4.2707956851550016e-05, "loss": 0.8531, "step": 790 }, { "epoch": 7.260273972602739, "grad_norm": 0.68359375, "learning_rate": 4.140298023713416e-05, "loss": 0.8609, "step": 795 }, { "epoch": 7.30593607305936, "grad_norm": 0.671875, "learning_rate": 4.011302701434937e-05, "loss": 0.8529, "step": 800 }, { "epoch": 7.351598173515982, "grad_norm": 0.96875, "learning_rate": 3.8838427907736476e-05, "loss": 0.8566, "step": 805 }, { "epoch": 7.397260273972603, "grad_norm": 0.6171875, "learning_rate": 3.757950970527249e-05, "loss": 0.8508, "step": 810 }, { "epoch": 7.442922374429224, "grad_norm": 0.435546875, "learning_rate": 3.633659517458736e-05, "loss": 0.8513, "step": 815 }, { "epoch": 7.488584474885845, "grad_norm": 0.466796875, "learning_rate": 3.5110002980210975e-05, "loss": 0.8499, "step": 820 }, { "epoch": 7.534246575342466, "grad_norm": 0.43359375, "learning_rate": 3.3900047601872596e-05, "loss": 0.8493, "step": 825 }, { "epoch": 7.579908675799087, "grad_norm": 0.6171875, "learning_rate": 3.270703925387279e-05, "loss": 0.851, "step": 830 }, { "epoch": 7.6255707762557075, "grad_norm": 0.5234375, "learning_rate": 3.153128380554941e-05, "loss": 0.8452, "step": 835 }, { "epoch": 7.671232876712329, "grad_norm": 0.43359375, "learning_rate": 3.037308270285709e-05, "loss": 0.862, "step": 840 }, { "epoch": 7.71689497716895, "grad_norm": 0.625, "learning_rate": 2.923273289108115e-05, "loss": 0.8487, "step": 845 }, { "epoch": 7.762557077625571, "grad_norm": 0.39453125, "learning_rate": 2.8110526738705344e-05, "loss": 0.8516, "step": 850 }, { "epoch": 7.808219178082192, "grad_norm": 0.546875, "learning_rate": 2.7006751962452882e-05, "loss": 0.8541, "step": 855 }, { "epoch": 7.853881278538813, "grad_norm": 0.419921875, "learning_rate": 2.592169155352031e-05, "loss": 0.8486, "step": 860 }, { "epoch": 7.899543378995434, "grad_norm": 0.86328125, "learning_rate": 2.485562370502279e-05, "loss": 0.8402, "step": 865 }, { "epoch": 7.945205479452055, "grad_norm": 0.412109375, "learning_rate": 2.3808821740669606e-05, "loss": 0.8474, "step": 870 }, { "epoch": 7.9908675799086755, "grad_norm": 0.40625, "learning_rate": 2.2781554044688015e-05, "loss": 0.8393, "step": 875 }, { "epoch": 8.0, "eval_loss": 2.515906810760498, "eval_runtime": 0.2401, "eval_samples_per_second": 41.645, "eval_steps_per_second": 4.164, "step": 876 }, { "epoch": 8.036529680365296, "grad_norm": 0.5390625, "learning_rate": 2.1774083993013718e-05, "loss": 0.8404, "step": 880 }, { "epoch": 8.082191780821917, "grad_norm": 0.5234375, "learning_rate": 2.078666988576504e-05, "loss": 0.837, "step": 885 }, { "epoch": 8.127853881278538, "grad_norm": 0.58203125, "learning_rate": 1.9819564881018983e-05, "loss": 0.8372, "step": 890 }, { "epoch": 8.173515981735159, "grad_norm": 0.486328125, "learning_rate": 1.887301692990494e-05, "loss": 0.846, "step": 895 }, { "epoch": 8.219178082191782, "grad_norm": 0.439453125, "learning_rate": 1.7947268713034127e-05, "loss": 0.8461, "step": 900 }, { "epoch": 8.264840182648403, "grad_norm": 0.4140625, "learning_rate": 1.7042557578279626e-05, "loss": 0.8373, "step": 905 }, { "epoch": 8.310502283105023, "grad_norm": 0.46484375, "learning_rate": 1.6159115479924257e-05, "loss": 0.8422, "step": 910 }, { "epoch": 8.356164383561644, "grad_norm": 0.4453125, "learning_rate": 1.529716891919074e-05, "loss": 0.8403, "step": 915 }, { "epoch": 8.401826484018265, "grad_norm": 0.419921875, "learning_rate": 1.4456938886170412e-05, "loss": 0.8343, "step": 920 }, { "epoch": 8.447488584474886, "grad_norm": 0.43359375, "learning_rate": 1.3638640803164516e-05, "loss": 0.8355, "step": 925 }, { "epoch": 8.493150684931507, "grad_norm": 0.41796875, "learning_rate": 1.2842484469453365e-05, "loss": 0.841, "step": 930 }, { "epoch": 8.538812785388128, "grad_norm": 0.455078125, "learning_rate": 1.2068674007506786e-05, "loss": 0.8396, "step": 935 }, { "epoch": 8.584474885844749, "grad_norm": 0.40625, "learning_rate": 1.1317407810650372e-05, "loss": 0.8377, "step": 940 }, { "epoch": 8.63013698630137, "grad_norm": 0.392578125, "learning_rate": 1.058887849220026e-05, "loss": 0.8348, "step": 945 }, { "epoch": 8.67579908675799, "grad_norm": 0.396484375, "learning_rate": 9.883272836080116e-06, "loss": 0.8388, "step": 950 }, { "epoch": 8.721461187214611, "grad_norm": 0.396484375, "learning_rate": 9.200771748932513e-06, "loss": 0.8366, "step": 955 }, { "epoch": 8.767123287671232, "grad_norm": 0.416015625, "learning_rate": 8.541550213737171e-06, "loss": 0.8436, "step": 960 }, { "epoch": 8.812785388127853, "grad_norm": 0.455078125, "learning_rate": 7.905777244947954e-06, "loss": 0.8409, "step": 965 }, { "epoch": 8.858447488584474, "grad_norm": 0.4140625, "learning_rate": 7.293615845160196e-06, "loss": 0.8377, "step": 970 }, { "epoch": 8.904109589041095, "grad_norm": 0.408203125, "learning_rate": 6.705222963319191e-06, "loss": 0.8425, "step": 975 }, { "epoch": 8.949771689497716, "grad_norm": 0.419921875, "learning_rate": 6.140749454480932e-06, "loss": 0.8371, "step": 980 }, { "epoch": 8.995433789954339, "grad_norm": 0.408203125, "learning_rate": 5.6003400411351325e-06, "loss": 0.8462, "step": 985 }, { "epoch": 8.995433789954339, "eval_loss": 2.518533706665039, "eval_runtime": 0.2546, "eval_samples_per_second": 39.282, "eval_steps_per_second": 3.928, "step": 985 }, { "epoch": 9.04109589041096, "grad_norm": 0.392578125, "learning_rate": 5.0841332761005e-06, "loss": 0.8404, "step": 990 }, { "epoch": 9.08675799086758, "grad_norm": 0.416015625, "learning_rate": 4.592261507001993e-06, "loss": 0.8303, "step": 995 }, { "epoch": 9.132420091324201, "grad_norm": 0.42578125, "learning_rate": 4.124850842338779e-06, "loss": 0.8325, "step": 1000 }, { "epoch": 9.178082191780822, "grad_norm": 0.4140625, "learning_rate": 3.6820211191520125e-06, "loss": 0.8353, "step": 1005 }, { "epoch": 9.223744292237443, "grad_norm": 0.419921875, "learning_rate": 3.263885872300343e-06, "loss": 0.8347, "step": 1010 }, { "epoch": 9.269406392694064, "grad_norm": 0.3984375, "learning_rate": 2.8705523053513816e-06, "loss": 0.8329, "step": 1015 }, { "epoch": 9.315068493150685, "grad_norm": 0.421875, "learning_rate": 2.502121263096224e-06, "loss": 0.8369, "step": 1020 }, { "epoch": 9.360730593607306, "grad_norm": 0.419921875, "learning_rate": 2.1586872056944428e-06, "loss": 0.8324, "step": 1025 }, { "epoch": 9.406392694063927, "grad_norm": 0.412109375, "learning_rate": 1.840338184455881e-06, "loss": 0.8383, "step": 1030 }, { "epoch": 9.452054794520548, "grad_norm": 0.400390625, "learning_rate": 1.5471558192656777e-06, "loss": 0.8315, "step": 1035 }, { "epoch": 9.497716894977168, "grad_norm": 0.435546875, "learning_rate": 1.2792152776580968e-06, "loss": 0.8437, "step": 1040 }, { "epoch": 9.54337899543379, "grad_norm": 0.419921875, "learning_rate": 1.036585255544764e-06, "loss": 0.8418, "step": 1045 }, { "epoch": 9.58904109589041, "grad_norm": 0.400390625, "learning_rate": 8.193279596020121e-07, "loss": 0.8346, "step": 1050 }, { "epoch": 9.634703196347033, "grad_norm": 0.40234375, "learning_rate": 6.274990913221035e-07, "loss": 0.8415, "step": 1055 }, { "epoch": 9.680365296803654, "grad_norm": 0.416015625, "learning_rate": 4.6114783273213393e-07, "loss": 0.8339, "step": 1060 }, { "epoch": 9.726027397260275, "grad_norm": 0.439453125, "learning_rate": 3.203168337845508e-07, "loss": 0.8331, "step": 1065 }, { "epoch": 9.771689497716896, "grad_norm": 0.400390625, "learning_rate": 2.05042201422323e-07, "loss": 0.8458, "step": 1070 }, { "epoch": 9.817351598173516, "grad_norm": 0.41015625, "learning_rate": 1.1535349032167908e-07, "loss": 0.8444, "step": 1075 }, { "epoch": 9.863013698630137, "grad_norm": 0.400390625, "learning_rate": 5.127369531473525e-08, "loss": 0.8486, "step": 1080 }, { "epoch": 9.908675799086758, "grad_norm": 0.39453125, "learning_rate": 1.2819245493955744e-08, "loss": 0.8473, "step": 1085 }, { "epoch": 9.954337899543379, "grad_norm": 0.439453125, "learning_rate": 0.0, "loss": 0.8359, "step": 1090 }, { "epoch": 9.954337899543379, "eval_loss": 2.515676259994507, "eval_runtime": 0.2343, "eval_samples_per_second": 42.683, "eval_steps_per_second": 4.268, "step": 1090 }, { "epoch": 9.954337899543379, "step": 1090, "total_flos": 3.327732991202951e+18, "train_loss": 2.136770288659892, "train_runtime": 2636.8816, "train_samples_per_second": 26.554, "train_steps_per_second": 0.413 } ], "logging_steps": 5, "max_steps": 1090, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 3.327732991202951e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }