diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8682 +1,1642 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 15.0, + "epoch": 9.954337899543379, "eval_steps": 500, - "global_step": 6090, + "global_step": 1090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0024630541871921183, - "grad_norm": 414.0, - "learning_rate": 3.284072249589491e-07, - "loss": 48.2481, + "epoch": 0.0091324200913242, + "grad_norm": 382.0, + "learning_rate": 1.8348623853211011e-06, + "loss": 46.9033, "step": 1 }, { - "epoch": 0.012315270935960592, - "grad_norm": 498.0, - "learning_rate": 1.6420361247947457e-06, - "loss": 47.2765, + "epoch": 0.045662100456621, + "grad_norm": 318.0, + "learning_rate": 9.174311926605506e-06, + "loss": 46.3618, "step": 5 }, { - "epoch": 0.024630541871921183, - "grad_norm": 346.0, - "learning_rate": 3.2840722495894914e-06, - "loss": 47.1113, + "epoch": 0.091324200913242, + "grad_norm": 139.0, + "learning_rate": 1.834862385321101e-05, + "loss": 39.3883, "step": 10 }, { - "epoch": 0.03694581280788178, - "grad_norm": 366.0, - "learning_rate": 4.926108374384237e-06, - "loss": 45.8102, + "epoch": 0.136986301369863, + "grad_norm": 52.25, + "learning_rate": 2.7522935779816515e-05, + "loss": 29.4216, "step": 15 }, { - "epoch": 0.04926108374384237, - "grad_norm": 358.0, - "learning_rate": 6.568144499178983e-06, - "loss": 40.5213, + "epoch": 0.182648401826484, + "grad_norm": 20.25, + "learning_rate": 3.669724770642202e-05, + "loss": 24.7169, "step": 20 }, { - "epoch": 0.06157635467980296, - "grad_norm": 288.0, - "learning_rate": 8.210180623973727e-06, - "loss": 35.9014, + "epoch": 0.228310502283105, + "grad_norm": 10.0, + "learning_rate": 4.587155963302753e-05, + "loss": 21.4244, "step": 25 }, { - "epoch": 0.07389162561576355, - "grad_norm": 198.0, - "learning_rate": 9.852216748768475e-06, - "loss": 31.5535, + "epoch": 0.273972602739726, + "grad_norm": 5.0, + "learning_rate": 5.504587155963303e-05, + "loss": 19.7804, "step": 30 }, { - "epoch": 0.08620689655172414, - "grad_norm": 71.0, - "learning_rate": 1.1494252873563218e-05, - "loss": 26.5323, + "epoch": 0.319634703196347, + "grad_norm": 4.6875, + "learning_rate": 6.422018348623854e-05, + "loss": 19.1075, "step": 35 }, { - "epoch": 0.09852216748768473, - "grad_norm": 50.0, - "learning_rate": 1.3136288998357965e-05, - "loss": 24.6833, + "epoch": 0.365296803652968, + "grad_norm": 8.5, + "learning_rate": 7.339449541284404e-05, + "loss": 18.037, "step": 40 }, { - "epoch": 0.11083743842364532, - "grad_norm": 34.75, - "learning_rate": 1.4778325123152711e-05, - "loss": 23.2877, + "epoch": 0.410958904109589, + "grad_norm": 18.75, + "learning_rate": 8.256880733944955e-05, + "loss": 17.2652, "step": 45 }, { - "epoch": 0.12315270935960591, - "grad_norm": 22.75, - "learning_rate": 1.6420361247947455e-05, - "loss": 22.3972, + "epoch": 0.45662100456621, + "grad_norm": 29.875, + "learning_rate": 9.174311926605506e-05, + "loss": 14.4775, "step": 50 }, { - "epoch": 0.1354679802955665, - "grad_norm": 16.25, - "learning_rate": 1.8062397372742202e-05, - "loss": 20.5332, + "epoch": 0.502283105022831, + "grad_norm": 37.75, + "learning_rate": 0.00010091743119266055, + "loss": 9.6302, "step": 55 }, { - "epoch": 0.1477832512315271, - "grad_norm": 8.6875, - "learning_rate": 1.970443349753695e-05, - "loss": 19.9415, + "epoch": 0.547945205479452, + "grad_norm": 8.75, + "learning_rate": 0.00011009174311926606, + "loss": 4.0499, "step": 60 }, { - "epoch": 0.16009852216748768, - "grad_norm": 7.09375, - "learning_rate": 2.1346469622331693e-05, - "loss": 19.4214, + "epoch": 0.593607305936073, + "grad_norm": 7.03125, + "learning_rate": 0.00011926605504587157, + "loss": 2.4784, "step": 65 }, { - "epoch": 0.1724137931034483, - "grad_norm": 8.25, - "learning_rate": 2.2988505747126437e-05, - "loss": 18.7164, + "epoch": 0.639269406392694, + "grad_norm": 2.546875, + "learning_rate": 0.00012844036697247707, + "loss": 2.0967, "step": 70 }, { - "epoch": 0.18472906403940886, - "grad_norm": 10.375, - "learning_rate": 2.4630541871921184e-05, - "loss": 18.4288, + "epoch": 0.684931506849315, + "grad_norm": 0.83984375, + "learning_rate": 0.00013761467889908258, + "loss": 1.8211, "step": 75 }, { - "epoch": 0.19704433497536947, - "grad_norm": 12.9375, - "learning_rate": 2.627257799671593e-05, - "loss": 17.7133, + "epoch": 0.730593607305936, + "grad_norm": 1.5546875, + "learning_rate": 0.0001467889908256881, + "loss": 1.6757, "step": 80 }, { - "epoch": 0.20935960591133004, - "grad_norm": 22.5, - "learning_rate": 2.7914614121510675e-05, - "loss": 17.1272, + "epoch": 0.776255707762557, + "grad_norm": 1.0546875, + "learning_rate": 0.0001559633027522936, + "loss": 1.5691, "step": 85 }, { - "epoch": 0.22167487684729065, - "grad_norm": 34.5, - "learning_rate": 2.9556650246305422e-05, - "loss": 16.1901, + "epoch": 0.821917808219178, + "grad_norm": 0.96484375, + "learning_rate": 0.0001651376146788991, + "loss": 1.4872, "step": 90 }, { - "epoch": 0.23399014778325122, - "grad_norm": 56.25, - "learning_rate": 3.119868637110017e-05, - "loss": 13.4454, + "epoch": 0.867579908675799, + "grad_norm": 0.90234375, + "learning_rate": 0.00017431192660550458, + "loss": 1.4228, "step": 95 }, { - "epoch": 0.24630541871921183, - "grad_norm": 68.0, - "learning_rate": 3.284072249589491e-05, - "loss": 10.339, + "epoch": 0.91324200913242, + "grad_norm": 2.546875, + "learning_rate": 0.00018348623853211012, + "loss": 1.3778, "step": 100 }, { - "epoch": 0.25862068965517243, - "grad_norm": 43.25, - "learning_rate": 3.4482758620689657e-05, - "loss": 5.5131, + "epoch": 0.958904109589041, + "grad_norm": 1.5390625, + "learning_rate": 0.0001926605504587156, + "loss": 1.3554, "step": 105 }, { - "epoch": 0.270935960591133, - "grad_norm": 21.25, - "learning_rate": 3.6124794745484404e-05, - "loss": 2.7823, + "epoch": 0.9954337899543378, + "eval_loss": 2.645094871520996, + "eval_runtime": 0.2786, + "eval_samples_per_second": 35.888, + "eval_steps_per_second": 3.589, + "step": 109 + }, + { + "epoch": 1.004566210045662, + "grad_norm": 0.9296875, + "learning_rate": 0.00019999948721966259, + "loss": 1.326, "step": 110 }, { - "epoch": 0.2832512315270936, - "grad_norm": 3.578125, - "learning_rate": 3.7766830870279144e-05, - "loss": 2.3041, + "epoch": 1.0502283105022832, + "grad_norm": 1.015625, + "learning_rate": 0.00019998154046002822, + "loss": 1.291, "step": 115 }, { - "epoch": 0.2955665024630542, - "grad_norm": 4.3125, - "learning_rate": 3.94088669950739e-05, - "loss": 2.0115, + "epoch": 1.095890410958904, + "grad_norm": 1.0625, + "learning_rate": 0.0001999379599421534, + "loss": 1.2721, "step": 120 }, { - "epoch": 0.3078817733990148, - "grad_norm": 2.296875, - "learning_rate": 4.105090311986864e-05, - "loss": 1.8402, + "epoch": 1.1415525114155252, + "grad_norm": 1.5546875, + "learning_rate": 0.00019986875683942535, + "loss": 1.2479, "step": 125 }, { - "epoch": 0.32019704433497537, - "grad_norm": 2.1875, - "learning_rate": 4.2692939244663386e-05, - "loss": 1.6992, + "epoch": 1.187214611872146, + "grad_norm": 3.375, + "learning_rate": 0.00019977394889447524, + "loss": 1.2491, "step": 130 }, { - "epoch": 0.33251231527093594, - "grad_norm": 5.53125, - "learning_rate": 4.433497536945813e-05, - "loss": 1.5865, + "epoch": 1.2328767123287672, + "grad_norm": 1.1171875, + "learning_rate": 0.00019965356041462955, + "loss": 1.2212, "step": 135 }, { - "epoch": 0.3448275862068966, - "grad_norm": 2.40625, - "learning_rate": 4.597701149425287e-05, - "loss": 1.5333, + "epoch": 1.278538812785388, + "grad_norm": 1.546875, + "learning_rate": 0.00019950762226567781, + "loss": 1.2246, "step": 140 }, { - "epoch": 0.35714285714285715, - "grad_norm": 4.03125, - "learning_rate": 4.761904761904762e-05, - "loss": 1.4848, + "epoch": 1.3242009132420092, + "grad_norm": 1.984375, + "learning_rate": 0.00019933617186395917, + "loss": 1.2387, "step": 145 }, { - "epoch": 0.3694581280788177, - "grad_norm": 2.375, - "learning_rate": 4.926108374384237e-05, - "loss": 1.4199, + "epoch": 1.36986301369863, + "grad_norm": 2.46875, + "learning_rate": 0.00019913925316676945, + "loss": 1.203, "step": 150 }, { - "epoch": 0.3817733990147783, - "grad_norm": 4.3125, - "learning_rate": 5.0903119868637115e-05, - "loss": 1.3649, + "epoch": 1.4155251141552512, + "grad_norm": 0.93359375, + "learning_rate": 0.00019891691666109113, + "loss": 1.1869, "step": 155 }, { - "epoch": 0.39408866995073893, - "grad_norm": 20.875, - "learning_rate": 5.254515599343186e-05, - "loss": 1.333, + "epoch": 1.461187214611872, + "grad_norm": 2.28125, + "learning_rate": 0.00019866921935064906, + "loss": 1.1858, "step": 160 }, { - "epoch": 0.4064039408866995, - "grad_norm": 2.515625, - "learning_rate": 5.41871921182266e-05, - "loss": 1.2964, + "epoch": 1.5068493150684932, + "grad_norm": 1.296875, + "learning_rate": 0.00019839622474129596, + "loss": 1.1696, "step": 165 }, { - "epoch": 0.4187192118226601, - "grad_norm": 5.03125, - "learning_rate": 5.582922824302135e-05, - "loss": 1.2699, + "epoch": 1.5525114155251143, + "grad_norm": 0.81640625, + "learning_rate": 0.00019809800282473013, + "loss": 1.1624, "step": 170 }, { - "epoch": 0.43103448275862066, - "grad_norm": 2.59375, - "learning_rate": 5.747126436781609e-05, - "loss": 1.241, + "epoch": 1.5981735159817352, + "grad_norm": 1.734375, + "learning_rate": 0.0001977746300605507, + "loss": 1.1494, "step": 175 }, { - "epoch": 0.4433497536945813, - "grad_norm": 4.1875, - "learning_rate": 5.9113300492610844e-05, - "loss": 1.2204, + "epoch": 1.643835616438356, + "grad_norm": 1.203125, + "learning_rate": 0.00019742618935665476, + "loss": 1.1314, "step": 180 }, { - "epoch": 0.45566502463054187, - "grad_norm": 4.125, - "learning_rate": 6.075533661740559e-05, - "loss": 1.2022, + "epoch": 1.6894977168949772, + "grad_norm": 2.078125, + "learning_rate": 0.00019705277004798073, + "loss": 1.1407, "step": 185 }, { - "epoch": 0.46798029556650245, - "grad_norm": 5.03125, - "learning_rate": 6.239737274220034e-05, - "loss": 1.1749, + "epoch": 1.7351598173515983, + "grad_norm": 2.0625, + "learning_rate": 0.0001966544678736044, + "loss": 1.1287, "step": 190 }, { - "epoch": 0.4802955665024631, - "grad_norm": 1.6796875, - "learning_rate": 6.403940886699507e-05, - "loss": 1.1671, + "epoch": 1.7808219178082192, + "grad_norm": 0.91796875, + "learning_rate": 0.00019623138495219292, + "loss": 1.1407, "step": 195 }, { - "epoch": 0.49261083743842365, - "grad_norm": 3.25, - "learning_rate": 6.568144499178982e-05, - "loss": 1.1441, + "epoch": 1.82648401826484, + "grad_norm": 12.0, + "learning_rate": 0.00019578362975582292, + "loss": 1.1151, "step": 200 }, { - "epoch": 0.5049261083743842, - "grad_norm": 5.96875, - "learning_rate": 6.732348111658457e-05, - "loss": 1.1313, + "epoch": 1.8721461187214612, + "grad_norm": 1.2890625, + "learning_rate": 0.00019531131708217005, + "loss": 1.1221, "step": 205 }, { - "epoch": 0.5172413793103449, - "grad_norm": 5.75, - "learning_rate": 6.896551724137931e-05, - "loss": 1.1001, + "epoch": 1.9178082191780823, + "grad_norm": 0.90625, + "learning_rate": 0.0001948145680250766, + "loss": 1.0982, "step": 210 }, { - "epoch": 0.5295566502463054, - "grad_norm": 1.78125, - "learning_rate": 7.060755336617406e-05, - "loss": 1.1061, + "epoch": 1.9634703196347032, + "grad_norm": 0.8984375, + "learning_rate": 0.00019429350994350483, + "loss": 1.0898, "step": 215 }, { - "epoch": 0.541871921182266, - "grad_norm": 1.6484375, - "learning_rate": 7.224958949096881e-05, - "loss": 1.0903, + "epoch": 2.0, + "eval_loss": 2.508340358734131, + "eval_runtime": 0.2456, + "eval_samples_per_second": 40.72, + "eval_steps_per_second": 4.072, + "step": 219 + }, + { + "epoch": 2.009132420091324, + "grad_norm": 2.59375, + "learning_rate": 0.00019374827642888398, + "loss": 1.1068, "step": 220 }, { - "epoch": 0.5541871921182266, - "grad_norm": 1.8515625, - "learning_rate": 7.389162561576355e-05, - "loss": 1.0691, + "epoch": 2.0547945205479454, + "grad_norm": 4.09375, + "learning_rate": 0.0001931790072708596, + "loss": 1.0932, "step": 225 }, { - "epoch": 0.5665024630541872, - "grad_norm": 1.53125, - "learning_rate": 7.553366174055829e-05, - "loss": 1.0464, + "epoch": 2.1004566210045663, + "grad_norm": 1.5, + "learning_rate": 0.00019258584842145343, + "loss": 1.1001, "step": 230 }, { - "epoch": 0.5788177339901478, - "grad_norm": 2.0, - "learning_rate": 7.717569786535304e-05, - "loss": 1.0537, + "epoch": 2.146118721461187, + "grad_norm": 11.25, + "learning_rate": 0.00019196895195764362, + "loss": 1.1001, "step": 235 }, { - "epoch": 0.5911330049261084, - "grad_norm": 1.8515625, - "learning_rate": 7.88177339901478e-05, - "loss": 1.0279, + "epoch": 2.191780821917808, + "grad_norm": 1.9375, + "learning_rate": 0.0001913284760423745, + "loss": 1.1046, "step": 240 }, { - "epoch": 0.603448275862069, - "grad_norm": 2.5625, - "learning_rate": 8.045977011494253e-05, - "loss": 1.0285, + "epoch": 2.237442922374429, + "grad_norm": 2.828125, + "learning_rate": 0.00019066458488400584, + "loss": 1.0795, "step": 245 }, { - "epoch": 0.6157635467980296, - "grad_norm": 10.125, - "learning_rate": 8.210180623973728e-05, - "loss": 1.043, + "epoch": 2.2831050228310503, + "grad_norm": 1.1953125, + "learning_rate": 0.00018997744869421246, + "loss": 1.0767, "step": 250 }, { - "epoch": 0.6280788177339901, - "grad_norm": 1.984375, - "learning_rate": 8.374384236453202e-05, - "loss": 1.0256, + "epoch": 2.328767123287671, + "grad_norm": 4.875, + "learning_rate": 0.00018926724364434446, + "loss": 1.059, "step": 255 }, { - "epoch": 0.6403940886699507, - "grad_norm": 3.328125, - "learning_rate": 8.538587848932677e-05, - "loss": 1.0294, + "epoch": 2.374429223744292, + "grad_norm": 3.171875, + "learning_rate": 0.0001885341518202595, + "loss": 1.0695, "step": 260 }, { - "epoch": 0.6527093596059114, - "grad_norm": 4.4375, - "learning_rate": 8.702791461412152e-05, - "loss": 1.0111, + "epoch": 2.4200913242009134, + "grad_norm": 0.734375, + "learning_rate": 0.00018777836117563892, + "loss": 1.0709, "step": 265 }, { - "epoch": 0.6650246305418719, - "grad_norm": 20.125, - "learning_rate": 8.866995073891627e-05, - "loss": 1.022, + "epoch": 2.4657534246575343, + "grad_norm": 7.59375, + "learning_rate": 0.00018700006548379898, + "loss": 1.0677, "step": 270 }, { - "epoch": 0.6773399014778325, - "grad_norm": 5.96875, - "learning_rate": 9.031198686371101e-05, - "loss": 1.0246, + "epoch": 2.5114155251141552, + "grad_norm": 0.984375, + "learning_rate": 0.0001861994642880105, + "loss": 1.0693, "step": 275 }, { - "epoch": 0.6896551724137931, - "grad_norm": 1.5625, - "learning_rate": 9.195402298850575e-05, - "loss": 1.0236, + "epoch": 2.557077625570776, + "grad_norm": 0.95703125, + "learning_rate": 0.00018537676285033887, + "loss": 1.0508, "step": 280 }, { - "epoch": 0.7019704433497537, - "grad_norm": 2.59375, - "learning_rate": 9.35960591133005e-05, - "loss": 1.0164, + "epoch": 2.602739726027397, + "grad_norm": 0.578125, + "learning_rate": 0.0001845321720990181, + "loss": 1.0449, "step": 285 }, { - "epoch": 0.7142857142857143, - "grad_norm": 2.640625, - "learning_rate": 9.523809523809524e-05, - "loss": 1.0113, + "epoch": 2.6484018264840183, + "grad_norm": 1.015625, + "learning_rate": 0.00018366590857437184, + "loss": 1.0562, "step": 290 }, { - "epoch": 0.7266009852216748, - "grad_norm": 1.140625, - "learning_rate": 9.688013136288999e-05, - "loss": 1.0018, + "epoch": 2.6940639269406392, + "grad_norm": 1.734375, + "learning_rate": 0.00018277819437329576, + "loss": 1.0428, "step": 295 }, { - "epoch": 0.7389162561576355, - "grad_norm": 1.09375, - "learning_rate": 9.852216748768474e-05, - "loss": 0.9837, + "epoch": 2.73972602739726, + "grad_norm": 1.53125, + "learning_rate": 0.00018186925709231532, + "loss": 1.0321, "step": 300 }, { - "epoch": 0.7512315270935961, - "grad_norm": 5.21875, - "learning_rate": 0.0001001642036124795, - "loss": 1.013, + "epoch": 2.7853881278538815, + "grad_norm": 1.0703125, + "learning_rate": 0.0001809393297692334, + "loss": 1.0253, "step": 305 }, { - "epoch": 0.7635467980295566, - "grad_norm": 2.15625, - "learning_rate": 0.00010180623973727423, - "loss": 1.0111, + "epoch": 2.8310502283105023, + "grad_norm": 3.1875, + "learning_rate": 0.0001799886508233829, + "loss": 1.0377, "step": 310 }, { - "epoch": 0.7758620689655172, - "grad_norm": 3.125, - "learning_rate": 0.00010344827586206898, - "loss": 0.997, + "epoch": 2.8767123287671232, + "grad_norm": 3.484375, + "learning_rate": 0.0001790174639944997, + "loss": 1.0359, "step": 315 }, { - "epoch": 0.7881773399014779, - "grad_norm": 1.3515625, - "learning_rate": 0.00010509031198686372, - "loss": 0.9858, + "epoch": 2.922374429223744, + "grad_norm": 6.09375, + "learning_rate": 0.00017802601828023138, + "loss": 1.0428, "step": 320 }, { - "epoch": 0.8004926108374384, - "grad_norm": 3.0, - "learning_rate": 0.00010673234811165847, - "loss": 0.9788, + "epoch": 2.968036529680365, + "grad_norm": 3.03125, + "learning_rate": 0.00017701456787229804, + "loss": 1.0434, "step": 325 }, { - "epoch": 0.812807881773399, - "grad_norm": 1.40625, - "learning_rate": 0.0001083743842364532, - "loss": 0.9844, + "epoch": 2.9954337899543377, + "eval_loss": 2.480058193206787, + "eval_runtime": 0.2581, + "eval_samples_per_second": 38.741, + "eval_steps_per_second": 3.874, + "step": 328 + }, + { + "epoch": 3.0136986301369864, + "grad_norm": 7.625, + "learning_rate": 0.0001759833720913214, + "loss": 1.0302, "step": 330 }, { - "epoch": 0.8251231527093597, - "grad_norm": 1.390625, - "learning_rate": 0.00011001642036124795, - "loss": 0.9594, + "epoch": 3.0593607305936072, + "grad_norm": 1.7890625, + "learning_rate": 0.00017493269532033883, + "loss": 1.0273, "step": 335 }, { - "epoch": 0.8374384236453202, - "grad_norm": 2.578125, - "learning_rate": 0.0001116584564860427, - "loss": 0.9913, + "epoch": 3.105022831050228, + "grad_norm": 1.71875, + "learning_rate": 0.0001738628069370195, + "loss": 1.0212, "step": 340 }, { - "epoch": 0.8497536945812808, - "grad_norm": 1.9375, - "learning_rate": 0.00011330049261083743, - "loss": 0.9689, + "epoch": 3.1506849315068495, + "grad_norm": 1.3515625, + "learning_rate": 0.00017277398124460023, + "loss": 1.013, "step": 345 }, { - "epoch": 0.8620689655172413, - "grad_norm": 4.5625, - "learning_rate": 0.00011494252873563218, - "loss": 0.9669, + "epoch": 3.1963470319634704, + "grad_norm": 2.390625, + "learning_rate": 0.000171666497401558, + "loss": 1.0077, "step": 350 }, { - "epoch": 0.874384236453202, - "grad_norm": 2.171875, - "learning_rate": 0.00011658456486042693, - "loss": 0.9903, + "epoch": 3.2420091324200913, + "grad_norm": 0.921875, + "learning_rate": 0.0001705406393500381, + "loss": 1.0111, "step": 355 }, { - "epoch": 0.8866995073891626, - "grad_norm": 80.5, - "learning_rate": 0.00011822660098522169, - "loss": 0.9542, + "epoch": 3.287671232876712, + "grad_norm": 1.1875, + "learning_rate": 0.00016939669574305566, + "loss": 1.0047, "step": 360 }, { - "epoch": 0.8990147783251231, - "grad_norm": 5.8125, - "learning_rate": 0.00011986863711001643, - "loss": 0.9595, + "epoch": 3.3333333333333335, + "grad_norm": 0.97265625, + "learning_rate": 0.0001682349598704892, + "loss": 0.9977, "step": 365 }, { - "epoch": 0.9113300492610837, - "grad_norm": 3.671875, - "learning_rate": 0.00012151067323481118, - "loss": 0.952, + "epoch": 3.3789954337899544, + "grad_norm": 1.09375, + "learning_rate": 0.00016705572958388576, + "loss": 0.9914, "step": 370 }, { - "epoch": 0.9236453201970444, - "grad_norm": 1.9140625, - "learning_rate": 0.00012315270935960593, - "loss": 0.9749, + "epoch": 3.4246575342465753, + "grad_norm": 1.046875, + "learning_rate": 0.00016585930722009601, + "loss": 1.0012, "step": 375 }, { - "epoch": 0.9359605911330049, - "grad_norm": 1.78125, - "learning_rate": 0.00012479474548440068, - "loss": 0.9496, + "epoch": 3.470319634703196, + "grad_norm": 1.7421875, + "learning_rate": 0.00016464599952375998, + "loss": 0.9888, "step": 380 }, { - "epoch": 0.9482758620689655, - "grad_norm": 3.640625, - "learning_rate": 0.0001264367816091954, - "loss": 0.9454, + "epoch": 3.5159817351598175, + "grad_norm": 0.73828125, + "learning_rate": 0.000163416117568662, + "loss": 1.0036, "step": 385 }, { - "epoch": 0.9605911330049262, - "grad_norm": 1.0234375, - "learning_rate": 0.00012807881773399014, - "loss": 0.9526, + "epoch": 3.5616438356164384, + "grad_norm": 2.515625, + "learning_rate": 0.0001621699766779763, + "loss": 0.9963, "step": 390 }, { - "epoch": 0.9729064039408867, - "grad_norm": 3.703125, - "learning_rate": 0.0001297208538587849, - "loss": 0.9354, + "epoch": 3.6073059360730593, + "grad_norm": 1.203125, + "learning_rate": 0.00016090789634342278, + "loss": 0.9955, "step": 395 }, { - "epoch": 0.9852216748768473, - "grad_norm": 1.6328125, - "learning_rate": 0.00013136288998357964, - "loss": 0.9335, + "epoch": 3.65296803652968, + "grad_norm": 1.53125, + "learning_rate": 0.00015963020014335438, + "loss": 0.9953, "step": 400 }, { - "epoch": 0.9975369458128078, - "grad_norm": 1.046875, - "learning_rate": 0.00013300492610837438, - "loss": 0.9333, + "epoch": 3.6986301369863015, + "grad_norm": 2.015625, + "learning_rate": 0.0001583372156597961, + "loss": 0.9959, "step": 405 }, { - "epoch": 1.0, - "eval_loss": 2.444850206375122, - "eval_runtime": 2.0491, - "eval_samples_per_second": 4.88, - "eval_steps_per_second": 0.976, - "step": 406 - }, - { - "epoch": 1.0098522167487685, - "grad_norm": 1.2578125, - "learning_rate": 0.00013464696223316913, - "loss": 0.9413, + "epoch": 3.7442922374429224, + "grad_norm": 1.75, + "learning_rate": 0.00015702927439445826, + "loss": 0.9906, "step": 410 }, { - "epoch": 1.022167487684729, - "grad_norm": 2.765625, - "learning_rate": 0.00013628899835796388, - "loss": 0.9055, + "epoch": 3.7899543378995433, + "grad_norm": 1.4453125, + "learning_rate": 0.00015570671168374438, + "loss": 0.9849, "step": 415 }, { - "epoch": 1.0344827586206897, - "grad_norm": 5.1875, - "learning_rate": 0.00013793103448275863, - "loss": 0.9148, + "epoch": 3.8356164383561646, + "grad_norm": 2.234375, + "learning_rate": 0.00015436986661277577, + "loss": 0.9697, "step": 420 }, { - "epoch": 1.0467980295566504, - "grad_norm": 0.96484375, - "learning_rate": 0.00013957307060755337, - "loss": 0.8961, + "epoch": 3.8812785388127855, + "grad_norm": 1.5, + "learning_rate": 0.0001530190819284555, + "loss": 0.979, "step": 425 }, { - "epoch": 1.0591133004926108, - "grad_norm": 1.40625, - "learning_rate": 0.00014121510673234812, - "loss": 0.9097, + "epoch": 3.9269406392694064, + "grad_norm": 10.75, + "learning_rate": 0.00015165470395159313, + "loss": 0.9715, "step": 430 }, { - "epoch": 1.0714285714285714, - "grad_norm": 2.578125, - "learning_rate": 0.00014285714285714287, - "loss": 0.9046, + "epoch": 3.9726027397260273, + "grad_norm": 2.71875, + "learning_rate": 0.0001502770824881133, + "loss": 0.9864, "step": 435 }, { - "epoch": 1.083743842364532, - "grad_norm": 0.921875, - "learning_rate": 0.00014449917898193762, - "loss": 0.8812, + "epoch": 4.0, + "eval_loss": 2.474334239959717, + "eval_runtime": 0.2363, + "eval_samples_per_second": 42.318, + "eval_steps_per_second": 4.232, + "step": 438 + }, + { + "epoch": 4.018264840182648, + "grad_norm": 1.3046875, + "learning_rate": 0.00014888657073937076, + "loss": 0.9764, "step": 440 }, { - "epoch": 1.0960591133004927, - "grad_norm": 1.859375, - "learning_rate": 0.00014614121510673236, - "loss": 0.9016, + "epoch": 4.063926940639269, + "grad_norm": 0.97265625, + "learning_rate": 0.00014748352521159493, + "loss": 0.9564, "step": 445 }, { - "epoch": 1.1083743842364533, - "grad_norm": 0.8515625, - "learning_rate": 0.0001477832512315271, - "loss": 0.8977, + "epoch": 4.109589041095891, + "grad_norm": 0.7265625, + "learning_rate": 0.0001460683056244869, + "loss": 0.9573, "step": 450 }, { - "epoch": 1.1206896551724137, - "grad_norm": 1.0546875, - "learning_rate": 0.00014942528735632183, - "loss": 0.8917, + "epoch": 4.155251141552512, + "grad_norm": 11.5625, + "learning_rate": 0.00014464127481899312, + "loss": 0.957, "step": 455 }, { - "epoch": 1.1330049261083743, - "grad_norm": 2.125, - "learning_rate": 0.00015106732348111658, - "loss": 0.8861, + "epoch": 4.200913242009133, + "grad_norm": 0.91796875, + "learning_rate": 0.00014320279866427796, + "loss": 0.9596, "step": 460 }, { - "epoch": 1.145320197044335, - "grad_norm": 0.8203125, - "learning_rate": 0.00015270935960591132, - "loss": 0.8908, + "epoch": 4.2465753424657535, + "grad_norm": 2.03125, + "learning_rate": 0.00014175324596392075, + "loss": 0.9647, "step": 465 }, { - "epoch": 1.1576354679802956, - "grad_norm": 2.8125, - "learning_rate": 0.00015435139573070607, - "loss": 0.8853, + "epoch": 4.292237442922374, + "grad_norm": 1.359375, + "learning_rate": 0.00014029298836135988, + "loss": 0.9632, "step": 470 }, { - "epoch": 1.1699507389162562, - "grad_norm": 3.125, - "learning_rate": 0.00015599343185550085, - "loss": 0.8877, + "epoch": 4.337899543378995, + "grad_norm": 5.09375, + "learning_rate": 0.00013882240024460927, + "loss": 0.9664, "step": 475 }, { - "epoch": 1.1822660098522166, - "grad_norm": 2.328125, - "learning_rate": 0.0001576354679802956, - "loss": 0.8836, + "epoch": 4.383561643835616, + "grad_norm": 2.96875, + "learning_rate": 0.0001373418586502706, + "loss": 0.964, "step": 480 }, { - "epoch": 1.1945812807881773, - "grad_norm": 3.15625, - "learning_rate": 0.0001592775041050903, - "loss": 0.8755, + "epoch": 4.429223744292237, + "grad_norm": 1.2421875, + "learning_rate": 0.0001358517431668672, + "loss": 0.9531, "step": 485 }, { - "epoch": 1.206896551724138, - "grad_norm": 1.7890625, - "learning_rate": 0.00016091954022988506, - "loss": 0.8794, + "epoch": 4.474885844748858, + "grad_norm": 1.046875, + "learning_rate": 0.00013435243583752294, + "loss": 0.958, "step": 490 }, { - "epoch": 1.2192118226600985, - "grad_norm": 2.015625, - "learning_rate": 0.0001625615763546798, - "loss": 0.9029, + "epoch": 4.52054794520548, + "grad_norm": 0.6953125, + "learning_rate": 0.00013284432106201233, + "loss": 0.9514, "step": 495 }, { - "epoch": 1.2315270935960592, - "grad_norm": 1.9765625, - "learning_rate": 0.00016420361247947455, - "loss": 0.9035, + "epoch": 4.566210045662101, + "grad_norm": 0.7265625, + "learning_rate": 0.00013132778549820618, + "loss": 0.9588, "step": 500 }, { - "epoch": 1.2438423645320198, - "grad_norm": 7.96875, - "learning_rate": 0.0001658456486042693, - "loss": 0.9059, + "epoch": 4.6118721461187215, + "grad_norm": 0.93359375, + "learning_rate": 0.00012980321796293836, + "loss": 0.9494, "step": 505 }, { - "epoch": 1.2561576354679804, - "grad_norm": 8.25, - "learning_rate": 0.00016748768472906405, - "loss": 0.9183, + "epoch": 4.657534246575342, + "grad_norm": 0.91796875, + "learning_rate": 0.00012827100933231905, + "loss": 0.9508, "step": 510 }, { - "epoch": 1.2684729064039408, - "grad_norm": 2.90625, - "learning_rate": 0.0001691297208538588, - "loss": 0.9074, + "epoch": 4.703196347031963, + "grad_norm": 1.0703125, + "learning_rate": 0.00012673155244151985, + "loss": 0.9557, "step": 515 }, { - "epoch": 1.2807881773399015, - "grad_norm": 2.109375, - "learning_rate": 0.00017077175697865354, - "loss": 0.8946, + "epoch": 4.748858447488584, + "grad_norm": 0.75, + "learning_rate": 0.000125185241984057, + "loss": 0.9508, "step": 520 }, { - "epoch": 1.293103448275862, - "grad_norm": 5.03125, - "learning_rate": 0.00017241379310344826, - "loss": 0.8911, + "epoch": 4.794520547945205, + "grad_norm": 0.76953125, + "learning_rate": 0.00012363247441059776, + "loss": 0.9562, "step": 525 }, { - "epoch": 1.3054187192118227, - "grad_norm": 1.40625, - "learning_rate": 0.00017405582922824304, - "loss": 0.8732, + "epoch": 4.840182648401827, + "grad_norm": 1.078125, + "learning_rate": 0.00012207364782731655, + "loss": 0.9542, "step": 530 }, { - "epoch": 1.3177339901477834, - "grad_norm": 1.1796875, - "learning_rate": 0.00017569786535303778, - "loss": 0.8752, + "epoch": 4.885844748858448, + "grad_norm": 2.1875, + "learning_rate": 0.00012050916189382646, + "loss": 0.9606, "step": 535 }, { - "epoch": 1.3300492610837438, - "grad_norm": 1.546875, - "learning_rate": 0.00017733990147783253, - "loss": 0.8822, + "epoch": 4.931506849315069, + "grad_norm": 1.84375, + "learning_rate": 0.00011893941772071249, + "loss": 0.9424, "step": 540 }, { - "epoch": 1.3423645320197044, - "grad_norm": 1.0, - "learning_rate": 0.00017898193760262728, - "loss": 0.8715, + "epoch": 4.9771689497716896, + "grad_norm": 1.9375, + "learning_rate": 0.00011736481776669306, + "loss": 0.9371, "step": 545 }, { - "epoch": 1.354679802955665, - "grad_norm": 0.91796875, - "learning_rate": 0.00018062397372742203, - "loss": 0.8754, + "epoch": 4.995433789954338, + "eval_loss": 2.485384464263916, + "eval_runtime": 0.2567, + "eval_samples_per_second": 38.96, + "eval_steps_per_second": 3.896, + "step": 547 + }, + { + "epoch": 5.0228310502283104, + "grad_norm": 0.99609375, + "learning_rate": 0.0001157857657354354, + "loss": 0.9249, "step": 550 }, { - "epoch": 1.3669950738916257, - "grad_norm": 1.171875, - "learning_rate": 0.00018226600985221675, - "loss": 0.8664, + "epoch": 5.068493150684931, + "grad_norm": 2.53125, + "learning_rate": 0.00011420266647205231, + "loss": 0.9271, "step": 555 }, { - "epoch": 1.3793103448275863, - "grad_norm": 1.0859375, - "learning_rate": 0.0001839080459770115, - "loss": 0.8896, + "epoch": 5.114155251141552, + "grad_norm": 12.0, + "learning_rate": 0.00011261592585930576, + "loss": 0.9329, "step": 560 }, { - "epoch": 1.3916256157635467, - "grad_norm": 1.21875, - "learning_rate": 0.00018555008210180624, - "loss": 0.8547, + "epoch": 5.159817351598173, + "grad_norm": 0.7890625, + "learning_rate": 0.00011102595071354472, + "loss": 0.9238, "step": 565 }, { - "epoch": 1.4039408866995073, - "grad_norm": 0.92578125, - "learning_rate": 0.000187192118226601, - "loss": 0.8671, + "epoch": 5.205479452054795, + "grad_norm": 6.3125, + "learning_rate": 0.00010943314868040364, + "loss": 0.9134, "step": 570 }, { - "epoch": 1.416256157635468, - "grad_norm": 1.109375, - "learning_rate": 0.00018883415435139573, - "loss": 0.8716, + "epoch": 5.251141552511416, + "grad_norm": 0.69921875, + "learning_rate": 0.00010783792813028827, + "loss": 0.91, "step": 575 }, { - "epoch": 1.4285714285714286, - "grad_norm": 2.125, - "learning_rate": 0.00019047619047619048, - "loss": 0.8814, + "epoch": 5.296803652968037, + "grad_norm": 4.375, + "learning_rate": 0.00010624069805367559, + "loss": 0.9193, "step": 580 }, { - "epoch": 1.4408866995073892, - "grad_norm": 2.609375, - "learning_rate": 0.00019211822660098523, - "loss": 0.8788, + "epoch": 5.342465753424658, + "grad_norm": 0.984375, + "learning_rate": 0.00010464186795625482, + "loss": 0.9101, "step": 585 }, { - "epoch": 1.4532019704433496, - "grad_norm": 1.515625, - "learning_rate": 0.00019376026272577998, - "loss": 0.8676, + "epoch": 5.3881278538812785, + "grad_norm": 3.1875, + "learning_rate": 0.00010304184775393642, + "loss": 0.9122, "step": 590 }, { - "epoch": 1.4655172413793103, - "grad_norm": 1.390625, - "learning_rate": 0.00019540229885057472, - "loss": 0.8878, + "epoch": 5.433789954337899, + "grad_norm": 0.90625, + "learning_rate": 0.00010144104766775572, + "loss": 0.9126, "step": 595 }, { - "epoch": 1.477832512315271, + "epoch": 5.47945205479452, "grad_norm": 1.171875, - "learning_rate": 0.00019704433497536947, - "loss": 0.8862, + "learning_rate": 9.983987811869862e-05, + "loss": 0.9177, "step": 600 }, { - "epoch": 1.4901477832512315, - "grad_norm": 1.0859375, - "learning_rate": 0.00019868637110016422, - "loss": 0.8774, + "epoch": 5.525114155251142, + "grad_norm": 0.53515625, + "learning_rate": 9.823874962247564e-05, + "loss": 0.9089, "step": 605 }, { - "epoch": 1.5024630541871922, - "grad_norm": 1.5703125, - "learning_rate": 0.00019999998357330727, - "loss": 0.8948, + "epoch": 5.570776255707763, + "grad_norm": 1.0078125, + "learning_rate": 9.663807268427198e-05, + "loss": 0.9112, "step": 610 }, { - "epoch": 1.5147783251231526, - "grad_norm": 5.6875, - "learning_rate": 0.00019999940863962815, - "loss": 0.8775, + "epoch": 5.616438356164384, + "grad_norm": 0.69140625, + "learning_rate": 9.503825769350017e-05, + "loss": 0.9142, "step": 615 }, { - "epoch": 1.5270935960591134, - "grad_norm": 0.92578125, - "learning_rate": 0.00019999801237670888, - "loss": 0.8641, + "epoch": 5.662100456621005, + "grad_norm": 0.8359375, + "learning_rate": 9.343971481858246e-05, + "loss": 0.9068, "step": 620 }, { - "epoch": 1.5394088669950738, - "grad_norm": 3.828125, - "learning_rate": 0.00019999579479601748, - "loss": 0.854, + "epoch": 5.707762557077626, + "grad_norm": 0.92578125, + "learning_rate": 9.184285390178978e-05, + "loss": 0.9134, "step": 625 }, { - "epoch": 1.5517241379310345, - "grad_norm": 1.9375, - "learning_rate": 0.00019999275591576766, - "loss": 0.8795, + "epoch": 5.7534246575342465, + "grad_norm": 1.890625, + "learning_rate": 9.024808435416434e-05, + "loss": 0.9106, "step": 630 }, { - "epoch": 1.564039408866995, - "grad_norm": 1.140625, - "learning_rate": 0.00019998889576091885, - "loss": 0.8771, + "epoch": 5.799086757990867, + "grad_norm": 0.90234375, + "learning_rate": 8.865581505055291e-05, + "loss": 0.9108, "step": 635 }, { - "epoch": 1.5763546798029555, - "grad_norm": 1.453125, - "learning_rate": 0.00019998421436317573, - "loss": 0.8707, + "epoch": 5.844748858447488, + "grad_norm": 1.6015625, + "learning_rate": 8.706645422477739e-05, + "loss": 0.9027, "step": 640 }, { - "epoch": 1.5886699507389164, - "grad_norm": 2.203125, - "learning_rate": 0.00019997871176098827, - "loss": 0.8698, + "epoch": 5.890410958904109, + "grad_norm": 4.90625, + "learning_rate": 8.548040936496989e-05, + "loss": 0.9217, "step": 645 }, { - "epoch": 1.6009852216748768, - "grad_norm": 0.7890625, - "learning_rate": 0.0001999723879995512, - "loss": 0.8751, + "epoch": 5.936073059360731, + "grad_norm": 7.09375, + "learning_rate": 8.389808710909881e-05, + "loss": 0.9227, "step": 650 }, { - "epoch": 1.6133004926108374, - "grad_norm": 1.5234375, - "learning_rate": 0.00019996524313080377, - "loss": 0.8577, + "epoch": 5.981735159817352, + "grad_norm": 6.625, + "learning_rate": 8.231989314071317e-05, + "loss": 0.9157, "step": 655 }, { - "epoch": 1.625615763546798, - "grad_norm": 1.15625, - "learning_rate": 0.00019995727721342914, - "loss": 0.8654, + "epoch": 6.0, + "eval_loss": 2.46421480178833, + "eval_runtime": 0.2356, + "eval_samples_per_second": 42.441, + "eval_steps_per_second": 4.244, + "step": 657 + }, + { + "epoch": 6.027397260273973, + "grad_norm": 0.7109375, + "learning_rate": 8.07462320849313e-05, + "loss": 0.902, "step": 660 }, { - "epoch": 1.6379310344827587, - "grad_norm": 1.515625, - "learning_rate": 0.00019994849031285415, - "loss": 0.8664, + "epoch": 6.073059360730594, + "grad_norm": 1.0703125, + "learning_rate": 7.917750740470117e-05, + "loss": 0.8855, "step": 665 }, { - "epoch": 1.6502463054187193, - "grad_norm": 3.328125, - "learning_rate": 0.00019993888250124866, - "loss": 0.8505, + "epoch": 6.1187214611872145, + "grad_norm": 1.25, + "learning_rate": 7.761412129735852e-05, + "loss": 0.9014, "step": 670 }, { - "epoch": 1.6625615763546797, - "grad_norm": 2.84375, - "learning_rate": 0.00019992845385752485, - "loss": 0.8557, + "epoch": 6.164383561643835, + "grad_norm": 0.984375, + "learning_rate": 7.605647459150961e-05, + "loss": 0.8863, "step": 675 }, { - "epoch": 1.6748768472906403, - "grad_norm": 1.6875, - "learning_rate": 0.0001999172044673367, - "loss": 0.8677, + "epoch": 6.210045662100456, + "grad_norm": 1.296875, + "learning_rate": 7.450496664426477e-05, + "loss": 0.8804, "step": 680 }, { - "epoch": 1.687192118226601, - "grad_norm": 1.703125, - "learning_rate": 0.00019990513442307933, - "loss": 0.8597, + "epoch": 6.255707762557078, + "grad_norm": 1.1171875, + "learning_rate": 7.295999523884921e-05, + "loss": 0.8795, "step": 685 }, { - "epoch": 1.6995073891625616, - "grad_norm": 2.421875, - "learning_rate": 0.00019989224382388813, - "loss": 0.8773, + "epoch": 6.301369863013699, + "grad_norm": 0.890625, + "learning_rate": 7.142195648261747e-05, + "loss": 0.8855, "step": 690 }, { - "epoch": 1.7118226600985222, - "grad_norm": 5.0625, - "learning_rate": 0.00019987853277563794, - "loss": 0.8669, + "epoch": 6.34703196347032, + "grad_norm": 0.671875, + "learning_rate": 6.989124470549745e-05, + "loss": 0.8799, "step": 695 }, { - "epoch": 1.7241379310344827, - "grad_norm": 1.53125, - "learning_rate": 0.00019986400139094236, - "loss": 0.8735, + "epoch": 6.392694063926941, + "grad_norm": 0.796875, + "learning_rate": 6.83682523588902e-05, + "loss": 0.8731, "step": 700 }, { - "epoch": 1.7364532019704435, - "grad_norm": 2.671875, - "learning_rate": 0.00019984864978915253, - "loss": 0.8679, + "epoch": 6.438356164383562, + "grad_norm": 2.015625, + "learning_rate": 6.685336991505122e-05, + "loss": 0.8818, "step": 705 }, { - "epoch": 1.748768472906404, - "grad_norm": 2.609375, - "learning_rate": 0.00019983247809635644, - "loss": 0.8775, + "epoch": 6.4840182648401825, + "grad_norm": 1.4140625, + "learning_rate": 6.534698576697939e-05, + "loss": 0.8792, "step": 710 }, { - "epoch": 1.7610837438423645, - "grad_norm": 1.671875, - "learning_rate": 0.00019981548644537766, - "loss": 0.8518, + "epoch": 6.529680365296803, + "grad_norm": 0.8125, + "learning_rate": 6.384948612883873e-05, + "loss": 0.8713, "step": 715 }, { - "epoch": 1.7733990147783252, - "grad_norm": 3.765625, - "learning_rate": 0.00019979767497577445, - "loss": 0.8451, + "epoch": 6.575342465753424, + "grad_norm": 0.53515625, + "learning_rate": 6.2361254936939e-05, + "loss": 0.8762, "step": 720 }, { - "epoch": 1.7857142857142856, - "grad_norm": 0.921875, - "learning_rate": 0.0001997790438338385, - "loss": 0.8589, + "epoch": 6.621004566210045, + "grad_norm": 0.515625, + "learning_rate": 6.088267375130023e-05, + "loss": 0.8708, "step": 725 }, { - "epoch": 1.7980295566502464, - "grad_norm": 2.375, - "learning_rate": 0.0001997595931725937, - "loss": 0.8545, + "epoch": 6.666666666666667, + "grad_norm": 0.70703125, + "learning_rate": 5.941412165782645e-05, + "loss": 0.8797, "step": 730 }, { - "epoch": 1.8103448275862069, - "grad_norm": 1.7265625, - "learning_rate": 0.000199739323151795, - "loss": 0.8611, + "epoch": 6.712328767123288, + "grad_norm": 1.21875, + "learning_rate": 5.79559751711138e-05, + "loss": 0.8634, "step": 735 }, { - "epoch": 1.8226600985221675, - "grad_norm": 2.90625, - "learning_rate": 0.00019971823393792693, - "loss": 0.8287, + "epoch": 6.757990867579909, + "grad_norm": 0.81640625, + "learning_rate": 5.650860813791785e-05, + "loss": 0.872, "step": 740 }, { - "epoch": 1.8349753694581281, - "grad_norm": 2.984375, - "learning_rate": 0.00019969632570420248, - "loss": 0.8651, + "epoch": 6.80365296803653, + "grad_norm": 0.625, + "learning_rate": 5.507239164130501e-05, + "loss": 0.8661, "step": 745 }, { - "epoch": 1.8472906403940885, - "grad_norm": 3.234375, - "learning_rate": 0.00019967359863056134, - "loss": 0.8459, + "epoch": 6.8493150684931505, + "grad_norm": 0.828125, + "learning_rate": 5.364769390551225e-05, + "loss": 0.8744, "step": 750 }, { - "epoch": 1.8596059113300494, - "grad_norm": 0.8203125, - "learning_rate": 0.0001996500529036688, - "loss": 0.8434, + "epoch": 6.894977168949771, + "grad_norm": 1.5234375, + "learning_rate": 5.2234880201540284e-05, + "loss": 0.8662, "step": 755 }, { - "epoch": 1.8719211822660098, - "grad_norm": 1.7734375, - "learning_rate": 0.0001996256887169139, - "loss": 0.85, + "epoch": 6.940639269406392, + "grad_norm": 0.94140625, + "learning_rate": 5.0834312753503124e-05, + "loss": 0.8764, "step": 760 }, { - "epoch": 1.8842364532019704, - "grad_norm": 2.953125, - "learning_rate": 0.00019960050627040806, - "loss": 0.8468, + "epoch": 6.986301369863014, + "grad_norm": 0.4609375, + "learning_rate": 4.9446350645759885e-05, + "loss": 0.8657, "step": 765 }, { - "epoch": 1.896551724137931, - "grad_norm": 2.0, - "learning_rate": 0.00019957450577098322, - "loss": 0.8825, + "epoch": 6.995433789954338, + "eval_loss": 2.5075883865356445, + "eval_runtime": 0.2561, + "eval_samples_per_second": 39.047, + "eval_steps_per_second": 3.905, + "step": 766 + }, + { + "epoch": 7.031963470319635, + "grad_norm": 0.408203125, + "learning_rate": 4.807134973085036e-05, + "loss": 0.8614, "step": 770 }, { - "epoch": 1.9088669950738915, - "grad_norm": 1.53125, - "learning_rate": 0.00019954768743219044, - "loss": 0.8443, + "epoch": 7.077625570776256, + "grad_norm": 0.56640625, + "learning_rate": 4.6709662538260267e-05, + "loss": 0.8457, "step": 775 }, { - "epoch": 1.9211822660098523, - "grad_norm": 1.6640625, - "learning_rate": 0.0001995200514742978, - "loss": 0.8399, + "epoch": 7.123287671232877, + "grad_norm": 0.447265625, + "learning_rate": 4.53616381840377e-05, + "loss": 0.8502, "step": 780 }, { - "epoch": 1.9334975369458127, - "grad_norm": 2.328125, - "learning_rate": 0.00019949159812428889, - "loss": 0.8544, + "epoch": 7.168949771689498, + "grad_norm": 0.64453125, + "learning_rate": 4.402762228128531e-05, + "loss": 0.8536, "step": 785 }, { - "epoch": 1.9458128078817734, - "grad_norm": 0.8125, - "learning_rate": 0.00019946232761586073, - "loss": 0.8334, + "epoch": 7.2146118721461185, + "grad_norm": 0.78515625, + "learning_rate": 4.2707956851550016e-05, + "loss": 0.8531, "step": 790 }, { - "epoch": 1.958128078817734, - "grad_norm": 0.984375, - "learning_rate": 0.0001994322401894221, - "loss": 0.8379, + "epoch": 7.260273972602739, + "grad_norm": 0.68359375, + "learning_rate": 4.140298023713416e-05, + "loss": 0.8609, "step": 795 }, { - "epoch": 1.9704433497536946, - "grad_norm": 1.0, - "learning_rate": 0.00019940133609209118, - "loss": 0.8502, + "epoch": 7.30593607305936, + "grad_norm": 0.671875, + "learning_rate": 4.011302701434937e-05, + "loss": 0.8529, "step": 800 }, { - "epoch": 1.9827586206896552, - "grad_norm": 1.46875, - "learning_rate": 0.00019936961557769385, - "loss": 0.8428, + "epoch": 7.351598173515982, + "grad_norm": 0.96875, + "learning_rate": 3.8838427907736476e-05, + "loss": 0.8566, "step": 805 }, { - "epoch": 1.9950738916256157, - "grad_norm": 2.140625, - "learning_rate": 0.00019933707890676158, - "loss": 0.8399, + "epoch": 7.397260273972603, + "grad_norm": 0.6171875, + "learning_rate": 3.757950970527249e-05, + "loss": 0.8508, "step": 810 }, { - "epoch": 2.0, - "eval_loss": 2.3512892723083496, - "eval_runtime": 2.0459, - "eval_samples_per_second": 4.888, - "eval_steps_per_second": 0.978, - "step": 812 - }, - { - "epoch": 2.0073891625615765, - "grad_norm": 11.5, - "learning_rate": 0.00019930372634652913, - "loss": 0.8116, + "epoch": 7.442922374429224, + "grad_norm": 0.435546875, + "learning_rate": 3.633659517458736e-05, + "loss": 0.8513, "step": 815 }, { - "epoch": 2.019704433497537, - "grad_norm": 1.46875, - "learning_rate": 0.00019926955817093243, - "loss": 0.7747, + "epoch": 7.488584474885845, + "grad_norm": 0.466796875, + "learning_rate": 3.5110002980210975e-05, + "loss": 0.8499, "step": 820 }, { - "epoch": 2.0320197044334973, - "grad_norm": 1.078125, - "learning_rate": 0.00019923457466060636, - "loss": 0.7982, + "epoch": 7.534246575342466, + "grad_norm": 0.43359375, + "learning_rate": 3.3900047601872596e-05, + "loss": 0.8493, "step": 825 }, { - "epoch": 2.044334975369458, - "grad_norm": 1.5859375, - "learning_rate": 0.0001991987761028824, - "loss": 0.7727, + "epoch": 7.579908675799087, + "grad_norm": 0.6171875, + "learning_rate": 3.270703925387279e-05, + "loss": 0.851, "step": 830 }, { - "epoch": 2.0566502463054186, - "grad_norm": 1.953125, - "learning_rate": 0.0001991621627917864, - "loss": 0.7962, + "epoch": 7.6255707762557075, + "grad_norm": 0.5234375, + "learning_rate": 3.153128380554941e-05, + "loss": 0.8452, "step": 835 }, { - "epoch": 2.0689655172413794, - "grad_norm": 0.9765625, - "learning_rate": 0.00019912473502803582, - "loss": 0.783, + "epoch": 7.671232876712329, + "grad_norm": 0.43359375, + "learning_rate": 3.037308270285709e-05, + "loss": 0.862, "step": 840 }, { - "epoch": 2.08128078817734, - "grad_norm": 1.28125, - "learning_rate": 0.00019908649311903775, - "loss": 0.7814, + "epoch": 7.71689497716895, + "grad_norm": 0.625, + "learning_rate": 2.923273289108115e-05, + "loss": 0.8487, "step": 845 }, { - "epoch": 2.0935960591133007, - "grad_norm": 2.09375, - "learning_rate": 0.00019904743737888603, - "loss": 0.791, + "epoch": 7.762557077625571, + "grad_norm": 0.39453125, + "learning_rate": 2.8110526738705344e-05, + "loss": 0.8516, "step": 850 }, { - "epoch": 2.105911330049261, - "grad_norm": 1.3828125, - "learning_rate": 0.0001990075681283587, - "loss": 0.7831, + "epoch": 7.808219178082192, + "grad_norm": 0.546875, + "learning_rate": 2.7006751962452882e-05, + "loss": 0.8541, "step": 855 }, { - "epoch": 2.1182266009852215, - "grad_norm": 1.3984375, - "learning_rate": 0.00019896688569491557, - "loss": 0.778, + "epoch": 7.853881278538813, + "grad_norm": 0.419921875, + "learning_rate": 2.592169155352031e-05, + "loss": 0.8486, "step": 860 }, { - "epoch": 2.1305418719211824, - "grad_norm": 0.77734375, - "learning_rate": 0.00019892539041269533, - "loss": 0.7664, + "epoch": 7.899543378995434, + "grad_norm": 0.86328125, + "learning_rate": 2.485562370502279e-05, + "loss": 0.8402, "step": 865 }, { - "epoch": 2.142857142857143, - "grad_norm": 1.265625, - "learning_rate": 0.00019888308262251285, - "loss": 0.7849, + "epoch": 7.945205479452055, + "grad_norm": 0.412109375, + "learning_rate": 2.3808821740669606e-05, + "loss": 0.8474, "step": 870 }, { - "epoch": 2.1551724137931036, - "grad_norm": 1.4453125, - "learning_rate": 0.0001988399626718565, - "loss": 0.7774, + "epoch": 7.9908675799086755, + "grad_norm": 0.40625, + "learning_rate": 2.2781554044688015e-05, + "loss": 0.8393, "step": 875 }, { - "epoch": 2.167487684729064, - "grad_norm": 2.078125, - "learning_rate": 0.00019879603091488504, - "loss": 0.7803, + "epoch": 8.0, + "eval_loss": 2.515906810760498, + "eval_runtime": 0.2401, + "eval_samples_per_second": 41.645, + "eval_steps_per_second": 4.164, + "step": 876 + }, + { + "epoch": 8.036529680365296, + "grad_norm": 0.5390625, + "learning_rate": 2.1774083993013718e-05, + "loss": 0.8404, "step": 880 }, { - "epoch": 2.1798029556650245, - "grad_norm": 2.515625, - "learning_rate": 0.00019875128771242506, - "loss": 0.7782, + "epoch": 8.082191780821917, + "grad_norm": 0.5234375, + "learning_rate": 2.078666988576504e-05, + "loss": 0.837, "step": 885 }, { - "epoch": 2.1921182266009853, - "grad_norm": 1.015625, - "learning_rate": 0.0001987057334319677, - "loss": 0.7883, + "epoch": 8.127853881278538, + "grad_norm": 0.58203125, + "learning_rate": 1.9819564881018983e-05, + "loss": 0.8372, "step": 890 }, { - "epoch": 2.2044334975369457, - "grad_norm": 1.140625, - "learning_rate": 0.0001986593684476658, - "loss": 0.7829, + "epoch": 8.173515981735159, + "grad_norm": 0.486328125, + "learning_rate": 1.887301692990494e-05, + "loss": 0.846, "step": 895 }, { - "epoch": 2.2167487684729066, - "grad_norm": 0.76171875, - "learning_rate": 0.00019861219314033077, - "loss": 0.78, + "epoch": 8.219178082191782, + "grad_norm": 0.439453125, + "learning_rate": 1.7947268713034127e-05, + "loss": 0.8461, "step": 900 }, { - "epoch": 2.229064039408867, - "grad_norm": 0.8125, - "learning_rate": 0.00019856420789742953, - "loss": 0.7865, + "epoch": 8.264840182648403, + "grad_norm": 0.4140625, + "learning_rate": 1.7042557578279626e-05, + "loss": 0.8373, "step": 905 }, { - "epoch": 2.2413793103448274, - "grad_norm": 1.0859375, - "learning_rate": 0.00019851541311308123, - "loss": 0.7731, + "epoch": 8.310502283105023, + "grad_norm": 0.46484375, + "learning_rate": 1.6159115479924257e-05, + "loss": 0.8422, "step": 910 }, { - "epoch": 2.2536945812807883, - "grad_norm": 1.109375, - "learning_rate": 0.000198465809188054, - "loss": 0.7705, + "epoch": 8.356164383561644, + "grad_norm": 0.4453125, + "learning_rate": 1.529716891919074e-05, + "loss": 0.8403, "step": 915 }, { - "epoch": 2.2660098522167487, - "grad_norm": 0.87890625, - "learning_rate": 0.00019841539652976192, - "loss": 0.7815, + "epoch": 8.401826484018265, + "grad_norm": 0.419921875, + "learning_rate": 1.4456938886170412e-05, + "loss": 0.8343, "step": 920 }, { - "epoch": 2.2783251231527095, - "grad_norm": 1.3984375, - "learning_rate": 0.00019836417555226129, - "loss": 0.781, + "epoch": 8.447488584474886, + "grad_norm": 0.43359375, + "learning_rate": 1.3638640803164516e-05, + "loss": 0.8355, "step": 925 }, { - "epoch": 2.29064039408867, - "grad_norm": 1.0703125, - "learning_rate": 0.0001983121466762474, - "loss": 0.7929, + "epoch": 8.493150684931507, + "grad_norm": 0.41796875, + "learning_rate": 1.2842484469453365e-05, + "loss": 0.841, "step": 930 }, { - "epoch": 2.302955665024631, - "grad_norm": 1.6328125, - "learning_rate": 0.0001982593103290512, - "loss": 0.7992, + "epoch": 8.538812785388128, + "grad_norm": 0.455078125, + "learning_rate": 1.2068674007506786e-05, + "loss": 0.8396, "step": 935 }, { - "epoch": 2.315270935960591, - "grad_norm": 1.2421875, - "learning_rate": 0.00019820566694463566, - "loss": 0.8023, + "epoch": 8.584474885844749, + "grad_norm": 0.40625, + "learning_rate": 1.1317407810650372e-05, + "loss": 0.8377, "step": 940 }, { - "epoch": 2.3275862068965516, - "grad_norm": 1.09375, - "learning_rate": 0.00019815121696359212, - "loss": 0.7887, + "epoch": 8.63013698630137, + "grad_norm": 0.392578125, + "learning_rate": 1.058887849220026e-05, + "loss": 0.8348, "step": 945 }, { - "epoch": 2.3399014778325125, - "grad_norm": 0.90625, - "learning_rate": 0.0001980959608331369, - "loss": 0.7807, + "epoch": 8.67579908675799, + "grad_norm": 0.396484375, + "learning_rate": 9.883272836080116e-06, + "loss": 0.8388, "step": 950 }, { - "epoch": 2.352216748768473, - "grad_norm": 0.6640625, - "learning_rate": 0.00019803989900710734, - "loss": 0.7934, + "epoch": 8.721461187214611, + "grad_norm": 0.396484375, + "learning_rate": 9.200771748932513e-06, + "loss": 0.8366, "step": 955 }, { - "epoch": 2.3645320197044333, - "grad_norm": 1.4453125, - "learning_rate": 0.00019798303194595846, - "loss": 0.7867, + "epoch": 8.767123287671232, + "grad_norm": 0.416015625, + "learning_rate": 8.541550213737171e-06, + "loss": 0.8436, "step": 960 }, { - "epoch": 2.376847290640394, - "grad_norm": 0.78515625, - "learning_rate": 0.0001979253601167588, - "loss": 0.7571, + "epoch": 8.812785388127853, + "grad_norm": 0.455078125, + "learning_rate": 7.905777244947954e-06, + "loss": 0.8409, "step": 965 }, { - "epoch": 2.3891625615763545, - "grad_norm": 1.640625, - "learning_rate": 0.00019786688399318664, - "loss": 0.7879, + "epoch": 8.858447488584474, + "grad_norm": 0.4140625, + "learning_rate": 7.293615845160196e-06, + "loss": 0.8377, "step": 970 }, { - "epoch": 2.4014778325123154, - "grad_norm": 1.9140625, - "learning_rate": 0.00019780760405552645, - "loss": 0.7975, + "epoch": 8.904109589041095, + "grad_norm": 0.408203125, + "learning_rate": 6.705222963319191e-06, + "loss": 0.8425, "step": 975 }, { - "epoch": 2.413793103448276, - "grad_norm": 1.1171875, - "learning_rate": 0.00019774752079066452, - "loss": 0.7924, + "epoch": 8.949771689497716, + "grad_norm": 0.419921875, + "learning_rate": 6.140749454480932e-06, + "loss": 0.8371, "step": 980 }, { - "epoch": 2.4261083743842367, - "grad_norm": 2.34375, - "learning_rate": 0.0001976866346920852, - "loss": 0.7733, + "epoch": 8.995433789954339, + "grad_norm": 0.408203125, + "learning_rate": 5.6003400411351325e-06, + "loss": 0.8462, + "step": 985 + }, + { + "epoch": 8.995433789954339, + "eval_loss": 2.518533706665039, + "eval_runtime": 0.2546, + "eval_samples_per_second": 39.282, + "eval_steps_per_second": 3.928, "step": 985 }, { - "epoch": 2.438423645320197, - "grad_norm": 1.03125, - "learning_rate": 0.00019762494625986677, - "loss": 0.7886, + "epoch": 9.04109589041096, + "grad_norm": 0.392578125, + "learning_rate": 5.0841332761005e-06, + "loss": 0.8404, "step": 990 }, { - "epoch": 2.4507389162561575, - "grad_norm": 0.9921875, - "learning_rate": 0.00019756245600067738, - "loss": 0.7751, + "epoch": 9.08675799086758, + "grad_norm": 0.416015625, + "learning_rate": 4.592261507001993e-06, + "loss": 0.8303, "step": 995 }, { - "epoch": 2.4630541871921183, - "grad_norm": 0.81640625, - "learning_rate": 0.00019749916442777078, - "loss": 0.7845, + "epoch": 9.132420091324201, + "grad_norm": 0.42578125, + "learning_rate": 4.124850842338779e-06, + "loss": 0.8325, "step": 1000 }, { - "epoch": 2.4753694581280787, - "grad_norm": 1.0234375, - "learning_rate": 0.00019743507206098233, - "loss": 0.7755, + "epoch": 9.178082191780822, + "grad_norm": 0.4140625, + "learning_rate": 3.6820211191520125e-06, + "loss": 0.8353, "step": 1005 }, { - "epoch": 2.4876847290640396, - "grad_norm": 0.87890625, - "learning_rate": 0.00019737017942672442, - "loss": 0.7751, + "epoch": 9.223744292237443, + "grad_norm": 0.419921875, + "learning_rate": 3.263885872300343e-06, + "loss": 0.8347, "step": 1010 }, { - "epoch": 2.5, - "grad_norm": 1.515625, - "learning_rate": 0.00019730448705798239, - "loss": 0.7804, + "epoch": 9.269406392694064, + "grad_norm": 0.3984375, + "learning_rate": 2.8705523053513816e-06, + "loss": 0.8329, "step": 1015 }, { - "epoch": 2.512315270935961, - "grad_norm": 1.140625, - "learning_rate": 0.00019723799549431007, - "loss": 0.7793, + "epoch": 9.315068493150685, + "grad_norm": 0.421875, + "learning_rate": 2.502121263096224e-06, + "loss": 0.8369, "step": 1020 }, { - "epoch": 2.5246305418719213, - "grad_norm": 0.65234375, - "learning_rate": 0.00019717070528182538, - "loss": 0.7916, + "epoch": 9.360730593607306, + "grad_norm": 0.419921875, + "learning_rate": 2.1586872056944428e-06, + "loss": 0.8324, "step": 1025 }, { - "epoch": 2.5369458128078817, - "grad_norm": 1.0078125, - "learning_rate": 0.00019710261697320573, - "loss": 0.7927, + "epoch": 9.406392694063927, + "grad_norm": 0.412109375, + "learning_rate": 1.840338184455881e-06, + "loss": 0.8383, "step": 1030 }, { - "epoch": 2.5492610837438425, - "grad_norm": 1.0703125, - "learning_rate": 0.00019703373112768365, - "loss": 0.7879, + "epoch": 9.452054794520548, + "grad_norm": 0.400390625, + "learning_rate": 1.5471558192656777e-06, + "loss": 0.8315, "step": 1035 }, { - "epoch": 2.561576354679803, - "grad_norm": 0.88671875, - "learning_rate": 0.00019696404831104204, - "loss": 0.7921, + "epoch": 9.497716894977168, + "grad_norm": 0.435546875, + "learning_rate": 1.2792152776580968e-06, + "loss": 0.8437, "step": 1040 }, { - "epoch": 2.5738916256157633, - "grad_norm": 1.6953125, - "learning_rate": 0.00019689356909560965, - "loss": 0.7809, + "epoch": 9.54337899543379, + "grad_norm": 0.419921875, + "learning_rate": 1.036585255544764e-06, + "loss": 0.8418, "step": 1045 }, { - "epoch": 2.586206896551724, - "grad_norm": 1.546875, - "learning_rate": 0.00019682229406025635, - "loss": 0.8084, + "epoch": 9.58904109589041, + "grad_norm": 0.400390625, + "learning_rate": 8.193279596020121e-07, + "loss": 0.8346, "step": 1050 }, { - "epoch": 2.5985221674876846, - "grad_norm": 0.92578125, - "learning_rate": 0.00019675022379038822, - "loss": 0.7885, + "epoch": 9.634703196347033, + "grad_norm": 0.40234375, + "learning_rate": 6.274990913221035e-07, + "loss": 0.8415, "step": 1055 }, { - "epoch": 2.6108374384236455, - "grad_norm": 0.640625, - "learning_rate": 0.00019667735887794304, - "loss": 0.7935, + "epoch": 9.680365296803654, + "grad_norm": 0.416015625, + "learning_rate": 4.6114783273213393e-07, + "loss": 0.8339, "step": 1060 }, { - "epoch": 2.623152709359606, - "grad_norm": 1.0078125, - "learning_rate": 0.00019660369992138517, - "loss": 0.7833, + "epoch": 9.726027397260275, + "grad_norm": 0.439453125, + "learning_rate": 3.203168337845508e-07, + "loss": 0.8331, "step": 1065 }, { - "epoch": 2.6354679802955667, - "grad_norm": 0.72265625, - "learning_rate": 0.0001965292475257007, - "loss": 0.7838, + "epoch": 9.771689497716896, + "grad_norm": 0.400390625, + "learning_rate": 2.05042201422323e-07, + "loss": 0.8458, "step": 1070 }, { - "epoch": 2.647783251231527, - "grad_norm": 0.890625, - "learning_rate": 0.00019645400230239256, - "loss": 0.7751, + "epoch": 9.817351598173516, + "grad_norm": 0.41015625, + "learning_rate": 1.1535349032167908e-07, + "loss": 0.8444, "step": 1075 }, { - "epoch": 2.6600985221674875, - "grad_norm": 0.94921875, - "learning_rate": 0.0001963779648694754, - "loss": 0.781, + "epoch": 9.863013698630137, + "grad_norm": 0.400390625, + "learning_rate": 5.127369531473525e-08, + "loss": 0.8486, "step": 1080 }, { - "epoch": 2.6724137931034484, - "grad_norm": 1.046875, - "learning_rate": 0.00019630113585147063, - "loss": 0.7858, + "epoch": 9.908675799086758, + "grad_norm": 0.39453125, + "learning_rate": 1.2819245493955744e-08, + "loss": 0.8473, "step": 1085 }, { - "epoch": 2.684729064039409, - "grad_norm": 0.734375, - "learning_rate": 0.00019622351587940116, - "loss": 0.7882, + "epoch": 9.954337899543379, + "grad_norm": 0.439453125, + "learning_rate": 0.0, + "loss": 0.8359, "step": 1090 }, { - "epoch": 2.697044334975369, - "grad_norm": 0.9453125, - "learning_rate": 0.00019614510559078625, - "loss": 0.7935, - "step": 1095 - }, - { - "epoch": 2.70935960591133, - "grad_norm": 1.2734375, - "learning_rate": 0.00019606590562963637, - "loss": 0.7805, - "step": 1100 - }, - { - "epoch": 2.7216748768472905, - "grad_norm": 1.0625, - "learning_rate": 0.00019598591664644782, - "loss": 0.7713, - "step": 1105 - }, - { - "epoch": 2.7339901477832513, - "grad_norm": 1.0546875, - "learning_rate": 0.00019590513929819734, - "loss": 0.7954, - "step": 1110 - }, - { - "epoch": 2.7463054187192117, - "grad_norm": 0.96484375, - "learning_rate": 0.0001958235742483369, - "loss": 0.7897, - "step": 1115 - }, - { - "epoch": 2.7586206896551726, - "grad_norm": 0.7890625, - "learning_rate": 0.00019574122216678799, - "loss": 0.7809, - "step": 1120 - }, - { - "epoch": 2.770935960591133, - "grad_norm": 1.171875, - "learning_rate": 0.0001956580837299364, - "loss": 0.7773, - "step": 1125 - }, - { - "epoch": 2.7832512315270934, - "grad_norm": 0.984375, - "learning_rate": 0.00019557415962062643, - "loss": 0.7958, - "step": 1130 - }, - { - "epoch": 2.7955665024630543, - "grad_norm": 1.078125, - "learning_rate": 0.0001954894505281554, - "loss": 0.7782, - "step": 1135 - }, - { - "epoch": 2.8078817733990147, - "grad_norm": 1.2890625, - "learning_rate": 0.000195403957148268, - "loss": 0.7704, - "step": 1140 - }, - { - "epoch": 2.8201970443349755, - "grad_norm": 0.703125, - "learning_rate": 0.0001953176801831505, - "loss": 0.7934, - "step": 1145 - }, - { - "epoch": 2.832512315270936, - "grad_norm": 0.96484375, - "learning_rate": 0.000195230620341425, - "loss": 0.7692, - "step": 1150 - }, - { - "epoch": 2.844827586206897, - "grad_norm": 0.72265625, - "learning_rate": 0.0001951427783381437, - "loss": 0.7775, - "step": 1155 - }, - { - "epoch": 2.857142857142857, - "grad_norm": 0.89453125, - "learning_rate": 0.0001950541548947829, - "loss": 0.7742, - "step": 1160 - }, - { - "epoch": 2.8694581280788176, - "grad_norm": 0.73828125, - "learning_rate": 0.0001949647507392372, - "loss": 0.7732, - "step": 1165 - }, - { - "epoch": 2.8817733990147785, - "grad_norm": 0.828125, - "learning_rate": 0.0001948745666058134, - "loss": 0.7918, - "step": 1170 - }, - { - "epoch": 2.894088669950739, - "grad_norm": 1.6875, - "learning_rate": 0.00019478360323522446, - "loss": 0.7684, - "step": 1175 - }, - { - "epoch": 2.9064039408866993, - "grad_norm": 0.9296875, - "learning_rate": 0.00019469186137458365, - "loss": 0.785, - "step": 1180 - }, - { - "epoch": 2.91871921182266, - "grad_norm": 1.1875, - "learning_rate": 0.00019459934177739813, - "loss": 0.7781, - "step": 1185 - }, - { - "epoch": 2.9310344827586206, - "grad_norm": 3.625, - "learning_rate": 0.0001945060452035629, - "loss": 0.7878, - "step": 1190 - }, - { - "epoch": 2.9433497536945814, - "grad_norm": 1.0703125, - "learning_rate": 0.0001944119724193545, - "loss": 0.7709, - "step": 1195 - }, - { - "epoch": 2.955665024630542, - "grad_norm": 1.609375, - "learning_rate": 0.00019431712419742484, - "loss": 0.7821, - "step": 1200 - }, - { - "epoch": 2.9679802955665027, - "grad_norm": 0.98828125, - "learning_rate": 0.00019422150131679467, - "loss": 0.7925, - "step": 1205 - }, - { - "epoch": 2.980295566502463, - "grad_norm": 1.0390625, - "learning_rate": 0.00019412510456284733, - "loss": 0.7867, - "step": 1210 - }, - { - "epoch": 2.9926108374384235, - "grad_norm": 1.2421875, - "learning_rate": 0.00019402793472732217, - "loss": 0.7898, - "step": 1215 - }, - { - "epoch": 3.0, - "eval_loss": 2.431116819381714, - "eval_runtime": 2.0437, - "eval_samples_per_second": 4.893, - "eval_steps_per_second": 0.979, - "step": 1218 - }, - { - "epoch": 3.0049261083743843, - "grad_norm": 0.984375, - "learning_rate": 0.00019392999260830828, - "loss": 0.7424, - "step": 1220 - }, - { - "epoch": 3.0172413793103448, - "grad_norm": 1.0078125, - "learning_rate": 0.0001938312790102376, - "loss": 0.6782, - "step": 1225 - }, - { - "epoch": 3.0295566502463056, - "grad_norm": 0.6796875, - "learning_rate": 0.00019373179474387858, - "loss": 0.7011, - "step": 1230 - }, - { - "epoch": 3.041871921182266, - "grad_norm": 0.96875, - "learning_rate": 0.0001936315406263295, - "loss": 0.6979, - "step": 1235 - }, - { - "epoch": 3.0541871921182264, - "grad_norm": 0.91796875, - "learning_rate": 0.0001935305174810115, - "loss": 0.6945, - "step": 1240 - }, - { - "epoch": 3.0665024630541873, - "grad_norm": 1.359375, - "learning_rate": 0.0001934287261376622, - "loss": 0.7051, - "step": 1245 - }, - { - "epoch": 3.0788177339901477, - "grad_norm": 0.9375, - "learning_rate": 0.00019332616743232857, - "loss": 0.7047, - "step": 1250 - }, - { - "epoch": 3.0911330049261085, - "grad_norm": 1.2109375, - "learning_rate": 0.00019322284220736022, - "loss": 0.7062, - "step": 1255 - }, - { - "epoch": 3.103448275862069, - "grad_norm": 0.97265625, - "learning_rate": 0.00019311875131140246, - "loss": 0.6851, - "step": 1260 - }, - { - "epoch": 3.1157635467980294, - "grad_norm": 0.78125, - "learning_rate": 0.00019301389559938934, - "loss": 0.7069, - "step": 1265 - }, - { - "epoch": 3.12807881773399, - "grad_norm": 0.81640625, - "learning_rate": 0.00019290827593253655, - "loss": 0.6772, - "step": 1270 - }, - { - "epoch": 3.1403940886699506, - "grad_norm": 0.7109375, - "learning_rate": 0.00019280189317833445, - "loss": 0.6882, - "step": 1275 - }, - { - "epoch": 3.1527093596059115, - "grad_norm": 0.70703125, - "learning_rate": 0.00019269474821054084, - "loss": 0.6889, - "step": 1280 - }, - { - "epoch": 3.165024630541872, - "grad_norm": 0.80078125, - "learning_rate": 0.00019258684190917388, - "loss": 0.6993, - "step": 1285 - }, - { - "epoch": 3.1773399014778327, - "grad_norm": 0.94921875, - "learning_rate": 0.00019247817516050483, - "loss": 0.7013, - "step": 1290 - }, - { - "epoch": 3.189655172413793, - "grad_norm": 0.79296875, - "learning_rate": 0.00019236874885705075, - "loss": 0.6971, - "step": 1295 - }, - { - "epoch": 3.2019704433497536, - "grad_norm": 0.8359375, - "learning_rate": 0.00019225856389756718, - "loss": 0.7069, - "step": 1300 - }, - { - "epoch": 3.2142857142857144, - "grad_norm": 1.3984375, - "learning_rate": 0.00019214762118704076, - "loss": 0.7119, - "step": 1305 - }, - { - "epoch": 3.226600985221675, - "grad_norm": 0.78125, - "learning_rate": 0.00019203592163668184, - "loss": 0.7073, - "step": 1310 - }, - { - "epoch": 3.2389162561576357, - "grad_norm": 0.94140625, - "learning_rate": 0.0001919234661639168, - "loss": 0.7074, - "step": 1315 - }, - { - "epoch": 3.251231527093596, - "grad_norm": 0.65625, - "learning_rate": 0.0001918102556923809, - "loss": 0.7039, - "step": 1320 - }, - { - "epoch": 3.2635467980295565, - "grad_norm": 0.8671875, - "learning_rate": 0.00019169629115191027, - "loss": 0.7059, - "step": 1325 - }, - { - "epoch": 3.2758620689655173, - "grad_norm": 0.97265625, - "learning_rate": 0.0001915815734785346, - "loss": 0.7008, - "step": 1330 - }, - { - "epoch": 3.2881773399014778, - "grad_norm": 1.0859375, - "learning_rate": 0.0001914661036144692, - "loss": 0.7061, - "step": 1335 - }, - { - "epoch": 3.3004926108374386, - "grad_norm": 1.6640625, - "learning_rate": 0.00019134988250810747, - "loss": 0.7122, - "step": 1340 - }, - { - "epoch": 3.312807881773399, - "grad_norm": 0.9140625, - "learning_rate": 0.00019123291111401299, - "loss": 0.7056, - "step": 1345 - }, - { - "epoch": 3.3251231527093594, - "grad_norm": 2.109375, - "learning_rate": 0.00019111519039291167, - "loss": 0.6945, - "step": 1350 - }, - { - "epoch": 3.3374384236453203, - "grad_norm": 1.140625, - "learning_rate": 0.00019099672131168397, - "loss": 0.7014, - "step": 1355 - }, - { - "epoch": 3.3497536945812807, - "grad_norm": 0.98046875, - "learning_rate": 0.0001908775048433568, - "loss": 0.7036, - "step": 1360 - }, - { - "epoch": 3.3620689655172415, - "grad_norm": 0.8359375, - "learning_rate": 0.00019075754196709572, - "loss": 0.6991, - "step": 1365 - }, - { - "epoch": 3.374384236453202, - "grad_norm": 0.74609375, - "learning_rate": 0.0001906368336681967, - "loss": 0.7058, - "step": 1370 - }, - { - "epoch": 3.386699507389163, - "grad_norm": 1.015625, - "learning_rate": 0.00019051538093807816, - "loss": 0.6984, - "step": 1375 - }, - { - "epoch": 3.399014778325123, - "grad_norm": 1.734375, - "learning_rate": 0.0001903931847742728, - "loss": 0.7033, - "step": 1380 - }, - { - "epoch": 3.4113300492610836, - "grad_norm": 0.78125, - "learning_rate": 0.00019027024618041937, - "loss": 0.7006, - "step": 1385 - }, - { - "epoch": 3.4236453201970445, - "grad_norm": 0.8671875, - "learning_rate": 0.00019014656616625448, - "loss": 0.7249, - "step": 1390 - }, - { - "epoch": 3.435960591133005, - "grad_norm": 0.98828125, - "learning_rate": 0.00019002214574760423, - "loss": 0.6966, - "step": 1395 - }, - { - "epoch": 3.4482758620689653, - "grad_norm": 1.296875, - "learning_rate": 0.0001898969859463759, - "loss": 0.6932, - "step": 1400 - }, - { - "epoch": 3.460591133004926, - "grad_norm": 1.2578125, - "learning_rate": 0.00018977108779054974, - "loss": 0.7123, - "step": 1405 - }, - { - "epoch": 3.4729064039408866, - "grad_norm": 1.1328125, - "learning_rate": 0.0001896444523141701, - "loss": 0.7085, - "step": 1410 - }, - { - "epoch": 3.4852216748768474, - "grad_norm": 0.828125, - "learning_rate": 0.0001895170805573374, - "loss": 0.7035, - "step": 1415 - }, - { - "epoch": 3.497536945812808, - "grad_norm": 3.046875, - "learning_rate": 0.00018938897356619928, - "loss": 0.7039, - "step": 1420 - }, - { - "epoch": 3.5098522167487687, - "grad_norm": 1.140625, - "learning_rate": 0.00018926013239294216, - "loss": 0.7166, - "step": 1425 - }, - { - "epoch": 3.522167487684729, - "grad_norm": 0.79296875, - "learning_rate": 0.00018913055809578253, - "loss": 0.7053, - "step": 1430 - }, - { - "epoch": 3.5344827586206895, - "grad_norm": 0.8515625, - "learning_rate": 0.00018900025173895822, - "loss": 0.699, - "step": 1435 - }, - { - "epoch": 3.5467980295566504, - "grad_norm": 0.69140625, - "learning_rate": 0.00018886921439271984, - "loss": 0.6875, - "step": 1440 - }, - { - "epoch": 3.5591133004926108, - "grad_norm": 1.0078125, - "learning_rate": 0.0001887374471333218, - "loss": 0.7138, - "step": 1445 - }, - { - "epoch": 3.571428571428571, - "grad_norm": 1.0703125, - "learning_rate": 0.00018860495104301345, - "loss": 0.7016, - "step": 1450 - }, - { - "epoch": 3.583743842364532, - "grad_norm": 0.73046875, - "learning_rate": 0.00018847172721003043, - "loss": 0.7158, - "step": 1455 - }, - { - "epoch": 3.596059113300493, - "grad_norm": 0.80078125, - "learning_rate": 0.00018833777672858543, - "loss": 0.7027, - "step": 1460 - }, - { - "epoch": 3.6083743842364533, - "grad_norm": 0.71484375, - "learning_rate": 0.0001882031006988595, - "loss": 0.7279, - "step": 1465 - }, - { - "epoch": 3.6206896551724137, - "grad_norm": 0.76171875, - "learning_rate": 0.00018806770022699278, - "loss": 0.7183, - "step": 1470 - }, - { - "epoch": 3.6330049261083746, - "grad_norm": 0.97265625, - "learning_rate": 0.00018793157642507552, - "loss": 0.7126, - "step": 1475 - }, - { - "epoch": 3.645320197044335, - "grad_norm": 1.3671875, - "learning_rate": 0.00018779473041113885, - "loss": 0.7076, - "step": 1480 - }, - { - "epoch": 3.6576354679802954, - "grad_norm": 1.546875, - "learning_rate": 0.0001876571633091458, - "loss": 0.7159, - "step": 1485 - }, - { - "epoch": 3.6699507389162562, - "grad_norm": 0.8984375, - "learning_rate": 0.00018751887624898195, - "loss": 0.7121, - "step": 1490 - }, - { - "epoch": 3.6822660098522166, - "grad_norm": 1.8828125, - "learning_rate": 0.00018737987036644596, - "loss": 0.7062, - "step": 1495 - }, - { - "epoch": 3.6945812807881775, - "grad_norm": 1.5703125, - "learning_rate": 0.00018724014680324057, - "loss": 0.706, - "step": 1500 - }, - { - "epoch": 3.706896551724138, - "grad_norm": 0.984375, - "learning_rate": 0.00018709970670696308, - "loss": 0.7015, - "step": 1505 - }, - { - "epoch": 3.7192118226600988, - "grad_norm": 1.1640625, - "learning_rate": 0.00018695855123109588, - "loss": 0.7154, - "step": 1510 - }, - { - "epoch": 3.731527093596059, - "grad_norm": 0.85546875, - "learning_rate": 0.00018681668153499697, - "loss": 0.7154, - "step": 1515 - }, - { - "epoch": 3.7438423645320196, - "grad_norm": 1.375, - "learning_rate": 0.00018667409878389064, - "loss": 0.7063, - "step": 1520 - }, - { - "epoch": 3.7561576354679804, - "grad_norm": 1.203125, - "learning_rate": 0.00018653080414885755, - "loss": 0.7167, - "step": 1525 - }, - { - "epoch": 3.768472906403941, - "grad_norm": 0.90234375, - "learning_rate": 0.00018638679880682543, - "loss": 0.7125, - "step": 1530 - }, - { - "epoch": 3.7807881773399012, - "grad_norm": 1.203125, - "learning_rate": 0.00018624208394055924, - "loss": 0.707, - "step": 1535 - }, - { - "epoch": 3.793103448275862, - "grad_norm": 1.15625, - "learning_rate": 0.00018609666073865158, - "loss": 0.7092, - "step": 1540 - }, - { - "epoch": 3.8054187192118225, - "grad_norm": 0.9765625, - "learning_rate": 0.00018595053039551274, - "loss": 0.7163, - "step": 1545 - }, - { - "epoch": 3.8177339901477834, - "grad_norm": 1.4609375, - "learning_rate": 0.00018580369411136104, - "loss": 0.7119, - "step": 1550 - }, - { - "epoch": 3.8300492610837438, - "grad_norm": 1.0546875, - "learning_rate": 0.00018565615309221295, - "loss": 0.7175, - "step": 1555 - }, - { - "epoch": 3.8423645320197046, - "grad_norm": 1.4609375, - "learning_rate": 0.00018550790854987323, - "loss": 0.7004, - "step": 1560 - }, - { - "epoch": 3.854679802955665, - "grad_norm": 1.0546875, - "learning_rate": 0.00018535896170192482, - "loss": 0.7061, - "step": 1565 - }, - { - "epoch": 3.8669950738916254, - "grad_norm": 0.8671875, - "learning_rate": 0.00018520931377171893, - "loss": 0.7183, - "step": 1570 - }, - { - "epoch": 3.8793103448275863, - "grad_norm": 1.2734375, - "learning_rate": 0.00018505896598836508, - "loss": 0.7094, - "step": 1575 - }, - { - "epoch": 3.8916256157635467, - "grad_norm": 2.015625, - "learning_rate": 0.00018490791958672084, - "loss": 0.7055, - "step": 1580 - }, - { - "epoch": 3.903940886699507, - "grad_norm": 0.91796875, - "learning_rate": 0.00018475617580738187, - "loss": 0.7137, - "step": 1585 - }, - { - "epoch": 3.916256157635468, - "grad_norm": 0.83984375, - "learning_rate": 0.00018460373589667154, - "loss": 0.6981, - "step": 1590 - }, - { - "epoch": 3.928571428571429, - "grad_norm": 0.74609375, - "learning_rate": 0.0001844506011066308, - "loss": 0.7196, - "step": 1595 - }, - { - "epoch": 3.9408866995073892, - "grad_norm": 0.84765625, - "learning_rate": 0.0001842967726950079, - "loss": 0.7185, - "step": 1600 - }, - { - "epoch": 3.9532019704433496, - "grad_norm": 0.73828125, - "learning_rate": 0.00018414225192524806, - "loss": 0.713, - "step": 1605 - }, - { - "epoch": 3.9655172413793105, - "grad_norm": 1.296875, - "learning_rate": 0.00018398704006648302, - "loss": 0.7047, - "step": 1610 - }, - { - "epoch": 3.977832512315271, - "grad_norm": 0.83984375, - "learning_rate": 0.00018383113839352068, - "loss": 0.7008, - "step": 1615 - }, - { - "epoch": 3.9901477832512313, - "grad_norm": 0.87109375, - "learning_rate": 0.00018367454818683473, - "loss": 0.7295, - "step": 1620 - }, - { - "epoch": 4.0, - "eval_loss": 2.596930742263794, - "eval_runtime": 2.044, - "eval_samples_per_second": 4.892, - "eval_steps_per_second": 0.978, - "step": 1624 - }, - { - "epoch": 4.002463054187192, - "grad_norm": 0.640625, - "learning_rate": 0.0001835172707325538, - "loss": 0.6924, - "step": 1625 - }, - { - "epoch": 4.014778325123153, - "grad_norm": 0.9375, - "learning_rate": 0.00018335930732245136, - "loss": 0.6128, - "step": 1630 - }, - { - "epoch": 4.027093596059113, - "grad_norm": 0.76953125, - "learning_rate": 0.00018320065925393468, - "loss": 0.6085, - "step": 1635 - }, - { - "epoch": 4.039408866995074, - "grad_norm": 0.8203125, - "learning_rate": 0.00018304132783003452, - "loss": 0.612, - "step": 1640 - }, - { - "epoch": 4.051724137931035, - "grad_norm": 1.03125, - "learning_rate": 0.00018288131435939412, - "loss": 0.6157, - "step": 1645 - }, - { - "epoch": 4.064039408866995, - "grad_norm": 0.87109375, - "learning_rate": 0.00018272062015625872, - "loss": 0.6039, - "step": 1650 - }, - { - "epoch": 4.0763546798029555, - "grad_norm": 1.3203125, - "learning_rate": 0.00018255924654046458, - "loss": 0.6103, - "step": 1655 - }, - { - "epoch": 4.088669950738916, - "grad_norm": 0.98046875, - "learning_rate": 0.00018239719483742822, - "loss": 0.6045, - "step": 1660 - }, - { - "epoch": 4.100985221674877, - "grad_norm": 0.99609375, - "learning_rate": 0.0001822344663781356, - "loss": 0.6083, - "step": 1665 - }, - { - "epoch": 4.113300492610837, - "grad_norm": 1.4140625, - "learning_rate": 0.00018207106249913094, - "loss": 0.6176, - "step": 1670 - }, - { - "epoch": 4.125615763546798, - "grad_norm": 0.86328125, - "learning_rate": 0.00018190698454250605, - "loss": 0.622, - "step": 1675 - }, - { - "epoch": 4.137931034482759, - "grad_norm": 1.25, - "learning_rate": 0.00018174223385588917, - "loss": 0.6116, - "step": 1680 - }, - { - "epoch": 4.150246305418719, - "grad_norm": 1.15625, - "learning_rate": 0.00018157681179243386, - "loss": 0.6116, - "step": 1685 - }, - { - "epoch": 4.16256157635468, - "grad_norm": 1.1796875, - "learning_rate": 0.00018141071971080792, - "loss": 0.6245, - "step": 1690 - }, - { - "epoch": 4.174876847290641, - "grad_norm": 1.1015625, - "learning_rate": 0.00018124395897518224, - "loss": 0.6307, - "step": 1695 - }, - { - "epoch": 4.187192118226601, - "grad_norm": 1.078125, - "learning_rate": 0.00018107653095521958, - "loss": 0.6107, - "step": 1700 - }, - { - "epoch": 4.199507389162561, - "grad_norm": 0.78125, - "learning_rate": 0.00018090843702606337, - "loss": 0.6168, - "step": 1705 - }, - { - "epoch": 4.211822660098522, - "grad_norm": 1.2265625, - "learning_rate": 0.0001807396785683264, - "loss": 0.6196, - "step": 1710 - }, - { - "epoch": 4.224137931034483, - "grad_norm": 0.77734375, - "learning_rate": 0.0001805702569680794, - "loss": 0.6114, - "step": 1715 - }, - { - "epoch": 4.236453201970443, - "grad_norm": 0.9296875, - "learning_rate": 0.00018040017361683976, - "loss": 0.6232, - "step": 1720 - }, - { - "epoch": 4.248768472906404, - "grad_norm": 0.86328125, - "learning_rate": 0.00018022942991156, - "loss": 0.6153, - "step": 1725 - }, - { - "epoch": 4.261083743842365, - "grad_norm": 0.80078125, - "learning_rate": 0.00018005802725461643, - "loss": 0.6203, - "step": 1730 - }, - { - "epoch": 4.273399014778325, - "grad_norm": 1.0390625, - "learning_rate": 0.0001798859670537975, - "loss": 0.6221, - "step": 1735 - }, - { - "epoch": 4.285714285714286, - "grad_norm": 1.2890625, - "learning_rate": 0.00017971325072229226, - "loss": 0.6332, - "step": 1740 - }, - { - "epoch": 4.298029556650246, - "grad_norm": 0.875, - "learning_rate": 0.0001795398796786789, - "loss": 0.6262, - "step": 1745 - }, - { - "epoch": 4.310344827586207, - "grad_norm": 1.0625, - "learning_rate": 0.00017936585534691291, - "loss": 0.6301, - "step": 1750 - }, - { - "epoch": 4.322660098522167, - "grad_norm": 1.03125, - "learning_rate": 0.0001791911791563154, - "loss": 0.6247, - "step": 1755 - }, - { - "epoch": 4.334975369458128, - "grad_norm": 0.953125, - "learning_rate": 0.00017901585254156155, - "loss": 0.6376, - "step": 1760 - }, - { - "epoch": 4.347290640394089, - "grad_norm": 1.203125, - "learning_rate": 0.00017883987694266863, - "loss": 0.6311, - "step": 1765 - }, - { - "epoch": 4.359605911330049, - "grad_norm": 1.328125, - "learning_rate": 0.00017866325380498416, - "loss": 0.6346, - "step": 1770 - }, - { - "epoch": 4.37192118226601, - "grad_norm": 0.97265625, - "learning_rate": 0.0001784859845791743, - "loss": 0.6289, - "step": 1775 - }, - { - "epoch": 4.384236453201971, - "grad_norm": 1.234375, - "learning_rate": 0.00017830807072121156, - "loss": 0.625, - "step": 1780 - }, - { - "epoch": 4.396551724137931, - "grad_norm": 0.8984375, - "learning_rate": 0.00017812951369236316, - "loss": 0.6233, - "step": 1785 - }, - { - "epoch": 4.4088669950738915, - "grad_norm": 0.83984375, - "learning_rate": 0.00017795031495917884, - "loss": 0.642, - "step": 1790 - }, - { - "epoch": 4.421182266009852, - "grad_norm": 1.0078125, - "learning_rate": 0.00017777047599347893, - "loss": 0.6285, - "step": 1795 - }, - { - "epoch": 4.433497536945813, - "grad_norm": 0.90234375, - "learning_rate": 0.00017758999827234212, - "loss": 0.6342, - "step": 1800 - }, - { - "epoch": 4.445812807881773, - "grad_norm": 2.015625, - "learning_rate": 0.00017740888327809354, - "loss": 0.6302, - "step": 1805 - }, - { - "epoch": 4.458128078817734, - "grad_norm": 1.0703125, - "learning_rate": 0.00017722713249829236, - "loss": 0.6198, - "step": 1810 - }, - { - "epoch": 4.470443349753695, - "grad_norm": 3.390625, - "learning_rate": 0.00017704474742571969, - "loss": 0.6205, - "step": 1815 - }, - { - "epoch": 4.482758620689655, - "grad_norm": 1.4296875, - "learning_rate": 0.00017686172955836633, - "loss": 0.6286, - "step": 1820 - }, - { - "epoch": 4.495073891625616, - "grad_norm": 0.9140625, - "learning_rate": 0.00017667808039942043, - "loss": 0.635, - "step": 1825 - }, - { - "epoch": 4.5073891625615765, - "grad_norm": 0.81640625, - "learning_rate": 0.00017649380145725517, - "loss": 0.6237, - "step": 1830 - }, - { - "epoch": 4.519704433497537, - "grad_norm": 0.9765625, - "learning_rate": 0.0001763088942454163, - "loss": 0.6266, - "step": 1835 - }, - { - "epoch": 4.532019704433497, - "grad_norm": 1.6015625, - "learning_rate": 0.00017612336028260982, - "loss": 0.6381, - "step": 1840 - }, - { - "epoch": 4.544334975369458, - "grad_norm": 1.0859375, - "learning_rate": 0.00017593720109268944, - "loss": 0.6474, - "step": 1845 - }, - { - "epoch": 4.556650246305419, - "grad_norm": 0.82421875, - "learning_rate": 0.00017575041820464405, - "loss": 0.6381, - "step": 1850 - }, - { - "epoch": 4.568965517241379, - "grad_norm": 0.7734375, - "learning_rate": 0.00017556301315258517, - "loss": 0.6341, - "step": 1855 - }, - { - "epoch": 4.58128078817734, - "grad_norm": 0.9921875, - "learning_rate": 0.00017537498747573443, - "loss": 0.6398, - "step": 1860 - }, - { - "epoch": 4.593596059113301, - "grad_norm": 1.4453125, - "learning_rate": 0.00017518634271841083, - "loss": 0.6445, - "step": 1865 - }, - { - "epoch": 4.605911330049262, - "grad_norm": 0.85546875, - "learning_rate": 0.00017499708043001807, - "loss": 0.6446, - "step": 1870 - }, - { - "epoch": 4.6182266009852215, - "grad_norm": 0.8359375, - "learning_rate": 0.00017480720216503183, - "loss": 0.642, - "step": 1875 - }, - { - "epoch": 4.630541871921182, - "grad_norm": 0.8984375, - "learning_rate": 0.0001746167094829871, - "loss": 0.6425, - "step": 1880 - }, - { - "epoch": 4.642857142857143, - "grad_norm": 1.4765625, - "learning_rate": 0.00017442560394846516, - "loss": 0.6315, - "step": 1885 - }, - { - "epoch": 4.655172413793103, - "grad_norm": 0.8828125, - "learning_rate": 0.000174233887131081, - "loss": 0.6423, - "step": 1890 - }, - { - "epoch": 4.667487684729064, - "grad_norm": 0.734375, - "learning_rate": 0.00017404156060547016, - "loss": 0.6354, - "step": 1895 - }, - { - "epoch": 4.679802955665025, - "grad_norm": 0.76953125, - "learning_rate": 0.000173848625951276, - "loss": 0.6404, - "step": 1900 - }, - { - "epoch": 4.692118226600985, - "grad_norm": 0.72265625, - "learning_rate": 0.0001736550847531366, - "loss": 0.6432, - "step": 1905 - }, - { - "epoch": 4.704433497536946, - "grad_norm": 0.85546875, - "learning_rate": 0.00017346093860067186, - "loss": 0.6478, - "step": 1910 - }, - { - "epoch": 4.716748768472907, - "grad_norm": 0.96484375, - "learning_rate": 0.00017326618908847024, - "loss": 0.6551, - "step": 1915 - }, - { - "epoch": 4.7290640394088665, - "grad_norm": 0.83203125, - "learning_rate": 0.00017307083781607595, - "loss": 0.6346, - "step": 1920 - }, - { - "epoch": 4.741379310344827, - "grad_norm": 0.85546875, - "learning_rate": 0.00017287488638797563, - "loss": 0.6477, - "step": 1925 - }, - { - "epoch": 4.753694581280788, - "grad_norm": 0.7734375, - "learning_rate": 0.0001726783364135851, - "loss": 0.6516, - "step": 1930 - }, - { - "epoch": 4.766009852216749, - "grad_norm": 0.796875, - "learning_rate": 0.00017248118950723634, - "loss": 0.6469, - "step": 1935 - }, - { - "epoch": 4.778325123152709, - "grad_norm": 0.80078125, - "learning_rate": 0.00017228344728816413, - "loss": 0.6438, - "step": 1940 - }, - { - "epoch": 4.79064039408867, - "grad_norm": 0.94140625, - "learning_rate": 0.00017208511138049274, - "loss": 0.6484, - "step": 1945 - }, - { - "epoch": 4.802955665024631, - "grad_norm": 0.76953125, - "learning_rate": 0.00017188618341322254, - "loss": 0.6488, - "step": 1950 - }, - { - "epoch": 4.815270935960591, - "grad_norm": 1.0703125, - "learning_rate": 0.0001716866650202169, - "loss": 0.6311, - "step": 1955 - }, - { - "epoch": 4.827586206896552, - "grad_norm": 0.83984375, - "learning_rate": 0.00017148655784018829, - "loss": 0.6517, - "step": 1960 - }, - { - "epoch": 4.8399014778325125, - "grad_norm": 0.8359375, - "learning_rate": 0.00017128586351668524, - "loss": 0.6483, - "step": 1965 - }, - { - "epoch": 4.852216748768473, - "grad_norm": 0.91015625, - "learning_rate": 0.00017108458369807864, - "loss": 0.6471, - "step": 1970 - }, - { - "epoch": 4.864532019704433, - "grad_norm": 0.81640625, - "learning_rate": 0.00017088272003754832, - "loss": 0.641, - "step": 1975 - }, - { - "epoch": 4.876847290640394, - "grad_norm": 0.89453125, - "learning_rate": 0.00017068027419306936, - "loss": 0.6464, - "step": 1980 - }, - { - "epoch": 4.889162561576355, - "grad_norm": 1.0859375, - "learning_rate": 0.00017047724782739846, - "loss": 0.6405, - "step": 1985 - }, - { - "epoch": 4.901477832512315, - "grad_norm": 0.78125, - "learning_rate": 0.0001702736426080604, - "loss": 0.6536, - "step": 1990 - }, - { - "epoch": 4.913793103448276, - "grad_norm": 1.2890625, - "learning_rate": 0.00017006946020733425, - "loss": 0.6359, - "step": 1995 - }, - { - "epoch": 4.926108374384237, - "grad_norm": 0.9921875, - "learning_rate": 0.00016986470230223973, - "loss": 0.6482, - "step": 2000 - }, - { - "epoch": 4.9384236453201975, - "grad_norm": 0.9453125, - "learning_rate": 0.00016965937057452333, - "loss": 0.6431, - "step": 2005 - }, - { - "epoch": 4.9507389162561575, - "grad_norm": 0.7421875, - "learning_rate": 0.00016945346671064452, - "loss": 0.6546, - "step": 2010 - }, - { - "epoch": 4.963054187192118, - "grad_norm": 1.078125, - "learning_rate": 0.00016924699240176194, - "loss": 0.6462, - "step": 2015 - }, - { - "epoch": 4.975369458128079, - "grad_norm": 1.1171875, - "learning_rate": 0.00016903994934371953, - "loss": 0.6374, - "step": 2020 - }, - { - "epoch": 4.987684729064039, - "grad_norm": 1.75, - "learning_rate": 0.00016883233923703248, - "loss": 0.6569, - "step": 2025 - }, - { - "epoch": 5.0, - "grad_norm": 0.73828125, - "learning_rate": 0.0001686241637868734, - "loss": 0.6377, - "step": 2030 - }, - { - "epoch": 5.0, - "eval_loss": 2.8600502014160156, - "eval_runtime": 2.042, - "eval_samples_per_second": 4.897, - "eval_steps_per_second": 0.979, - "step": 2030 - }, - { - "epoch": 5.012315270935961, - "grad_norm": 0.9296875, - "learning_rate": 0.00016841542470305817, - "loss": 0.5387, - "step": 2035 - }, - { - "epoch": 5.024630541871921, - "grad_norm": 1.046875, - "learning_rate": 0.00016820612370003221, - "loss": 0.5418, - "step": 2040 - }, - { - "epoch": 5.036945812807882, - "grad_norm": 0.80078125, - "learning_rate": 0.0001679962624968559, - "loss": 0.5381, - "step": 2045 - }, - { - "epoch": 5.0492610837438425, - "grad_norm": 0.87109375, - "learning_rate": 0.00016778584281719095, - "loss": 0.5311, - "step": 2050 - }, - { - "epoch": 5.061576354679803, - "grad_norm": 0.84765625, - "learning_rate": 0.00016757486638928587, - "loss": 0.5428, - "step": 2055 - }, - { - "epoch": 5.073891625615763, - "grad_norm": 1.0859375, - "learning_rate": 0.00016736333494596196, - "loss": 0.5433, - "step": 2060 - }, - { - "epoch": 5.086206896551724, - "grad_norm": 0.77734375, - "learning_rate": 0.00016715125022459922, - "loss": 0.5428, - "step": 2065 - }, - { - "epoch": 5.098522167487685, - "grad_norm": 0.828125, - "learning_rate": 0.00016693861396712168, - "loss": 0.5395, - "step": 2070 - }, - { - "epoch": 5.110837438423645, - "grad_norm": 0.86328125, - "learning_rate": 0.00016672542791998344, - "loss": 0.5527, - "step": 2075 - }, - { - "epoch": 5.123152709359606, - "grad_norm": 0.984375, - "learning_rate": 0.0001665116938341542, - "loss": 0.5529, - "step": 2080 - }, - { - "epoch": 5.135467980295567, - "grad_norm": 1.078125, - "learning_rate": 0.00016629741346510496, - "loss": 0.5512, - "step": 2085 - }, - { - "epoch": 5.147783251231527, - "grad_norm": 0.953125, - "learning_rate": 0.00016608258857279333, - "loss": 0.5472, - "step": 2090 - }, - { - "epoch": 5.1600985221674875, - "grad_norm": 0.93359375, - "learning_rate": 0.0001658672209216495, - "loss": 0.55, - "step": 2095 - }, - { - "epoch": 5.172413793103448, - "grad_norm": 0.875, - "learning_rate": 0.00016565131228056133, - "loss": 0.5459, - "step": 2100 - }, - { - "epoch": 5.184729064039409, - "grad_norm": 0.84765625, - "learning_rate": 0.0001654348644228602, - "loss": 0.5552, - "step": 2105 - }, - { - "epoch": 5.197044334975369, - "grad_norm": 0.87109375, - "learning_rate": 0.00016521787912630612, - "loss": 0.5465, - "step": 2110 - }, - { - "epoch": 5.20935960591133, - "grad_norm": 1.0078125, - "learning_rate": 0.00016500035817307334, - "loss": 0.5581, - "step": 2115 - }, - { - "epoch": 5.221674876847291, - "grad_norm": 0.78125, - "learning_rate": 0.00016478230334973556, - "loss": 0.5535, - "step": 2120 - }, - { - "epoch": 5.233990147783251, - "grad_norm": 0.8671875, - "learning_rate": 0.00016456371644725146, - "loss": 0.5604, - "step": 2125 - }, - { - "epoch": 5.246305418719212, - "grad_norm": 0.90234375, - "learning_rate": 0.0001643445992609498, - "loss": 0.5551, - "step": 2130 - }, - { - "epoch": 5.258620689655173, - "grad_norm": 0.94921875, - "learning_rate": 0.0001641249535905147, - "loss": 0.5443, - "step": 2135 - }, - { - "epoch": 5.2709359605911335, - "grad_norm": 0.87109375, - "learning_rate": 0.00016390478123997094, - "loss": 0.5544, - "step": 2140 - }, - { - "epoch": 5.283251231527093, - "grad_norm": 0.96484375, - "learning_rate": 0.00016368408401766916, - "loss": 0.571, - "step": 2145 - }, - { - "epoch": 5.295566502463054, - "grad_norm": 0.81640625, - "learning_rate": 0.0001634628637362709, - "loss": 0.5501, - "step": 2150 - }, - { - "epoch": 5.307881773399015, - "grad_norm": 0.89453125, - "learning_rate": 0.0001632411222127337, - "loss": 0.5537, - "step": 2155 - }, - { - "epoch": 5.320197044334975, - "grad_norm": 0.90234375, - "learning_rate": 0.0001630188612682963, - "loss": 0.563, - "step": 2160 - }, - { - "epoch": 5.332512315270936, - "grad_norm": 0.859375, - "learning_rate": 0.00016279608272846372, - "loss": 0.5663, - "step": 2165 - }, - { - "epoch": 5.344827586206897, - "grad_norm": 0.9140625, - "learning_rate": 0.00016257278842299197, - "loss": 0.5587, - "step": 2170 - }, - { - "epoch": 5.357142857142857, - "grad_norm": 0.94140625, - "learning_rate": 0.00016234898018587337, - "loss": 0.5627, - "step": 2175 - }, - { - "epoch": 5.369458128078818, - "grad_norm": 0.89453125, - "learning_rate": 0.00016212465985532124, - "loss": 0.5699, - "step": 2180 - }, - { - "epoch": 5.3817733990147785, - "grad_norm": 1.0546875, - "learning_rate": 0.000161899829273755, - "loss": 0.5686, - "step": 2185 - }, - { - "epoch": 5.394088669950739, - "grad_norm": 1.4296875, - "learning_rate": 0.00016167449028778484, - "loss": 0.5679, - "step": 2190 - }, - { - "epoch": 5.406403940886699, - "grad_norm": 0.7421875, - "learning_rate": 0.00016144864474819666, - "loss": 0.5715, - "step": 2195 - }, - { - "epoch": 5.41871921182266, - "grad_norm": 1.0546875, - "learning_rate": 0.0001612222945099369, - "loss": 0.5646, - "step": 2200 - }, - { - "epoch": 5.431034482758621, - "grad_norm": 0.89453125, - "learning_rate": 0.0001609954414320973, - "loss": 0.5586, - "step": 2205 - }, - { - "epoch": 5.443349753694581, - "grad_norm": 1.078125, - "learning_rate": 0.00016076808737789947, - "loss": 0.5675, - "step": 2210 - }, - { - "epoch": 5.455665024630542, - "grad_norm": 0.80078125, - "learning_rate": 0.00016054023421467983, - "loss": 0.5632, - "step": 2215 - }, - { - "epoch": 5.467980295566503, - "grad_norm": 0.97265625, - "learning_rate": 0.0001603118838138741, - "loss": 0.564, - "step": 2220 - }, - { - "epoch": 5.480295566502463, - "grad_norm": 0.875, - "learning_rate": 0.00016008303805100193, - "loss": 0.5735, - "step": 2225 - }, - { - "epoch": 5.4926108374384235, - "grad_norm": 0.90234375, - "learning_rate": 0.00015985369880565164, - "loss": 0.5583, - "step": 2230 - }, - { - "epoch": 5.504926108374384, - "grad_norm": 1.1015625, - "learning_rate": 0.00015962386796146462, - "loss": 0.5686, - "step": 2235 - }, - { - "epoch": 5.517241379310345, - "grad_norm": 0.96875, - "learning_rate": 0.00015939354740612, - "loss": 0.571, - "step": 2240 - }, - { - "epoch": 5.529556650246305, - "grad_norm": 1.375, - "learning_rate": 0.000159162739031319, - "loss": 0.5735, - "step": 2245 - }, - { - "epoch": 5.541871921182266, - "grad_norm": 1.3125, - "learning_rate": 0.00015893144473276953, - "loss": 0.5691, - "step": 2250 - }, - { - "epoch": 5.554187192118227, - "grad_norm": 0.87890625, - "learning_rate": 0.0001586996664101705, - "loss": 0.5663, - "step": 2255 - }, - { - "epoch": 5.566502463054187, - "grad_norm": 1.3671875, - "learning_rate": 0.00015846740596719636, - "loss": 0.5684, - "step": 2260 - }, - { - "epoch": 5.578817733990148, - "grad_norm": 1.0703125, - "learning_rate": 0.00015823466531148124, - "loss": 0.575, - "step": 2265 - }, - { - "epoch": 5.5911330049261085, - "grad_norm": 0.94921875, - "learning_rate": 0.00015800144635460354, - "loss": 0.5814, - "step": 2270 - }, - { - "epoch": 5.603448275862069, - "grad_norm": 0.8125, - "learning_rate": 0.0001577677510120701, - "loss": 0.5778, - "step": 2275 - }, - { - "epoch": 5.615763546798029, - "grad_norm": 1.109375, - "learning_rate": 0.00015753358120330042, - "loss": 0.5688, - "step": 2280 - }, - { - "epoch": 5.62807881773399, - "grad_norm": 1.0859375, - "learning_rate": 0.00015729893885161098, - "loss": 0.5771, - "step": 2285 - }, - { - "epoch": 5.640394088669951, - "grad_norm": 0.87890625, - "learning_rate": 0.00015706382588419945, - "loss": 0.5723, - "step": 2290 - }, - { - "epoch": 5.652709359605911, - "grad_norm": 1.015625, - "learning_rate": 0.00015682824423212877, - "loss": 0.5782, - "step": 2295 - }, - { - "epoch": 5.665024630541872, - "grad_norm": 1.0234375, - "learning_rate": 0.0001565921958303114, - "loss": 0.5776, - "step": 2300 - }, - { - "epoch": 5.677339901477833, - "grad_norm": 0.85546875, - "learning_rate": 0.00015635568261749332, - "loss": 0.5721, - "step": 2305 - }, - { - "epoch": 5.689655172413794, - "grad_norm": 0.8515625, - "learning_rate": 0.00015611870653623825, - "loss": 0.5643, - "step": 2310 - }, - { - "epoch": 5.701970443349754, - "grad_norm": 0.90625, - "learning_rate": 0.0001558812695329115, - "loss": 0.574, - "step": 2315 - }, - { - "epoch": 5.714285714285714, - "grad_norm": 0.88671875, - "learning_rate": 0.00015564337355766412, - "loss": 0.5795, - "step": 2320 - }, - { - "epoch": 5.726600985221674, - "grad_norm": 1.0234375, - "learning_rate": 0.00015540502056441688, - "loss": 0.5779, - "step": 2325 - }, - { - "epoch": 5.738916256157635, - "grad_norm": 0.953125, - "learning_rate": 0.00015516621251084422, - "loss": 0.5754, - "step": 2330 - }, - { - "epoch": 5.751231527093596, - "grad_norm": 1.4921875, - "learning_rate": 0.00015492695135835811, - "loss": 0.5769, - "step": 2335 - }, - { - "epoch": 5.763546798029557, - "grad_norm": 1.03125, - "learning_rate": 0.00015468723907209193, - "loss": 0.5842, - "step": 2340 - }, - { - "epoch": 5.775862068965517, - "grad_norm": 0.96875, - "learning_rate": 0.00015444707762088443, - "loss": 0.5806, - "step": 2345 - }, - { - "epoch": 5.788177339901478, - "grad_norm": 0.81640625, - "learning_rate": 0.0001542064689772636, - "loss": 0.5817, - "step": 2350 - }, - { - "epoch": 5.800492610837439, - "grad_norm": 0.97265625, - "learning_rate": 0.00015396541511743012, - "loss": 0.5767, - "step": 2355 - }, - { - "epoch": 5.812807881773399, - "grad_norm": 1.0546875, - "learning_rate": 0.00015372391802124163, - "loss": 0.5726, - "step": 2360 - }, - { - "epoch": 5.825123152709359, - "grad_norm": 0.83203125, - "learning_rate": 0.00015348197967219606, - "loss": 0.5731, - "step": 2365 - }, - { - "epoch": 5.83743842364532, - "grad_norm": 0.8515625, - "learning_rate": 0.00015323960205741561, - "loss": 0.5873, - "step": 2370 - }, - { - "epoch": 5.849753694581281, - "grad_norm": 1.0, - "learning_rate": 0.00015299678716763028, - "loss": 0.5801, - "step": 2375 - }, - { - "epoch": 5.862068965517241, - "grad_norm": 2.0625, - "learning_rate": 0.00015275353699716155, - "loss": 0.5826, - "step": 2380 - }, - { - "epoch": 5.874384236453202, - "grad_norm": 0.9140625, - "learning_rate": 0.00015250985354390596, - "loss": 0.5893, - "step": 2385 - }, - { - "epoch": 5.886699507389163, - "grad_norm": 0.859375, - "learning_rate": 0.00015226573880931888, - "loss": 0.5772, - "step": 2390 - }, - { - "epoch": 5.899014778325123, - "grad_norm": 1.0546875, - "learning_rate": 0.0001520211947983978, - "loss": 0.5838, - "step": 2395 - }, - { - "epoch": 5.911330049261084, - "grad_norm": 0.7890625, - "learning_rate": 0.0001517762235196661, - "loss": 0.5719, - "step": 2400 - }, - { - "epoch": 5.9236453201970445, - "grad_norm": 0.8125, - "learning_rate": 0.0001515308269851564, - "loss": 0.5741, - "step": 2405 - }, - { - "epoch": 5.935960591133005, - "grad_norm": 0.84375, - "learning_rate": 0.0001512850072103941, - "loss": 0.5831, - "step": 2410 - }, - { - "epoch": 5.948275862068965, - "grad_norm": 0.88671875, - "learning_rate": 0.00015103876621438086, - "loss": 0.5768, - "step": 2415 - }, - { - "epoch": 5.960591133004926, - "grad_norm": 0.8359375, - "learning_rate": 0.00015079210601957793, - "loss": 0.5875, - "step": 2420 - }, - { - "epoch": 5.972906403940887, - "grad_norm": 0.92578125, - "learning_rate": 0.00015054502865188957, - "loss": 0.5797, - "step": 2425 - }, - { - "epoch": 5.985221674876847, - "grad_norm": 1.203125, - "learning_rate": 0.00015029753614064645, - "loss": 0.5835, - "step": 2430 - }, - { - "epoch": 5.997536945812808, - "grad_norm": 1.296875, - "learning_rate": 0.000150049630518589, - "loss": 0.5891, - "step": 2435 - }, - { - "epoch": 6.0, - "eval_loss": 3.1032605171203613, - "eval_runtime": 2.0442, - "eval_samples_per_second": 4.892, - "eval_steps_per_second": 0.978, - "step": 2436 - }, - { - "epoch": 6.009852216748769, - "grad_norm": 0.8515625, - "learning_rate": 0.0001498013138218506, - "loss": 0.4998, - "step": 2440 - }, - { - "epoch": 6.022167487684729, - "grad_norm": 0.88671875, - "learning_rate": 0.00014955258808994096, - "loss": 0.4797, - "step": 2445 - }, - { - "epoch": 6.0344827586206895, - "grad_norm": 0.890625, - "learning_rate": 0.00014930345536572924, - "loss": 0.4731, - "step": 2450 - }, - { - "epoch": 6.04679802955665, - "grad_norm": 1.3671875, - "learning_rate": 0.00014905391769542758, - "loss": 0.4802, - "step": 2455 - }, - { - "epoch": 6.059113300492611, - "grad_norm": 1.2890625, - "learning_rate": 0.00014880397712857386, - "loss": 0.4708, - "step": 2460 - }, - { - "epoch": 6.071428571428571, - "grad_norm": 0.91015625, - "learning_rate": 0.00014855363571801523, - "loss": 0.4703, - "step": 2465 - }, - { - "epoch": 6.083743842364532, - "grad_norm": 0.859375, - "learning_rate": 0.000148302895519891, - "loss": 0.4726, - "step": 2470 - }, - { - "epoch": 6.096059113300493, - "grad_norm": 0.83203125, - "learning_rate": 0.00014805175859361594, - "loss": 0.4733, - "step": 2475 - }, - { - "epoch": 6.108374384236453, - "grad_norm": 0.8828125, - "learning_rate": 0.0001478002270018633, - "loss": 0.4742, - "step": 2480 - }, - { - "epoch": 6.120689655172414, - "grad_norm": 0.96875, - "learning_rate": 0.00014754830281054777, - "loss": 0.4884, - "step": 2485 - }, - { - "epoch": 6.1330049261083746, - "grad_norm": 0.97265625, - "learning_rate": 0.00014729598808880861, - "loss": 0.4851, - "step": 2490 - }, - { - "epoch": 6.1453201970443345, - "grad_norm": 0.953125, - "learning_rate": 0.0001470432849089927, - "loss": 0.4755, - "step": 2495 - }, - { - "epoch": 6.157635467980295, - "grad_norm": 0.89453125, - "learning_rate": 0.00014679019534663738, - "loss": 0.4823, - "step": 2500 - }, - { - "epoch": 6.169950738916256, - "grad_norm": 1.1015625, - "learning_rate": 0.00014653672148045357, - "loss": 0.4781, - "step": 2505 - }, - { - "epoch": 6.182266009852217, - "grad_norm": 0.921875, - "learning_rate": 0.00014628286539230848, - "loss": 0.4878, - "step": 2510 - }, - { - "epoch": 6.194581280788177, - "grad_norm": 0.9140625, - "learning_rate": 0.00014602862916720884, - "loss": 0.489, - "step": 2515 - }, - { - "epoch": 6.206896551724138, - "grad_norm": 1.421875, - "learning_rate": 0.00014577401489328335, - "loss": 0.4913, - "step": 2520 - }, - { - "epoch": 6.219211822660099, - "grad_norm": 1.046875, - "learning_rate": 0.00014551902466176592, - "loss": 0.4934, - "step": 2525 - }, - { - "epoch": 6.231527093596059, - "grad_norm": 1.2109375, - "learning_rate": 0.00014526366056697825, - "loss": 0.4908, - "step": 2530 - }, - { - "epoch": 6.24384236453202, - "grad_norm": 1.0390625, - "learning_rate": 0.0001450079247063127, - "loss": 0.4894, - "step": 2535 - }, - { - "epoch": 6.25615763546798, - "grad_norm": 0.91015625, - "learning_rate": 0.0001447518191802151, - "loss": 0.4942, - "step": 2540 - }, - { - "epoch": 6.268472906403941, - "grad_norm": 1.2109375, - "learning_rate": 0.00014449534609216748, - "loss": 0.4894, - "step": 2545 - }, - { - "epoch": 6.280788177339901, - "grad_norm": 1.03125, - "learning_rate": 0.00014423850754867075, - "loss": 0.4937, - "step": 2550 - }, - { - "epoch": 6.293103448275862, - "grad_norm": 1.0625, - "learning_rate": 0.00014398130565922742, - "loss": 0.4905, - "step": 2555 - }, - { - "epoch": 6.305418719211823, - "grad_norm": 0.88671875, - "learning_rate": 0.00014372374253632437, - "loss": 0.4901, - "step": 2560 - }, - { - "epoch": 6.317733990147783, - "grad_norm": 0.96484375, - "learning_rate": 0.0001434658202954153, - "loss": 0.4944, - "step": 2565 - }, - { - "epoch": 6.330049261083744, - "grad_norm": 0.91015625, - "learning_rate": 0.0001432075410549035, - "loss": 0.4931, - "step": 2570 - }, - { - "epoch": 6.342364532019705, - "grad_norm": 1.015625, - "learning_rate": 0.0001429489069361245, - "loss": 0.5, - "step": 2575 - }, - { - "epoch": 6.3546798029556655, - "grad_norm": 0.89453125, - "learning_rate": 0.00014268992006332846, - "loss": 0.4955, - "step": 2580 - }, - { - "epoch": 6.3669950738916254, - "grad_norm": 0.83984375, - "learning_rate": 0.0001424305825636629, - "loss": 0.5065, - "step": 2585 - }, - { - "epoch": 6.379310344827586, - "grad_norm": 0.8984375, - "learning_rate": 0.0001421708965671551, - "loss": 0.4944, - "step": 2590 - }, - { - "epoch": 6.391625615763547, - "grad_norm": 0.92578125, - "learning_rate": 0.0001419108642066947, - "loss": 0.4923, - "step": 2595 - }, - { - "epoch": 6.403940886699507, - "grad_norm": 1.2265625, - "learning_rate": 0.00014165048761801617, - "loss": 0.5143, - "step": 2600 - }, - { - "epoch": 6.416256157635468, - "grad_norm": 1.0078125, - "learning_rate": 0.0001413897689396812, - "loss": 0.4983, - "step": 2605 - }, - { - "epoch": 6.428571428571429, - "grad_norm": 0.83203125, - "learning_rate": 0.00014112871031306119, - "loss": 0.4954, - "step": 2610 - }, - { - "epoch": 6.440886699507389, - "grad_norm": 1.0390625, - "learning_rate": 0.0001408673138823196, - "loss": 0.4951, - "step": 2615 - }, - { - "epoch": 6.45320197044335, - "grad_norm": 0.92578125, - "learning_rate": 0.00014060558179439456, - "loss": 0.4964, - "step": 2620 - }, - { - "epoch": 6.4655172413793105, - "grad_norm": 1.0, - "learning_rate": 0.00014034351619898088, - "loss": 0.5015, - "step": 2625 - }, - { - "epoch": 6.477832512315271, - "grad_norm": 1.0390625, - "learning_rate": 0.00014008111924851264, - "loss": 0.4904, - "step": 2630 - }, - { - "epoch": 6.490147783251231, - "grad_norm": 0.9921875, - "learning_rate": 0.00013981839309814547, - "loss": 0.5047, - "step": 2635 - }, - { - "epoch": 6.502463054187192, - "grad_norm": 1.015625, - "learning_rate": 0.00013955533990573886, - "loss": 0.4973, - "step": 2640 - }, - { - "epoch": 6.514778325123153, - "grad_norm": 0.8515625, - "learning_rate": 0.00013929196183183834, - "loss": 0.492, - "step": 2645 - }, - { - "epoch": 6.527093596059113, - "grad_norm": 1.1328125, - "learning_rate": 0.00013902826103965788, - "loss": 0.4996, - "step": 2650 - }, - { - "epoch": 6.539408866995074, - "grad_norm": 0.95703125, - "learning_rate": 0.00013876423969506194, - "loss": 0.5011, - "step": 2655 - }, - { - "epoch": 6.551724137931035, - "grad_norm": 0.9765625, - "learning_rate": 0.0001384998999665479, - "loss": 0.4946, - "step": 2660 - }, - { - "epoch": 6.564039408866995, - "grad_norm": 0.953125, - "learning_rate": 0.00013823524402522804, - "loss": 0.5043, - "step": 2665 - }, - { - "epoch": 6.5763546798029555, - "grad_norm": 1.0234375, - "learning_rate": 0.00013797027404481184, - "loss": 0.512, - "step": 2670 - }, - { - "epoch": 6.588669950738916, - "grad_norm": 0.90625, - "learning_rate": 0.00013770499220158816, - "loss": 0.4999, - "step": 2675 - }, - { - "epoch": 6.600985221674877, - "grad_norm": 0.890625, - "learning_rate": 0.0001374394006744072, - "loss": 0.5051, - "step": 2680 - }, - { - "epoch": 6.613300492610837, - "grad_norm": 1.015625, - "learning_rate": 0.0001371735016446627, - "loss": 0.5081, - "step": 2685 - }, - { - "epoch": 6.625615763546798, - "grad_norm": 0.88671875, - "learning_rate": 0.0001369072972962741, - "loss": 0.5027, - "step": 2690 - }, - { - "epoch": 6.637931034482759, - "grad_norm": 0.890625, - "learning_rate": 0.00013664078981566843, - "loss": 0.5062, - "step": 2695 - }, - { - "epoch": 6.650246305418719, - "grad_norm": 0.99609375, - "learning_rate": 0.00013637398139176255, - "loss": 0.5036, - "step": 2700 - }, - { - "epoch": 6.66256157635468, - "grad_norm": 0.859375, - "learning_rate": 0.00013610687421594498, - "loss": 0.4933, - "step": 2705 - }, - { - "epoch": 6.674876847290641, - "grad_norm": 0.99609375, - "learning_rate": 0.00013583947048205808, - "loss": 0.5036, - "step": 2710 - }, - { - "epoch": 6.687192118226601, - "grad_norm": 0.8828125, - "learning_rate": 0.00013557177238637986, - "loss": 0.5176, - "step": 2715 - }, - { - "epoch": 6.699507389162561, - "grad_norm": 1.0234375, - "learning_rate": 0.00013530378212760606, - "loss": 0.5205, - "step": 2720 - }, - { - "epoch": 6.711822660098522, - "grad_norm": 1.03125, - "learning_rate": 0.00013503550190683205, - "loss": 0.5051, - "step": 2725 - }, - { - "epoch": 6.724137931034483, - "grad_norm": 0.9921875, - "learning_rate": 0.00013476693392753476, - "loss": 0.5086, - "step": 2730 - }, - { - "epoch": 6.736453201970443, - "grad_norm": 0.9375, - "learning_rate": 0.0001344980803955546, - "loss": 0.5025, - "step": 2735 - }, - { - "epoch": 6.748768472906404, - "grad_norm": 0.96484375, - "learning_rate": 0.00013422894351907726, - "loss": 0.5062, - "step": 2740 - }, - { - "epoch": 6.761083743842365, - "grad_norm": 0.9375, - "learning_rate": 0.00013395952550861572, - "loss": 0.5228, - "step": 2745 - }, - { - "epoch": 6.773399014778326, - "grad_norm": 0.87109375, - "learning_rate": 0.00013368982857699192, - "loss": 0.5049, - "step": 2750 - }, - { - "epoch": 6.785714285714286, - "grad_norm": 1.1015625, - "learning_rate": 0.00013341985493931877, - "loss": 0.5163, - "step": 2755 - }, - { - "epoch": 6.798029556650246, - "grad_norm": 1.0078125, - "learning_rate": 0.00013314960681298175, - "loss": 0.5118, - "step": 2760 - }, - { - "epoch": 6.810344827586206, - "grad_norm": 0.9375, - "learning_rate": 0.0001328790864176209, - "loss": 0.508, - "step": 2765 - }, - { - "epoch": 6.822660098522167, - "grad_norm": 1.03125, - "learning_rate": 0.00013260829597511246, - "loss": 0.5117, - "step": 2770 - }, - { - "epoch": 6.834975369458128, - "grad_norm": 1.0546875, - "learning_rate": 0.0001323372377095507, - "loss": 0.5178, - "step": 2775 - }, - { - "epoch": 6.847290640394089, - "grad_norm": 1.0234375, - "learning_rate": 0.0001320659138472295, - "loss": 0.51, - "step": 2780 - }, - { - "epoch": 6.859605911330049, - "grad_norm": 1.015625, - "learning_rate": 0.00013179432661662434, - "loss": 0.5201, - "step": 2785 - }, - { - "epoch": 6.87192118226601, - "grad_norm": 0.94921875, - "learning_rate": 0.0001315224782483737, - "loss": 0.5145, - "step": 2790 - }, - { - "epoch": 6.884236453201971, - "grad_norm": 1.0703125, - "learning_rate": 0.00013125037097526097, - "loss": 0.5196, - "step": 2795 - }, - { - "epoch": 6.896551724137931, - "grad_norm": 0.875, - "learning_rate": 0.00013097800703219586, - "loss": 0.5215, - "step": 2800 - }, - { - "epoch": 6.9088669950738915, - "grad_norm": 1.09375, - "learning_rate": 0.00013070538865619642, - "loss": 0.5076, - "step": 2805 - }, - { - "epoch": 6.921182266009852, - "grad_norm": 0.890625, - "learning_rate": 0.00013043251808637026, - "loss": 0.5065, - "step": 2810 - }, - { - "epoch": 6.933497536945813, - "grad_norm": 1.0625, - "learning_rate": 0.00013015939756389643, - "loss": 0.5052, - "step": 2815 - }, - { - "epoch": 6.945812807881773, - "grad_norm": 0.8671875, - "learning_rate": 0.00012988602933200689, - "loss": 0.5103, - "step": 2820 - }, - { - "epoch": 6.958128078817734, - "grad_norm": 0.92578125, - "learning_rate": 0.00012961241563596817, - "loss": 0.5088, - "step": 2825 - }, - { - "epoch": 6.970443349753695, - "grad_norm": 1.109375, - "learning_rate": 0.00012933855872306285, - "loss": 0.5151, - "step": 2830 - }, - { - "epoch": 6.982758620689655, - "grad_norm": 0.8984375, - "learning_rate": 0.0001290644608425711, - "loss": 0.5133, - "step": 2835 - }, - { - "epoch": 6.995073891625616, - "grad_norm": 0.95703125, - "learning_rate": 0.0001287901242457523, - "loss": 0.5095, - "step": 2840 - }, - { - "epoch": 7.0, - "eval_loss": 3.5735390186309814, - "eval_runtime": 2.0433, - "eval_samples_per_second": 4.894, - "eval_steps_per_second": 0.979, - "step": 2842 - }, - { - "epoch": 7.0073891625615765, - "grad_norm": 0.8046875, - "learning_rate": 0.00012851555118582644, - "loss": 0.4549, - "step": 2845 - }, - { - "epoch": 7.019704433497537, - "grad_norm": 1.0703125, - "learning_rate": 0.0001282407439179557, - "loss": 0.4114, - "step": 2850 - }, - { - "epoch": 7.032019704433497, - "grad_norm": 0.90625, - "learning_rate": 0.00012796570469922588, - "loss": 0.416, - "step": 2855 - }, - { - "epoch": 7.044334975369458, - "grad_norm": 1.0703125, - "learning_rate": 0.00012769043578862786, - "loss": 0.4192, - "step": 2860 - }, - { - "epoch": 7.056650246305419, - "grad_norm": 1.1875, - "learning_rate": 0.00012741493944703905, - "loss": 0.4192, - "step": 2865 - }, - { - "epoch": 7.068965517241379, - "grad_norm": 0.99609375, - "learning_rate": 0.0001271392179372048, - "loss": 0.4203, - "step": 2870 - }, - { - "epoch": 7.08128078817734, - "grad_norm": 0.953125, - "learning_rate": 0.00012686327352371999, - "loss": 0.4141, - "step": 2875 - }, - { - "epoch": 7.093596059113301, - "grad_norm": 1.1015625, - "learning_rate": 0.0001265871084730101, - "loss": 0.4252, - "step": 2880 - }, - { - "epoch": 7.105911330049261, - "grad_norm": 1.171875, - "learning_rate": 0.0001263107250533128, - "loss": 0.4253, - "step": 2885 - }, - { - "epoch": 7.1182266009852215, - "grad_norm": 1.0625, - "learning_rate": 0.0001260341255346595, - "loss": 0.4182, - "step": 2890 - }, - { - "epoch": 7.130541871921182, - "grad_norm": 0.9296875, - "learning_rate": 0.00012575731218885625, - "loss": 0.4157, - "step": 2895 - }, - { - "epoch": 7.142857142857143, - "grad_norm": 1.140625, - "learning_rate": 0.0001254802872894655, - "loss": 0.4293, - "step": 2900 - }, - { - "epoch": 7.155172413793103, - "grad_norm": 0.87890625, - "learning_rate": 0.00012520305311178716, - "loss": 0.4264, - "step": 2905 - }, - { - "epoch": 7.167487684729064, - "grad_norm": 0.96484375, - "learning_rate": 0.00012492561193284008, - "loss": 0.4378, - "step": 2910 - }, - { - "epoch": 7.179802955665025, - "grad_norm": 0.99609375, - "learning_rate": 0.00012464796603134327, - "loss": 0.428, - "step": 2915 - }, - { - "epoch": 7.192118226600985, - "grad_norm": 0.94921875, - "learning_rate": 0.0001243701176876972, - "loss": 0.4227, - "step": 2920 - }, - { - "epoch": 7.204433497536946, - "grad_norm": 1.046875, - "learning_rate": 0.00012409206918396503, - "loss": 0.4171, - "step": 2925 - }, - { - "epoch": 7.216748768472907, - "grad_norm": 0.9375, - "learning_rate": 0.00012381382280385393, - "loss": 0.422, - "step": 2930 - }, - { - "epoch": 7.2290640394088665, - "grad_norm": 0.9609375, - "learning_rate": 0.00012353538083269633, - "loss": 0.4248, - "step": 2935 - }, - { - "epoch": 7.241379310344827, - "grad_norm": 0.98828125, - "learning_rate": 0.00012325674555743106, - "loss": 0.4217, - "step": 2940 - }, - { - "epoch": 7.253694581280788, - "grad_norm": 1.4609375, - "learning_rate": 0.0001229779192665846, - "loss": 0.4328, - "step": 2945 - }, - { - "epoch": 7.266009852216749, - "grad_norm": 0.9765625, - "learning_rate": 0.0001226989042502524, - "loss": 0.4311, - "step": 2950 - }, - { - "epoch": 7.278325123152709, - "grad_norm": 1.578125, - "learning_rate": 0.0001224197028000799, - "loss": 0.4311, - "step": 2955 - }, - { - "epoch": 7.29064039408867, - "grad_norm": 1.0078125, - "learning_rate": 0.00012214031720924384, - "loss": 0.4364, - "step": 2960 - }, - { - "epoch": 7.302955665024631, - "grad_norm": 1.0546875, - "learning_rate": 0.0001218607497724333, - "loss": 0.4302, - "step": 2965 - }, - { - "epoch": 7.315270935960591, - "grad_norm": 1.03125, - "learning_rate": 0.000121581002785831, - "loss": 0.4323, - "step": 2970 - }, - { - "epoch": 7.327586206896552, - "grad_norm": 1.0078125, - "learning_rate": 0.0001213010785470943, - "loss": 0.4299, - "step": 2975 - }, - { - "epoch": 7.3399014778325125, - "grad_norm": 0.9609375, - "learning_rate": 0.0001210209793553364, - "loss": 0.4333, - "step": 2980 - }, - { - "epoch": 7.352216748768473, - "grad_norm": 1.0234375, - "learning_rate": 0.00012074070751110751, - "loss": 0.4451, - "step": 2985 - }, - { - "epoch": 7.364532019704433, - "grad_norm": 0.96484375, - "learning_rate": 0.00012046026531637587, - "loss": 0.4346, - "step": 2990 - }, - { - "epoch": 7.376847290640394, - "grad_norm": 0.98828125, - "learning_rate": 0.00012017965507450877, - "loss": 0.4395, - "step": 2995 - }, - { - "epoch": 7.389162561576355, - "grad_norm": 1.0625, - "learning_rate": 0.00011989887909025388, - "loss": 0.4322, - "step": 3000 - }, - { - "epoch": 7.401477832512315, - "grad_norm": 0.9453125, - "learning_rate": 0.00011961793966972004, - "loss": 0.4329, - "step": 3005 - }, - { - "epoch": 7.413793103448276, - "grad_norm": 1.0625, - "learning_rate": 0.00011933683912035856, - "loss": 0.4367, - "step": 3010 - }, - { - "epoch": 7.426108374384237, - "grad_norm": 0.98046875, - "learning_rate": 0.00011905557975094406, - "loss": 0.4352, - "step": 3015 - }, - { - "epoch": 7.4384236453201975, - "grad_norm": 0.9765625, - "learning_rate": 0.00011877416387155565, - "loss": 0.4357, - "step": 3020 - }, - { - "epoch": 7.4507389162561575, - "grad_norm": 0.96875, - "learning_rate": 0.0001184925937935579, - "loss": 0.4374, - "step": 3025 - }, - { - "epoch": 7.463054187192118, - "grad_norm": 0.9140625, - "learning_rate": 0.00011821087182958186, - "loss": 0.4334, - "step": 3030 - }, - { - "epoch": 7.475369458128079, - "grad_norm": 0.87890625, - "learning_rate": 0.00011792900029350611, - "loss": 0.4384, - "step": 3035 - }, - { - "epoch": 7.487684729064039, - "grad_norm": 0.92578125, - "learning_rate": 0.00011764698150043767, - "loss": 0.4402, - "step": 3040 - }, - { - "epoch": 7.5, - "grad_norm": 0.984375, - "learning_rate": 0.00011736481776669306, - "loss": 0.4362, - "step": 3045 - }, - { - "epoch": 7.512315270935961, - "grad_norm": 0.98828125, - "learning_rate": 0.00011708251140977918, - "loss": 0.4403, - "step": 3050 - }, - { - "epoch": 7.524630541871921, - "grad_norm": 1.0859375, - "learning_rate": 0.00011680006474837446, - "loss": 0.4471, - "step": 3055 - }, - { - "epoch": 7.536945812807882, - "grad_norm": 0.9609375, - "learning_rate": 0.0001165174801023096, - "loss": 0.4502, - "step": 3060 - }, - { - "epoch": 7.5492610837438425, - "grad_norm": 1.25, - "learning_rate": 0.0001162347597925487, - "loss": 0.4433, - "step": 3065 - }, - { - "epoch": 7.5615763546798025, - "grad_norm": 1.015625, - "learning_rate": 0.00011595190614117002, - "loss": 0.4485, - "step": 3070 - }, - { - "epoch": 7.573891625615763, - "grad_norm": 1.0703125, - "learning_rate": 0.00011566892147134705, - "loss": 0.445, - "step": 3075 - }, - { - "epoch": 7.586206896551724, - "grad_norm": 1.0546875, - "learning_rate": 0.00011538580810732938, - "loss": 0.4421, - "step": 3080 - }, - { - "epoch": 7.598522167487685, - "grad_norm": 1.0234375, - "learning_rate": 0.00011510256837442359, - "loss": 0.4356, - "step": 3085 - }, - { - "epoch": 7.610837438423645, - "grad_norm": 0.94140625, - "learning_rate": 0.00011481920459897417, - "loss": 0.4466, - "step": 3090 - }, - { - "epoch": 7.623152709359606, - "grad_norm": 0.92578125, - "learning_rate": 0.00011453571910834449, - "loss": 0.4517, - "step": 3095 - }, - { - "epoch": 7.635467980295567, - "grad_norm": 0.90625, - "learning_rate": 0.00011425211423089748, - "loss": 0.4484, - "step": 3100 - }, - { - "epoch": 7.647783251231527, - "grad_norm": 0.90625, - "learning_rate": 0.00011396839229597674, - "loss": 0.4412, - "step": 3105 - }, - { - "epoch": 7.6600985221674875, - "grad_norm": 0.953125, - "learning_rate": 0.0001136845556338872, - "loss": 0.4366, - "step": 3110 - }, - { - "epoch": 7.672413793103448, - "grad_norm": 0.94140625, - "learning_rate": 0.00011340060657587623, - "loss": 0.4506, - "step": 3115 - }, - { - "epoch": 7.684729064039409, - "grad_norm": 0.9453125, - "learning_rate": 0.00011311654745411425, - "loss": 0.4297, - "step": 3120 - }, - { - "epoch": 7.697044334975369, - "grad_norm": 1.0234375, - "learning_rate": 0.00011283238060167563, - "loss": 0.4487, - "step": 3125 - }, - { - "epoch": 7.70935960591133, - "grad_norm": 1.1015625, - "learning_rate": 0.00011254810835251963, - "loss": 0.4494, - "step": 3130 - }, - { - "epoch": 7.721674876847291, - "grad_norm": 1.015625, - "learning_rate": 0.00011226373304147123, - "loss": 0.4459, - "step": 3135 - }, - { - "epoch": 7.733990147783251, - "grad_norm": 1.015625, - "learning_rate": 0.00011197925700420173, - "loss": 0.4441, - "step": 3140 - }, - { - "epoch": 7.746305418719212, - "grad_norm": 1.109375, - "learning_rate": 0.0001116946825772099, - "loss": 0.4444, - "step": 3145 - }, - { - "epoch": 7.758620689655173, - "grad_norm": 1.0390625, - "learning_rate": 0.00011141001209780249, - "loss": 0.4515, - "step": 3150 - }, - { - "epoch": 7.7709359605911335, - "grad_norm": 1.1171875, - "learning_rate": 0.00011112524790407524, - "loss": 0.4422, - "step": 3155 - }, - { - "epoch": 7.783251231527093, - "grad_norm": 0.90625, - "learning_rate": 0.00011084039233489354, - "loss": 0.454, - "step": 3160 - }, - { - "epoch": 7.795566502463054, - "grad_norm": 0.9296875, - "learning_rate": 0.00011055544772987335, - "loss": 0.4388, - "step": 3165 - }, - { - "epoch": 7.807881773399015, - "grad_norm": 0.9375, - "learning_rate": 0.0001102704164293618, - "loss": 0.4458, - "step": 3170 - }, - { - "epoch": 7.820197044334975, - "grad_norm": 1.078125, - "learning_rate": 0.00010998530077441824, - "loss": 0.4454, - "step": 3175 - }, - { - "epoch": 7.832512315270936, - "grad_norm": 0.9609375, - "learning_rate": 0.0001097001031067947, - "loss": 0.4486, - "step": 3180 - }, - { - "epoch": 7.844827586206897, - "grad_norm": 1.0078125, - "learning_rate": 0.00010941482576891689, - "loss": 0.4508, - "step": 3185 - }, - { - "epoch": 7.857142857142857, - "grad_norm": 0.97265625, - "learning_rate": 0.00010912947110386484, - "loss": 0.4569, - "step": 3190 - }, - { - "epoch": 7.869458128078818, - "grad_norm": 0.9921875, - "learning_rate": 0.00010884404145535372, - "loss": 0.4459, - "step": 3195 - }, - { - "epoch": 7.8817733990147785, - "grad_norm": 1.1015625, - "learning_rate": 0.00010855853916771457, - "loss": 0.4539, - "step": 3200 - }, - { - "epoch": 7.894088669950738, - "grad_norm": 0.9609375, - "learning_rate": 0.00010827296658587503, - "loss": 0.4598, - "step": 3205 - }, - { - "epoch": 7.906403940886699, - "grad_norm": 1.0234375, - "learning_rate": 0.00010798732605534006, - "loss": 0.4465, - "step": 3210 - }, - { - "epoch": 7.91871921182266, - "grad_norm": 1.0859375, - "learning_rate": 0.00010770161992217271, - "loss": 0.4448, - "step": 3215 - }, - { - "epoch": 7.931034482758621, - "grad_norm": 1.0078125, - "learning_rate": 0.00010741585053297494, - "loss": 0.4485, - "step": 3220 - }, - { - "epoch": 7.943349753694581, - "grad_norm": 0.95703125, - "learning_rate": 0.00010713002023486816, - "loss": 0.4487, - "step": 3225 - }, - { - "epoch": 7.955665024630542, - "grad_norm": 0.94921875, - "learning_rate": 0.00010684413137547404, - "loss": 0.4432, - "step": 3230 - }, - { - "epoch": 7.967980295566503, - "grad_norm": 0.9765625, - "learning_rate": 0.00010655818630289525, - "loss": 0.4517, - "step": 3235 - }, - { - "epoch": 7.980295566502463, - "grad_norm": 0.98046875, - "learning_rate": 0.00010627218736569624, - "loss": 0.4366, - "step": 3240 - }, - { - "epoch": 7.9926108374384235, - "grad_norm": 0.9375, - "learning_rate": 0.00010598613691288372, - "loss": 0.4514, - "step": 3245 - }, - { - "epoch": 8.0, - "eval_loss": 3.931917190551758, - "eval_runtime": 2.0448, - "eval_samples_per_second": 4.89, - "eval_steps_per_second": 0.978, - "step": 3248 - }, - { - "epoch": 8.004926108374384, - "grad_norm": 0.80859375, - "learning_rate": 0.00010570003729388767, - "loss": 0.4202, - "step": 3250 - }, - { - "epoch": 8.017241379310345, - "grad_norm": 1.0625, - "learning_rate": 0.00010541389085854176, - "loss": 0.369, - "step": 3255 - }, - { - "epoch": 8.029556650246306, - "grad_norm": 0.91015625, - "learning_rate": 0.00010512769995706426, - "loss": 0.3781, - "step": 3260 - }, - { - "epoch": 8.041871921182265, - "grad_norm": 0.98046875, - "learning_rate": 0.0001048414669400386, - "loss": 0.3687, - "step": 3265 - }, - { - "epoch": 8.054187192118226, - "grad_norm": 1.0859375, - "learning_rate": 0.00010455519415839415, - "loss": 0.3635, - "step": 3270 - }, - { - "epoch": 8.066502463054187, - "grad_norm": 0.859375, - "learning_rate": 0.00010426888396338688, - "loss": 0.3642, - "step": 3275 - }, - { - "epoch": 8.078817733990148, - "grad_norm": 0.90234375, - "learning_rate": 0.00010398253870658006, - "loss": 0.3715, - "step": 3280 - }, - { - "epoch": 8.091133004926109, - "grad_norm": 0.984375, - "learning_rate": 0.00010369616073982491, - "loss": 0.3667, - "step": 3285 - }, - { - "epoch": 8.10344827586207, - "grad_norm": 0.9296875, - "learning_rate": 0.00010340975241524132, - "loss": 0.373, - "step": 3290 - }, - { - "epoch": 8.11576354679803, - "grad_norm": 0.92578125, - "learning_rate": 0.00010312331608519847, - "loss": 0.3717, - "step": 3295 - }, - { - "epoch": 8.12807881773399, - "grad_norm": 0.98828125, - "learning_rate": 0.00010283685410229571, - "loss": 0.3777, - "step": 3300 - }, - { - "epoch": 8.14039408866995, - "grad_norm": 0.9609375, - "learning_rate": 0.0001025503688193429, - "loss": 0.3782, - "step": 3305 - }, - { - "epoch": 8.152709359605911, - "grad_norm": 0.9375, - "learning_rate": 0.00010226386258934138, - "loss": 0.3792, - "step": 3310 - }, - { - "epoch": 8.165024630541872, - "grad_norm": 0.96875, - "learning_rate": 0.00010197733776546447, - "loss": 0.3745, - "step": 3315 - }, - { - "epoch": 8.177339901477833, - "grad_norm": 0.91015625, - "learning_rate": 0.00010169079670103831, - "loss": 0.378, - "step": 3320 - }, - { - "epoch": 8.189655172413794, - "grad_norm": 1.0234375, - "learning_rate": 0.00010140424174952232, - "loss": 0.3783, - "step": 3325 - }, - { - "epoch": 8.201970443349754, - "grad_norm": 1.0, - "learning_rate": 0.00010111767526449004, - "loss": 0.3788, - "step": 3330 - }, - { - "epoch": 8.214285714285714, - "grad_norm": 1.03125, - "learning_rate": 0.00010083109959960973, - "loss": 0.3752, - "step": 3335 - }, - { - "epoch": 8.226600985221674, - "grad_norm": 0.97265625, - "learning_rate": 0.00010054451710862498, - "loss": 0.3794, - "step": 3340 - }, - { - "epoch": 8.238916256157635, - "grad_norm": 1.0390625, - "learning_rate": 0.00010025793014533558, - "loss": 0.381, - "step": 3345 - }, - { - "epoch": 8.251231527093596, - "grad_norm": 0.91796875, - "learning_rate": 9.997134106357796e-05, - "loss": 0.3834, - "step": 3350 - }, - { - "epoch": 8.263546798029557, - "grad_norm": 0.95703125, - "learning_rate": 9.968475221720594e-05, - "loss": 0.3791, - "step": 3355 - }, - { - "epoch": 8.275862068965518, - "grad_norm": 1.015625, - "learning_rate": 9.939816596007146e-05, - "loss": 0.377, - "step": 3360 - }, - { - "epoch": 8.288177339901479, - "grad_norm": 1.0703125, - "learning_rate": 9.911158464600517e-05, - "loss": 0.3764, - "step": 3365 - }, - { - "epoch": 8.300492610837438, - "grad_norm": 0.97265625, - "learning_rate": 9.88250106287971e-05, - "loss": 0.3786, - "step": 3370 - }, - { - "epoch": 8.312807881773399, - "grad_norm": 0.98046875, - "learning_rate": 9.853844626217737e-05, - "loss": 0.3722, - "step": 3375 - }, - { - "epoch": 8.32512315270936, - "grad_norm": 0.96875, - "learning_rate": 9.825189389979683e-05, - "loss": 0.3847, - "step": 3380 - }, - { - "epoch": 8.33743842364532, - "grad_norm": 1.046875, - "learning_rate": 9.796535589520777e-05, - "loss": 0.377, - "step": 3385 - }, - { - "epoch": 8.349753694581281, - "grad_norm": 1.078125, - "learning_rate": 9.767883460184443e-05, - "loss": 0.3799, - "step": 3390 - }, - { - "epoch": 8.362068965517242, - "grad_norm": 1.03125, - "learning_rate": 9.739233237300402e-05, - "loss": 0.3878, - "step": 3395 - }, - { - "epoch": 8.374384236453203, - "grad_norm": 0.9765625, - "learning_rate": 9.710585156182695e-05, - "loss": 0.3799, - "step": 3400 - }, - { - "epoch": 8.386699507389162, - "grad_norm": 0.97265625, - "learning_rate": 9.681939452127784e-05, - "loss": 0.3811, - "step": 3405 - }, - { - "epoch": 8.399014778325123, - "grad_norm": 1.0078125, - "learning_rate": 9.653296360412602e-05, - "loss": 0.3893, - "step": 3410 - }, - { - "epoch": 8.411330049261084, - "grad_norm": 0.9453125, - "learning_rate": 9.624656116292628e-05, - "loss": 0.3812, - "step": 3415 - }, - { - "epoch": 8.423645320197044, - "grad_norm": 1.015625, - "learning_rate": 9.596018954999953e-05, - "loss": 0.3866, - "step": 3420 - }, - { - "epoch": 8.435960591133005, - "grad_norm": 1.0546875, - "learning_rate": 9.567385111741345e-05, - "loss": 0.3845, - "step": 3425 - }, - { - "epoch": 8.448275862068966, - "grad_norm": 0.98046875, - "learning_rate": 9.538754821696323e-05, - "loss": 0.3843, - "step": 3430 - }, - { - "epoch": 8.460591133004925, - "grad_norm": 1.1875, - "learning_rate": 9.510128320015224e-05, - "loss": 0.3884, - "step": 3435 - }, - { - "epoch": 8.472906403940886, - "grad_norm": 0.94921875, - "learning_rate": 9.481505841817265e-05, - "loss": 0.3778, - "step": 3440 - }, - { - "epoch": 8.485221674876847, - "grad_norm": 0.99609375, - "learning_rate": 9.452887622188619e-05, - "loss": 0.3829, - "step": 3445 - }, - { - "epoch": 8.497536945812808, - "grad_norm": 1.1484375, - "learning_rate": 9.424273896180482e-05, - "loss": 0.3807, - "step": 3450 - }, - { - "epoch": 8.509852216748769, - "grad_norm": 1.0625, - "learning_rate": 9.395664898807142e-05, - "loss": 0.3887, - "step": 3455 - }, - { - "epoch": 8.52216748768473, - "grad_norm": 0.94140625, - "learning_rate": 9.367060865044049e-05, - "loss": 0.386, - "step": 3460 - }, - { - "epoch": 8.53448275862069, - "grad_norm": 0.953125, - "learning_rate": 9.338462029825886e-05, - "loss": 0.3859, - "step": 3465 - }, - { - "epoch": 8.54679802955665, - "grad_norm": 0.984375, - "learning_rate": 9.309868628044633e-05, - "loss": 0.3855, - "step": 3470 - }, - { - "epoch": 8.55911330049261, - "grad_norm": 0.96484375, - "learning_rate": 9.281280894547657e-05, - "loss": 0.3803, - "step": 3475 - }, - { - "epoch": 8.571428571428571, - "grad_norm": 1.03125, - "learning_rate": 9.252699064135758e-05, - "loss": 0.3793, - "step": 3480 - }, - { - "epoch": 8.583743842364532, - "grad_norm": 0.98046875, - "learning_rate": 9.224123371561252e-05, - "loss": 0.3772, - "step": 3485 - }, - { - "epoch": 8.596059113300493, - "grad_norm": 1.0078125, - "learning_rate": 9.19555405152605e-05, - "loss": 0.3885, - "step": 3490 - }, - { - "epoch": 8.608374384236454, - "grad_norm": 1.015625, - "learning_rate": 9.166991338679715e-05, - "loss": 0.3886, - "step": 3495 - }, - { - "epoch": 8.620689655172415, - "grad_norm": 0.9296875, - "learning_rate": 9.138435467617548e-05, - "loss": 0.3765, - "step": 3500 - }, - { - "epoch": 8.633004926108374, - "grad_norm": 0.984375, - "learning_rate": 9.109886672878653e-05, - "loss": 0.3931, - "step": 3505 - }, - { - "epoch": 8.645320197044335, - "grad_norm": 0.984375, - "learning_rate": 9.081345188944019e-05, - "loss": 0.3872, - "step": 3510 - }, - { - "epoch": 8.657635467980295, - "grad_norm": 0.95703125, - "learning_rate": 9.052811250234579e-05, - "loss": 0.3802, - "step": 3515 - }, - { - "epoch": 8.669950738916256, - "grad_norm": 1.0390625, - "learning_rate": 9.024285091109309e-05, - "loss": 0.3886, - "step": 3520 - }, - { - "epoch": 8.682266009852217, - "grad_norm": 0.96484375, - "learning_rate": 8.995766945863277e-05, - "loss": 0.3814, - "step": 3525 - }, - { - "epoch": 8.694581280788178, - "grad_norm": 1.0234375, - "learning_rate": 8.967257048725733e-05, - "loss": 0.3886, - "step": 3530 - }, - { - "epoch": 8.706896551724139, - "grad_norm": 1.0703125, - "learning_rate": 8.938755633858186e-05, - "loss": 0.3889, - "step": 3535 - }, - { - "epoch": 8.719211822660098, - "grad_norm": 0.95703125, - "learning_rate": 8.91026293535247e-05, - "loss": 0.3838, - "step": 3540 - }, - { - "epoch": 8.731527093596059, - "grad_norm": 1.0, - "learning_rate": 8.881779187228836e-05, - "loss": 0.3868, - "step": 3545 - }, - { - "epoch": 8.74384236453202, - "grad_norm": 1.0234375, - "learning_rate": 8.853304623434021e-05, - "loss": 0.3863, - "step": 3550 - }, - { - "epoch": 8.75615763546798, - "grad_norm": 1.0703125, - "learning_rate": 8.82483947783932e-05, - "loss": 0.3873, - "step": 3555 - }, - { - "epoch": 8.768472906403941, - "grad_norm": 1.1328125, - "learning_rate": 8.796383984238688e-05, - "loss": 0.3944, - "step": 3560 - }, - { - "epoch": 8.780788177339902, - "grad_norm": 1.0703125, - "learning_rate": 8.767938376346792e-05, - "loss": 0.3927, - "step": 3565 - }, - { - "epoch": 8.793103448275861, - "grad_norm": 1.03125, - "learning_rate": 8.739502887797107e-05, - "loss": 0.3865, - "step": 3570 - }, - { - "epoch": 8.805418719211822, - "grad_norm": 1.125, - "learning_rate": 8.71107775214e-05, - "loss": 0.3827, - "step": 3575 - }, - { - "epoch": 8.817733990147783, - "grad_norm": 1.015625, - "learning_rate": 8.682663202840802e-05, - "loss": 0.3934, - "step": 3580 - }, - { - "epoch": 8.830049261083744, - "grad_norm": 1.015625, - "learning_rate": 8.654259473277892e-05, - "loss": 0.3939, - "step": 3585 - }, - { - "epoch": 8.842364532019705, - "grad_norm": 1.0234375, - "learning_rate": 8.625866796740787e-05, - "loss": 0.3933, - "step": 3590 - }, - { - "epoch": 8.854679802955665, - "grad_norm": 1.0625, - "learning_rate": 8.597485406428219e-05, - "loss": 0.3861, - "step": 3595 - }, - { - "epoch": 8.866995073891626, - "grad_norm": 1.0390625, - "learning_rate": 8.569115535446228e-05, - "loss": 0.3888, - "step": 3600 - }, - { - "epoch": 8.879310344827585, - "grad_norm": 0.984375, - "learning_rate": 8.540757416806236e-05, - "loss": 0.3876, - "step": 3605 - }, - { - "epoch": 8.891625615763546, - "grad_norm": 0.97265625, - "learning_rate": 8.51241128342314e-05, - "loss": 0.3868, - "step": 3610 - }, - { - "epoch": 8.903940886699507, - "grad_norm": 0.9765625, - "learning_rate": 8.484077368113399e-05, - "loss": 0.3812, - "step": 3615 - }, - { - "epoch": 8.916256157635468, - "grad_norm": 1.03125, - "learning_rate": 8.45575590359312e-05, - "loss": 0.3845, - "step": 3620 - }, - { - "epoch": 8.928571428571429, - "grad_norm": 0.9609375, - "learning_rate": 8.427447122476148e-05, - "loss": 0.3873, - "step": 3625 - }, - { - "epoch": 8.94088669950739, - "grad_norm": 0.91015625, - "learning_rate": 8.399151257272156e-05, - "loss": 0.3944, - "step": 3630 - }, - { - "epoch": 8.95320197044335, - "grad_norm": 0.9765625, - "learning_rate": 8.37086854038473e-05, - "loss": 0.3888, - "step": 3635 - }, - { - "epoch": 8.96551724137931, - "grad_norm": 1.0625, - "learning_rate": 8.342599204109472e-05, - "loss": 0.3815, - "step": 3640 - }, - { - "epoch": 8.97783251231527, - "grad_norm": 1.03125, - "learning_rate": 8.314343480632078e-05, - "loss": 0.387, - "step": 3645 - }, - { - "epoch": 8.990147783251231, - "grad_norm": 0.96875, - "learning_rate": 8.286101602026437e-05, - "loss": 0.3872, - "step": 3650 - }, - { - "epoch": 9.0, - "eval_loss": 4.438603401184082, - "eval_runtime": 2.0448, - "eval_samples_per_second": 4.89, - "eval_steps_per_second": 0.978, - "step": 3654 - }, - { - "epoch": 9.002463054187192, - "grad_norm": 0.890625, - "learning_rate": 8.257873800252732e-05, - "loss": 0.3883, - "step": 3655 - }, - { - "epoch": 9.014778325123153, - "grad_norm": 1.015625, - "learning_rate": 8.229660307155518e-05, - "loss": 0.3218, - "step": 3660 - }, - { - "epoch": 9.027093596059114, - "grad_norm": 0.9375, - "learning_rate": 8.20146135446184e-05, - "loss": 0.3247, - "step": 3665 - }, - { - "epoch": 9.039408866995075, - "grad_norm": 0.91796875, - "learning_rate": 8.173277173779305e-05, - "loss": 0.3305, - "step": 3670 - }, - { - "epoch": 9.051724137931034, - "grad_norm": 0.953125, - "learning_rate": 8.145107996594206e-05, - "loss": 0.3281, - "step": 3675 - }, - { - "epoch": 9.064039408866995, - "grad_norm": 1.0546875, - "learning_rate": 8.116954054269591e-05, - "loss": 0.3265, - "step": 3680 - }, - { - "epoch": 9.076354679802956, - "grad_norm": 0.9296875, - "learning_rate": 8.088815578043398e-05, - "loss": 0.3299, - "step": 3685 - }, - { - "epoch": 9.088669950738916, - "grad_norm": 1.0546875, - "learning_rate": 8.060692799026522e-05, - "loss": 0.3352, - "step": 3690 - }, - { - "epoch": 9.100985221674877, - "grad_norm": 1.0234375, - "learning_rate": 8.032585948200937e-05, - "loss": 0.3329, - "step": 3695 - }, - { - "epoch": 9.113300492610838, - "grad_norm": 0.9375, - "learning_rate": 8.004495256417792e-05, - "loss": 0.3284, - "step": 3700 - }, - { - "epoch": 9.125615763546797, - "grad_norm": 0.9765625, - "learning_rate": 7.976420954395518e-05, - "loss": 0.3326, - "step": 3705 - }, - { - "epoch": 9.137931034482758, - "grad_norm": 0.9921875, - "learning_rate": 7.948363272717926e-05, - "loss": 0.33, - "step": 3710 - }, - { - "epoch": 9.150246305418719, - "grad_norm": 1.0078125, - "learning_rate": 7.920322441832326e-05, - "loss": 0.3383, - "step": 3715 - }, - { - "epoch": 9.16256157635468, - "grad_norm": 1.03125, - "learning_rate": 7.892298692047621e-05, - "loss": 0.3309, - "step": 3720 - }, - { - "epoch": 9.17487684729064, - "grad_norm": 0.94140625, - "learning_rate": 7.864292253532427e-05, - "loss": 0.3284, - "step": 3725 - }, - { - "epoch": 9.187192118226601, - "grad_norm": 0.94140625, - "learning_rate": 7.83630335631317e-05, - "loss": 0.3315, - "step": 3730 - }, - { - "epoch": 9.199507389162562, - "grad_norm": 0.96484375, - "learning_rate": 7.808332230272209e-05, - "loss": 0.3373, - "step": 3735 - }, - { - "epoch": 9.211822660098521, - "grad_norm": 0.88671875, - "learning_rate": 7.780379105145934e-05, - "loss": 0.3277, - "step": 3740 - }, - { - "epoch": 9.224137931034482, - "grad_norm": 1.0859375, - "learning_rate": 7.752444210522898e-05, - "loss": 0.3281, - "step": 3745 - }, - { - "epoch": 9.236453201970443, - "grad_norm": 0.94921875, - "learning_rate": 7.724527775841914e-05, - "loss": 0.3263, - "step": 3750 - }, - { - "epoch": 9.248768472906404, - "grad_norm": 1.09375, - "learning_rate": 7.696630030390179e-05, - "loss": 0.3406, - "step": 3755 - }, - { - "epoch": 9.261083743842365, - "grad_norm": 0.97265625, - "learning_rate": 7.668751203301384e-05, - "loss": 0.3377, - "step": 3760 - }, - { - "epoch": 9.273399014778326, - "grad_norm": 1.046875, - "learning_rate": 7.64089152355385e-05, - "loss": 0.3383, - "step": 3765 - }, - { - "epoch": 9.285714285714286, - "grad_norm": 0.98828125, - "learning_rate": 7.613051219968623e-05, - "loss": 0.3396, - "step": 3770 - }, - { - "epoch": 9.298029556650246, - "grad_norm": 0.9296875, - "learning_rate": 7.585230521207608e-05, - "loss": 0.3349, - "step": 3775 - }, - { - "epoch": 9.310344827586206, - "grad_norm": 0.99609375, - "learning_rate": 7.55742965577169e-05, - "loss": 0.3306, - "step": 3780 - }, - { - "epoch": 9.322660098522167, - "grad_norm": 0.96875, - "learning_rate": 7.529648851998857e-05, - "loss": 0.3359, - "step": 3785 - }, - { - "epoch": 9.334975369458128, - "grad_norm": 1.015625, - "learning_rate": 7.501888338062323e-05, - "loss": 0.3442, - "step": 3790 - }, - { - "epoch": 9.347290640394089, - "grad_norm": 0.95703125, - "learning_rate": 7.474148341968652e-05, - "loss": 0.3426, - "step": 3795 - }, - { - "epoch": 9.35960591133005, - "grad_norm": 1.0390625, - "learning_rate": 7.446429091555889e-05, - "loss": 0.3405, - "step": 3800 - }, - { - "epoch": 9.37192118226601, - "grad_norm": 1.1953125, - "learning_rate": 7.418730814491697e-05, - "loss": 0.3418, - "step": 3805 - }, - { - "epoch": 9.38423645320197, - "grad_norm": 1.1171875, - "learning_rate": 7.391053738271466e-05, - "loss": 0.3317, - "step": 3810 - }, - { - "epoch": 9.39655172413793, - "grad_norm": 1.046875, - "learning_rate": 7.363398090216459e-05, - "loss": 0.3297, - "step": 3815 - }, - { - "epoch": 9.408866995073891, - "grad_norm": 0.98828125, - "learning_rate": 7.335764097471944e-05, - "loss": 0.3373, - "step": 3820 - }, - { - "epoch": 9.421182266009852, - "grad_norm": 0.953125, - "learning_rate": 7.308151987005326e-05, - "loss": 0.3337, - "step": 3825 - }, - { - "epoch": 9.433497536945813, - "grad_norm": 1.0546875, - "learning_rate": 7.28056198560428e-05, - "loss": 0.3383, - "step": 3830 - }, - { - "epoch": 9.445812807881774, - "grad_norm": 1.0078125, - "learning_rate": 7.2529943198749e-05, - "loss": 0.3356, - "step": 3835 - }, - { - "epoch": 9.458128078817733, - "grad_norm": 0.9921875, - "learning_rate": 7.225449216239821e-05, - "loss": 0.3315, - "step": 3840 - }, - { - "epoch": 9.470443349753694, - "grad_norm": 0.953125, - "learning_rate": 7.19792690093637e-05, - "loss": 0.3397, - "step": 3845 - }, - { - "epoch": 9.482758620689655, - "grad_norm": 1.109375, - "learning_rate": 7.170427600014712e-05, - "loss": 0.3378, - "step": 3850 - }, - { - "epoch": 9.495073891625616, - "grad_norm": 1.015625, - "learning_rate": 7.142951539335981e-05, - "loss": 0.3439, - "step": 3855 - }, - { - "epoch": 9.507389162561577, - "grad_norm": 1.046875, - "learning_rate": 7.115498944570427e-05, - "loss": 0.3331, - "step": 3860 - }, - { - "epoch": 9.519704433497537, - "grad_norm": 0.9375, - "learning_rate": 7.088070041195576e-05, - "loss": 0.3355, - "step": 3865 - }, - { - "epoch": 9.532019704433498, - "grad_norm": 1.109375, - "learning_rate": 7.060665054494362e-05, - "loss": 0.3301, - "step": 3870 - }, - { - "epoch": 9.544334975369457, - "grad_norm": 0.98046875, - "learning_rate": 7.033284209553286e-05, - "loss": 0.3329, - "step": 3875 - }, - { - "epoch": 9.556650246305418, - "grad_norm": 1.0078125, - "learning_rate": 7.005927731260562e-05, - "loss": 0.344, - "step": 3880 - }, - { - "epoch": 9.568965517241379, - "grad_norm": 0.98828125, - "learning_rate": 6.978595844304271e-05, - "loss": 0.3358, - "step": 3885 - }, - { - "epoch": 9.58128078817734, - "grad_norm": 0.9765625, - "learning_rate": 6.95128877317053e-05, - "loss": 0.3448, - "step": 3890 - }, - { - "epoch": 9.5935960591133, - "grad_norm": 1.03125, - "learning_rate": 6.924006742141618e-05, - "loss": 0.3338, - "step": 3895 - }, - { - "epoch": 9.605911330049262, - "grad_norm": 1.078125, - "learning_rate": 6.89674997529416e-05, - "loss": 0.3386, - "step": 3900 - }, - { - "epoch": 9.618226600985222, - "grad_norm": 1.0234375, - "learning_rate": 6.869518696497275e-05, - "loss": 0.3285, - "step": 3905 - }, - { - "epoch": 9.630541871921181, - "grad_norm": 1.078125, - "learning_rate": 6.842313129410741e-05, - "loss": 0.3447, - "step": 3910 - }, - { - "epoch": 9.642857142857142, - "grad_norm": 1.0625, - "learning_rate": 6.815133497483157e-05, - "loss": 0.3342, - "step": 3915 - }, - { - "epoch": 9.655172413793103, - "grad_norm": 1.0078125, - "learning_rate": 6.787980023950108e-05, - "loss": 0.3458, - "step": 3920 - }, - { - "epoch": 9.667487684729064, - "grad_norm": 0.953125, - "learning_rate": 6.760852931832328e-05, - "loss": 0.344, - "step": 3925 - }, - { - "epoch": 9.679802955665025, - "grad_norm": 1.0078125, - "learning_rate": 6.733752443933878e-05, - "loss": 0.3366, - "step": 3930 - }, - { - "epoch": 9.692118226600986, - "grad_norm": 0.9921875, - "learning_rate": 6.706678782840304e-05, - "loss": 0.3404, - "step": 3935 - }, - { - "epoch": 9.704433497536947, - "grad_norm": 0.96484375, - "learning_rate": 6.679632170916816e-05, - "loss": 0.3398, - "step": 3940 - }, - { - "epoch": 9.716748768472906, - "grad_norm": 0.98046875, - "learning_rate": 6.65261283030646e-05, - "loss": 0.3331, - "step": 3945 - }, - { - "epoch": 9.729064039408867, - "grad_norm": 0.98046875, - "learning_rate": 6.625620982928293e-05, - "loss": 0.3396, - "step": 3950 - }, - { - "epoch": 9.741379310344827, - "grad_norm": 0.9609375, - "learning_rate": 6.598656850475562e-05, - "loss": 0.336, - "step": 3955 - }, - { - "epoch": 9.753694581280788, - "grad_norm": 1.0078125, - "learning_rate": 6.571720654413877e-05, - "loss": 0.3416, - "step": 3960 - }, - { - "epoch": 9.766009852216749, - "grad_norm": 1.015625, - "learning_rate": 6.544812615979404e-05, - "loss": 0.3381, - "step": 3965 - }, - { - "epoch": 9.77832512315271, - "grad_norm": 1.0078125, - "learning_rate": 6.517932956177038e-05, - "loss": 0.3398, - "step": 3970 - }, - { - "epoch": 9.790640394088669, - "grad_norm": 0.9453125, - "learning_rate": 6.491081895778588e-05, - "loss": 0.3411, - "step": 3975 - }, - { - "epoch": 9.80295566502463, - "grad_norm": 1.09375, - "learning_rate": 6.464259655320973e-05, - "loss": 0.3405, - "step": 3980 - }, - { - "epoch": 9.81527093596059, - "grad_norm": 1.078125, - "learning_rate": 6.437466455104395e-05, - "loss": 0.343, - "step": 3985 - }, - { - "epoch": 9.827586206896552, - "grad_norm": 0.98828125, - "learning_rate": 6.410702515190543e-05, - "loss": 0.343, - "step": 3990 - }, - { - "epoch": 9.839901477832512, - "grad_norm": 0.98828125, - "learning_rate": 6.383968055400784e-05, - "loss": 0.3316, - "step": 3995 - }, - { - "epoch": 9.852216748768473, - "grad_norm": 0.9609375, - "learning_rate": 6.357263295314349e-05, - "loss": 0.3403, - "step": 4000 - }, - { - "epoch": 9.864532019704434, - "grad_norm": 0.9921875, - "learning_rate": 6.330588454266542e-05, - "loss": 0.3392, - "step": 4005 - }, - { - "epoch": 9.876847290640395, - "grad_norm": 1.0625, - "learning_rate": 6.303943751346922e-05, - "loss": 0.3406, - "step": 4010 - }, - { - "epoch": 9.889162561576354, - "grad_norm": 0.984375, - "learning_rate": 6.277329405397525e-05, - "loss": 0.3404, - "step": 4015 - }, - { - "epoch": 9.901477832512315, - "grad_norm": 0.96484375, - "learning_rate": 6.250745635011048e-05, - "loss": 0.3386, - "step": 4020 - }, - { - "epoch": 9.913793103448276, - "grad_norm": 0.97265625, - "learning_rate": 6.22419265852906e-05, - "loss": 0.3342, - "step": 4025 - }, - { - "epoch": 9.926108374384237, - "grad_norm": 0.99609375, - "learning_rate": 6.19767069404021e-05, - "loss": 0.3351, - "step": 4030 - }, - { - "epoch": 9.938423645320198, - "grad_norm": 0.9453125, - "learning_rate": 6.171179959378437e-05, - "loss": 0.3506, - "step": 4035 - }, - { - "epoch": 9.950738916256158, - "grad_norm": 1.0234375, - "learning_rate": 6.144720672121177e-05, - "loss": 0.3362, - "step": 4040 - }, - { - "epoch": 9.963054187192117, - "grad_norm": 1.1015625, - "learning_rate": 6.118293049587578e-05, - "loss": 0.3403, - "step": 4045 - }, - { - "epoch": 9.975369458128078, - "grad_norm": 1.078125, - "learning_rate": 6.0918973088367116e-05, - "loss": 0.3397, - "step": 4050 - }, - { - "epoch": 9.98768472906404, - "grad_norm": 1.0625, - "learning_rate": 6.0655336666658034e-05, - "loss": 0.3303, - "step": 4055 - }, - { - "epoch": 10.0, - "grad_norm": 1.0, - "learning_rate": 6.039202339608432e-05, - "loss": 0.3371, - "step": 4060 - }, - { - "epoch": 10.0, - "eval_loss": 4.856110572814941, - "eval_runtime": 2.0429, - "eval_samples_per_second": 4.895, - "eval_steps_per_second": 0.979, - "step": 4060 - }, - { - "epoch": 10.01231527093596, - "grad_norm": 0.93359375, - "learning_rate": 6.012903543932766e-05, - "loss": 0.2947, - "step": 4065 - }, - { - "epoch": 10.024630541871922, - "grad_norm": 0.98046875, - "learning_rate": 5.986637495639782e-05, - "loss": 0.301, - "step": 4070 - }, - { - "epoch": 10.036945812807883, - "grad_norm": 1.046875, - "learning_rate": 5.960404410461488e-05, - "loss": 0.2953, - "step": 4075 - }, - { - "epoch": 10.049261083743842, - "grad_norm": 0.890625, - "learning_rate": 5.934204503859158e-05, - "loss": 0.2988, - "step": 4080 - }, - { - "epoch": 10.061576354679802, - "grad_norm": 0.87109375, - "learning_rate": 5.90803799102156e-05, - "loss": 0.2913, - "step": 4085 - }, - { - "epoch": 10.073891625615763, - "grad_norm": 0.9609375, - "learning_rate": 5.881905086863181e-05, - "loss": 0.2937, - "step": 4090 - }, - { - "epoch": 10.086206896551724, - "grad_norm": 0.875, - "learning_rate": 5.8558060060224817e-05, - "loss": 0.2975, - "step": 4095 - }, - { - "epoch": 10.098522167487685, - "grad_norm": 0.8828125, - "learning_rate": 5.829740962860109e-05, - "loss": 0.299, - "step": 4100 - }, - { - "epoch": 10.110837438423646, - "grad_norm": 1.0234375, - "learning_rate": 5.803710171457145e-05, - "loss": 0.3064, - "step": 4105 - }, - { - "epoch": 10.123152709359607, - "grad_norm": 0.97265625, - "learning_rate": 5.777713845613364e-05, - "loss": 0.2937, - "step": 4110 - }, - { - "epoch": 10.135467980295566, - "grad_norm": 1.0234375, - "learning_rate": 5.751752198845444e-05, - "loss": 0.2982, - "step": 4115 - }, - { - "epoch": 10.147783251231527, - "grad_norm": 1.03125, - "learning_rate": 5.725825444385251e-05, - "loss": 0.3011, - "step": 4120 - }, - { - "epoch": 10.160098522167488, - "grad_norm": 0.9375, - "learning_rate": 5.699933795178052e-05, - "loss": 0.3046, - "step": 4125 - }, - { - "epoch": 10.172413793103448, - "grad_norm": 1.0078125, - "learning_rate": 5.6740774638807935e-05, - "loss": 0.2991, - "step": 4130 - }, - { - "epoch": 10.18472906403941, - "grad_norm": 0.953125, - "learning_rate": 5.6482566628603425e-05, - "loss": 0.2996, - "step": 4135 - }, - { - "epoch": 10.19704433497537, - "grad_norm": 1.140625, - "learning_rate": 5.622471604191746e-05, - "loss": 0.2981, - "step": 4140 - }, - { - "epoch": 10.20935960591133, - "grad_norm": 0.91796875, - "learning_rate": 5.59672249965647e-05, - "loss": 0.3016, - "step": 4145 - }, - { - "epoch": 10.22167487684729, - "grad_norm": 1.015625, - "learning_rate": 5.571009560740704e-05, - "loss": 0.3014, - "step": 4150 - }, - { - "epoch": 10.233990147783251, - "grad_norm": 0.94921875, - "learning_rate": 5.545332998633572e-05, - "loss": 0.296, - "step": 4155 - }, - { - "epoch": 10.246305418719212, - "grad_norm": 0.91796875, - "learning_rate": 5.5196930242254407e-05, - "loss": 0.2972, - "step": 4160 - }, - { - "epoch": 10.258620689655173, - "grad_norm": 1.0625, - "learning_rate": 5.494089848106156e-05, - "loss": 0.3073, - "step": 4165 - }, - { - "epoch": 10.270935960591133, - "grad_norm": 0.90625, - "learning_rate": 5.46852368056334e-05, - "loss": 0.3061, - "step": 4170 - }, - { - "epoch": 10.283251231527094, - "grad_norm": 0.93359375, - "learning_rate": 5.4429947315806376e-05, - "loss": 0.2959, - "step": 4175 - }, - { - "epoch": 10.295566502463053, - "grad_norm": 0.9765625, - "learning_rate": 5.417503210836015e-05, - "loss": 0.3049, - "step": 4180 - }, - { - "epoch": 10.307881773399014, - "grad_norm": 1.0, - "learning_rate": 5.392049327700026e-05, - "loss": 0.3025, - "step": 4185 - }, - { - "epoch": 10.320197044334975, - "grad_norm": 1.015625, - "learning_rate": 5.366633291234087e-05, - "loss": 0.304, - "step": 4190 - }, - { - "epoch": 10.332512315270936, - "grad_norm": 1.0546875, - "learning_rate": 5.341255310188775e-05, - "loss": 0.2992, - "step": 4195 - }, - { - "epoch": 10.344827586206897, - "grad_norm": 1.0625, - "learning_rate": 5.3159155930021e-05, - "loss": 0.3041, - "step": 4200 - }, - { - "epoch": 10.357142857142858, - "grad_norm": 0.90625, - "learning_rate": 5.290614347797802e-05, - "loss": 0.3002, - "step": 4205 - }, - { - "epoch": 10.369458128078819, - "grad_norm": 1.0234375, - "learning_rate": 5.265351782383629e-05, - "loss": 0.3046, - "step": 4210 - }, - { - "epoch": 10.381773399014778, - "grad_norm": 0.9921875, - "learning_rate": 5.2401281042496494e-05, - "loss": 0.3028, - "step": 4215 - }, - { - "epoch": 10.394088669950738, - "grad_norm": 0.8828125, - "learning_rate": 5.214943520566531e-05, - "loss": 0.3023, - "step": 4220 - }, - { - "epoch": 10.4064039408867, - "grad_norm": 1.03125, - "learning_rate": 5.18979823818385e-05, - "loss": 0.3114, - "step": 4225 - }, - { - "epoch": 10.41871921182266, - "grad_norm": 1.1171875, - "learning_rate": 5.164692463628378e-05, - "loss": 0.3008, - "step": 4230 - }, - { - "epoch": 10.431034482758621, - "grad_norm": 0.93359375, - "learning_rate": 5.13962640310241e-05, - "loss": 0.3022, - "step": 4235 - }, - { - "epoch": 10.443349753694582, - "grad_norm": 1.0078125, - "learning_rate": 5.1146002624820386e-05, - "loss": 0.308, - "step": 4240 - }, - { - "epoch": 10.455665024630543, - "grad_norm": 0.8828125, - "learning_rate": 5.0896142473154987e-05, - "loss": 0.314, - "step": 4245 - }, - { - "epoch": 10.467980295566502, - "grad_norm": 1.1796875, - "learning_rate": 5.064668562821444e-05, - "loss": 0.3085, - "step": 4250 - }, - { - "epoch": 10.480295566502463, - "grad_norm": 1.0078125, - "learning_rate": 5.039763413887291e-05, - "loss": 0.3102, - "step": 4255 - }, - { - "epoch": 10.492610837438423, - "grad_norm": 0.9609375, - "learning_rate": 5.014899005067524e-05, - "loss": 0.3058, - "step": 4260 - }, - { - "epoch": 10.504926108374384, - "grad_norm": 1.03125, - "learning_rate": 4.990075540582003e-05, - "loss": 0.3001, - "step": 4265 - }, - { - "epoch": 10.517241379310345, - "grad_norm": 1.1171875, - "learning_rate": 4.9652932243143146e-05, - "loss": 0.3018, - "step": 4270 - }, - { - "epoch": 10.529556650246306, - "grad_norm": 1.0234375, - "learning_rate": 4.940552259810063e-05, - "loss": 0.312, - "step": 4275 - }, - { - "epoch": 10.541871921182267, - "grad_norm": 0.87890625, - "learning_rate": 4.915852850275233e-05, - "loss": 0.3086, - "step": 4280 - }, - { - "epoch": 10.554187192118226, - "grad_norm": 1.03125, - "learning_rate": 4.891195198574491e-05, - "loss": 0.3002, - "step": 4285 - }, - { - "epoch": 10.566502463054187, - "grad_norm": 1.0234375, - "learning_rate": 4.866579507229545e-05, - "loss": 0.307, - "step": 4290 - }, - { - "epoch": 10.578817733990148, - "grad_norm": 0.9609375, - "learning_rate": 4.8420059784174485e-05, - "loss": 0.3018, - "step": 4295 - }, - { - "epoch": 10.591133004926109, - "grad_norm": 0.92578125, - "learning_rate": 4.8174748139689905e-05, - "loss": 0.3035, - "step": 4300 - }, - { - "epoch": 10.60344827586207, - "grad_norm": 0.9765625, - "learning_rate": 4.792986215366976e-05, - "loss": 0.3052, - "step": 4305 - }, - { - "epoch": 10.61576354679803, - "grad_norm": 0.90625, - "learning_rate": 4.768540383744622e-05, - "loss": 0.3069, - "step": 4310 - }, - { - "epoch": 10.62807881773399, - "grad_norm": 0.95703125, - "learning_rate": 4.744137519883872e-05, - "loss": 0.3031, - "step": 4315 - }, - { - "epoch": 10.64039408866995, - "grad_norm": 0.9609375, - "learning_rate": 4.7197778242137755e-05, - "loss": 0.3124, - "step": 4320 - }, - { - "epoch": 10.652709359605911, - "grad_norm": 0.91015625, - "learning_rate": 4.6954614968088115e-05, - "loss": 0.3012, - "step": 4325 - }, - { - "epoch": 10.665024630541872, - "grad_norm": 0.95703125, - "learning_rate": 4.6711887373872754e-05, - "loss": 0.2978, - "step": 4330 - }, - { - "epoch": 10.677339901477833, - "grad_norm": 0.92578125, - "learning_rate": 4.646959745309609e-05, - "loss": 0.3075, - "step": 4335 - }, - { - "epoch": 10.689655172413794, - "grad_norm": 1.0390625, - "learning_rate": 4.62277471957679e-05, - "loss": 0.3036, - "step": 4340 - }, - { - "epoch": 10.701970443349754, - "grad_norm": 0.890625, - "learning_rate": 4.598633858828681e-05, - "loss": 0.3051, - "step": 4345 - }, - { - "epoch": 10.714285714285714, - "grad_norm": 1.09375, - "learning_rate": 4.574537361342407e-05, - "loss": 0.3028, - "step": 4350 - }, - { - "epoch": 10.726600985221674, - "grad_norm": 0.9921875, - "learning_rate": 4.5504854250307085e-05, - "loss": 0.3109, - "step": 4355 - }, - { - "epoch": 10.738916256157635, - "grad_norm": 0.91796875, - "learning_rate": 4.526478247440349e-05, - "loss": 0.2968, - "step": 4360 - }, - { - "epoch": 10.751231527093596, - "grad_norm": 0.953125, - "learning_rate": 4.502516025750455e-05, - "loss": 0.3105, - "step": 4365 - }, - { - "epoch": 10.763546798029557, - "grad_norm": 1.0546875, - "learning_rate": 4.4785989567709316e-05, - "loss": 0.3039, - "step": 4370 - }, - { - "epoch": 10.775862068965518, - "grad_norm": 0.9609375, - "learning_rate": 4.454727236940814e-05, - "loss": 0.3047, - "step": 4375 - }, - { - "epoch": 10.788177339901479, - "grad_norm": 0.91015625, - "learning_rate": 4.430901062326681e-05, - "loss": 0.3062, - "step": 4380 - }, - { - "epoch": 10.800492610837438, - "grad_norm": 1.0078125, - "learning_rate": 4.407120628621032e-05, - "loss": 0.3066, - "step": 4385 - }, - { - "epoch": 10.812807881773399, - "grad_norm": 0.90234375, - "learning_rate": 4.3833861311406697e-05, - "loss": 0.3024, - "step": 4390 - }, - { - "epoch": 10.82512315270936, - "grad_norm": 0.9140625, - "learning_rate": 4.359697764825123e-05, - "loss": 0.3068, - "step": 4395 - }, - { - "epoch": 10.83743842364532, - "grad_norm": 0.9765625, - "learning_rate": 4.336055724235013e-05, - "loss": 0.3091, - "step": 4400 - }, - { - "epoch": 10.849753694581281, - "grad_norm": 0.88671875, - "learning_rate": 4.312460203550489e-05, - "loss": 0.3067, - "step": 4405 - }, - { - "epoch": 10.862068965517242, - "grad_norm": 1.1015625, - "learning_rate": 4.288911396569599e-05, - "loss": 0.3093, - "step": 4410 - }, - { - "epoch": 10.874384236453203, - "grad_norm": 0.96875, - "learning_rate": 4.265409496706733e-05, - "loss": 0.3003, - "step": 4415 - }, - { - "epoch": 10.886699507389162, - "grad_norm": 0.93359375, - "learning_rate": 4.241954696990995e-05, - "loss": 0.3017, - "step": 4420 - }, - { - "epoch": 10.899014778325123, - "grad_norm": 1.015625, - "learning_rate": 4.21854719006467e-05, - "loss": 0.3075, - "step": 4425 - }, - { - "epoch": 10.911330049261084, - "grad_norm": 0.90234375, - "learning_rate": 4.1951871681815804e-05, - "loss": 0.3098, - "step": 4430 - }, - { - "epoch": 10.923645320197044, - "grad_norm": 0.9296875, - "learning_rate": 4.1718748232055595e-05, - "loss": 0.3021, - "step": 4435 - }, - { - "epoch": 10.935960591133005, - "grad_norm": 0.984375, - "learning_rate": 4.148610346608837e-05, - "loss": 0.3097, - "step": 4440 - }, - { - "epoch": 10.948275862068966, - "grad_norm": 1.0078125, - "learning_rate": 4.1253939294705004e-05, - "loss": 0.3028, - "step": 4445 - }, - { - "epoch": 10.960591133004925, - "grad_norm": 0.9453125, - "learning_rate": 4.1022257624748914e-05, - "loss": 0.3118, - "step": 4450 - }, - { - "epoch": 10.972906403940886, - "grad_norm": 0.9609375, - "learning_rate": 4.079106035910073e-05, - "loss": 0.3038, - "step": 4455 - }, - { - "epoch": 10.985221674876847, - "grad_norm": 0.99609375, - "learning_rate": 4.056034939666236e-05, - "loss": 0.297, - "step": 4460 - }, - { - "epoch": 10.997536945812808, - "grad_norm": 0.98046875, - "learning_rate": 4.0330126632341625e-05, - "loss": 0.2993, - "step": 4465 - }, - { - "epoch": 11.0, - "eval_loss": 5.202045440673828, - "eval_runtime": 2.0439, - "eval_samples_per_second": 4.893, - "eval_steps_per_second": 0.979, - "step": 4466 - }, - { - "epoch": 11.009852216748769, - "grad_norm": 0.76953125, - "learning_rate": 4.010039395703664e-05, - "loss": 0.2871, - "step": 4470 - }, - { - "epoch": 11.02216748768473, - "grad_norm": 0.921875, - "learning_rate": 3.987115325762012e-05, - "loss": 0.2841, - "step": 4475 - }, - { - "epoch": 11.03448275862069, - "grad_norm": 0.921875, - "learning_rate": 3.964240641692416e-05, - "loss": 0.2807, - "step": 4480 - }, - { - "epoch": 11.04679802955665, - "grad_norm": 0.94921875, - "learning_rate": 3.94141553137245e-05, - "loss": 0.2826, - "step": 4485 - }, - { - "epoch": 11.05911330049261, - "grad_norm": 0.890625, - "learning_rate": 3.918640182272535e-05, - "loss": 0.2809, - "step": 4490 - }, - { - "epoch": 11.071428571428571, - "grad_norm": 0.91796875, - "learning_rate": 3.89591478145437e-05, - "loss": 0.2852, - "step": 4495 - }, - { - "epoch": 11.083743842364532, - "grad_norm": 0.89453125, - "learning_rate": 3.873239515569429e-05, - "loss": 0.2906, - "step": 4500 - }, - { - "epoch": 11.096059113300493, - "grad_norm": 0.890625, - "learning_rate": 3.8506145708573916e-05, - "loss": 0.2764, - "step": 4505 - }, - { - "epoch": 11.108374384236454, - "grad_norm": 1.0078125, - "learning_rate": 3.828040133144657e-05, - "loss": 0.2854, - "step": 4510 - }, - { - "epoch": 11.120689655172415, - "grad_norm": 0.9296875, - "learning_rate": 3.80551638784277e-05, - "loss": 0.2767, - "step": 4515 - }, - { - "epoch": 11.133004926108374, - "grad_norm": 1.2734375, - "learning_rate": 3.783043519946936e-05, - "loss": 0.2836, - "step": 4520 - }, - { - "epoch": 11.145320197044335, - "grad_norm": 0.94921875, - "learning_rate": 3.760621714034476e-05, - "loss": 0.2811, - "step": 4525 - }, - { - "epoch": 11.157635467980295, - "grad_norm": 0.86328125, - "learning_rate": 3.738251154263333e-05, - "loss": 0.2803, - "step": 4530 - }, - { - "epoch": 11.169950738916256, - "grad_norm": 0.8671875, - "learning_rate": 3.7159320243705355e-05, - "loss": 0.2818, - "step": 4535 - }, - { - "epoch": 11.182266009852217, - "grad_norm": 0.94140625, - "learning_rate": 3.6936645076707146e-05, - "loss": 0.2864, - "step": 4540 - }, - { - "epoch": 11.194581280788178, - "grad_norm": 0.94140625, - "learning_rate": 3.671448787054571e-05, - "loss": 0.2815, - "step": 4545 - }, - { - "epoch": 11.206896551724139, - "grad_norm": 0.8984375, - "learning_rate": 3.649285044987397e-05, - "loss": 0.2779, - "step": 4550 - }, - { - "epoch": 11.219211822660098, - "grad_norm": 0.91015625, - "learning_rate": 3.627173463507565e-05, - "loss": 0.2857, - "step": 4555 - }, - { - "epoch": 11.231527093596059, - "grad_norm": 0.99609375, - "learning_rate": 3.605114224225028e-05, - "loss": 0.2811, - "step": 4560 - }, - { - "epoch": 11.24384236453202, - "grad_norm": 0.9375, - "learning_rate": 3.5831075083198464e-05, - "loss": 0.2804, - "step": 4565 - }, - { - "epoch": 11.25615763546798, - "grad_norm": 0.9609375, - "learning_rate": 3.561153496540673e-05, - "loss": 0.2825, - "step": 4570 - }, - { - "epoch": 11.268472906403941, - "grad_norm": 0.9296875, - "learning_rate": 3.5392523692033006e-05, - "loss": 0.2878, - "step": 4575 - }, - { - "epoch": 11.280788177339902, - "grad_norm": 0.9375, - "learning_rate": 3.51740430618915e-05, - "loss": 0.2851, - "step": 4580 - }, - { - "epoch": 11.293103448275861, - "grad_norm": 0.8828125, - "learning_rate": 3.495609486943814e-05, - "loss": 0.2805, - "step": 4585 - }, - { - "epoch": 11.305418719211822, - "grad_norm": 0.921875, - "learning_rate": 3.473868090475574e-05, - "loss": 0.2836, - "step": 4590 - }, - { - "epoch": 11.317733990147783, - "grad_norm": 0.96484375, - "learning_rate": 3.4521802953539376e-05, - "loss": 0.2952, - "step": 4595 - }, - { - "epoch": 11.330049261083744, - "grad_norm": 0.98828125, - "learning_rate": 3.4305462797081525e-05, - "loss": 0.288, - "step": 4600 - }, - { - "epoch": 11.342364532019705, - "grad_norm": 0.9609375, - "learning_rate": 3.408966221225773e-05, - "loss": 0.2773, - "step": 4605 - }, - { - "epoch": 11.354679802955665, - "grad_norm": 0.89453125, - "learning_rate": 3.387440297151172e-05, - "loss": 0.2867, - "step": 4610 - }, - { - "epoch": 11.366995073891626, - "grad_norm": 0.91796875, - "learning_rate": 3.36596868428411e-05, - "loss": 0.2809, - "step": 4615 - }, - { - "epoch": 11.379310344827585, - "grad_norm": 0.96875, - "learning_rate": 3.3445515589782574e-05, - "loss": 0.2847, - "step": 4620 - }, - { - "epoch": 11.391625615763546, - "grad_norm": 0.99609375, - "learning_rate": 3.3231890971397694e-05, - "loss": 0.2794, - "step": 4625 - }, - { - "epoch": 11.403940886699507, - "grad_norm": 0.953125, - "learning_rate": 3.301881474225831e-05, - "loss": 0.283, - "step": 4630 - }, - { - "epoch": 11.416256157635468, - "grad_norm": 0.91796875, - "learning_rate": 3.2806288652432174e-05, - "loss": 0.2808, - "step": 4635 - }, - { - "epoch": 11.428571428571429, - "grad_norm": 0.953125, - "learning_rate": 3.259431444746846e-05, - "loss": 0.2821, - "step": 4640 - }, - { - "epoch": 11.44088669950739, - "grad_norm": 0.91015625, - "learning_rate": 3.238289386838364e-05, - "loss": 0.2811, - "step": 4645 - }, - { - "epoch": 11.45320197044335, - "grad_norm": 0.9375, - "learning_rate": 3.217202865164697e-05, - "loss": 0.2841, - "step": 4650 - }, - { - "epoch": 11.46551724137931, - "grad_norm": 0.91015625, - "learning_rate": 3.1961720529166436e-05, - "loss": 0.2791, - "step": 4655 - }, - { - "epoch": 11.47783251231527, - "grad_norm": 1.0, - "learning_rate": 3.17519712282743e-05, - "loss": 0.2847, - "step": 4660 - }, - { - "epoch": 11.490147783251231, - "grad_norm": 0.8671875, - "learning_rate": 3.154278247171314e-05, - "loss": 0.2852, - "step": 4665 - }, - { - "epoch": 11.502463054187192, - "grad_norm": 1.0, - "learning_rate": 3.133415597762148e-05, - "loss": 0.2871, - "step": 4670 - }, - { - "epoch": 11.514778325123153, - "grad_norm": 0.8515625, - "learning_rate": 3.112609345951989e-05, - "loss": 0.2828, - "step": 4675 - }, - { - "epoch": 11.527093596059114, - "grad_norm": 0.90234375, - "learning_rate": 3.09185966262968e-05, - "loss": 0.2815, - "step": 4680 - }, - { - "epoch": 11.539408866995075, - "grad_norm": 0.9140625, - "learning_rate": 3.071166718219439e-05, - "loss": 0.2875, - "step": 4685 - }, - { - "epoch": 11.551724137931034, - "grad_norm": 0.96484375, - "learning_rate": 3.05053068267948e-05, - "loss": 0.2818, - "step": 4690 - }, - { - "epoch": 11.564039408866995, - "grad_norm": 0.93359375, - "learning_rate": 3.0299517255005937e-05, - "loss": 0.2872, - "step": 4695 - }, - { - "epoch": 11.576354679802956, - "grad_norm": 0.91796875, - "learning_rate": 3.0094300157047793e-05, - "loss": 0.2768, - "step": 4700 - }, - { - "epoch": 11.588669950738916, - "grad_norm": 0.9140625, - "learning_rate": 2.9889657218438283e-05, - "loss": 0.28, - "step": 4705 - }, - { - "epoch": 11.600985221674877, - "grad_norm": 0.96875, - "learning_rate": 2.9685590119979688e-05, - "loss": 0.2793, - "step": 4710 - }, - { - "epoch": 11.613300492610838, - "grad_norm": 0.8984375, - "learning_rate": 2.948210053774465e-05, - "loss": 0.284, - "step": 4715 - }, - { - "epoch": 11.625615763546797, - "grad_norm": 0.93359375, - "learning_rate": 2.9279190143062552e-05, - "loss": 0.2934, - "step": 4720 - }, - { - "epoch": 11.637931034482758, - "grad_norm": 0.921875, - "learning_rate": 2.9076860602505564e-05, - "loss": 0.2825, - "step": 4725 - }, - { - "epoch": 11.650246305418719, - "grad_norm": 0.94921875, - "learning_rate": 2.8875113577875258e-05, - "loss": 0.2836, - "step": 4730 - }, - { - "epoch": 11.66256157635468, - "grad_norm": 0.9765625, - "learning_rate": 2.867395072618868e-05, - "loss": 0.29, - "step": 4735 - }, - { - "epoch": 11.67487684729064, - "grad_norm": 0.9296875, - "learning_rate": 2.8473373699664997e-05, - "loss": 0.2824, - "step": 4740 - }, - { - "epoch": 11.687192118226601, - "grad_norm": 0.9765625, - "learning_rate": 2.8273384145711624e-05, - "loss": 0.2794, - "step": 4745 - }, - { - "epoch": 11.699507389162562, - "grad_norm": 0.94921875, - "learning_rate": 2.8073983706911024e-05, - "loss": 0.285, - "step": 4750 - }, - { - "epoch": 11.711822660098521, - "grad_norm": 1.0546875, - "learning_rate": 2.7875174021007e-05, - "loss": 0.2843, - "step": 4755 - }, - { - "epoch": 11.724137931034482, - "grad_norm": 0.94921875, - "learning_rate": 2.7676956720891235e-05, - "loss": 0.2873, - "step": 4760 - }, - { - "epoch": 11.736453201970443, - "grad_norm": 0.921875, - "learning_rate": 2.747933343459007e-05, - "loss": 0.2841, - "step": 4765 - }, - { - "epoch": 11.748768472906404, - "grad_norm": 0.875, - "learning_rate": 2.728230578525086e-05, - "loss": 0.2801, - "step": 4770 - }, - { - "epoch": 11.761083743842365, - "grad_norm": 0.9375, - "learning_rate": 2.7085875391128955e-05, - "loss": 0.2792, - "step": 4775 - }, - { - "epoch": 11.773399014778326, - "grad_norm": 1.1015625, - "learning_rate": 2.6890043865574078e-05, - "loss": 0.2856, - "step": 4780 - }, - { - "epoch": 11.785714285714286, - "grad_norm": 1.0625, - "learning_rate": 2.669481281701739e-05, - "loss": 0.2899, - "step": 4785 - }, - { - "epoch": 11.798029556650246, - "grad_norm": 0.9453125, - "learning_rate": 2.6500183848957983e-05, - "loss": 0.2834, - "step": 4790 - }, - { - "epoch": 11.810344827586206, - "grad_norm": 0.8828125, - "learning_rate": 2.6306158559950023e-05, - "loss": 0.2855, - "step": 4795 - }, - { - "epoch": 11.822660098522167, - "grad_norm": 0.875, - "learning_rate": 2.6112738543589312e-05, - "loss": 0.2883, - "step": 4800 - }, - { - "epoch": 11.834975369458128, - "grad_norm": 0.93359375, - "learning_rate": 2.591992538850042e-05, - "loss": 0.2853, - "step": 4805 - }, - { - "epoch": 11.847290640394089, - "grad_norm": 0.890625, - "learning_rate": 2.572772067832351e-05, - "loss": 0.2817, - "step": 4810 - }, - { - "epoch": 11.85960591133005, - "grad_norm": 1.0234375, - "learning_rate": 2.553612599170143e-05, - "loss": 0.2847, - "step": 4815 - }, - { - "epoch": 11.87192118226601, - "grad_norm": 0.91796875, - "learning_rate": 2.5345142902266628e-05, - "loss": 0.2841, - "step": 4820 - }, - { - "epoch": 11.88423645320197, - "grad_norm": 0.88671875, - "learning_rate": 2.5154772978628405e-05, - "loss": 0.2884, - "step": 4825 - }, - { - "epoch": 11.89655172413793, - "grad_norm": 0.97265625, - "learning_rate": 2.496501778435977e-05, - "loss": 0.2876, - "step": 4830 - }, - { - "epoch": 11.908866995073891, - "grad_norm": 0.9609375, - "learning_rate": 2.477587887798488e-05, - "loss": 0.2867, - "step": 4835 - }, - { - "epoch": 11.921182266009852, - "grad_norm": 0.84375, - "learning_rate": 2.4587357812966095e-05, - "loss": 0.2807, - "step": 4840 - }, - { - "epoch": 11.933497536945813, - "grad_norm": 1.0390625, - "learning_rate": 2.4399456137691147e-05, - "loss": 0.2868, - "step": 4845 - }, - { - "epoch": 11.945812807881774, - "grad_norm": 0.984375, - "learning_rate": 2.42121753954606e-05, - "loss": 0.2953, - "step": 4850 - }, - { - "epoch": 11.958128078817733, - "grad_norm": 0.93359375, - "learning_rate": 2.4025517124475017e-05, - "loss": 0.2788, - "step": 4855 - }, - { - "epoch": 11.970443349753694, - "grad_norm": 0.91015625, - "learning_rate": 2.3839482857822458e-05, - "loss": 0.2893, - "step": 4860 - }, - { - "epoch": 11.982758620689655, - "grad_norm": 1.09375, - "learning_rate": 2.3654074123465752e-05, - "loss": 0.2919, - "step": 4865 - }, - { - "epoch": 11.995073891625616, - "grad_norm": 0.96875, - "learning_rate": 2.3469292444230096e-05, - "loss": 0.29, - "step": 4870 - }, - { - "epoch": 12.0, - "eval_loss": 5.407031536102295, - "eval_runtime": 2.0437, - "eval_samples_per_second": 4.893, - "eval_steps_per_second": 0.979, - "step": 4872 - }, - { - "epoch": 12.007389162561577, - "grad_norm": 0.8515625, - "learning_rate": 2.328513933779034e-05, - "loss": 0.2855, - "step": 4875 - }, - { - "epoch": 12.019704433497537, - "grad_norm": 0.89453125, - "learning_rate": 2.310161631665886e-05, - "loss": 0.2683, - "step": 4880 - }, - { - "epoch": 12.032019704433498, - "grad_norm": 0.8359375, - "learning_rate": 2.2918724888172714e-05, - "loss": 0.2686, - "step": 4885 - }, - { - "epoch": 12.044334975369457, - "grad_norm": 0.91796875, - "learning_rate": 2.2736466554481617e-05, - "loss": 0.278, - "step": 4890 - }, - { - "epoch": 12.056650246305418, - "grad_norm": 0.94921875, - "learning_rate": 2.255484281253537e-05, - "loss": 0.2731, - "step": 4895 - }, - { - "epoch": 12.068965517241379, - "grad_norm": 0.953125, - "learning_rate": 2.2373855154071732e-05, - "loss": 0.2786, - "step": 4900 - }, - { - "epoch": 12.08128078817734, - "grad_norm": 0.89453125, - "learning_rate": 2.2193505065604014e-05, - "loss": 0.2711, - "step": 4905 - }, - { - "epoch": 12.0935960591133, - "grad_norm": 0.8828125, - "learning_rate": 2.201379402840903e-05, - "loss": 0.2823, - "step": 4910 - }, - { - "epoch": 12.105911330049262, - "grad_norm": 0.8828125, - "learning_rate": 2.183472351851472e-05, - "loss": 0.2705, - "step": 4915 - }, - { - "epoch": 12.118226600985222, - "grad_norm": 0.8515625, - "learning_rate": 2.1656295006688353e-05, - "loss": 0.2721, - "step": 4920 - }, - { - "epoch": 12.130541871921181, - "grad_norm": 0.87109375, - "learning_rate": 2.1478509958424064e-05, - "loss": 0.2736, - "step": 4925 - }, - { - "epoch": 12.142857142857142, - "grad_norm": 0.9140625, - "learning_rate": 2.1301369833931117e-05, - "loss": 0.276, - "step": 4930 - }, - { - "epoch": 12.155172413793103, - "grad_norm": 0.9140625, - "learning_rate": 2.1124876088121692e-05, - "loss": 0.2752, - "step": 4935 - }, - { - "epoch": 12.167487684729064, - "grad_norm": 0.91015625, - "learning_rate": 2.0949030170599182e-05, - "loss": 0.2696, - "step": 4940 - }, - { - "epoch": 12.179802955665025, - "grad_norm": 0.8984375, - "learning_rate": 2.0773833525645992e-05, - "loss": 0.2715, - "step": 4945 - }, - { - "epoch": 12.192118226600986, - "grad_norm": 0.8671875, - "learning_rate": 2.0599287592211968e-05, - "loss": 0.2779, - "step": 4950 - }, - { - "epoch": 12.204433497536947, - "grad_norm": 0.9609375, - "learning_rate": 2.0425393803902314e-05, - "loss": 0.2681, - "step": 4955 - }, - { - "epoch": 12.216748768472906, - "grad_norm": 0.97265625, - "learning_rate": 2.0252153588966037e-05, - "loss": 0.2812, - "step": 4960 - }, - { - "epoch": 12.229064039408867, - "grad_norm": 1.03125, - "learning_rate": 2.0079568370284128e-05, - "loss": 0.2777, - "step": 4965 - }, - { - "epoch": 12.241379310344827, - "grad_norm": 0.9453125, - "learning_rate": 1.990763956535777e-05, - "loss": 0.2717, - "step": 4970 - }, - { - "epoch": 12.253694581280788, - "grad_norm": 0.93359375, - "learning_rate": 1.9736368586296916e-05, - "loss": 0.2782, - "step": 4975 - }, - { - "epoch": 12.266009852216749, - "grad_norm": 0.8984375, - "learning_rate": 1.956575683980846e-05, - "loss": 0.2732, - "step": 4980 - }, - { - "epoch": 12.27832512315271, - "grad_norm": 0.875, - "learning_rate": 1.9395805727184912e-05, - "loss": 0.2661, - "step": 4985 - }, - { - "epoch": 12.290640394088669, - "grad_norm": 0.96484375, - "learning_rate": 1.9226516644292647e-05, - "loss": 0.2757, - "step": 4990 - }, - { - "epoch": 12.30295566502463, - "grad_norm": 0.8828125, - "learning_rate": 1.9057890981560677e-05, - "loss": 0.2753, - "step": 4995 - }, - { - "epoch": 12.31527093596059, - "grad_norm": 0.9140625, - "learning_rate": 1.888993012396899e-05, - "loss": 0.2701, - "step": 5000 - }, - { - "epoch": 12.327586206896552, - "grad_norm": 0.921875, - "learning_rate": 1.8722635451037497e-05, - "loss": 0.2794, - "step": 5005 - }, - { - "epoch": 12.339901477832512, - "grad_norm": 0.9765625, - "learning_rate": 1.85560083368143e-05, - "loss": 0.2824, - "step": 5010 - }, - { - "epoch": 12.352216748768473, - "grad_norm": 0.98046875, - "learning_rate": 1.8390050149864745e-05, - "loss": 0.275, - "step": 5015 - }, - { - "epoch": 12.364532019704434, - "grad_norm": 0.95703125, - "learning_rate": 1.8224762253259976e-05, - "loss": 0.2787, - "step": 5020 - }, - { - "epoch": 12.376847290640393, - "grad_norm": 0.95703125, - "learning_rate": 1.806014600456588e-05, - "loss": 0.2712, - "step": 5025 - }, - { - "epoch": 12.389162561576354, - "grad_norm": 0.88671875, - "learning_rate": 1.7896202755831804e-05, - "loss": 0.2765, - "step": 5030 - }, - { - "epoch": 12.401477832512315, - "grad_norm": 0.921875, - "learning_rate": 1.773293385357959e-05, - "loss": 0.275, - "step": 5035 - }, - { - "epoch": 12.413793103448276, - "grad_norm": 0.8828125, - "learning_rate": 1.757034063879235e-05, - "loss": 0.2676, - "step": 5040 - }, - { - "epoch": 12.426108374384237, - "grad_norm": 0.9375, - "learning_rate": 1.7408424446903626e-05, - "loss": 0.2728, - "step": 5045 - }, - { - "epoch": 12.438423645320198, - "grad_norm": 0.9453125, - "learning_rate": 1.7247186607786338e-05, - "loss": 0.2792, - "step": 5050 - }, - { - "epoch": 12.450738916256158, - "grad_norm": 0.88671875, - "learning_rate": 1.708662844574178e-05, - "loss": 0.2757, - "step": 5055 - }, - { - "epoch": 12.463054187192117, - "grad_norm": 0.9453125, - "learning_rate": 1.692675127948894e-05, - "loss": 0.276, - "step": 5060 - }, - { - "epoch": 12.475369458128078, - "grad_norm": 0.9140625, - "learning_rate": 1.676755642215343e-05, - "loss": 0.2757, - "step": 5065 - }, - { - "epoch": 12.48768472906404, - "grad_norm": 0.91015625, - "learning_rate": 1.6609045181256976e-05, - "loss": 0.2722, - "step": 5070 - }, - { - "epoch": 12.5, - "grad_norm": 0.89453125, - "learning_rate": 1.6451218858706374e-05, - "loss": 0.276, - "step": 5075 - }, - { - "epoch": 12.51231527093596, - "grad_norm": 0.9609375, - "learning_rate": 1.629407875078305e-05, - "loss": 0.2782, - "step": 5080 - }, - { - "epoch": 12.524630541871922, - "grad_norm": 0.8828125, - "learning_rate": 1.61376261481323e-05, - "loss": 0.2738, - "step": 5085 - }, - { - "epoch": 12.536945812807883, - "grad_norm": 0.88671875, - "learning_rate": 1.5981862335752716e-05, - "loss": 0.2779, - "step": 5090 - }, - { - "epoch": 12.549261083743842, - "grad_norm": 0.8671875, - "learning_rate": 1.5826788592985553e-05, - "loss": 0.2686, - "step": 5095 - }, - { - "epoch": 12.561576354679802, - "grad_norm": 0.9296875, - "learning_rate": 1.5672406193504384e-05, - "loss": 0.2704, - "step": 5100 - }, - { - "epoch": 12.573891625615763, - "grad_norm": 0.84375, - "learning_rate": 1.5518716405304447e-05, - "loss": 0.2715, - "step": 5105 - }, - { - "epoch": 12.586206896551724, - "grad_norm": 0.90625, - "learning_rate": 1.5365720490692426e-05, - "loss": 0.2803, - "step": 5110 - }, - { - "epoch": 12.598522167487685, - "grad_norm": 0.93359375, - "learning_rate": 1.5213419706275878e-05, - "loss": 0.2766, - "step": 5115 - }, - { - "epoch": 12.610837438423646, - "grad_norm": 0.91796875, - "learning_rate": 1.5061815302953141e-05, - "loss": 0.2725, - "step": 5120 - }, - { - "epoch": 12.623152709359605, - "grad_norm": 0.97265625, - "learning_rate": 1.4910908525902811e-05, - "loss": 0.2694, - "step": 5125 - }, - { - "epoch": 12.635467980295566, - "grad_norm": 1.109375, - "learning_rate": 1.4760700614573731e-05, - "loss": 0.2727, - "step": 5130 - }, - { - "epoch": 12.647783251231527, - "grad_norm": 0.89453125, - "learning_rate": 1.461119280267471e-05, - "loss": 0.2772, - "step": 5135 - }, - { - "epoch": 12.660098522167488, - "grad_norm": 0.92578125, - "learning_rate": 1.4462386318164356e-05, - "loss": 0.2735, - "step": 5140 - }, - { - "epoch": 12.672413793103448, - "grad_norm": 0.95703125, - "learning_rate": 1.4314282383241096e-05, - "loss": 0.2735, - "step": 5145 - }, - { - "epoch": 12.68472906403941, - "grad_norm": 0.890625, - "learning_rate": 1.4166882214332999e-05, - "loss": 0.2767, - "step": 5150 - }, - { - "epoch": 12.69704433497537, - "grad_norm": 1.046875, - "learning_rate": 1.4020187022087971e-05, - "loss": 0.2824, - "step": 5155 - }, - { - "epoch": 12.709359605911331, - "grad_norm": 1.4375, - "learning_rate": 1.3874198011363582e-05, - "loss": 0.2778, - "step": 5160 - }, - { - "epoch": 12.72167487684729, - "grad_norm": 1.015625, - "learning_rate": 1.3728916381217394e-05, - "loss": 0.2768, - "step": 5165 - }, - { - "epoch": 12.733990147783251, - "grad_norm": 0.9296875, - "learning_rate": 1.3584343324896964e-05, - "loss": 0.2729, - "step": 5170 - }, - { - "epoch": 12.746305418719212, - "grad_norm": 0.90234375, - "learning_rate": 1.3440480029830127e-05, - "loss": 0.2775, - "step": 5175 - }, - { - "epoch": 12.758620689655173, - "grad_norm": 0.99609375, - "learning_rate": 1.3297327677615124e-05, - "loss": 0.2774, - "step": 5180 - }, - { - "epoch": 12.770935960591133, - "grad_norm": 0.875, - "learning_rate": 1.3154887444011087e-05, - "loss": 0.2798, - "step": 5185 - }, - { - "epoch": 12.783251231527094, - "grad_norm": 0.9296875, - "learning_rate": 1.301316049892818e-05, - "loss": 0.2758, - "step": 5190 - }, - { - "epoch": 12.795566502463053, - "grad_norm": 0.87890625, - "learning_rate": 1.2872148006418161e-05, - "loss": 0.2783, - "step": 5195 - }, - { - "epoch": 12.807881773399014, - "grad_norm": 0.953125, - "learning_rate": 1.2731851124664685e-05, - "loss": 0.2666, - "step": 5200 - }, - { - "epoch": 12.820197044334975, - "grad_norm": 0.92578125, - "learning_rate": 1.2592271005973888e-05, - "loss": 0.2807, - "step": 5205 - }, - { - "epoch": 12.832512315270936, - "grad_norm": 0.94140625, - "learning_rate": 1.2453408796764876e-05, - "loss": 0.2776, - "step": 5210 - }, - { - "epoch": 12.844827586206897, - "grad_norm": 0.9296875, - "learning_rate": 1.2315265637560357e-05, - "loss": 0.2727, - "step": 5215 - }, - { - "epoch": 12.857142857142858, - "grad_norm": 0.8828125, - "learning_rate": 1.2177842662977135e-05, - "loss": 0.2788, - "step": 5220 - }, - { - "epoch": 12.869458128078819, - "grad_norm": 0.90234375, - "learning_rate": 1.2041141001717027e-05, - "loss": 0.2732, - "step": 5225 - }, - { - "epoch": 12.881773399014778, - "grad_norm": 0.9453125, - "learning_rate": 1.1905161776557327e-05, - "loss": 0.2767, - "step": 5230 - }, - { - "epoch": 12.894088669950738, - "grad_norm": 0.87890625, - "learning_rate": 1.1769906104341832e-05, - "loss": 0.2717, - "step": 5235 - }, - { - "epoch": 12.9064039408867, - "grad_norm": 0.8984375, - "learning_rate": 1.1635375095971435e-05, - "loss": 0.2775, - "step": 5240 - }, - { - "epoch": 12.91871921182266, - "grad_norm": 0.890625, - "learning_rate": 1.1501569856395223e-05, - "loss": 0.2795, - "step": 5245 - }, - { - "epoch": 12.931034482758621, - "grad_norm": 0.94921875, - "learning_rate": 1.136849148460125e-05, - "loss": 0.281, - "step": 5250 - }, - { - "epoch": 12.943349753694582, - "grad_norm": 0.95703125, - "learning_rate": 1.1236141073607542e-05, - "loss": 0.275, - "step": 5255 - }, - { - "epoch": 12.955665024630543, - "grad_norm": 0.90234375, - "learning_rate": 1.1104519710453176e-05, - "loss": 0.2769, - "step": 5260 - }, - { - "epoch": 12.967980295566502, - "grad_norm": 0.93359375, - "learning_rate": 1.0973628476189257e-05, - "loss": 0.2804, - "step": 5265 - }, - { - "epoch": 12.980295566502463, - "grad_norm": 0.91015625, - "learning_rate": 1.0843468445870142e-05, - "loss": 0.2693, - "step": 5270 - }, - { - "epoch": 12.992610837438423, - "grad_norm": 0.8984375, - "learning_rate": 1.0714040688544535e-05, - "loss": 0.2802, - "step": 5275 - }, - { - "epoch": 13.0, - "eval_loss": 5.508431434631348, - "eval_runtime": 2.0431, - "eval_samples_per_second": 4.895, - "eval_steps_per_second": 0.979, - "step": 5278 - }, - { - "epoch": 13.004926108374384, - "grad_norm": 0.8359375, - "learning_rate": 1.0585346267246743e-05, - "loss": 0.2778, - "step": 5280 - }, - { - "epoch": 13.017241379310345, - "grad_norm": 0.89453125, - "learning_rate": 1.045738623898791e-05, - "loss": 0.2741, - "step": 5285 - }, - { - "epoch": 13.029556650246306, - "grad_norm": 0.828125, - "learning_rate": 1.0330161654747395e-05, - "loss": 0.2757, - "step": 5290 - }, - { - "epoch": 13.041871921182265, - "grad_norm": 0.84375, - "learning_rate": 1.0203673559464089e-05, - "loss": 0.2747, - "step": 5295 - }, - { - "epoch": 13.054187192118226, - "grad_norm": 0.91015625, - "learning_rate": 1.0077922992027867e-05, - "loss": 0.2692, - "step": 5300 - }, - { - "epoch": 13.066502463054187, - "grad_norm": 0.921875, - "learning_rate": 9.952910985270969e-06, - "loss": 0.2673, - "step": 5305 - }, - { - "epoch": 13.078817733990148, - "grad_norm": 0.875, - "learning_rate": 9.82863856595968e-06, - "loss": 0.2692, - "step": 5310 - }, - { - "epoch": 13.091133004926109, - "grad_norm": 0.8828125, - "learning_rate": 9.70510675478572e-06, - "loss": 0.2695, - "step": 5315 - }, - { - "epoch": 13.10344827586207, - "grad_norm": 0.8828125, - "learning_rate": 9.582316566357996e-06, - "loss": 0.2742, - "step": 5320 - }, - { - "epoch": 13.11576354679803, - "grad_norm": 0.90234375, - "learning_rate": 9.460269009194167e-06, - "loss": 0.2727, - "step": 5325 - }, - { - "epoch": 13.12807881773399, - "grad_norm": 0.953125, - "learning_rate": 9.338965085712459e-06, - "loss": 0.2656, - "step": 5330 - }, - { - "epoch": 13.14039408866995, - "grad_norm": 0.94140625, - "learning_rate": 9.218405792223361e-06, - "loss": 0.2655, - "step": 5335 - }, - { - "epoch": 13.152709359605911, - "grad_norm": 0.88671875, - "learning_rate": 9.098592118921435e-06, - "loss": 0.2744, - "step": 5340 - }, - { - "epoch": 13.165024630541872, - "grad_norm": 0.9375, - "learning_rate": 8.979525049877258e-06, - "loss": 0.2752, - "step": 5345 - }, - { - "epoch": 13.177339901477833, - "grad_norm": 0.90234375, - "learning_rate": 8.861205563029228e-06, - "loss": 0.273, - "step": 5350 - }, - { - "epoch": 13.189655172413794, - "grad_norm": 0.94921875, - "learning_rate": 8.74363463017569e-06, - "loss": 0.2742, - "step": 5355 - }, - { - "epoch": 13.201970443349754, - "grad_norm": 0.93359375, - "learning_rate": 8.626813216966745e-06, - "loss": 0.2751, - "step": 5360 - }, - { - "epoch": 13.214285714285714, - "grad_norm": 0.89453125, - "learning_rate": 8.510742282896544e-06, - "loss": 0.2712, - "step": 5365 - }, - { - "epoch": 13.226600985221674, - "grad_norm": 0.84375, - "learning_rate": 8.395422781295192e-06, - "loss": 0.2797, - "step": 5370 - }, - { - "epoch": 13.238916256157635, - "grad_norm": 0.83984375, - "learning_rate": 8.28085565932113e-06, - "loss": 0.2744, - "step": 5375 - }, - { - "epoch": 13.251231527093596, - "grad_norm": 0.87109375, - "learning_rate": 8.167041857953162e-06, - "loss": 0.2678, - "step": 5380 - }, - { - "epoch": 13.263546798029557, - "grad_norm": 0.84765625, - "learning_rate": 8.053982311982867e-06, - "loss": 0.2727, - "step": 5385 - }, - { - "epoch": 13.275862068965518, - "grad_norm": 0.96484375, - "learning_rate": 7.94167795000682e-06, - "loss": 0.2699, - "step": 5390 - }, - { - "epoch": 13.288177339901479, - "grad_norm": 0.875, - "learning_rate": 7.830129694419065e-06, - "loss": 0.2722, - "step": 5395 - }, - { - "epoch": 13.300492610837438, - "grad_norm": 0.94140625, - "learning_rate": 7.719338461403435e-06, - "loss": 0.2666, - "step": 5400 - }, - { - "epoch": 13.312807881773399, - "grad_norm": 0.8671875, - "learning_rate": 7.609305160926128e-06, - "loss": 0.2712, - "step": 5405 - }, - { - "epoch": 13.32512315270936, - "grad_norm": 0.8671875, - "learning_rate": 7.500030696728133e-06, - "loss": 0.2724, - "step": 5410 - }, - { - "epoch": 13.33743842364532, - "grad_norm": 0.88671875, - "learning_rate": 7.3915159663179075e-06, - "loss": 0.2766, - "step": 5415 - }, - { - "epoch": 13.349753694581281, - "grad_norm": 0.87109375, - "learning_rate": 7.283761860963933e-06, - "loss": 0.2712, - "step": 5420 - }, - { - "epoch": 13.362068965517242, - "grad_norm": 0.984375, - "learning_rate": 7.176769265687389e-06, - "loss": 0.2697, - "step": 5425 - }, - { - "epoch": 13.374384236453203, - "grad_norm": 0.92578125, - "learning_rate": 7.070539059254977e-06, - "loss": 0.273, - "step": 5430 - }, - { - "epoch": 13.386699507389162, - "grad_norm": 0.87109375, - "learning_rate": 6.965072114171578e-06, - "loss": 0.2658, - "step": 5435 - }, - { - "epoch": 13.399014778325123, - "grad_norm": 0.93359375, - "learning_rate": 6.860369296673197e-06, - "loss": 0.2687, - "step": 5440 - }, - { - "epoch": 13.411330049261084, - "grad_norm": 0.92578125, - "learning_rate": 6.756431466719737e-06, - "loss": 0.2734, - "step": 5445 - }, - { - "epoch": 13.423645320197044, - "grad_norm": 0.9140625, - "learning_rate": 6.653259477988083e-06, - "loss": 0.2736, - "step": 5450 - }, - { - "epoch": 13.435960591133005, - "grad_norm": 0.8671875, - "learning_rate": 6.5508541778649066e-06, - "loss": 0.2752, - "step": 5455 - }, - { - "epoch": 13.448275862068966, - "grad_norm": 0.9140625, - "learning_rate": 6.4492164074399065e-06, - "loss": 0.2758, - "step": 5460 - }, - { - "epoch": 13.460591133004925, - "grad_norm": 0.83984375, - "learning_rate": 6.348347001498711e-06, - "loss": 0.2713, - "step": 5465 - }, - { - "epoch": 13.472906403940886, - "grad_norm": 0.9375, - "learning_rate": 6.248246788516165e-06, - "loss": 0.2745, - "step": 5470 - }, - { - "epoch": 13.485221674876847, - "grad_norm": 0.8671875, - "learning_rate": 6.148916590649434e-06, - "loss": 0.2728, - "step": 5475 - }, - { - "epoch": 13.497536945812808, - "grad_norm": 0.87109375, - "learning_rate": 6.050357223731318e-06, - "loss": 0.2722, - "step": 5480 - }, - { - "epoch": 13.509852216748769, - "grad_norm": 0.875, - "learning_rate": 5.9525694972634715e-06, - "loss": 0.2679, - "step": 5485 - }, - { - "epoch": 13.52216748768473, - "grad_norm": 0.859375, - "learning_rate": 5.8555542144098865e-06, - "loss": 0.2699, - "step": 5490 - }, - { - "epoch": 13.53448275862069, - "grad_norm": 0.859375, - "learning_rate": 5.7593121719900835e-06, - "loss": 0.2798, - "step": 5495 - }, - { - "epoch": 13.54679802955665, - "grad_norm": 0.86328125, - "learning_rate": 5.663844160472865e-06, - "loss": 0.2699, - "step": 5500 - }, - { - "epoch": 13.55911330049261, - "grad_norm": 0.921875, - "learning_rate": 5.569150963969494e-06, - "loss": 0.2744, - "step": 5505 - }, - { - "epoch": 13.571428571428571, - "grad_norm": 0.83984375, - "learning_rate": 5.475233360227516e-06, - "loss": 0.2701, - "step": 5510 - }, - { - "epoch": 13.583743842364532, - "grad_norm": 0.9375, - "learning_rate": 5.382092120624216e-06, - "loss": 0.277, - "step": 5515 - }, - { - "epoch": 13.596059113300493, - "grad_norm": 0.85546875, - "learning_rate": 5.289728010160366e-06, - "loss": 0.2759, - "step": 5520 - }, - { - "epoch": 13.608374384236454, - "grad_norm": 0.875, - "learning_rate": 5.19814178745388e-06, - "loss": 0.2708, - "step": 5525 - }, - { - "epoch": 13.620689655172415, - "grad_norm": 0.89453125, - "learning_rate": 5.10733420473366e-06, - "loss": 0.2766, - "step": 5530 - }, - { - "epoch": 13.633004926108374, - "grad_norm": 0.86328125, - "learning_rate": 5.0173060078333225e-06, - "loss": 0.274, - "step": 5535 - }, - { - "epoch": 13.645320197044335, - "grad_norm": 0.87890625, - "learning_rate": 4.928057936185138e-06, - "loss": 0.2741, - "step": 5540 - }, - { - "epoch": 13.657635467980295, - "grad_norm": 0.85546875, - "learning_rate": 4.839590722813991e-06, - "loss": 0.277, - "step": 5545 - }, - { - "epoch": 13.669950738916256, - "grad_norm": 0.91796875, - "learning_rate": 4.7519050943312325e-06, - "loss": 0.2695, - "step": 5550 - }, - { - "epoch": 13.682266009852217, - "grad_norm": 0.9453125, - "learning_rate": 4.665001770928845e-06, - "loss": 0.2699, - "step": 5555 - }, - { - "epoch": 13.694581280788178, - "grad_norm": 0.91796875, - "learning_rate": 4.578881466373441e-06, - "loss": 0.2676, - "step": 5560 - }, - { - "epoch": 13.706896551724139, - "grad_norm": 0.87890625, - "learning_rate": 4.493544888000467e-06, - "loss": 0.2756, - "step": 5565 - }, - { - "epoch": 13.719211822660098, - "grad_norm": 0.88671875, - "learning_rate": 4.408992736708317e-06, - "loss": 0.2712, - "step": 5570 - }, - { - "epoch": 13.731527093596059, - "grad_norm": 0.90625, - "learning_rate": 4.3252257069526516e-06, - "loss": 0.2772, - "step": 5575 - }, - { - "epoch": 13.74384236453202, - "grad_norm": 0.8671875, - "learning_rate": 4.242244486740643e-06, - "loss": 0.2729, - "step": 5580 - }, - { - "epoch": 13.75615763546798, - "grad_norm": 0.875, - "learning_rate": 4.160049757625362e-06, - "loss": 0.2736, - "step": 5585 - }, - { - "epoch": 13.768472906403941, - "grad_norm": 0.86328125, - "learning_rate": 4.078642194700111e-06, - "loss": 0.2728, - "step": 5590 - }, - { - "epoch": 13.780788177339902, - "grad_norm": 0.8984375, - "learning_rate": 3.99802246659301e-06, - "loss": 0.2699, - "step": 5595 - }, - { - "epoch": 13.793103448275861, - "grad_norm": 1.1171875, - "learning_rate": 3.918191235461333e-06, - "loss": 0.269, - "step": 5600 - }, - { - "epoch": 13.805418719211822, - "grad_norm": 0.9765625, - "learning_rate": 3.839149156986233e-06, - "loss": 0.2687, - "step": 5605 - }, - { - "epoch": 13.817733990147783, - "grad_norm": 0.86328125, - "learning_rate": 3.760896880367215e-06, - "loss": 0.274, - "step": 5610 - }, - { - "epoch": 13.830049261083744, - "grad_norm": 0.87109375, - "learning_rate": 3.683435048316941e-06, - "loss": 0.2695, - "step": 5615 - }, - { - "epoch": 13.842364532019705, - "grad_norm": 0.8515625, - "learning_rate": 3.6067642970558312e-06, - "loss": 0.2738, - "step": 5620 - }, - { - "epoch": 13.854679802955665, - "grad_norm": 0.8671875, - "learning_rate": 3.530885256306915e-06, - "loss": 0.2728, - "step": 5625 - }, - { - "epoch": 13.866995073891626, - "grad_norm": 0.9453125, - "learning_rate": 3.455798549290645e-06, - "loss": 0.268, - "step": 5630 - }, - { - "epoch": 13.879310344827585, - "grad_norm": 0.828125, - "learning_rate": 3.381504792719714e-06, - "loss": 0.2697, - "step": 5635 - }, - { - "epoch": 13.891625615763546, - "grad_norm": 0.88671875, - "learning_rate": 3.308004596794101e-06, - "loss": 0.2817, - "step": 5640 - }, - { - "epoch": 13.903940886699507, - "grad_norm": 0.92578125, - "learning_rate": 3.2352985651959657e-06, - "loss": 0.2706, - "step": 5645 - }, - { - "epoch": 13.916256157635468, - "grad_norm": 0.96484375, - "learning_rate": 3.1633872950847523e-06, - "loss": 0.2751, - "step": 5650 - }, - { - "epoch": 13.928571428571429, - "grad_norm": 0.9296875, - "learning_rate": 3.092271377092215e-06, - "loss": 0.2793, - "step": 5655 - }, - { - "epoch": 13.94088669950739, - "grad_norm": 0.91015625, - "learning_rate": 3.021951395317646e-06, - "loss": 0.2755, - "step": 5660 - }, - { - "epoch": 13.95320197044335, - "grad_norm": 0.9765625, - "learning_rate": 2.9524279273230428e-06, - "loss": 0.2787, - "step": 5665 - }, - { - "epoch": 13.96551724137931, - "grad_norm": 1.0546875, - "learning_rate": 2.8837015441283586e-06, - "loss": 0.2675, - "step": 5670 - }, - { - "epoch": 13.97783251231527, - "grad_norm": 0.86328125, - "learning_rate": 2.815772810206785e-06, - "loss": 0.2726, - "step": 5675 - }, - { - "epoch": 13.990147783251231, - "grad_norm": 0.890625, - "learning_rate": 2.7486422834802186e-06, - "loss": 0.2693, - "step": 5680 - }, - { - "epoch": 14.0, - "eval_loss": 5.538418292999268, - "eval_runtime": 2.0446, - "eval_samples_per_second": 4.891, - "eval_steps_per_second": 0.978, - "step": 5684 - }, - { - "epoch": 14.002463054187192, - "grad_norm": 0.98046875, - "learning_rate": 2.682310515314512e-06, - "loss": 0.2708, - "step": 5685 - }, - { - "epoch": 14.014778325123153, - "grad_norm": 0.84765625, - "learning_rate": 2.616778050515145e-06, - "loss": 0.2679, - "step": 5690 - }, - { - "epoch": 14.027093596059114, - "grad_norm": 0.90234375, - "learning_rate": 2.5520454273225582e-06, - "loss": 0.2706, - "step": 5695 - }, - { - "epoch": 14.039408866995075, - "grad_norm": 0.95703125, - "learning_rate": 2.488113177407869e-06, - "loss": 0.2706, - "step": 5700 - }, - { - "epoch": 14.051724137931034, - "grad_norm": 0.89453125, - "learning_rate": 2.4249818258684664e-06, - "loss": 0.2715, - "step": 5705 - }, - { - "epoch": 14.064039408866995, - "grad_norm": 0.9296875, - "learning_rate": 2.3626518912236327e-06, - "loss": 0.2691, - "step": 5710 - }, - { - "epoch": 14.076354679802956, - "grad_norm": 0.96875, - "learning_rate": 2.3011238854103947e-06, - "loss": 0.2757, - "step": 5715 - }, - { - "epoch": 14.088669950738916, - "grad_norm": 0.91796875, - "learning_rate": 2.240398313779235e-06, - "loss": 0.2728, - "step": 5720 - }, - { - "epoch": 14.100985221674877, - "grad_norm": 0.8828125, - "learning_rate": 2.180475675089988e-06, - "loss": 0.2701, - "step": 5725 - }, - { - "epoch": 14.113300492610838, - "grad_norm": 0.921875, - "learning_rate": 2.1213564615077065e-06, - "loss": 0.2639, - "step": 5730 - }, - { - "epoch": 14.125615763546797, - "grad_norm": 0.85546875, - "learning_rate": 2.0630411585986554e-06, - "loss": 0.2732, - "step": 5735 - }, - { - "epoch": 14.137931034482758, - "grad_norm": 0.9453125, - "learning_rate": 2.0055302453262924e-06, - "loss": 0.274, - "step": 5740 - }, - { - "epoch": 14.150246305418719, - "grad_norm": 0.90625, - "learning_rate": 1.9488241940473828e-06, - "loss": 0.2749, - "step": 5745 - }, - { - "epoch": 14.16256157635468, - "grad_norm": 0.875, - "learning_rate": 1.8929234705080346e-06, - "loss": 0.2654, - "step": 5750 - }, - { - "epoch": 14.17487684729064, - "grad_norm": 0.88671875, - "learning_rate": 1.8378285338399692e-06, - "loss": 0.2622, - "step": 5755 - }, - { - "epoch": 14.187192118226601, - "grad_norm": 0.90234375, - "learning_rate": 1.783539836556669e-06, - "loss": 0.2647, - "step": 5760 - }, - { - "epoch": 14.199507389162562, - "grad_norm": 0.86328125, - "learning_rate": 1.7300578245497245e-06, - "loss": 0.2727, - "step": 5765 - }, - { - "epoch": 14.211822660098521, - "grad_norm": 0.90234375, - "learning_rate": 1.6773829370851368e-06, - "loss": 0.266, - "step": 5770 - }, - { - "epoch": 14.224137931034482, - "grad_norm": 0.8828125, - "learning_rate": 1.6255156067997323e-06, - "loss": 0.2707, - "step": 5775 - }, - { - "epoch": 14.236453201970443, - "grad_norm": 0.94140625, - "learning_rate": 1.5744562596975432e-06, - "loss": 0.276, - "step": 5780 - }, - { - "epoch": 14.248768472906404, - "grad_norm": 0.93359375, - "learning_rate": 1.524205315146432e-06, - "loss": 0.2701, - "step": 5785 - }, - { - "epoch": 14.261083743842365, - "grad_norm": 0.96875, - "learning_rate": 1.474763185874517e-06, - "loss": 0.271, - "step": 5790 - }, - { - "epoch": 14.273399014778326, - "grad_norm": 0.96484375, - "learning_rate": 1.4261302779668862e-06, - "loss": 0.2757, - "step": 5795 - }, - { - "epoch": 14.285714285714286, - "grad_norm": 0.9296875, - "learning_rate": 1.378306990862177e-06, - "loss": 0.2794, - "step": 5800 - }, - { - "epoch": 14.298029556650246, - "grad_norm": 0.87890625, - "learning_rate": 1.3312937173493577e-06, - "loss": 0.2699, - "step": 5805 - }, - { - "epoch": 14.310344827586206, - "grad_norm": 0.98046875, - "learning_rate": 1.285090843564485e-06, - "loss": 0.2706, - "step": 5810 - }, - { - "epoch": 14.322660098522167, - "grad_norm": 1.046875, - "learning_rate": 1.2396987489874946e-06, - "loss": 0.2731, - "step": 5815 - }, - { - "epoch": 14.334975369458128, - "grad_norm": 0.9140625, - "learning_rate": 1.19511780643915e-06, - "loss": 0.2703, - "step": 5820 - }, - { - "epoch": 14.347290640394089, - "grad_norm": 0.8359375, - "learning_rate": 1.1513483820779214e-06, - "loss": 0.2602, - "step": 5825 - }, - { - "epoch": 14.35960591133005, - "grad_norm": 0.87109375, - "learning_rate": 1.108390835397044e-06, - "loss": 0.2748, - "step": 5830 - }, - { - "epoch": 14.37192118226601, - "grad_norm": 0.890625, - "learning_rate": 1.066245519221465e-06, - "loss": 0.2692, - "step": 5835 - }, - { - "epoch": 14.38423645320197, - "grad_norm": 0.87890625, - "learning_rate": 1.024912779705045e-06, - "loss": 0.2657, - "step": 5840 - }, - { - "epoch": 14.39655172413793, - "grad_norm": 0.890625, - "learning_rate": 9.843929563276733e-07, - "loss": 0.2782, - "step": 5845 - }, - { - "epoch": 14.408866995073891, - "grad_norm": 0.890625, - "learning_rate": 9.446863818924679e-07, - "loss": 0.2744, - "step": 5850 - }, - { - "epoch": 14.421182266009852, - "grad_norm": 0.86328125, - "learning_rate": 9.05793382523068e-07, - "loss": 0.2685, - "step": 5855 - }, - { - "epoch": 14.433497536945813, - "grad_norm": 0.87109375, - "learning_rate": 8.67714277660947e-07, - "loss": 0.2718, - "step": 5860 - }, - { - "epoch": 14.445812807881774, - "grad_norm": 0.85546875, - "learning_rate": 8.304493800627589e-07, - "loss": 0.2679, - "step": 5865 - }, - { - "epoch": 14.458128078817733, - "grad_norm": 0.87890625, - "learning_rate": 7.939989957978289e-07, - "loss": 0.2673, - "step": 5870 - }, - { - "epoch": 14.470443349753694, - "grad_norm": 0.93359375, - "learning_rate": 7.583634242455784e-07, - "loss": 0.2773, - "step": 5875 - }, - { - "epoch": 14.482758620689655, - "grad_norm": 0.86328125, - "learning_rate": 7.235429580931152e-07, - "loss": 0.2713, - "step": 5880 - }, - { - "epoch": 14.495073891625616, - "grad_norm": 0.9140625, - "learning_rate": 6.895378833328025e-07, - "loss": 0.2705, - "step": 5885 - }, - { - "epoch": 14.507389162561577, - "grad_norm": 0.828125, - "learning_rate": 6.563484792599161e-07, - "loss": 0.2725, - "step": 5890 - }, - { - "epoch": 14.519704433497537, - "grad_norm": 0.80859375, - "learning_rate": 6.239750184703464e-07, - "loss": 0.2645, - "step": 5895 - }, - { - "epoch": 14.532019704433498, - "grad_norm": 0.87109375, - "learning_rate": 5.924177668583552e-07, - "loss": 0.2703, - "step": 5900 - }, - { - "epoch": 14.544334975369457, - "grad_norm": 0.8984375, - "learning_rate": 5.616769836144231e-07, - "loss": 0.2758, - "step": 5905 - }, - { - "epoch": 14.556650246305418, - "grad_norm": 0.86328125, - "learning_rate": 5.317529212230721e-07, - "loss": 0.2701, - "step": 5910 - }, - { - "epoch": 14.568965517241379, - "grad_norm": 0.95703125, - "learning_rate": 5.026458254608457e-07, - "loss": 0.2704, - "step": 5915 - }, - { - "epoch": 14.58128078817734, - "grad_norm": 0.8671875, - "learning_rate": 4.7435593539423284e-07, - "loss": 0.2724, - "step": 5920 - }, - { - "epoch": 14.5935960591133, - "grad_norm": 0.921875, - "learning_rate": 4.4688348337774686e-07, - "loss": 0.2845, - "step": 5925 - }, - { - "epoch": 14.605911330049262, - "grad_norm": 0.80859375, - "learning_rate": 4.202286950520162e-07, - "loss": 0.272, - "step": 5930 - }, - { - "epoch": 14.618226600985222, - "grad_norm": 0.87109375, - "learning_rate": 3.943917893418858e-07, - "loss": 0.276, - "step": 5935 - }, - { - "epoch": 14.630541871921181, - "grad_norm": 0.9765625, - "learning_rate": 3.693729784546962e-07, - "loss": 0.2689, - "step": 5940 - }, - { - "epoch": 14.642857142857142, - "grad_norm": 0.86328125, - "learning_rate": 3.451724678784518e-07, - "loss": 0.2802, - "step": 5945 - }, - { - "epoch": 14.655172413793103, - "grad_norm": 0.87109375, - "learning_rate": 3.21790456380211e-07, - "loss": 0.2675, - "step": 5950 - }, - { - "epoch": 14.667487684729064, - "grad_norm": 0.88671875, - "learning_rate": 2.9922713600439854e-07, - "loss": 0.2742, - "step": 5955 - }, - { - "epoch": 14.679802955665025, - "grad_norm": 0.83203125, - "learning_rate": 2.7748269207125145e-07, - "loss": 0.281, - "step": 5960 - }, - { - "epoch": 14.692118226600986, - "grad_norm": 0.87109375, - "learning_rate": 2.565573031753199e-07, - "loss": 0.2728, - "step": 5965 - }, - { - "epoch": 14.704433497536947, - "grad_norm": 0.89453125, - "learning_rate": 2.3645114118395762e-07, - "loss": 0.2788, - "step": 5970 - }, - { - "epoch": 14.716748768472906, - "grad_norm": 0.91796875, - "learning_rate": 2.1716437123591172e-07, - "loss": 0.269, - "step": 5975 - }, - { - "epoch": 14.729064039408867, - "grad_norm": 0.89453125, - "learning_rate": 1.986971517400016e-07, - "loss": 0.2742, - "step": 5980 - }, - { - "epoch": 14.741379310344827, - "grad_norm": 0.921875, - "learning_rate": 1.8104963437381993e-07, - "loss": 0.2687, - "step": 5985 - }, - { - "epoch": 14.753694581280788, - "grad_norm": 0.9375, - "learning_rate": 1.6422196408241165e-07, - "loss": 0.2788, - "step": 5990 - }, - { - "epoch": 14.766009852216749, - "grad_norm": 0.9296875, - "learning_rate": 1.4821427907719677e-07, - "loss": 0.2691, - "step": 5995 - }, - { - "epoch": 14.77832512315271, - "grad_norm": 0.90234375, - "learning_rate": 1.3302671083474938e-07, - "loss": 0.2757, - "step": 6000 - }, - { - "epoch": 14.790640394088669, - "grad_norm": 0.9296875, - "learning_rate": 1.1865938409573174e-07, - "loss": 0.2764, - "step": 6005 - }, - { - "epoch": 14.80295566502463, - "grad_norm": 0.87109375, - "learning_rate": 1.0511241686389505e-07, - "loss": 0.2734, - "step": 6010 - }, - { - "epoch": 14.81527093596059, - "grad_norm": 0.84765625, - "learning_rate": 9.238592040512472e-08, - "loss": 0.2696, - "step": 6015 - }, - { - "epoch": 14.827586206896552, - "grad_norm": 0.8671875, - "learning_rate": 8.047999924645222e-08, - "loss": 0.2647, - "step": 6020 - }, - { - "epoch": 14.839901477832512, - "grad_norm": 0.90625, - "learning_rate": 6.939475117526683e-08, - "loss": 0.2754, - "step": 6025 - }, - { - "epoch": 14.852216748768473, - "grad_norm": 0.9140625, - "learning_rate": 5.913026723850523e-08, - "loss": 0.2683, - "step": 6030 - }, - { - "epoch": 14.864532019704434, - "grad_norm": 1.109375, - "learning_rate": 4.968663174187427e-08, - "loss": 0.2692, - "step": 6035 - }, - { - "epoch": 14.876847290640395, - "grad_norm": 0.87109375, - "learning_rate": 4.106392224915156e-08, - "loss": 0.2759, - "step": 6040 - }, - { - "epoch": 14.889162561576354, - "grad_norm": 0.96484375, - "learning_rate": 3.3262209581619297e-08, - "loss": 0.2735, - "step": 6045 - }, - { - "epoch": 14.901477832512315, - "grad_norm": 0.83203125, - "learning_rate": 2.6281557817386947e-08, - "loss": 0.2697, - "step": 6050 - }, - { - "epoch": 14.913793103448276, - "grad_norm": 0.828125, - "learning_rate": 2.012202429091392e-08, - "loss": 0.2738, - "step": 6055 - }, - { - "epoch": 14.926108374384237, - "grad_norm": 0.890625, - "learning_rate": 1.4783659592576548e-08, - "loss": 0.2695, - "step": 6060 - }, - { - "epoch": 14.938423645320198, - "grad_norm": 0.96875, - "learning_rate": 1.0266507568179595e-08, - "loss": 0.2824, - "step": 6065 - }, - { - "epoch": 14.950738916256158, - "grad_norm": 0.87109375, - "learning_rate": 6.570605318612089e-09, - "loss": 0.2728, - "step": 6070 - }, - { - "epoch": 14.963054187192117, - "grad_norm": 0.91015625, - "learning_rate": 3.6959831996030704e-09, - "loss": 0.2733, - "step": 6075 - }, - { - "epoch": 14.975369458128078, - "grad_norm": 0.97265625, - "learning_rate": 1.6426648213885287e-09, - "loss": 0.2686, - "step": 6080 - }, - { - "epoch": 14.98768472906404, - "grad_norm": 0.93359375, - "learning_rate": 4.10667048589275e-10, - "loss": 0.2747, - "step": 6085 - }, - { - "epoch": 15.0, - "grad_norm": 0.875, - "learning_rate": 0.0, - "loss": 0.2715, - "step": 6090 - }, - { - "epoch": 15.0, - "eval_loss": 5.538125038146973, - "eval_runtime": 2.0459, - "eval_samples_per_second": 4.888, - "eval_steps_per_second": 0.978, - "step": 6090 + "epoch": 9.954337899543379, + "eval_loss": 2.515676259994507, + "eval_runtime": 0.2343, + "eval_samples_per_second": 42.683, + "eval_steps_per_second": 4.268, + "step": 1090 }, { - "epoch": 15.0, - "step": 6090, - "total_flos": 4.655418812370256e+18, - "train_loss": 0.9284773617542436, - "train_runtime": 39545.4398, - "train_samples_per_second": 2.463, - "train_steps_per_second": 0.154 + "epoch": 9.954337899543379, + "step": 1090, + "total_flos": 3.327732991202951e+18, + "train_loss": 2.136770288659892, + "train_runtime": 2636.8816, + "train_samples_per_second": 26.554, + "train_steps_per_second": 0.413 } ], "logging_steps": 5, - "max_steps": 6090, + "max_steps": 1090, "num_input_tokens_seen": 0, - "num_train_epochs": 15, + "num_train_epochs": 10, "save_steps": 100, - "total_flos": 4.655418812370256e+18, + "total_flos": 3.327732991202951e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null