{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2274, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013192612137203166, "grad_norm": 0.8693021722671535, "learning_rate": 8.771929824561404e-07, "loss": 1.3026, "step": 1 }, { "epoch": 0.002638522427440633, "grad_norm": 0.9161054498378574, "learning_rate": 1.7543859649122807e-06, "loss": 1.332, "step": 2 }, { "epoch": 0.00395778364116095, "grad_norm": 0.9021273684481022, "learning_rate": 2.631578947368421e-06, "loss": 1.3286, "step": 3 }, { "epoch": 0.005277044854881266, "grad_norm": 0.8787396269218507, "learning_rate": 3.5087719298245615e-06, "loss": 1.3101, "step": 4 }, { "epoch": 0.006596306068601583, "grad_norm": 0.866657352076399, "learning_rate": 4.3859649122807014e-06, "loss": 1.286, "step": 5 }, { "epoch": 0.0079155672823219, "grad_norm": 0.8962426278179947, "learning_rate": 5.263157894736842e-06, "loss": 1.328, "step": 6 }, { "epoch": 0.009234828496042216, "grad_norm": 0.8722433086351821, "learning_rate": 6.140350877192982e-06, "loss": 1.2915, "step": 7 }, { "epoch": 0.010554089709762533, "grad_norm": 0.9364305269301308, "learning_rate": 7.017543859649123e-06, "loss": 1.3499, "step": 8 }, { "epoch": 0.011873350923482849, "grad_norm": 0.8666494805251869, "learning_rate": 7.894736842105263e-06, "loss": 1.2788, "step": 9 }, { "epoch": 0.013192612137203167, "grad_norm": 0.8538974042693923, "learning_rate": 8.771929824561403e-06, "loss": 1.2815, "step": 10 }, { "epoch": 0.014511873350923483, "grad_norm": 0.8718962041821159, "learning_rate": 9.649122807017545e-06, "loss": 1.2874, "step": 11 }, { "epoch": 0.0158311345646438, "grad_norm": 0.813151185121104, "learning_rate": 1.0526315789473684e-05, "loss": 1.2738, "step": 12 }, { "epoch": 0.017150395778364115, "grad_norm": 0.770954156017625, "learning_rate": 1.1403508771929824e-05, "loss": 1.2396, "step": 13 }, { "epoch": 0.018469656992084433, "grad_norm": 0.7367955597273694, "learning_rate": 1.2280701754385964e-05, "loss": 1.2092, "step": 14 }, { "epoch": 0.01978891820580475, "grad_norm": 0.632787292771107, "learning_rate": 1.3157894736842106e-05, "loss": 1.1393, "step": 15 }, { "epoch": 0.021108179419525065, "grad_norm": 0.5789116552665657, "learning_rate": 1.4035087719298246e-05, "loss": 1.145, "step": 16 }, { "epoch": 0.022427440633245383, "grad_norm": 0.5402396844138461, "learning_rate": 1.4912280701754386e-05, "loss": 1.0668, "step": 17 }, { "epoch": 0.023746701846965697, "grad_norm": 0.5249041641872824, "learning_rate": 1.5789473684210526e-05, "loss": 1.0737, "step": 18 }, { "epoch": 0.025065963060686015, "grad_norm": 0.5285491394028654, "learning_rate": 1.6666666666666667e-05, "loss": 1.076, "step": 19 }, { "epoch": 0.026385224274406333, "grad_norm": 0.5284665752594401, "learning_rate": 1.7543859649122806e-05, "loss": 1.0178, "step": 20 }, { "epoch": 0.027704485488126648, "grad_norm": 0.5270647620865595, "learning_rate": 1.8421052631578947e-05, "loss": 0.9769, "step": 21 }, { "epoch": 0.029023746701846966, "grad_norm": 0.573450685063447, "learning_rate": 1.929824561403509e-05, "loss": 0.9573, "step": 22 }, { "epoch": 0.030343007915567283, "grad_norm": 0.5733322832727733, "learning_rate": 2.0175438596491227e-05, "loss": 0.9245, "step": 23 }, { "epoch": 0.0316622691292876, "grad_norm": 0.5546007532133637, "learning_rate": 2.105263157894737e-05, "loss": 0.8805, "step": 24 }, { "epoch": 0.032981530343007916, "grad_norm": 0.5826530883184633, "learning_rate": 2.1929824561403507e-05, "loss": 0.8489, "step": 25 }, { "epoch": 0.03430079155672823, "grad_norm": 0.5248573296548907, "learning_rate": 2.280701754385965e-05, "loss": 0.807, "step": 26 }, { "epoch": 0.03562005277044855, "grad_norm": 0.4985504682150672, "learning_rate": 2.368421052631579e-05, "loss": 0.79, "step": 27 }, { "epoch": 0.036939313984168866, "grad_norm": 0.5204225038152794, "learning_rate": 2.456140350877193e-05, "loss": 0.7468, "step": 28 }, { "epoch": 0.03825857519788918, "grad_norm": 0.49822260139092617, "learning_rate": 2.5438596491228074e-05, "loss": 0.6956, "step": 29 }, { "epoch": 0.0395778364116095, "grad_norm": 0.45728067341818385, "learning_rate": 2.6315789473684212e-05, "loss": 0.6917, "step": 30 }, { "epoch": 0.040897097625329816, "grad_norm": 0.4219215383754553, "learning_rate": 2.7192982456140354e-05, "loss": 0.6481, "step": 31 }, { "epoch": 0.04221635883905013, "grad_norm": 0.40942978043427214, "learning_rate": 2.8070175438596492e-05, "loss": 0.6307, "step": 32 }, { "epoch": 0.04353562005277045, "grad_norm": 0.3724077110177095, "learning_rate": 2.8947368421052634e-05, "loss": 0.5895, "step": 33 }, { "epoch": 0.044854881266490766, "grad_norm": 0.26999812791665917, "learning_rate": 2.9824561403508772e-05, "loss": 0.5769, "step": 34 }, { "epoch": 0.04617414248021108, "grad_norm": 0.258257084591898, "learning_rate": 3.0701754385964913e-05, "loss": 0.5778, "step": 35 }, { "epoch": 0.047493403693931395, "grad_norm": 0.23728961740660975, "learning_rate": 3.157894736842105e-05, "loss": 0.5436, "step": 36 }, { "epoch": 0.048812664907651716, "grad_norm": 0.2221611762414051, "learning_rate": 3.24561403508772e-05, "loss": 0.5379, "step": 37 }, { "epoch": 0.05013192612137203, "grad_norm": 0.1985037907886148, "learning_rate": 3.3333333333333335e-05, "loss": 0.5263, "step": 38 }, { "epoch": 0.051451187335092345, "grad_norm": 0.22416736446767715, "learning_rate": 3.421052631578947e-05, "loss": 0.5354, "step": 39 }, { "epoch": 0.052770448548812667, "grad_norm": 0.23228776338592944, "learning_rate": 3.508771929824561e-05, "loss": 0.511, "step": 40 }, { "epoch": 0.05408970976253298, "grad_norm": 0.18250285452129647, "learning_rate": 3.5964912280701756e-05, "loss": 0.5156, "step": 41 }, { "epoch": 0.055408970976253295, "grad_norm": 0.19902186758643073, "learning_rate": 3.6842105263157895e-05, "loss": 0.5009, "step": 42 }, { "epoch": 0.05672823218997362, "grad_norm": 0.19806503644234785, "learning_rate": 3.771929824561404e-05, "loss": 0.5104, "step": 43 }, { "epoch": 0.05804749340369393, "grad_norm": 0.19250401157462557, "learning_rate": 3.859649122807018e-05, "loss": 0.4914, "step": 44 }, { "epoch": 0.059366754617414245, "grad_norm": 0.1703627930093435, "learning_rate": 3.9473684210526316e-05, "loss": 0.5036, "step": 45 }, { "epoch": 0.06068601583113457, "grad_norm": 0.16963359671174075, "learning_rate": 4.0350877192982455e-05, "loss": 0.4971, "step": 46 }, { "epoch": 0.06200527704485488, "grad_norm": 0.17977958989923656, "learning_rate": 4.12280701754386e-05, "loss": 0.4961, "step": 47 }, { "epoch": 0.0633245382585752, "grad_norm": 0.15637021112719643, "learning_rate": 4.210526315789474e-05, "loss": 0.4776, "step": 48 }, { "epoch": 0.06464379947229551, "grad_norm": 0.14689840092670522, "learning_rate": 4.298245614035088e-05, "loss": 0.4649, "step": 49 }, { "epoch": 0.06596306068601583, "grad_norm": 0.14522616190095408, "learning_rate": 4.3859649122807014e-05, "loss": 0.4737, "step": 50 }, { "epoch": 0.06728232189973615, "grad_norm": 0.1409221770141239, "learning_rate": 4.473684210526316e-05, "loss": 0.4688, "step": 51 }, { "epoch": 0.06860158311345646, "grad_norm": 0.13506857851061382, "learning_rate": 4.56140350877193e-05, "loss": 0.4527, "step": 52 }, { "epoch": 0.06992084432717678, "grad_norm": 0.12539941288875148, "learning_rate": 4.649122807017544e-05, "loss": 0.4705, "step": 53 }, { "epoch": 0.0712401055408971, "grad_norm": 0.1221420244550201, "learning_rate": 4.736842105263158e-05, "loss": 0.4566, "step": 54 }, { "epoch": 0.07255936675461741, "grad_norm": 0.1201632054136647, "learning_rate": 4.824561403508772e-05, "loss": 0.4724, "step": 55 }, { "epoch": 0.07387862796833773, "grad_norm": 0.11779196244124046, "learning_rate": 4.912280701754386e-05, "loss": 0.463, "step": 56 }, { "epoch": 0.07519788918205805, "grad_norm": 0.10916434192414819, "learning_rate": 5e-05, "loss": 0.4475, "step": 57 }, { "epoch": 0.07651715039577836, "grad_norm": 0.1393777390089969, "learning_rate": 5.087719298245615e-05, "loss": 0.4476, "step": 58 }, { "epoch": 0.07783641160949868, "grad_norm": 0.11701591428530693, "learning_rate": 5.1754385964912286e-05, "loss": 0.4503, "step": 59 }, { "epoch": 0.079155672823219, "grad_norm": 0.12633431646265014, "learning_rate": 5.2631578947368424e-05, "loss": 0.4377, "step": 60 }, { "epoch": 0.08047493403693931, "grad_norm": 0.1179768885412869, "learning_rate": 5.350877192982456e-05, "loss": 0.4425, "step": 61 }, { "epoch": 0.08179419525065963, "grad_norm": 0.10457255410694592, "learning_rate": 5.438596491228071e-05, "loss": 0.4404, "step": 62 }, { "epoch": 0.08311345646437995, "grad_norm": 0.10983803417834777, "learning_rate": 5.526315789473685e-05, "loss": 0.4316, "step": 63 }, { "epoch": 0.08443271767810026, "grad_norm": 0.11025497048879628, "learning_rate": 5.6140350877192984e-05, "loss": 0.44, "step": 64 }, { "epoch": 0.08575197889182058, "grad_norm": 0.10553670577757882, "learning_rate": 5.701754385964912e-05, "loss": 0.4276, "step": 65 }, { "epoch": 0.0870712401055409, "grad_norm": 0.11003365371390775, "learning_rate": 5.789473684210527e-05, "loss": 0.437, "step": 66 }, { "epoch": 0.08839050131926121, "grad_norm": 0.11403897452245441, "learning_rate": 5.877192982456141e-05, "loss": 0.4359, "step": 67 }, { "epoch": 0.08970976253298153, "grad_norm": 0.11497898774858492, "learning_rate": 5.9649122807017544e-05, "loss": 0.441, "step": 68 }, { "epoch": 0.09102902374670185, "grad_norm": 0.110365062852618, "learning_rate": 6.052631578947369e-05, "loss": 0.435, "step": 69 }, { "epoch": 0.09234828496042216, "grad_norm": 0.11683673452796663, "learning_rate": 6.140350877192983e-05, "loss": 0.4156, "step": 70 }, { "epoch": 0.09366754617414248, "grad_norm": 0.1161146806529081, "learning_rate": 6.228070175438597e-05, "loss": 0.4182, "step": 71 }, { "epoch": 0.09498680738786279, "grad_norm": 0.11360683173942634, "learning_rate": 6.31578947368421e-05, "loss": 0.4215, "step": 72 }, { "epoch": 0.09630606860158311, "grad_norm": 0.10946899311729161, "learning_rate": 6.403508771929825e-05, "loss": 0.4093, "step": 73 }, { "epoch": 0.09762532981530343, "grad_norm": 0.11769738275480937, "learning_rate": 6.49122807017544e-05, "loss": 0.4243, "step": 74 }, { "epoch": 0.09894459102902374, "grad_norm": 0.1086000165561529, "learning_rate": 6.578947368421054e-05, "loss": 0.4147, "step": 75 }, { "epoch": 0.10026385224274406, "grad_norm": 0.1093530550420317, "learning_rate": 6.666666666666667e-05, "loss": 0.4156, "step": 76 }, { "epoch": 0.10158311345646438, "grad_norm": 0.11368181824057796, "learning_rate": 6.754385964912281e-05, "loss": 0.4166, "step": 77 }, { "epoch": 0.10290237467018469, "grad_norm": 0.1157611850953829, "learning_rate": 6.842105263157895e-05, "loss": 0.4149, "step": 78 }, { "epoch": 0.10422163588390501, "grad_norm": 0.11230989933095901, "learning_rate": 6.929824561403509e-05, "loss": 0.424, "step": 79 }, { "epoch": 0.10554089709762533, "grad_norm": 0.10986357726322057, "learning_rate": 7.017543859649122e-05, "loss": 0.4116, "step": 80 }, { "epoch": 0.10686015831134564, "grad_norm": 0.10707473801066615, "learning_rate": 7.105263157894737e-05, "loss": 0.4156, "step": 81 }, { "epoch": 0.10817941952506596, "grad_norm": 0.1146007785738499, "learning_rate": 7.192982456140351e-05, "loss": 0.4078, "step": 82 }, { "epoch": 0.10949868073878628, "grad_norm": 0.10863452738748855, "learning_rate": 7.280701754385966e-05, "loss": 0.4273, "step": 83 }, { "epoch": 0.11081794195250659, "grad_norm": 0.1146222808818652, "learning_rate": 7.368421052631579e-05, "loss": 0.4141, "step": 84 }, { "epoch": 0.11213720316622691, "grad_norm": 0.11250077142934803, "learning_rate": 7.456140350877193e-05, "loss": 0.415, "step": 85 }, { "epoch": 0.11345646437994723, "grad_norm": 0.12271872502798858, "learning_rate": 7.543859649122808e-05, "loss": 0.3968, "step": 86 }, { "epoch": 0.11477572559366754, "grad_norm": 0.11086015050702852, "learning_rate": 7.631578947368422e-05, "loss": 0.4104, "step": 87 }, { "epoch": 0.11609498680738786, "grad_norm": 0.12096044625059281, "learning_rate": 7.719298245614036e-05, "loss": 0.4156, "step": 88 }, { "epoch": 0.11741424802110818, "grad_norm": 0.12220685137799303, "learning_rate": 7.807017543859649e-05, "loss": 0.4192, "step": 89 }, { "epoch": 0.11873350923482849, "grad_norm": 0.11320510589373632, "learning_rate": 7.894736842105263e-05, "loss": 0.4108, "step": 90 }, { "epoch": 0.12005277044854881, "grad_norm": 0.12246719082571837, "learning_rate": 7.982456140350878e-05, "loss": 0.4144, "step": 91 }, { "epoch": 0.12137203166226913, "grad_norm": 0.1192043731817562, "learning_rate": 8.070175438596491e-05, "loss": 0.4001, "step": 92 }, { "epoch": 0.12269129287598944, "grad_norm": 0.11878137377412559, "learning_rate": 8.157894736842105e-05, "loss": 0.3899, "step": 93 }, { "epoch": 0.12401055408970976, "grad_norm": 0.12140866736953267, "learning_rate": 8.24561403508772e-05, "loss": 0.405, "step": 94 }, { "epoch": 0.12532981530343007, "grad_norm": 0.11959361814319593, "learning_rate": 8.333333333333334e-05, "loss": 0.3847, "step": 95 }, { "epoch": 0.1266490765171504, "grad_norm": 0.11770340026840373, "learning_rate": 8.421052631578948e-05, "loss": 0.3915, "step": 96 }, { "epoch": 0.1279683377308707, "grad_norm": 0.13793608006119712, "learning_rate": 8.508771929824562e-05, "loss": 0.3941, "step": 97 }, { "epoch": 0.12928759894459102, "grad_norm": 0.12846043670583573, "learning_rate": 8.596491228070177e-05, "loss": 0.3869, "step": 98 }, { "epoch": 0.13060686015831136, "grad_norm": 0.1264066325440092, "learning_rate": 8.68421052631579e-05, "loss": 0.3999, "step": 99 }, { "epoch": 0.13192612137203166, "grad_norm": 0.11952840258178161, "learning_rate": 8.771929824561403e-05, "loss": 0.392, "step": 100 }, { "epoch": 0.13324538258575197, "grad_norm": 0.13619689146717792, "learning_rate": 8.859649122807017e-05, "loss": 0.3874, "step": 101 }, { "epoch": 0.1345646437994723, "grad_norm": 0.12346611085708956, "learning_rate": 8.947368421052632e-05, "loss": 0.3747, "step": 102 }, { "epoch": 0.1358839050131926, "grad_norm": 0.13076223581049698, "learning_rate": 9.035087719298246e-05, "loss": 0.389, "step": 103 }, { "epoch": 0.13720316622691292, "grad_norm": 0.1276180446561196, "learning_rate": 9.12280701754386e-05, "loss": 0.3994, "step": 104 }, { "epoch": 0.13852242744063326, "grad_norm": 0.1271433670482555, "learning_rate": 9.210526315789474e-05, "loss": 0.3954, "step": 105 }, { "epoch": 0.13984168865435356, "grad_norm": 0.12864010028246214, "learning_rate": 9.298245614035089e-05, "loss": 0.3891, "step": 106 }, { "epoch": 0.14116094986807387, "grad_norm": 0.137893998957263, "learning_rate": 9.385964912280703e-05, "loss": 0.3848, "step": 107 }, { "epoch": 0.1424802110817942, "grad_norm": 0.1420061230542738, "learning_rate": 9.473684210526316e-05, "loss": 0.3939, "step": 108 }, { "epoch": 0.1437994722955145, "grad_norm": 0.128894012484459, "learning_rate": 9.56140350877193e-05, "loss": 0.3989, "step": 109 }, { "epoch": 0.14511873350923482, "grad_norm": 0.13300843067788046, "learning_rate": 9.649122807017544e-05, "loss": 0.3866, "step": 110 }, { "epoch": 0.14643799472295516, "grad_norm": 0.1285264013014554, "learning_rate": 9.736842105263158e-05, "loss": 0.3911, "step": 111 }, { "epoch": 0.14775725593667546, "grad_norm": 0.13046033249282984, "learning_rate": 9.824561403508771e-05, "loss": 0.3801, "step": 112 }, { "epoch": 0.14907651715039577, "grad_norm": 0.13409263879225755, "learning_rate": 9.912280701754386e-05, "loss": 0.388, "step": 113 }, { "epoch": 0.1503957783641161, "grad_norm": 0.13499711664043587, "learning_rate": 0.0001, "loss": 0.3956, "step": 114 }, { "epoch": 0.1517150395778364, "grad_norm": 0.13369217819479268, "learning_rate": 0.00010087719298245615, "loss": 0.3786, "step": 115 }, { "epoch": 0.15303430079155672, "grad_norm": 0.14272440003636813, "learning_rate": 0.0001017543859649123, "loss": 0.3833, "step": 116 }, { "epoch": 0.15435356200527706, "grad_norm": 0.13551547074871673, "learning_rate": 0.00010263157894736844, "loss": 0.3957, "step": 117 }, { "epoch": 0.15567282321899736, "grad_norm": 0.13089983299082494, "learning_rate": 0.00010350877192982457, "loss": 0.3831, "step": 118 }, { "epoch": 0.15699208443271767, "grad_norm": 0.1368087230642144, "learning_rate": 0.0001043859649122807, "loss": 0.3777, "step": 119 }, { "epoch": 0.158311345646438, "grad_norm": 0.13813873429818516, "learning_rate": 0.00010526315789473685, "loss": 0.3768, "step": 120 }, { "epoch": 0.15963060686015831, "grad_norm": 0.14059051282523097, "learning_rate": 0.00010614035087719298, "loss": 0.3963, "step": 121 }, { "epoch": 0.16094986807387862, "grad_norm": 0.14363343796194059, "learning_rate": 0.00010701754385964912, "loss": 0.3823, "step": 122 }, { "epoch": 0.16226912928759896, "grad_norm": 0.12747981291451596, "learning_rate": 0.00010789473684210527, "loss": 0.3846, "step": 123 }, { "epoch": 0.16358839050131926, "grad_norm": 0.13911648627974257, "learning_rate": 0.00010877192982456141, "loss": 0.3674, "step": 124 }, { "epoch": 0.16490765171503957, "grad_norm": 0.13218106889909387, "learning_rate": 0.00010964912280701756, "loss": 0.3942, "step": 125 }, { "epoch": 0.1662269129287599, "grad_norm": 0.14024910410953972, "learning_rate": 0.0001105263157894737, "loss": 0.3888, "step": 126 }, { "epoch": 0.16754617414248021, "grad_norm": 0.15042143560288362, "learning_rate": 0.00011140350877192982, "loss": 0.381, "step": 127 }, { "epoch": 0.16886543535620052, "grad_norm": 0.13240867282608515, "learning_rate": 0.00011228070175438597, "loss": 0.3814, "step": 128 }, { "epoch": 0.17018469656992086, "grad_norm": 0.1464818495240077, "learning_rate": 0.00011315789473684211, "loss": 0.3953, "step": 129 }, { "epoch": 0.17150395778364116, "grad_norm": 0.13304701179566775, "learning_rate": 0.00011403508771929824, "loss": 0.3756, "step": 130 }, { "epoch": 0.17282321899736147, "grad_norm": 0.12175135765793466, "learning_rate": 0.00011491228070175439, "loss": 0.3706, "step": 131 }, { "epoch": 0.1741424802110818, "grad_norm": 0.13709394445505466, "learning_rate": 0.00011578947368421053, "loss": 0.3648, "step": 132 }, { "epoch": 0.17546174142480211, "grad_norm": 0.13383886035214185, "learning_rate": 0.00011666666666666668, "loss": 0.3983, "step": 133 }, { "epoch": 0.17678100263852242, "grad_norm": 0.13240950957043196, "learning_rate": 0.00011754385964912282, "loss": 0.3818, "step": 134 }, { "epoch": 0.17810026385224276, "grad_norm": 0.1342852460020694, "learning_rate": 0.00011842105263157894, "loss": 0.372, "step": 135 }, { "epoch": 0.17941952506596306, "grad_norm": 0.1343275187702562, "learning_rate": 0.00011929824561403509, "loss": 0.3844, "step": 136 }, { "epoch": 0.18073878627968337, "grad_norm": 0.13001566807047218, "learning_rate": 0.00012017543859649123, "loss": 0.3732, "step": 137 }, { "epoch": 0.1820580474934037, "grad_norm": 0.13660745388397583, "learning_rate": 0.00012105263157894738, "loss": 0.3786, "step": 138 }, { "epoch": 0.18337730870712401, "grad_norm": 0.14201827519828158, "learning_rate": 0.00012192982456140352, "loss": 0.3789, "step": 139 }, { "epoch": 0.18469656992084432, "grad_norm": 0.13791649159558378, "learning_rate": 0.00012280701754385965, "loss": 0.3754, "step": 140 }, { "epoch": 0.18601583113456466, "grad_norm": 0.13627440608674038, "learning_rate": 0.0001236842105263158, "loss": 0.3735, "step": 141 }, { "epoch": 0.18733509234828497, "grad_norm": 0.13251875102416452, "learning_rate": 0.00012456140350877194, "loss": 0.3578, "step": 142 }, { "epoch": 0.18865435356200527, "grad_norm": 0.13607248757793755, "learning_rate": 0.00012543859649122806, "loss": 0.3719, "step": 143 }, { "epoch": 0.18997361477572558, "grad_norm": 0.1454941712671466, "learning_rate": 0.0001263157894736842, "loss": 0.3772, "step": 144 }, { "epoch": 0.19129287598944592, "grad_norm": 0.14588466362694655, "learning_rate": 0.00012719298245614035, "loss": 0.377, "step": 145 }, { "epoch": 0.19261213720316622, "grad_norm": 0.13552542518533645, "learning_rate": 0.0001280701754385965, "loss": 0.3744, "step": 146 }, { "epoch": 0.19393139841688653, "grad_norm": 0.13678293222612534, "learning_rate": 0.00012894736842105264, "loss": 0.3726, "step": 147 }, { "epoch": 0.19525065963060687, "grad_norm": 0.13617953258578738, "learning_rate": 0.0001298245614035088, "loss": 0.3751, "step": 148 }, { "epoch": 0.19656992084432717, "grad_norm": 0.1346541826051042, "learning_rate": 0.00013070175438596493, "loss": 0.3784, "step": 149 }, { "epoch": 0.19788918205804748, "grad_norm": 0.13795206555641748, "learning_rate": 0.00013157894736842108, "loss": 0.3747, "step": 150 }, { "epoch": 0.19920844327176782, "grad_norm": 0.13738752281688263, "learning_rate": 0.0001324561403508772, "loss": 0.3776, "step": 151 }, { "epoch": 0.20052770448548812, "grad_norm": 0.1371532337142982, "learning_rate": 0.00013333333333333334, "loss": 0.372, "step": 152 }, { "epoch": 0.20184696569920843, "grad_norm": 0.13701126224184146, "learning_rate": 0.00013421052631578948, "loss": 0.3677, "step": 153 }, { "epoch": 0.20316622691292877, "grad_norm": 0.1271016343126417, "learning_rate": 0.00013508771929824563, "loss": 0.3791, "step": 154 }, { "epoch": 0.20448548812664907, "grad_norm": 0.12842296064424177, "learning_rate": 0.00013596491228070177, "loss": 0.3594, "step": 155 }, { "epoch": 0.20580474934036938, "grad_norm": 0.1263532190221259, "learning_rate": 0.0001368421052631579, "loss": 0.3631, "step": 156 }, { "epoch": 0.20712401055408972, "grad_norm": 0.1255302035995146, "learning_rate": 0.00013771929824561404, "loss": 0.3661, "step": 157 }, { "epoch": 0.20844327176781002, "grad_norm": 0.13560415043312185, "learning_rate": 0.00013859649122807018, "loss": 0.3682, "step": 158 }, { "epoch": 0.20976253298153033, "grad_norm": 0.12968078109561212, "learning_rate": 0.0001394736842105263, "loss": 0.3792, "step": 159 }, { "epoch": 0.21108179419525067, "grad_norm": 0.12994688926520284, "learning_rate": 0.00014035087719298245, "loss": 0.3643, "step": 160 }, { "epoch": 0.21240105540897097, "grad_norm": 0.12652972783735045, "learning_rate": 0.0001412280701754386, "loss": 0.3688, "step": 161 }, { "epoch": 0.21372031662269128, "grad_norm": 0.12404701337331903, "learning_rate": 0.00014210526315789474, "loss": 0.3718, "step": 162 }, { "epoch": 0.21503957783641162, "grad_norm": 0.13342243336370554, "learning_rate": 0.00014298245614035088, "loss": 0.3639, "step": 163 }, { "epoch": 0.21635883905013192, "grad_norm": 0.1256136546426832, "learning_rate": 0.00014385964912280703, "loss": 0.3645, "step": 164 }, { "epoch": 0.21767810026385223, "grad_norm": 0.12660709433132336, "learning_rate": 0.00014473684210526317, "loss": 0.3702, "step": 165 }, { "epoch": 0.21899736147757257, "grad_norm": 0.13071138314728412, "learning_rate": 0.00014561403508771932, "loss": 0.364, "step": 166 }, { "epoch": 0.22031662269129287, "grad_norm": 0.13580970847701554, "learning_rate": 0.00014649122807017543, "loss": 0.3761, "step": 167 }, { "epoch": 0.22163588390501318, "grad_norm": 0.12504883545801415, "learning_rate": 0.00014736842105263158, "loss": 0.3567, "step": 168 }, { "epoch": 0.22295514511873352, "grad_norm": 0.13178547719683262, "learning_rate": 0.00014824561403508772, "loss": 0.357, "step": 169 }, { "epoch": 0.22427440633245382, "grad_norm": 0.11984524980403763, "learning_rate": 0.00014912280701754387, "loss": 0.3619, "step": 170 }, { "epoch": 0.22559366754617413, "grad_norm": 0.1425474180433099, "learning_rate": 0.00015000000000000001, "loss": 0.3763, "step": 171 }, { "epoch": 0.22691292875989447, "grad_norm": 0.125031485874505, "learning_rate": 0.00015087719298245616, "loss": 0.3747, "step": 172 }, { "epoch": 0.22823218997361477, "grad_norm": 0.12496030609852068, "learning_rate": 0.0001517543859649123, "loss": 0.3536, "step": 173 }, { "epoch": 0.22955145118733508, "grad_norm": 0.12910769783711282, "learning_rate": 0.00015263157894736845, "loss": 0.3683, "step": 174 }, { "epoch": 0.23087071240105542, "grad_norm": 0.1251418259947457, "learning_rate": 0.00015350877192982457, "loss": 0.3621, "step": 175 }, { "epoch": 0.23218997361477572, "grad_norm": 0.13062137118805284, "learning_rate": 0.0001543859649122807, "loss": 0.3588, "step": 176 }, { "epoch": 0.23350923482849603, "grad_norm": 0.12191036062799511, "learning_rate": 0.00015526315789473686, "loss": 0.3643, "step": 177 }, { "epoch": 0.23482849604221637, "grad_norm": 0.11395474334208981, "learning_rate": 0.00015614035087719297, "loss": 0.359, "step": 178 }, { "epoch": 0.23614775725593667, "grad_norm": 0.12971526339547748, "learning_rate": 0.00015701754385964912, "loss": 0.3523, "step": 179 }, { "epoch": 0.23746701846965698, "grad_norm": 0.11844900920990659, "learning_rate": 0.00015789473684210527, "loss": 0.3581, "step": 180 }, { "epoch": 0.23878627968337732, "grad_norm": 0.11374806214943965, "learning_rate": 0.0001587719298245614, "loss": 0.3569, "step": 181 }, { "epoch": 0.24010554089709762, "grad_norm": 0.1347265589491595, "learning_rate": 0.00015964912280701756, "loss": 0.3715, "step": 182 }, { "epoch": 0.24142480211081793, "grad_norm": 0.1220978441048545, "learning_rate": 0.0001605263157894737, "loss": 0.357, "step": 183 }, { "epoch": 0.24274406332453827, "grad_norm": 0.11849666303790922, "learning_rate": 0.00016140350877192982, "loss": 0.3582, "step": 184 }, { "epoch": 0.24406332453825857, "grad_norm": 0.12157912076397208, "learning_rate": 0.00016228070175438596, "loss": 0.3755, "step": 185 }, { "epoch": 0.24538258575197888, "grad_norm": 0.11466647012795492, "learning_rate": 0.0001631578947368421, "loss": 0.356, "step": 186 }, { "epoch": 0.24670184696569922, "grad_norm": 0.11668208330133854, "learning_rate": 0.00016403508771929825, "loss": 0.3688, "step": 187 }, { "epoch": 0.24802110817941952, "grad_norm": 0.11461241334217744, "learning_rate": 0.0001649122807017544, "loss": 0.3626, "step": 188 }, { "epoch": 0.24934036939313983, "grad_norm": 0.1158190868224385, "learning_rate": 0.00016578947368421054, "loss": 0.3443, "step": 189 }, { "epoch": 0.25065963060686014, "grad_norm": 0.11608340269980369, "learning_rate": 0.0001666666666666667, "loss": 0.365, "step": 190 }, { "epoch": 0.2519788918205805, "grad_norm": 0.11481148293660791, "learning_rate": 0.00016754385964912283, "loss": 0.3575, "step": 191 }, { "epoch": 0.2532981530343008, "grad_norm": 0.10650265902910266, "learning_rate": 0.00016842105263157895, "loss": 0.3468, "step": 192 }, { "epoch": 0.2546174142480211, "grad_norm": 0.10849801440862118, "learning_rate": 0.0001692982456140351, "loss": 0.3393, "step": 193 }, { "epoch": 0.2559366754617414, "grad_norm": 0.12469137157268732, "learning_rate": 0.00017017543859649124, "loss": 0.3486, "step": 194 }, { "epoch": 0.25725593667546176, "grad_norm": 0.11648961999521303, "learning_rate": 0.00017105263157894739, "loss": 0.3541, "step": 195 }, { "epoch": 0.25857519788918204, "grad_norm": 0.11667156706398848, "learning_rate": 0.00017192982456140353, "loss": 0.352, "step": 196 }, { "epoch": 0.2598944591029024, "grad_norm": 0.1206884336376331, "learning_rate": 0.00017280701754385965, "loss": 0.36, "step": 197 }, { "epoch": 0.2612137203166227, "grad_norm": 0.10974973597220489, "learning_rate": 0.0001736842105263158, "loss": 0.3546, "step": 198 }, { "epoch": 0.262532981530343, "grad_norm": 0.11947967951703731, "learning_rate": 0.00017456140350877194, "loss": 0.3572, "step": 199 }, { "epoch": 0.2638522427440633, "grad_norm": 0.12048092951608809, "learning_rate": 0.00017543859649122806, "loss": 0.3546, "step": 200 }, { "epoch": 0.26517150395778366, "grad_norm": 0.12152884597415653, "learning_rate": 0.0001763157894736842, "loss": 0.361, "step": 201 }, { "epoch": 0.26649076517150394, "grad_norm": 0.11332459315301077, "learning_rate": 0.00017719298245614035, "loss": 0.3587, "step": 202 }, { "epoch": 0.2678100263852243, "grad_norm": 0.11723799412212824, "learning_rate": 0.0001780701754385965, "loss": 0.3698, "step": 203 }, { "epoch": 0.2691292875989446, "grad_norm": 0.12637637019672104, "learning_rate": 0.00017894736842105264, "loss": 0.3643, "step": 204 }, { "epoch": 0.2704485488126649, "grad_norm": 0.1075880869004632, "learning_rate": 0.00017982456140350878, "loss": 0.3663, "step": 205 }, { "epoch": 0.2717678100263852, "grad_norm": 0.11474941393876723, "learning_rate": 0.00018070175438596493, "loss": 0.3549, "step": 206 }, { "epoch": 0.27308707124010556, "grad_norm": 0.11652136318189446, "learning_rate": 0.00018157894736842107, "loss": 0.3624, "step": 207 }, { "epoch": 0.27440633245382584, "grad_norm": 0.10865694572873343, "learning_rate": 0.0001824561403508772, "loss": 0.3567, "step": 208 }, { "epoch": 0.2757255936675462, "grad_norm": 0.10948715520484822, "learning_rate": 0.00018333333333333334, "loss": 0.3679, "step": 209 }, { "epoch": 0.2770448548812665, "grad_norm": 0.11758583785508754, "learning_rate": 0.00018421052631578948, "loss": 0.3546, "step": 210 }, { "epoch": 0.2783641160949868, "grad_norm": 0.11064866353749926, "learning_rate": 0.00018508771929824563, "loss": 0.3503, "step": 211 }, { "epoch": 0.2796833773087071, "grad_norm": 0.10745060442971201, "learning_rate": 0.00018596491228070177, "loss": 0.3551, "step": 212 }, { "epoch": 0.28100263852242746, "grad_norm": 0.11485682226566826, "learning_rate": 0.00018684210526315792, "loss": 0.3516, "step": 213 }, { "epoch": 0.28232189973614774, "grad_norm": 0.10973905118156897, "learning_rate": 0.00018771929824561406, "loss": 0.3487, "step": 214 }, { "epoch": 0.2836411609498681, "grad_norm": 0.1111817265300109, "learning_rate": 0.0001885964912280702, "loss": 0.3607, "step": 215 }, { "epoch": 0.2849604221635884, "grad_norm": 0.1126839554086772, "learning_rate": 0.00018947368421052632, "loss": 0.3602, "step": 216 }, { "epoch": 0.2862796833773087, "grad_norm": 0.10676125246145508, "learning_rate": 0.00019035087719298247, "loss": 0.3468, "step": 217 }, { "epoch": 0.287598944591029, "grad_norm": 0.10201173463572814, "learning_rate": 0.0001912280701754386, "loss": 0.3481, "step": 218 }, { "epoch": 0.28891820580474936, "grad_norm": 0.10858922171603559, "learning_rate": 0.00019210526315789473, "loss": 0.3438, "step": 219 }, { "epoch": 0.29023746701846964, "grad_norm": 0.11474279688188205, "learning_rate": 0.00019298245614035088, "loss": 0.3644, "step": 220 }, { "epoch": 0.29155672823219, "grad_norm": 0.1066747280826231, "learning_rate": 0.00019385964912280702, "loss": 0.3549, "step": 221 }, { "epoch": 0.2928759894459103, "grad_norm": 0.11062836431865326, "learning_rate": 0.00019473684210526317, "loss": 0.3561, "step": 222 }, { "epoch": 0.2941952506596306, "grad_norm": 0.10706481357692105, "learning_rate": 0.0001956140350877193, "loss": 0.3532, "step": 223 }, { "epoch": 0.2955145118733509, "grad_norm": 0.10318221714085421, "learning_rate": 0.00019649122807017543, "loss": 0.3532, "step": 224 }, { "epoch": 0.29683377308707126, "grad_norm": 0.10916308630430939, "learning_rate": 0.00019736842105263157, "loss": 0.3375, "step": 225 }, { "epoch": 0.29815303430079154, "grad_norm": 0.1161721317342853, "learning_rate": 0.00019824561403508772, "loss": 0.3544, "step": 226 }, { "epoch": 0.2994722955145119, "grad_norm": 0.11184036834186738, "learning_rate": 0.00019912280701754386, "loss": 0.3548, "step": 227 }, { "epoch": 0.3007915567282322, "grad_norm": 0.10169252693039589, "learning_rate": 0.0002, "loss": 0.3539, "step": 228 }, { "epoch": 0.3021108179419525, "grad_norm": 0.11791385245456801, "learning_rate": 0.00019999988211503861, "loss": 0.3644, "step": 229 }, { "epoch": 0.3034300791556728, "grad_norm": 0.10332724823318692, "learning_rate": 0.00019999952846043234, "loss": 0.3587, "step": 230 }, { "epoch": 0.30474934036939316, "grad_norm": 0.10751888782235781, "learning_rate": 0.00019999893903701498, "loss": 0.3436, "step": 231 }, { "epoch": 0.30606860158311344, "grad_norm": 0.10714004529553176, "learning_rate": 0.00019999811384617625, "loss": 0.3495, "step": 232 }, { "epoch": 0.3073878627968338, "grad_norm": 0.11130645223862311, "learning_rate": 0.00019999705288986172, "loss": 0.3548, "step": 233 }, { "epoch": 0.3087071240105541, "grad_norm": 0.1076585940746542, "learning_rate": 0.00019999575617057276, "loss": 0.3623, "step": 234 }, { "epoch": 0.3100263852242744, "grad_norm": 0.10142092714373126, "learning_rate": 0.00019999422369136667, "loss": 0.3558, "step": 235 }, { "epoch": 0.3113456464379947, "grad_norm": 0.10859919240767507, "learning_rate": 0.00019999245545585656, "loss": 0.3604, "step": 236 }, { "epoch": 0.31266490765171506, "grad_norm": 0.10984665283205867, "learning_rate": 0.0001999904514682114, "loss": 0.3627, "step": 237 }, { "epoch": 0.31398416886543534, "grad_norm": 0.10122701504565076, "learning_rate": 0.000199988211733156, "loss": 0.3588, "step": 238 }, { "epoch": 0.3153034300791557, "grad_norm": 0.10679503300903319, "learning_rate": 0.000199985736255971, "loss": 0.3642, "step": 239 }, { "epoch": 0.316622691292876, "grad_norm": 0.10832436248252611, "learning_rate": 0.00019998302504249278, "loss": 0.36, "step": 240 }, { "epoch": 0.3179419525065963, "grad_norm": 0.11336851174954674, "learning_rate": 0.0001999800780991136, "loss": 0.3501, "step": 241 }, { "epoch": 0.31926121372031663, "grad_norm": 0.11110436622207917, "learning_rate": 0.00019997689543278145, "loss": 0.3533, "step": 242 }, { "epoch": 0.32058047493403696, "grad_norm": 0.11194688233184132, "learning_rate": 0.00019997347705100015, "loss": 0.3514, "step": 243 }, { "epoch": 0.32189973614775724, "grad_norm": 0.10416153545340495, "learning_rate": 0.00019996982296182915, "loss": 0.344, "step": 244 }, { "epoch": 0.3232189973614776, "grad_norm": 0.11083169959049553, "learning_rate": 0.0001999659331738837, "loss": 0.3624, "step": 245 }, { "epoch": 0.3245382585751979, "grad_norm": 0.10902409714533157, "learning_rate": 0.0001999618076963348, "loss": 0.3359, "step": 246 }, { "epoch": 0.3258575197889182, "grad_norm": 0.102048496465394, "learning_rate": 0.00019995744653890905, "loss": 0.3412, "step": 247 }, { "epoch": 0.32717678100263853, "grad_norm": 0.10141346061476565, "learning_rate": 0.00019995284971188873, "loss": 0.3466, "step": 248 }, { "epoch": 0.32849604221635886, "grad_norm": 0.10665009771260972, "learning_rate": 0.00019994801722611182, "loss": 0.3484, "step": 249 }, { "epoch": 0.32981530343007914, "grad_norm": 0.10286027354195602, "learning_rate": 0.0001999429490929718, "loss": 0.3671, "step": 250 }, { "epoch": 0.3311345646437995, "grad_norm": 0.1029461319514982, "learning_rate": 0.0001999376453244179, "loss": 0.3616, "step": 251 }, { "epoch": 0.3324538258575198, "grad_norm": 0.1062385221407233, "learning_rate": 0.00019993210593295473, "loss": 0.3439, "step": 252 }, { "epoch": 0.3337730870712401, "grad_norm": 0.10592940227627486, "learning_rate": 0.00019992633093164256, "loss": 0.341, "step": 253 }, { "epoch": 0.33509234828496043, "grad_norm": 0.10641345021523324, "learning_rate": 0.00019992032033409708, "loss": 0.3633, "step": 254 }, { "epoch": 0.33641160949868076, "grad_norm": 0.10370837857168665, "learning_rate": 0.00019991407415448947, "loss": 0.3537, "step": 255 }, { "epoch": 0.33773087071240104, "grad_norm": 0.09892250701584497, "learning_rate": 0.00019990759240754637, "loss": 0.3402, "step": 256 }, { "epoch": 0.3390501319261214, "grad_norm": 0.09980435045491794, "learning_rate": 0.00019990087510854973, "loss": 0.3359, "step": 257 }, { "epoch": 0.3403693931398417, "grad_norm": 0.0980359751848968, "learning_rate": 0.000199893922273337, "loss": 0.3413, "step": 258 }, { "epoch": 0.341688654353562, "grad_norm": 0.09929954125458344, "learning_rate": 0.0001998867339183008, "loss": 0.3449, "step": 259 }, { "epoch": 0.34300791556728233, "grad_norm": 0.09844574538368285, "learning_rate": 0.00019987931006038915, "loss": 0.3434, "step": 260 }, { "epoch": 0.34432717678100266, "grad_norm": 0.1013844456266077, "learning_rate": 0.00019987165071710527, "loss": 0.3491, "step": 261 }, { "epoch": 0.34564643799472294, "grad_norm": 0.09501476592518826, "learning_rate": 0.0001998637559065076, "loss": 0.3406, "step": 262 }, { "epoch": 0.3469656992084433, "grad_norm": 0.09868630877254, "learning_rate": 0.00019985562564720972, "loss": 0.3467, "step": 263 }, { "epoch": 0.3482849604221636, "grad_norm": 0.10144754149367415, "learning_rate": 0.00019984725995838033, "loss": 0.3543, "step": 264 }, { "epoch": 0.3496042216358839, "grad_norm": 0.0995974376644274, "learning_rate": 0.00019983865885974316, "loss": 0.3515, "step": 265 }, { "epoch": 0.35092348284960423, "grad_norm": 0.09448344439308938, "learning_rate": 0.00019982982237157711, "loss": 0.3465, "step": 266 }, { "epoch": 0.35224274406332456, "grad_norm": 0.09867701794487137, "learning_rate": 0.00019982075051471588, "loss": 0.3512, "step": 267 }, { "epoch": 0.35356200527704484, "grad_norm": 0.09750567479183042, "learning_rate": 0.00019981144331054825, "loss": 0.3488, "step": 268 }, { "epoch": 0.3548812664907652, "grad_norm": 0.09253867498209571, "learning_rate": 0.00019980190078101772, "loss": 0.35, "step": 269 }, { "epoch": 0.3562005277044855, "grad_norm": 0.09661211624163728, "learning_rate": 0.0001997921229486228, "loss": 0.3441, "step": 270 }, { "epoch": 0.3575197889182058, "grad_norm": 0.10097076067419916, "learning_rate": 0.00019978210983641662, "loss": 0.3488, "step": 271 }, { "epoch": 0.35883905013192613, "grad_norm": 0.09504942936608565, "learning_rate": 0.00019977186146800707, "loss": 0.3448, "step": 272 }, { "epoch": 0.36015831134564646, "grad_norm": 0.09289871346858902, "learning_rate": 0.0001997613778675568, "loss": 0.3433, "step": 273 }, { "epoch": 0.36147757255936674, "grad_norm": 0.09466279483778023, "learning_rate": 0.0001997506590597829, "loss": 0.3512, "step": 274 }, { "epoch": 0.3627968337730871, "grad_norm": 0.09132630431701041, "learning_rate": 0.00019973970506995716, "loss": 0.3495, "step": 275 }, { "epoch": 0.3641160949868074, "grad_norm": 0.09420392944682034, "learning_rate": 0.00019972851592390574, "loss": 0.3532, "step": 276 }, { "epoch": 0.3654353562005277, "grad_norm": 0.09704012537681957, "learning_rate": 0.0001997170916480093, "loss": 0.3423, "step": 277 }, { "epoch": 0.36675461741424803, "grad_norm": 0.09305191848689394, "learning_rate": 0.00019970543226920288, "loss": 0.3406, "step": 278 }, { "epoch": 0.36807387862796836, "grad_norm": 0.10229707685334494, "learning_rate": 0.00019969353781497574, "loss": 0.3472, "step": 279 }, { "epoch": 0.36939313984168864, "grad_norm": 0.09790466203430251, "learning_rate": 0.00019968140831337148, "loss": 0.3453, "step": 280 }, { "epoch": 0.370712401055409, "grad_norm": 0.09109971218140869, "learning_rate": 0.00019966904379298774, "loss": 0.3381, "step": 281 }, { "epoch": 0.3720316622691293, "grad_norm": 0.09677616291492164, "learning_rate": 0.00019965644428297642, "loss": 0.3555, "step": 282 }, { "epoch": 0.3733509234828496, "grad_norm": 0.09699762927012082, "learning_rate": 0.0001996436098130433, "loss": 0.3585, "step": 283 }, { "epoch": 0.37467018469656993, "grad_norm": 0.0946496434974004, "learning_rate": 0.00019963054041344827, "loss": 0.3483, "step": 284 }, { "epoch": 0.3759894459102902, "grad_norm": 0.09111239863257745, "learning_rate": 0.000199617236115005, "loss": 0.3447, "step": 285 }, { "epoch": 0.37730870712401055, "grad_norm": 0.09627388793272387, "learning_rate": 0.00019960369694908104, "loss": 0.3547, "step": 286 }, { "epoch": 0.3786279683377309, "grad_norm": 0.09197373138532838, "learning_rate": 0.00019958992294759765, "loss": 0.3469, "step": 287 }, { "epoch": 0.37994722955145116, "grad_norm": 0.09299207703266144, "learning_rate": 0.00019957591414302984, "loss": 0.3443, "step": 288 }, { "epoch": 0.3812664907651715, "grad_norm": 0.0916282442877691, "learning_rate": 0.00019956167056840607, "loss": 0.3486, "step": 289 }, { "epoch": 0.38258575197889183, "grad_norm": 0.10641683669535149, "learning_rate": 0.00019954719225730847, "loss": 0.3472, "step": 290 }, { "epoch": 0.3839050131926121, "grad_norm": 0.09085875781750939, "learning_rate": 0.00019953247924387252, "loss": 0.3523, "step": 291 }, { "epoch": 0.38522427440633245, "grad_norm": 0.09272871783946804, "learning_rate": 0.0001995175315627871, "loss": 0.3378, "step": 292 }, { "epoch": 0.3865435356200528, "grad_norm": 0.096939985866646, "learning_rate": 0.0001995023492492943, "loss": 0.3325, "step": 293 }, { "epoch": 0.38786279683377306, "grad_norm": 0.09419500732658995, "learning_rate": 0.00019948693233918952, "loss": 0.3503, "step": 294 }, { "epoch": 0.3891820580474934, "grad_norm": 0.09342198614467694, "learning_rate": 0.00019947128086882115, "loss": 0.3487, "step": 295 }, { "epoch": 0.39050131926121373, "grad_norm": 0.0896041920572187, "learning_rate": 0.00019945539487509063, "loss": 0.3409, "step": 296 }, { "epoch": 0.391820580474934, "grad_norm": 0.09525318945760115, "learning_rate": 0.00019943927439545242, "loss": 0.3378, "step": 297 }, { "epoch": 0.39313984168865435, "grad_norm": 0.09189154950515221, "learning_rate": 0.0001994229194679137, "loss": 0.3484, "step": 298 }, { "epoch": 0.3944591029023747, "grad_norm": 0.09455060067422674, "learning_rate": 0.0001994063301310345, "loss": 0.3497, "step": 299 }, { "epoch": 0.39577836411609496, "grad_norm": 0.09098195966262516, "learning_rate": 0.00019938950642392746, "loss": 0.3471, "step": 300 }, { "epoch": 0.3970976253298153, "grad_norm": 0.09641140805034262, "learning_rate": 0.00019937244838625788, "loss": 0.3423, "step": 301 }, { "epoch": 0.39841688654353563, "grad_norm": 0.0950210422809279, "learning_rate": 0.0001993551560582434, "loss": 0.3435, "step": 302 }, { "epoch": 0.3997361477572559, "grad_norm": 0.09489265519038878, "learning_rate": 0.0001993376294806542, "loss": 0.3366, "step": 303 }, { "epoch": 0.40105540897097625, "grad_norm": 0.09243810957282327, "learning_rate": 0.00019931986869481266, "loss": 0.3509, "step": 304 }, { "epoch": 0.4023746701846966, "grad_norm": 0.09190536016059143, "learning_rate": 0.00019930187374259337, "loss": 0.3336, "step": 305 }, { "epoch": 0.40369393139841686, "grad_norm": 0.09621129733942799, "learning_rate": 0.000199283644666423, "loss": 0.3446, "step": 306 }, { "epoch": 0.4050131926121372, "grad_norm": 0.09897518928574285, "learning_rate": 0.00019926518150928022, "loss": 0.3457, "step": 307 }, { "epoch": 0.40633245382585753, "grad_norm": 0.09448581651013746, "learning_rate": 0.00019924648431469564, "loss": 0.345, "step": 308 }, { "epoch": 0.4076517150395778, "grad_norm": 0.09964280598562016, "learning_rate": 0.00019922755312675158, "loss": 0.353, "step": 309 }, { "epoch": 0.40897097625329815, "grad_norm": 0.0964583821372594, "learning_rate": 0.00019920838799008213, "loss": 0.3401, "step": 310 }, { "epoch": 0.4102902374670185, "grad_norm": 0.09057622054755676, "learning_rate": 0.00019918898894987286, "loss": 0.3439, "step": 311 }, { "epoch": 0.41160949868073876, "grad_norm": 0.09502153533015795, "learning_rate": 0.00019916935605186092, "loss": 0.3473, "step": 312 }, { "epoch": 0.4129287598944591, "grad_norm": 0.09893983820259689, "learning_rate": 0.00019914948934233477, "loss": 0.3435, "step": 313 }, { "epoch": 0.41424802110817943, "grad_norm": 0.0944131549599587, "learning_rate": 0.00019912938886813413, "loss": 0.3403, "step": 314 }, { "epoch": 0.4155672823218997, "grad_norm": 0.0891194715719245, "learning_rate": 0.00019910905467664987, "loss": 0.3428, "step": 315 }, { "epoch": 0.41688654353562005, "grad_norm": 0.10356661093106957, "learning_rate": 0.00019908848681582391, "loss": 0.3415, "step": 316 }, { "epoch": 0.4182058047493404, "grad_norm": 0.09657693690289443, "learning_rate": 0.00019906768533414906, "loss": 0.3388, "step": 317 }, { "epoch": 0.41952506596306066, "grad_norm": 0.09306275610810413, "learning_rate": 0.00019904665028066894, "loss": 0.3342, "step": 318 }, { "epoch": 0.420844327176781, "grad_norm": 0.09202467707884077, "learning_rate": 0.00019902538170497795, "loss": 0.3302, "step": 319 }, { "epoch": 0.42216358839050133, "grad_norm": 0.09127055348159674, "learning_rate": 0.00019900387965722093, "loss": 0.337, "step": 320 }, { "epoch": 0.4234828496042216, "grad_norm": 0.09399646611483357, "learning_rate": 0.0001989821441880933, "loss": 0.3388, "step": 321 }, { "epoch": 0.42480211081794195, "grad_norm": 0.09712976329321023, "learning_rate": 0.00019896017534884068, "loss": 0.3487, "step": 322 }, { "epoch": 0.4261213720316623, "grad_norm": 0.08479836496688575, "learning_rate": 0.00019893797319125902, "loss": 0.3315, "step": 323 }, { "epoch": 0.42744063324538256, "grad_norm": 0.09454409052366376, "learning_rate": 0.0001989155377676944, "loss": 0.3394, "step": 324 }, { "epoch": 0.4287598944591029, "grad_norm": 0.09303434980656193, "learning_rate": 0.00019889286913104265, "loss": 0.3273, "step": 325 }, { "epoch": 0.43007915567282323, "grad_norm": 0.08887606695949546, "learning_rate": 0.00019886996733474975, "loss": 0.3409, "step": 326 }, { "epoch": 0.4313984168865435, "grad_norm": 0.09063606727221352, "learning_rate": 0.00019884683243281116, "loss": 0.3458, "step": 327 }, { "epoch": 0.43271767810026385, "grad_norm": 0.08894131559397522, "learning_rate": 0.00019882346447977204, "loss": 0.3366, "step": 328 }, { "epoch": 0.4340369393139842, "grad_norm": 0.09227269223835412, "learning_rate": 0.000198799863530727, "loss": 0.3424, "step": 329 }, { "epoch": 0.43535620052770446, "grad_norm": 0.09325799198382986, "learning_rate": 0.00019877602964131995, "loss": 0.3298, "step": 330 }, { "epoch": 0.4366754617414248, "grad_norm": 0.09001427251426992, "learning_rate": 0.0001987519628677441, "loss": 0.3396, "step": 331 }, { "epoch": 0.43799472295514513, "grad_norm": 0.09456231327430153, "learning_rate": 0.0001987276632667416, "loss": 0.3483, "step": 332 }, { "epoch": 0.4393139841688654, "grad_norm": 0.09143113491673216, "learning_rate": 0.00019870313089560365, "loss": 0.3577, "step": 333 }, { "epoch": 0.44063324538258575, "grad_norm": 0.0892795011802246, "learning_rate": 0.00019867836581217017, "loss": 0.3331, "step": 334 }, { "epoch": 0.4419525065963061, "grad_norm": 0.09121495696528145, "learning_rate": 0.00019865336807482975, "loss": 0.3468, "step": 335 }, { "epoch": 0.44327176781002636, "grad_norm": 0.08979603716457088, "learning_rate": 0.0001986281377425196, "loss": 0.3382, "step": 336 }, { "epoch": 0.4445910290237467, "grad_norm": 0.08486954835597034, "learning_rate": 0.0001986026748747252, "loss": 0.3348, "step": 337 }, { "epoch": 0.44591029023746703, "grad_norm": 0.09090313061985178, "learning_rate": 0.00019857697953148037, "loss": 0.3456, "step": 338 }, { "epoch": 0.4472295514511873, "grad_norm": 0.08838544290030771, "learning_rate": 0.00019855105177336702, "loss": 0.3309, "step": 339 }, { "epoch": 0.44854881266490765, "grad_norm": 0.08434714396158095, "learning_rate": 0.00019852489166151497, "loss": 0.3343, "step": 340 }, { "epoch": 0.449868073878628, "grad_norm": 0.08695922668812102, "learning_rate": 0.00019849849925760186, "loss": 0.3339, "step": 341 }, { "epoch": 0.45118733509234826, "grad_norm": 0.09342864000238602, "learning_rate": 0.0001984718746238531, "loss": 0.345, "step": 342 }, { "epoch": 0.4525065963060686, "grad_norm": 0.08845805722742146, "learning_rate": 0.00019844501782304155, "loss": 0.3378, "step": 343 }, { "epoch": 0.45382585751978893, "grad_norm": 0.09066179445916431, "learning_rate": 0.00019841792891848745, "loss": 0.3356, "step": 344 }, { "epoch": 0.4551451187335092, "grad_norm": 0.08825292083764819, "learning_rate": 0.00019839060797405833, "loss": 0.3286, "step": 345 }, { "epoch": 0.45646437994722955, "grad_norm": 0.0872633948532585, "learning_rate": 0.00019836305505416875, "loss": 0.3324, "step": 346 }, { "epoch": 0.4577836411609499, "grad_norm": 0.08838532362959937, "learning_rate": 0.00019833527022378018, "loss": 0.3396, "step": 347 }, { "epoch": 0.45910290237467016, "grad_norm": 0.08647352445747489, "learning_rate": 0.00019830725354840089, "loss": 0.342, "step": 348 }, { "epoch": 0.4604221635883905, "grad_norm": 0.08520979043052515, "learning_rate": 0.00019827900509408581, "loss": 0.3338, "step": 349 }, { "epoch": 0.46174142480211083, "grad_norm": 0.0856998435280094, "learning_rate": 0.00019825052492743628, "loss": 0.3338, "step": 350 }, { "epoch": 0.4630606860158311, "grad_norm": 0.0875080870279277, "learning_rate": 0.00019822181311559994, "loss": 0.3408, "step": 351 }, { "epoch": 0.46437994722955145, "grad_norm": 0.08412251753363091, "learning_rate": 0.00019819286972627066, "loss": 0.3369, "step": 352 }, { "epoch": 0.4656992084432718, "grad_norm": 0.08546423461124858, "learning_rate": 0.00019816369482768823, "loss": 0.3431, "step": 353 }, { "epoch": 0.46701846965699206, "grad_norm": 0.08859693227868694, "learning_rate": 0.00019813428848863826, "loss": 0.3509, "step": 354 }, { "epoch": 0.4683377308707124, "grad_norm": 0.08500434092310157, "learning_rate": 0.0001981046507784521, "loss": 0.3335, "step": 355 }, { "epoch": 0.46965699208443273, "grad_norm": 0.08775025973089794, "learning_rate": 0.0001980747817670065, "loss": 0.3396, "step": 356 }, { "epoch": 0.470976253298153, "grad_norm": 0.08664082818601633, "learning_rate": 0.00019804468152472362, "loss": 0.3531, "step": 357 }, { "epoch": 0.47229551451187335, "grad_norm": 0.08382354019684697, "learning_rate": 0.00019801435012257082, "loss": 0.3342, "step": 358 }, { "epoch": 0.4736147757255937, "grad_norm": 0.08493792423355828, "learning_rate": 0.00019798378763206037, "loss": 0.3435, "step": 359 }, { "epoch": 0.47493403693931396, "grad_norm": 0.08468911126955714, "learning_rate": 0.00019795299412524945, "loss": 0.3438, "step": 360 }, { "epoch": 0.4762532981530343, "grad_norm": 0.08210015277280021, "learning_rate": 0.0001979219696747399, "loss": 0.3299, "step": 361 }, { "epoch": 0.47757255936675463, "grad_norm": 0.08641166252194632, "learning_rate": 0.00019789071435367804, "loss": 0.3402, "step": 362 }, { "epoch": 0.4788918205804749, "grad_norm": 0.08560974221470605, "learning_rate": 0.00019785922823575448, "loss": 0.332, "step": 363 }, { "epoch": 0.48021108179419525, "grad_norm": 0.08605239814000158, "learning_rate": 0.00019782751139520407, "loss": 0.3404, "step": 364 }, { "epoch": 0.4815303430079156, "grad_norm": 0.09398927904501168, "learning_rate": 0.00019779556390680557, "loss": 0.3436, "step": 365 }, { "epoch": 0.48284960422163586, "grad_norm": 0.08830444282916391, "learning_rate": 0.00019776338584588153, "loss": 0.3449, "step": 366 }, { "epoch": 0.4841688654353562, "grad_norm": 0.08608013907915739, "learning_rate": 0.00019773097728829813, "loss": 0.3282, "step": 367 }, { "epoch": 0.48548812664907653, "grad_norm": 0.0838048262094816, "learning_rate": 0.00019769833831046501, "loss": 0.3348, "step": 368 }, { "epoch": 0.4868073878627968, "grad_norm": 0.08026589094862331, "learning_rate": 0.00019766546898933508, "loss": 0.3197, "step": 369 }, { "epoch": 0.48812664907651715, "grad_norm": 0.0835258961854144, "learning_rate": 0.0001976323694024043, "loss": 0.3344, "step": 370 }, { "epoch": 0.4894459102902375, "grad_norm": 0.08770720207262736, "learning_rate": 0.00019759903962771156, "loss": 0.3349, "step": 371 }, { "epoch": 0.49076517150395776, "grad_norm": 0.0864346406296061, "learning_rate": 0.0001975654797438384, "loss": 0.3394, "step": 372 }, { "epoch": 0.4920844327176781, "grad_norm": 0.08630538574342314, "learning_rate": 0.00019753168982990894, "loss": 0.3358, "step": 373 }, { "epoch": 0.49340369393139843, "grad_norm": 0.08592209058749774, "learning_rate": 0.0001974976699655897, "loss": 0.3348, "step": 374 }, { "epoch": 0.4947229551451187, "grad_norm": 0.0850726848948466, "learning_rate": 0.0001974634202310892, "loss": 0.3393, "step": 375 }, { "epoch": 0.49604221635883905, "grad_norm": 0.08586143582484707, "learning_rate": 0.00019742894070715807, "loss": 0.3384, "step": 376 }, { "epoch": 0.4973614775725594, "grad_norm": 0.08593194869672516, "learning_rate": 0.00019739423147508865, "loss": 0.3414, "step": 377 }, { "epoch": 0.49868073878627966, "grad_norm": 0.08776341723921044, "learning_rate": 0.00019735929261671485, "loss": 0.3378, "step": 378 }, { "epoch": 0.5, "grad_norm": 0.08441448432777303, "learning_rate": 0.00019732412421441197, "loss": 0.3293, "step": 379 }, { "epoch": 0.5013192612137203, "grad_norm": 0.08601612689376634, "learning_rate": 0.00019728872635109662, "loss": 0.3337, "step": 380 }, { "epoch": 0.5026385224274407, "grad_norm": 0.08270024777396094, "learning_rate": 0.00019725309911022617, "loss": 0.3329, "step": 381 }, { "epoch": 0.503957783641161, "grad_norm": 0.0901751191027156, "learning_rate": 0.00019721724257579907, "loss": 0.3386, "step": 382 }, { "epoch": 0.5052770448548812, "grad_norm": 0.08754976876063808, "learning_rate": 0.00019718115683235417, "loss": 0.3357, "step": 383 }, { "epoch": 0.5065963060686016, "grad_norm": 0.09084727634322448, "learning_rate": 0.00019714484196497084, "loss": 0.3289, "step": 384 }, { "epoch": 0.5079155672823219, "grad_norm": 0.08684766826464946, "learning_rate": 0.00019710829805926857, "loss": 0.3384, "step": 385 }, { "epoch": 0.5092348284960422, "grad_norm": 0.0850838319104451, "learning_rate": 0.00019707152520140694, "loss": 0.3225, "step": 386 }, { "epoch": 0.5105540897097626, "grad_norm": 0.08623576940953968, "learning_rate": 0.00019703452347808527, "loss": 0.3395, "step": 387 }, { "epoch": 0.5118733509234829, "grad_norm": 0.09088620699240582, "learning_rate": 0.0001969972929765425, "loss": 0.3515, "step": 388 }, { "epoch": 0.5131926121372031, "grad_norm": 0.08849172284328792, "learning_rate": 0.00019695983378455694, "loss": 0.3388, "step": 389 }, { "epoch": 0.5145118733509235, "grad_norm": 0.08251653155651849, "learning_rate": 0.0001969221459904461, "loss": 0.3336, "step": 390 }, { "epoch": 0.5158311345646438, "grad_norm": 0.08837806120181799, "learning_rate": 0.0001968842296830665, "loss": 0.3346, "step": 391 }, { "epoch": 0.5171503957783641, "grad_norm": 0.08990710590860274, "learning_rate": 0.00019684608495181333, "loss": 0.3416, "step": 392 }, { "epoch": 0.5184696569920845, "grad_norm": 0.09275341919173749, "learning_rate": 0.00019680771188662044, "loss": 0.3379, "step": 393 }, { "epoch": 0.5197889182058048, "grad_norm": 0.08542257287221022, "learning_rate": 0.00019676911057795997, "loss": 0.3356, "step": 394 }, { "epoch": 0.521108179419525, "grad_norm": 0.08674062348767167, "learning_rate": 0.00019673028111684215, "loss": 0.3297, "step": 395 }, { "epoch": 0.5224274406332454, "grad_norm": 0.08350988653871545, "learning_rate": 0.00019669122359481525, "loss": 0.3327, "step": 396 }, { "epoch": 0.5237467018469657, "grad_norm": 0.08589161913703508, "learning_rate": 0.00019665193810396509, "loss": 0.3296, "step": 397 }, { "epoch": 0.525065963060686, "grad_norm": 0.08428935342869327, "learning_rate": 0.00019661242473691508, "loss": 0.3346, "step": 398 }, { "epoch": 0.5263852242744064, "grad_norm": 0.08447568249031306, "learning_rate": 0.00019657268358682584, "loss": 0.3374, "step": 399 }, { "epoch": 0.5277044854881267, "grad_norm": 0.08425997148944009, "learning_rate": 0.00019653271474739503, "loss": 0.3324, "step": 400 }, { "epoch": 0.5290237467018469, "grad_norm": 0.08160050515886696, "learning_rate": 0.0001964925183128572, "loss": 0.335, "step": 401 }, { "epoch": 0.5303430079155673, "grad_norm": 0.08463924633890439, "learning_rate": 0.0001964520943779834, "loss": 0.3457, "step": 402 }, { "epoch": 0.5316622691292876, "grad_norm": 0.08269276100564432, "learning_rate": 0.0001964114430380812, "loss": 0.3262, "step": 403 }, { "epoch": 0.5329815303430079, "grad_norm": 0.08758207313621288, "learning_rate": 0.0001963705643889941, "loss": 0.3511, "step": 404 }, { "epoch": 0.5343007915567283, "grad_norm": 0.08229461765616651, "learning_rate": 0.00019632945852710173, "loss": 0.3465, "step": 405 }, { "epoch": 0.5356200527704486, "grad_norm": 0.08507994934768198, "learning_rate": 0.00019628812554931937, "loss": 0.332, "step": 406 }, { "epoch": 0.5369393139841688, "grad_norm": 0.08277530522973486, "learning_rate": 0.00019624656555309775, "loss": 0.3349, "step": 407 }, { "epoch": 0.5382585751978892, "grad_norm": 0.08018534393810597, "learning_rate": 0.00019620477863642276, "loss": 0.3391, "step": 408 }, { "epoch": 0.5395778364116095, "grad_norm": 0.08395885467401967, "learning_rate": 0.0001961627648978155, "loss": 0.3298, "step": 409 }, { "epoch": 0.5408970976253298, "grad_norm": 0.08697942039729992, "learning_rate": 0.00019612052443633161, "loss": 0.3368, "step": 410 }, { "epoch": 0.5422163588390502, "grad_norm": 0.08104514293453446, "learning_rate": 0.0001960780573515615, "loss": 0.3335, "step": 411 }, { "epoch": 0.5435356200527705, "grad_norm": 0.08589114948205105, "learning_rate": 0.00019603536374362973, "loss": 0.329, "step": 412 }, { "epoch": 0.5448548812664907, "grad_norm": 0.08708007336182826, "learning_rate": 0.00019599244371319493, "loss": 0.3361, "step": 413 }, { "epoch": 0.5461741424802111, "grad_norm": 0.08582873291032579, "learning_rate": 0.00019594929736144976, "loss": 0.3394, "step": 414 }, { "epoch": 0.5474934036939314, "grad_norm": 0.08309051102164515, "learning_rate": 0.00019590592479012023, "loss": 0.3427, "step": 415 }, { "epoch": 0.5488126649076517, "grad_norm": 0.08429939383152982, "learning_rate": 0.00019586232610146583, "loss": 0.331, "step": 416 }, { "epoch": 0.5501319261213721, "grad_norm": 0.0838069507477774, "learning_rate": 0.0001958185013982792, "loss": 0.3295, "step": 417 }, { "epoch": 0.5514511873350924, "grad_norm": 0.08487070375018464, "learning_rate": 0.00019577445078388582, "loss": 0.3288, "step": 418 }, { "epoch": 0.5527704485488126, "grad_norm": 0.08220058961001427, "learning_rate": 0.0001957301743621437, "loss": 0.3437, "step": 419 }, { "epoch": 0.554089709762533, "grad_norm": 0.08568411231603992, "learning_rate": 0.00019568567223744339, "loss": 0.3378, "step": 420 }, { "epoch": 0.5554089709762533, "grad_norm": 0.08956338446602499, "learning_rate": 0.0001956409445147075, "loss": 0.3453, "step": 421 }, { "epoch": 0.5567282321899736, "grad_norm": 0.08468678915812283, "learning_rate": 0.0001955959912993906, "loss": 0.337, "step": 422 }, { "epoch": 0.558047493403694, "grad_norm": 0.0858349470812717, "learning_rate": 0.00019555081269747877, "loss": 0.3521, "step": 423 }, { "epoch": 0.5593667546174143, "grad_norm": 0.0862795786108168, "learning_rate": 0.0001955054088154896, "loss": 0.3277, "step": 424 }, { "epoch": 0.5606860158311345, "grad_norm": 0.0829643749335341, "learning_rate": 0.00019545977976047175, "loss": 0.3214, "step": 425 }, { "epoch": 0.5620052770448549, "grad_norm": 0.07781985945037312, "learning_rate": 0.00019541392564000488, "loss": 0.3188, "step": 426 }, { "epoch": 0.5633245382585752, "grad_norm": 0.08411823284596769, "learning_rate": 0.00019536784656219917, "loss": 0.3286, "step": 427 }, { "epoch": 0.5646437994722955, "grad_norm": 0.08214093329608539, "learning_rate": 0.00019532154263569526, "loss": 0.3302, "step": 428 }, { "epoch": 0.5659630606860159, "grad_norm": 0.08193007449142446, "learning_rate": 0.00019527501396966382, "loss": 0.3443, "step": 429 }, { "epoch": 0.5672823218997362, "grad_norm": 0.08334324467276248, "learning_rate": 0.00019522826067380552, "loss": 0.3144, "step": 430 }, { "epoch": 0.5686015831134564, "grad_norm": 0.08117460221457512, "learning_rate": 0.0001951812828583505, "loss": 0.336, "step": 431 }, { "epoch": 0.5699208443271768, "grad_norm": 0.08278079461663734, "learning_rate": 0.00019513408063405837, "loss": 0.3253, "step": 432 }, { "epoch": 0.5712401055408971, "grad_norm": 0.08654152946172328, "learning_rate": 0.00019508665411221778, "loss": 0.3409, "step": 433 }, { "epoch": 0.5725593667546174, "grad_norm": 0.08850182024261356, "learning_rate": 0.00019503900340464618, "loss": 0.3311, "step": 434 }, { "epoch": 0.5738786279683378, "grad_norm": 0.08154689899973278, "learning_rate": 0.0001949911286236896, "loss": 0.3394, "step": 435 }, { "epoch": 0.575197889182058, "grad_norm": 0.08149752931722429, "learning_rate": 0.0001949430298822224, "loss": 0.3348, "step": 436 }, { "epoch": 0.5765171503957783, "grad_norm": 0.08364797272382658, "learning_rate": 0.00019489470729364692, "loss": 0.3388, "step": 437 }, { "epoch": 0.5778364116094987, "grad_norm": 0.08319368115493087, "learning_rate": 0.0001948461609718933, "loss": 0.3232, "step": 438 }, { "epoch": 0.579155672823219, "grad_norm": 0.08446131963250951, "learning_rate": 0.00019479739103141918, "loss": 0.3219, "step": 439 }, { "epoch": 0.5804749340369393, "grad_norm": 0.08573128028032231, "learning_rate": 0.0001947483975872094, "loss": 0.3353, "step": 440 }, { "epoch": 0.5817941952506597, "grad_norm": 0.08897306019998495, "learning_rate": 0.00019469918075477573, "loss": 0.3324, "step": 441 }, { "epoch": 0.58311345646438, "grad_norm": 0.08489782223632836, "learning_rate": 0.00019464974065015674, "loss": 0.335, "step": 442 }, { "epoch": 0.5844327176781002, "grad_norm": 0.0869549905585714, "learning_rate": 0.00019460007738991727, "loss": 0.3309, "step": 443 }, { "epoch": 0.5857519788918206, "grad_norm": 0.08649123606166434, "learning_rate": 0.00019455019109114834, "loss": 0.338, "step": 444 }, { "epoch": 0.5870712401055409, "grad_norm": 0.08656638533042739, "learning_rate": 0.00019450008187146684, "loss": 0.3374, "step": 445 }, { "epoch": 0.5883905013192612, "grad_norm": 0.08254541651097673, "learning_rate": 0.00019444974984901529, "loss": 0.3382, "step": 446 }, { "epoch": 0.5897097625329816, "grad_norm": 0.08179931184369654, "learning_rate": 0.00019439919514246143, "loss": 0.3342, "step": 447 }, { "epoch": 0.5910290237467019, "grad_norm": 0.0817551310580504, "learning_rate": 0.00019434841787099803, "loss": 0.335, "step": 448 }, { "epoch": 0.5923482849604221, "grad_norm": 0.08276590983117806, "learning_rate": 0.00019429741815434265, "loss": 0.3348, "step": 449 }, { "epoch": 0.5936675461741425, "grad_norm": 0.07884239738785236, "learning_rate": 0.00019424619611273727, "loss": 0.3252, "step": 450 }, { "epoch": 0.5949868073878628, "grad_norm": 0.08265608656867947, "learning_rate": 0.00019419475186694806, "loss": 0.3238, "step": 451 }, { "epoch": 0.5963060686015831, "grad_norm": 0.0847924216051823, "learning_rate": 0.0001941430855382651, "loss": 0.3317, "step": 452 }, { "epoch": 0.5976253298153035, "grad_norm": 0.07867232119133195, "learning_rate": 0.00019409119724850203, "loss": 0.34, "step": 453 }, { "epoch": 0.5989445910290238, "grad_norm": 0.07700217630383197, "learning_rate": 0.00019403908711999583, "loss": 0.3285, "step": 454 }, { "epoch": 0.600263852242744, "grad_norm": 0.08855657535577448, "learning_rate": 0.0001939867552756065, "loss": 0.3316, "step": 455 }, { "epoch": 0.6015831134564644, "grad_norm": 0.0826266027849057, "learning_rate": 0.00019393420183871682, "loss": 0.3345, "step": 456 }, { "epoch": 0.6029023746701847, "grad_norm": 0.08182517743250414, "learning_rate": 0.00019388142693323198, "loss": 0.3318, "step": 457 }, { "epoch": 0.604221635883905, "grad_norm": 0.08392174010991946, "learning_rate": 0.00019382843068357932, "loss": 0.3243, "step": 458 }, { "epoch": 0.6055408970976254, "grad_norm": 0.08370860246831847, "learning_rate": 0.00019377521321470805, "loss": 0.335, "step": 459 }, { "epoch": 0.6068601583113457, "grad_norm": 0.07983723685014193, "learning_rate": 0.00019372177465208897, "loss": 0.3283, "step": 460 }, { "epoch": 0.6081794195250659, "grad_norm": 0.08493061177126981, "learning_rate": 0.00019366811512171417, "loss": 0.3499, "step": 461 }, { "epoch": 0.6094986807387863, "grad_norm": 0.08300217816771671, "learning_rate": 0.0001936142347500966, "loss": 0.3398, "step": 462 }, { "epoch": 0.6108179419525066, "grad_norm": 0.08087224972467177, "learning_rate": 0.00019356013366427007, "loss": 0.3415, "step": 463 }, { "epoch": 0.6121372031662269, "grad_norm": 0.07916512929517935, "learning_rate": 0.00019350581199178857, "loss": 0.327, "step": 464 }, { "epoch": 0.6134564643799473, "grad_norm": 0.08260909055594477, "learning_rate": 0.00019345126986072635, "loss": 0.3376, "step": 465 }, { "epoch": 0.6147757255936676, "grad_norm": 0.08218411224817182, "learning_rate": 0.0001933965073996773, "loss": 0.3351, "step": 466 }, { "epoch": 0.6160949868073878, "grad_norm": 0.0833583533727767, "learning_rate": 0.00019334152473775485, "loss": 0.3434, "step": 467 }, { "epoch": 0.6174142480211082, "grad_norm": 0.0807204504474185, "learning_rate": 0.00019328632200459156, "loss": 0.3271, "step": 468 }, { "epoch": 0.6187335092348285, "grad_norm": 0.08387607481115048, "learning_rate": 0.0001932308993303389, "loss": 0.3279, "step": 469 }, { "epoch": 0.6200527704485488, "grad_norm": 0.08198000591021921, "learning_rate": 0.00019317525684566685, "loss": 0.337, "step": 470 }, { "epoch": 0.6213720316622692, "grad_norm": 0.07929341892119149, "learning_rate": 0.00019311939468176368, "loss": 0.3284, "step": 471 }, { "epoch": 0.6226912928759895, "grad_norm": 0.0809780485981929, "learning_rate": 0.0001930633129703355, "loss": 0.3383, "step": 472 }, { "epoch": 0.6240105540897097, "grad_norm": 0.07996980400550173, "learning_rate": 0.0001930070118436062, "loss": 0.3212, "step": 473 }, { "epoch": 0.6253298153034301, "grad_norm": 0.07901732753740348, "learning_rate": 0.00019295049143431685, "loss": 0.3324, "step": 474 }, { "epoch": 0.6266490765171504, "grad_norm": 0.07836517717785385, "learning_rate": 0.00019289375187572563, "loss": 0.3242, "step": 475 }, { "epoch": 0.6279683377308707, "grad_norm": 0.08435396522507235, "learning_rate": 0.00019283679330160726, "loss": 0.3222, "step": 476 }, { "epoch": 0.6292875989445911, "grad_norm": 0.07863940240710424, "learning_rate": 0.00019277961584625303, "loss": 0.3173, "step": 477 }, { "epoch": 0.6306068601583114, "grad_norm": 0.08335204630832893, "learning_rate": 0.0001927222196444701, "loss": 0.3436, "step": 478 }, { "epoch": 0.6319261213720316, "grad_norm": 0.08148641305786883, "learning_rate": 0.00019266460483158152, "loss": 0.327, "step": 479 }, { "epoch": 0.633245382585752, "grad_norm": 0.08253030313333945, "learning_rate": 0.00019260677154342564, "loss": 0.3387, "step": 480 }, { "epoch": 0.6345646437994723, "grad_norm": 0.08208012606481756, "learning_rate": 0.00019254871991635598, "loss": 0.3222, "step": 481 }, { "epoch": 0.6358839050131926, "grad_norm": 0.08239407010136815, "learning_rate": 0.00019249045008724077, "loss": 0.326, "step": 482 }, { "epoch": 0.637203166226913, "grad_norm": 0.07773760909524878, "learning_rate": 0.00019243196219346283, "loss": 0.3319, "step": 483 }, { "epoch": 0.6385224274406333, "grad_norm": 0.08061049752133144, "learning_rate": 0.00019237325637291896, "loss": 0.3215, "step": 484 }, { "epoch": 0.6398416886543535, "grad_norm": 0.08481635543092217, "learning_rate": 0.00019231433276401986, "loss": 0.3205, "step": 485 }, { "epoch": 0.6411609498680739, "grad_norm": 0.08352516609780243, "learning_rate": 0.00019225519150568965, "loss": 0.3374, "step": 486 }, { "epoch": 0.6424802110817942, "grad_norm": 0.08123118610594346, "learning_rate": 0.00019219583273736562, "loss": 0.3235, "step": 487 }, { "epoch": 0.6437994722955145, "grad_norm": 0.07946963190130013, "learning_rate": 0.00019213625659899791, "loss": 0.3198, "step": 488 }, { "epoch": 0.6451187335092349, "grad_norm": 0.08149334496438398, "learning_rate": 0.00019207646323104915, "loss": 0.3218, "step": 489 }, { "epoch": 0.6464379947229552, "grad_norm": 0.07915316205337136, "learning_rate": 0.0001920164527744941, "loss": 0.3179, "step": 490 }, { "epoch": 0.6477572559366754, "grad_norm": 0.07773007987231065, "learning_rate": 0.00019195622537081935, "loss": 0.3127, "step": 491 }, { "epoch": 0.6490765171503958, "grad_norm": 0.0769970419634341, "learning_rate": 0.00019189578116202307, "loss": 0.3272, "step": 492 }, { "epoch": 0.6503957783641161, "grad_norm": 0.07857163254429707, "learning_rate": 0.00019183512029061445, "loss": 0.336, "step": 493 }, { "epoch": 0.6517150395778364, "grad_norm": 0.08096140536575598, "learning_rate": 0.00019177424289961361, "loss": 0.3321, "step": 494 }, { "epoch": 0.6530343007915568, "grad_norm": 0.08174881922963369, "learning_rate": 0.00019171314913255113, "loss": 0.3183, "step": 495 }, { "epoch": 0.6543535620052771, "grad_norm": 0.07835709653390874, "learning_rate": 0.00019165183913346775, "loss": 0.3262, "step": 496 }, { "epoch": 0.6556728232189973, "grad_norm": 0.08442235363075887, "learning_rate": 0.00019159031304691397, "loss": 0.3184, "step": 497 }, { "epoch": 0.6569920844327177, "grad_norm": 0.07931961697032817, "learning_rate": 0.00019152857101794978, "loss": 0.3329, "step": 498 }, { "epoch": 0.658311345646438, "grad_norm": 0.07810154915470591, "learning_rate": 0.0001914666131921444, "loss": 0.3316, "step": 499 }, { "epoch": 0.6596306068601583, "grad_norm": 0.07738452118225553, "learning_rate": 0.0001914044397155757, "loss": 0.3356, "step": 500 }, { "epoch": 0.6609498680738787, "grad_norm": 0.07974302296929887, "learning_rate": 0.00019134205073483002, "loss": 0.3355, "step": 501 }, { "epoch": 0.662269129287599, "grad_norm": 0.08030499273528907, "learning_rate": 0.00019127944639700184, "loss": 0.3276, "step": 502 }, { "epoch": 0.6635883905013192, "grad_norm": 0.08112652588454172, "learning_rate": 0.00019121662684969335, "loss": 0.3245, "step": 503 }, { "epoch": 0.6649076517150396, "grad_norm": 0.07733053910916073, "learning_rate": 0.00019115359224101416, "loss": 0.337, "step": 504 }, { "epoch": 0.6662269129287599, "grad_norm": 0.08121982364944508, "learning_rate": 0.00019109034271958087, "loss": 0.3266, "step": 505 }, { "epoch": 0.6675461741424802, "grad_norm": 0.08162197554661528, "learning_rate": 0.00019102687843451687, "loss": 0.332, "step": 506 }, { "epoch": 0.6688654353562006, "grad_norm": 0.07318874692776918, "learning_rate": 0.00019096319953545185, "loss": 0.3018, "step": 507 }, { "epoch": 0.6701846965699209, "grad_norm": 0.08185537584375252, "learning_rate": 0.0001908993061725215, "loss": 0.3291, "step": 508 }, { "epoch": 0.6715039577836411, "grad_norm": 0.08103873559796818, "learning_rate": 0.00019083519849636714, "loss": 0.3371, "step": 509 }, { "epoch": 0.6728232189973615, "grad_norm": 0.07678603621979317, "learning_rate": 0.00019077087665813545, "loss": 0.3178, "step": 510 }, { "epoch": 0.6741424802110818, "grad_norm": 0.07764870516326369, "learning_rate": 0.0001907063408094779, "loss": 0.3233, "step": 511 }, { "epoch": 0.6754617414248021, "grad_norm": 0.08109312324415686, "learning_rate": 0.0001906415911025506, "loss": 0.3325, "step": 512 }, { "epoch": 0.6767810026385225, "grad_norm": 0.07884926842046147, "learning_rate": 0.00019057662769001395, "loss": 0.338, "step": 513 }, { "epoch": 0.6781002638522428, "grad_norm": 0.07781952621577197, "learning_rate": 0.00019051145072503215, "loss": 0.3278, "step": 514 }, { "epoch": 0.679419525065963, "grad_norm": 0.07759907747198086, "learning_rate": 0.0001904460603612728, "loss": 0.3317, "step": 515 }, { "epoch": 0.6807387862796834, "grad_norm": 0.07835788002434717, "learning_rate": 0.00019038045675290674, "loss": 0.3207, "step": 516 }, { "epoch": 0.6820580474934037, "grad_norm": 0.07962390214529308, "learning_rate": 0.0001903146400546076, "loss": 0.3328, "step": 517 }, { "epoch": 0.683377308707124, "grad_norm": 0.07731736065899096, "learning_rate": 0.0001902486104215513, "loss": 0.3293, "step": 518 }, { "epoch": 0.6846965699208444, "grad_norm": 0.07769795362281134, "learning_rate": 0.00019018236800941586, "loss": 0.3204, "step": 519 }, { "epoch": 0.6860158311345647, "grad_norm": 0.07677185088765882, "learning_rate": 0.00019011591297438097, "loss": 0.3187, "step": 520 }, { "epoch": 0.6873350923482849, "grad_norm": 0.0792528405767544, "learning_rate": 0.0001900492454731276, "loss": 0.3157, "step": 521 }, { "epoch": 0.6886543535620053, "grad_norm": 0.08143758846780581, "learning_rate": 0.00018998236566283774, "loss": 0.329, "step": 522 }, { "epoch": 0.6899736147757256, "grad_norm": 0.07990956948078035, "learning_rate": 0.00018991527370119376, "loss": 0.3258, "step": 523 }, { "epoch": 0.6912928759894459, "grad_norm": 0.08038489800313037, "learning_rate": 0.00018984796974637843, "loss": 0.3218, "step": 524 }, { "epoch": 0.6926121372031663, "grad_norm": 0.07788866305500225, "learning_rate": 0.00018978045395707418, "loss": 0.3222, "step": 525 }, { "epoch": 0.6939313984168866, "grad_norm": 0.08155915298406283, "learning_rate": 0.00018971272649246292, "loss": 0.3403, "step": 526 }, { "epoch": 0.6952506596306068, "grad_norm": 0.0791546495889291, "learning_rate": 0.00018964478751222564, "loss": 0.3374, "step": 527 }, { "epoch": 0.6965699208443272, "grad_norm": 0.07571967631189738, "learning_rate": 0.00018957663717654208, "loss": 0.319, "step": 528 }, { "epoch": 0.6978891820580475, "grad_norm": 0.07672943966694856, "learning_rate": 0.00018950827564609014, "loss": 0.3323, "step": 529 }, { "epoch": 0.6992084432717678, "grad_norm": 0.07459550499054882, "learning_rate": 0.00018943970308204583, "loss": 0.3166, "step": 530 }, { "epoch": 0.7005277044854882, "grad_norm": 0.07660793334220212, "learning_rate": 0.00018937091964608263, "loss": 0.3364, "step": 531 }, { "epoch": 0.7018469656992085, "grad_norm": 0.07901728060597539, "learning_rate": 0.00018930192550037112, "loss": 0.3256, "step": 532 }, { "epoch": 0.7031662269129287, "grad_norm": 0.07635754702306853, "learning_rate": 0.0001892327208075788, "loss": 0.3315, "step": 533 }, { "epoch": 0.7044854881266491, "grad_norm": 0.07969766029214384, "learning_rate": 0.00018916330573086953, "loss": 0.3289, "step": 534 }, { "epoch": 0.7058047493403694, "grad_norm": 0.07723783215866646, "learning_rate": 0.00018909368043390314, "loss": 0.3224, "step": 535 }, { "epoch": 0.7071240105540897, "grad_norm": 0.07548530343476885, "learning_rate": 0.00018902384508083517, "loss": 0.3169, "step": 536 }, { "epoch": 0.7084432717678101, "grad_norm": 0.07804886441878957, "learning_rate": 0.00018895379983631635, "loss": 0.3293, "step": 537 }, { "epoch": 0.7097625329815304, "grad_norm": 0.07762770005388542, "learning_rate": 0.00018888354486549237, "loss": 0.3087, "step": 538 }, { "epoch": 0.7110817941952506, "grad_norm": 0.07840986066065359, "learning_rate": 0.00018881308033400323, "loss": 0.3178, "step": 539 }, { "epoch": 0.712401055408971, "grad_norm": 0.07726893355845603, "learning_rate": 0.00018874240640798316, "loss": 0.3217, "step": 540 }, { "epoch": 0.7137203166226913, "grad_norm": 0.07795197874084298, "learning_rate": 0.00018867152325406, "loss": 0.326, "step": 541 }, { "epoch": 0.7150395778364116, "grad_norm": 0.08056951238385303, "learning_rate": 0.00018860043103935487, "loss": 0.3135, "step": 542 }, { "epoch": 0.716358839050132, "grad_norm": 0.07601366222023322, "learning_rate": 0.0001885291299314819, "loss": 0.3112, "step": 543 }, { "epoch": 0.7176781002638523, "grad_norm": 0.08201752768769388, "learning_rate": 0.00018845762009854763, "loss": 0.3278, "step": 544 }, { "epoch": 0.7189973614775725, "grad_norm": 0.07666118785398875, "learning_rate": 0.0001883859017091507, "loss": 0.3212, "step": 545 }, { "epoch": 0.7203166226912929, "grad_norm": 0.07600945062250915, "learning_rate": 0.00018831397493238158, "loss": 0.3267, "step": 546 }, { "epoch": 0.7216358839050132, "grad_norm": 0.07774654898421471, "learning_rate": 0.00018824183993782192, "loss": 0.3339, "step": 547 }, { "epoch": 0.7229551451187335, "grad_norm": 0.07677314844537811, "learning_rate": 0.00018816949689554434, "loss": 0.3167, "step": 548 }, { "epoch": 0.7242744063324539, "grad_norm": 0.0771558504533911, "learning_rate": 0.00018809694597611201, "loss": 0.3344, "step": 549 }, { "epoch": 0.7255936675461742, "grad_norm": 0.07656144679047976, "learning_rate": 0.00018802418735057815, "loss": 0.3245, "step": 550 }, { "epoch": 0.7269129287598944, "grad_norm": 0.07811463536734536, "learning_rate": 0.0001879512211904857, "loss": 0.326, "step": 551 }, { "epoch": 0.7282321899736148, "grad_norm": 0.07860140913790992, "learning_rate": 0.00018787804766786693, "loss": 0.3249, "step": 552 }, { "epoch": 0.7295514511873351, "grad_norm": 0.0758405254108781, "learning_rate": 0.000187804666955243, "loss": 0.3186, "step": 553 }, { "epoch": 0.7308707124010554, "grad_norm": 0.07877370533080455, "learning_rate": 0.00018773107922562357, "loss": 0.3171, "step": 554 }, { "epoch": 0.7321899736147758, "grad_norm": 0.07474655558685508, "learning_rate": 0.00018765728465250644, "loss": 0.3291, "step": 555 }, { "epoch": 0.7335092348284961, "grad_norm": 0.07618041482377434, "learning_rate": 0.00018758328340987688, "loss": 0.3257, "step": 556 }, { "epoch": 0.7348284960422163, "grad_norm": 0.07780590437669352, "learning_rate": 0.00018750907567220763, "loss": 0.3285, "step": 557 }, { "epoch": 0.7361477572559367, "grad_norm": 0.08125015855386801, "learning_rate": 0.00018743466161445823, "loss": 0.333, "step": 558 }, { "epoch": 0.737467018469657, "grad_norm": 0.07800226754706742, "learning_rate": 0.0001873600414120746, "loss": 0.3152, "step": 559 }, { "epoch": 0.7387862796833773, "grad_norm": 0.0780320799071381, "learning_rate": 0.0001872852152409888, "loss": 0.3156, "step": 560 }, { "epoch": 0.7401055408970977, "grad_norm": 0.07903448975609118, "learning_rate": 0.00018721018327761842, "loss": 0.3257, "step": 561 }, { "epoch": 0.741424802110818, "grad_norm": 0.07704440000004223, "learning_rate": 0.0001871349456988662, "loss": 0.3178, "step": 562 }, { "epoch": 0.7427440633245382, "grad_norm": 0.08283205482131041, "learning_rate": 0.0001870595026821198, "loss": 0.3303, "step": 563 }, { "epoch": 0.7440633245382586, "grad_norm": 0.07992746962965212, "learning_rate": 0.0001869838544052511, "loss": 0.3185, "step": 564 }, { "epoch": 0.7453825857519789, "grad_norm": 0.08235036854344133, "learning_rate": 0.00018690800104661603, "loss": 0.3259, "step": 565 }, { "epoch": 0.7467018469656992, "grad_norm": 0.0809699648298397, "learning_rate": 0.00018683194278505395, "loss": 0.3274, "step": 566 }, { "epoch": 0.7480211081794196, "grad_norm": 0.07620486395199721, "learning_rate": 0.00018675567979988743, "loss": 0.3242, "step": 567 }, { "epoch": 0.7493403693931399, "grad_norm": 0.0817283286043494, "learning_rate": 0.00018667921227092158, "loss": 0.3221, "step": 568 }, { "epoch": 0.7506596306068601, "grad_norm": 0.07961943137223382, "learning_rate": 0.00018660254037844388, "loss": 0.3198, "step": 569 }, { "epoch": 0.7519788918205804, "grad_norm": 0.08218411100217514, "learning_rate": 0.00018652566430322356, "loss": 0.3231, "step": 570 }, { "epoch": 0.7532981530343008, "grad_norm": 0.07444749736398726, "learning_rate": 0.00018644858422651133, "loss": 0.3197, "step": 571 }, { "epoch": 0.7546174142480211, "grad_norm": 0.07752950110507445, "learning_rate": 0.0001863713003300388, "loss": 0.3256, "step": 572 }, { "epoch": 0.7559366754617414, "grad_norm": 0.08149590411559456, "learning_rate": 0.00018629381279601813, "loss": 0.3333, "step": 573 }, { "epoch": 0.7572559366754618, "grad_norm": 0.07512590965011065, "learning_rate": 0.00018621612180714164, "loss": 0.316, "step": 574 }, { "epoch": 0.758575197889182, "grad_norm": 0.07524275993252107, "learning_rate": 0.00018613822754658132, "loss": 0.3242, "step": 575 }, { "epoch": 0.7598944591029023, "grad_norm": 0.07825038364874115, "learning_rate": 0.00018606013019798837, "loss": 0.319, "step": 576 }, { "epoch": 0.7612137203166227, "grad_norm": 0.07809667760411727, "learning_rate": 0.00018598182994549294, "loss": 0.3272, "step": 577 }, { "epoch": 0.762532981530343, "grad_norm": 0.0802834087151825, "learning_rate": 0.0001859033269737034, "loss": 0.3194, "step": 578 }, { "epoch": 0.7638522427440633, "grad_norm": 0.0852199043117729, "learning_rate": 0.00018582462146770614, "loss": 0.3415, "step": 579 }, { "epoch": 0.7651715039577837, "grad_norm": 0.08254004552106368, "learning_rate": 0.0001857457136130651, "loss": 0.3359, "step": 580 }, { "epoch": 0.7664907651715039, "grad_norm": 0.07779760312194849, "learning_rate": 0.00018566660359582127, "loss": 0.3208, "step": 581 }, { "epoch": 0.7678100263852242, "grad_norm": 0.07856141505616279, "learning_rate": 0.00018558729160249229, "loss": 0.3266, "step": 582 }, { "epoch": 0.7691292875989446, "grad_norm": 0.08163748702813539, "learning_rate": 0.00018550777782007193, "loss": 0.3371, "step": 583 }, { "epoch": 0.7704485488126649, "grad_norm": 0.07566557334466158, "learning_rate": 0.0001854280624360298, "loss": 0.3188, "step": 584 }, { "epoch": 0.7717678100263852, "grad_norm": 0.07513678351581865, "learning_rate": 0.00018534814563831082, "loss": 0.328, "step": 585 }, { "epoch": 0.7730870712401056, "grad_norm": 0.0760493027085604, "learning_rate": 0.00018526802761533479, "loss": 0.3271, "step": 586 }, { "epoch": 0.7744063324538258, "grad_norm": 0.08135378231315221, "learning_rate": 0.0001851877085559958, "loss": 0.336, "step": 587 }, { "epoch": 0.7757255936675461, "grad_norm": 0.07581370519174971, "learning_rate": 0.0001851071886496621, "loss": 0.3281, "step": 588 }, { "epoch": 0.7770448548812665, "grad_norm": 0.0816317952228403, "learning_rate": 0.00018502646808617548, "loss": 0.3316, "step": 589 }, { "epoch": 0.7783641160949868, "grad_norm": 0.0760869720572262, "learning_rate": 0.00018494554705585065, "loss": 0.3214, "step": 590 }, { "epoch": 0.7796833773087071, "grad_norm": 0.07880791361806208, "learning_rate": 0.00018486442574947511, "loss": 0.3359, "step": 591 }, { "epoch": 0.7810026385224275, "grad_norm": 0.07906384240395835, "learning_rate": 0.00018478310435830845, "loss": 0.3229, "step": 592 }, { "epoch": 0.7823218997361477, "grad_norm": 0.07900548401936976, "learning_rate": 0.00018470158307408213, "loss": 0.3341, "step": 593 }, { "epoch": 0.783641160949868, "grad_norm": 0.0760912322113177, "learning_rate": 0.00018461986208899878, "loss": 0.3346, "step": 594 }, { "epoch": 0.7849604221635884, "grad_norm": 0.07766841887919511, "learning_rate": 0.00018453794159573186, "loss": 0.3331, "step": 595 }, { "epoch": 0.7862796833773087, "grad_norm": 0.07339955909799124, "learning_rate": 0.0001844558217874253, "loss": 0.3104, "step": 596 }, { "epoch": 0.787598944591029, "grad_norm": 0.0752052006481722, "learning_rate": 0.00018437350285769295, "loss": 0.3255, "step": 597 }, { "epoch": 0.7889182058047494, "grad_norm": 0.07669706895122967, "learning_rate": 0.00018429098500061798, "loss": 0.3205, "step": 598 }, { "epoch": 0.7902374670184696, "grad_norm": 0.0778958794353999, "learning_rate": 0.00018420826841075277, "loss": 0.3212, "step": 599 }, { "epoch": 0.7915567282321899, "grad_norm": 0.07774737575184812, "learning_rate": 0.00018412535328311814, "loss": 0.3278, "step": 600 }, { "epoch": 0.7928759894459103, "grad_norm": 0.07735745011528168, "learning_rate": 0.000184042239813203, "loss": 0.3268, "step": 601 }, { "epoch": 0.7941952506596306, "grad_norm": 0.07587149129365167, "learning_rate": 0.00018395892819696389, "loss": 0.3183, "step": 602 }, { "epoch": 0.7955145118733509, "grad_norm": 0.0773478981052736, "learning_rate": 0.0001838754186308246, "loss": 0.3252, "step": 603 }, { "epoch": 0.7968337730870713, "grad_norm": 0.07584854663174286, "learning_rate": 0.00018379171131167557, "loss": 0.3209, "step": 604 }, { "epoch": 0.7981530343007915, "grad_norm": 0.07490819095575325, "learning_rate": 0.00018370780643687344, "loss": 0.3164, "step": 605 }, { "epoch": 0.7994722955145118, "grad_norm": 0.07932299262224758, "learning_rate": 0.00018362370420424068, "loss": 0.3372, "step": 606 }, { "epoch": 0.8007915567282322, "grad_norm": 0.07904948398916983, "learning_rate": 0.00018353940481206506, "loss": 0.3284, "step": 607 }, { "epoch": 0.8021108179419525, "grad_norm": 0.0771126078553985, "learning_rate": 0.00018345490845909923, "loss": 0.3275, "step": 608 }, { "epoch": 0.8034300791556728, "grad_norm": 0.07878982794762655, "learning_rate": 0.00018337021534456014, "loss": 0.325, "step": 609 }, { "epoch": 0.8047493403693932, "grad_norm": 0.07770354364396097, "learning_rate": 0.00018328532566812866, "loss": 0.3138, "step": 610 }, { "epoch": 0.8060686015831134, "grad_norm": 0.07796705476198237, "learning_rate": 0.00018320023962994914, "loss": 0.3268, "step": 611 }, { "epoch": 0.8073878627968337, "grad_norm": 0.07641123188092923, "learning_rate": 0.00018311495743062887, "loss": 0.3317, "step": 612 }, { "epoch": 0.8087071240105541, "grad_norm": 0.07936044919590553, "learning_rate": 0.00018302947927123766, "loss": 0.3311, "step": 613 }, { "epoch": 0.8100263852242744, "grad_norm": 0.07855031323130221, "learning_rate": 0.0001829438053533072, "loss": 0.3208, "step": 614 }, { "epoch": 0.8113456464379947, "grad_norm": 0.07593023138630253, "learning_rate": 0.00018285793587883092, "loss": 0.3178, "step": 615 }, { "epoch": 0.8126649076517151, "grad_norm": 0.07768353877792239, "learning_rate": 0.0001827718710502632, "loss": 0.3327, "step": 616 }, { "epoch": 0.8139841688654353, "grad_norm": 0.07565924686103344, "learning_rate": 0.00018268561107051892, "loss": 0.3227, "step": 617 }, { "epoch": 0.8153034300791556, "grad_norm": 0.07778663063369524, "learning_rate": 0.0001825991561429733, "loss": 0.325, "step": 618 }, { "epoch": 0.816622691292876, "grad_norm": 0.07695217374282812, "learning_rate": 0.000182512506471461, "loss": 0.3361, "step": 619 }, { "epoch": 0.8179419525065963, "grad_norm": 0.07482131127989258, "learning_rate": 0.0001824256622602759, "loss": 0.3201, "step": 620 }, { "epoch": 0.8192612137203166, "grad_norm": 0.07492459308283635, "learning_rate": 0.00018233862371417047, "loss": 0.3309, "step": 621 }, { "epoch": 0.820580474934037, "grad_norm": 0.07606552508327255, "learning_rate": 0.00018225139103835548, "loss": 0.324, "step": 622 }, { "epoch": 0.8218997361477572, "grad_norm": 0.07595243399647385, "learning_rate": 0.00018216396443849933, "loss": 0.3427, "step": 623 }, { "epoch": 0.8232189973614775, "grad_norm": 0.07908807135547007, "learning_rate": 0.00018207634412072764, "loss": 0.3265, "step": 624 }, { "epoch": 0.8245382585751979, "grad_norm": 0.07744970571678761, "learning_rate": 0.0001819885302916228, "loss": 0.3186, "step": 625 }, { "epoch": 0.8258575197889182, "grad_norm": 0.07574481012139717, "learning_rate": 0.00018190052315822332, "loss": 0.3347, "step": 626 }, { "epoch": 0.8271767810026385, "grad_norm": 0.07870062843685088, "learning_rate": 0.00018181232292802365, "loss": 0.3217, "step": 627 }, { "epoch": 0.8284960422163589, "grad_norm": 0.07750914192409034, "learning_rate": 0.00018172392980897337, "loss": 0.3329, "step": 628 }, { "epoch": 0.8298153034300791, "grad_norm": 0.07630760821353051, "learning_rate": 0.00018163534400947683, "loss": 0.3288, "step": 629 }, { "epoch": 0.8311345646437994, "grad_norm": 0.07698700799215849, "learning_rate": 0.00018154656573839275, "loss": 0.3229, "step": 630 }, { "epoch": 0.8324538258575198, "grad_norm": 0.07378013702759977, "learning_rate": 0.00018145759520503358, "loss": 0.3203, "step": 631 }, { "epoch": 0.8337730870712401, "grad_norm": 0.07430539084075606, "learning_rate": 0.00018136843261916508, "loss": 0.3244, "step": 632 }, { "epoch": 0.8350923482849604, "grad_norm": 0.07829109265979835, "learning_rate": 0.0001812790781910058, "loss": 0.32, "step": 633 }, { "epoch": 0.8364116094986808, "grad_norm": 0.07750597295818812, "learning_rate": 0.00018118953213122656, "loss": 0.3272, "step": 634 }, { "epoch": 0.837730870712401, "grad_norm": 0.07438713478577097, "learning_rate": 0.00018109979465095013, "loss": 0.3155, "step": 635 }, { "epoch": 0.8390501319261213, "grad_norm": 0.07396619698765629, "learning_rate": 0.00018100986596175046, "loss": 0.3231, "step": 636 }, { "epoch": 0.8403693931398417, "grad_norm": 0.07854845579779261, "learning_rate": 0.0001809197462756523, "loss": 0.3163, "step": 637 }, { "epoch": 0.841688654353562, "grad_norm": 0.07534294514976372, "learning_rate": 0.00018082943580513083, "loss": 0.3211, "step": 638 }, { "epoch": 0.8430079155672823, "grad_norm": 0.07673313463746073, "learning_rate": 0.00018073893476311097, "loss": 0.3282, "step": 639 }, { "epoch": 0.8443271767810027, "grad_norm": 0.07745647047420233, "learning_rate": 0.0001806482433629669, "loss": 0.3211, "step": 640 }, { "epoch": 0.8456464379947229, "grad_norm": 0.07754452461919048, "learning_rate": 0.00018055736181852176, "loss": 0.3274, "step": 641 }, { "epoch": 0.8469656992084432, "grad_norm": 0.07689093267394997, "learning_rate": 0.0001804662903440468, "loss": 0.3201, "step": 642 }, { "epoch": 0.8482849604221636, "grad_norm": 0.07676533750604121, "learning_rate": 0.00018037502915426123, "loss": 0.3183, "step": 643 }, { "epoch": 0.8496042216358839, "grad_norm": 0.07300727204166407, "learning_rate": 0.00018028357846433144, "loss": 0.3159, "step": 644 }, { "epoch": 0.8509234828496042, "grad_norm": 0.07655101194721985, "learning_rate": 0.0001801919384898707, "loss": 0.3207, "step": 645 }, { "epoch": 0.8522427440633246, "grad_norm": 0.07547470115368125, "learning_rate": 0.00018010010944693848, "loss": 0.332, "step": 646 }, { "epoch": 0.8535620052770448, "grad_norm": 0.07084627983728624, "learning_rate": 0.00018000809155204004, "loss": 0.3146, "step": 647 }, { "epoch": 0.8548812664907651, "grad_norm": 0.0792679171165376, "learning_rate": 0.0001799158850221259, "loss": 0.3282, "step": 648 }, { "epoch": 0.8562005277044855, "grad_norm": 0.07615220052526256, "learning_rate": 0.00017982349007459133, "loss": 0.3317, "step": 649 }, { "epoch": 0.8575197889182058, "grad_norm": 0.07546779909892315, "learning_rate": 0.00017973090692727583, "loss": 0.3239, "step": 650 }, { "epoch": 0.8588390501319261, "grad_norm": 0.07390596025676681, "learning_rate": 0.0001796381357984626, "loss": 0.318, "step": 651 }, { "epoch": 0.8601583113456465, "grad_norm": 0.07944895028442554, "learning_rate": 0.0001795451769068781, "loss": 0.3204, "step": 652 }, { "epoch": 0.8614775725593667, "grad_norm": 0.0808213559739916, "learning_rate": 0.0001794520304716914, "loss": 0.3279, "step": 653 }, { "epoch": 0.862796833773087, "grad_norm": 0.07453327366673215, "learning_rate": 0.00017935869671251378, "loss": 0.3109, "step": 654 }, { "epoch": 0.8641160949868074, "grad_norm": 0.07799257239519991, "learning_rate": 0.00017926517584939815, "loss": 0.3208, "step": 655 }, { "epoch": 0.8654353562005277, "grad_norm": 0.0764248152440674, "learning_rate": 0.00017917146810283863, "loss": 0.3269, "step": 656 }, { "epoch": 0.866754617414248, "grad_norm": 0.07750216705682134, "learning_rate": 0.00017907757369376985, "loss": 0.3262, "step": 657 }, { "epoch": 0.8680738786279684, "grad_norm": 0.07764499624069539, "learning_rate": 0.00017898349284356665, "loss": 0.3302, "step": 658 }, { "epoch": 0.8693931398416886, "grad_norm": 0.07583421456158562, "learning_rate": 0.00017888922577404332, "loss": 0.3244, "step": 659 }, { "epoch": 0.8707124010554089, "grad_norm": 0.07821088187996458, "learning_rate": 0.00017879477270745328, "loss": 0.3233, "step": 660 }, { "epoch": 0.8720316622691293, "grad_norm": 0.07581020226494725, "learning_rate": 0.00017870013386648846, "loss": 0.3272, "step": 661 }, { "epoch": 0.8733509234828496, "grad_norm": 0.07532243010872236, "learning_rate": 0.00017860530947427875, "loss": 0.3259, "step": 662 }, { "epoch": 0.8746701846965699, "grad_norm": 0.07927607157329124, "learning_rate": 0.00017851029975439158, "loss": 0.3155, "step": 663 }, { "epoch": 0.8759894459102903, "grad_norm": 0.07565144143646169, "learning_rate": 0.00017841510493083127, "loss": 0.3149, "step": 664 }, { "epoch": 0.8773087071240105, "grad_norm": 0.077896038519202, "learning_rate": 0.00017831972522803862, "loss": 0.3298, "step": 665 }, { "epoch": 0.8786279683377308, "grad_norm": 0.07716667205344427, "learning_rate": 0.00017822416087089025, "loss": 0.3288, "step": 666 }, { "epoch": 0.8799472295514512, "grad_norm": 0.0763876805343907, "learning_rate": 0.00017812841208469815, "loss": 0.318, "step": 667 }, { "epoch": 0.8812664907651715, "grad_norm": 0.07705370747907783, "learning_rate": 0.0001780324790952092, "loss": 0.3199, "step": 668 }, { "epoch": 0.8825857519788918, "grad_norm": 0.07528853662110586, "learning_rate": 0.00017793636212860449, "loss": 0.3229, "step": 669 }, { "epoch": 0.8839050131926122, "grad_norm": 0.07446178176015415, "learning_rate": 0.000177840061411499, "loss": 0.3166, "step": 670 }, { "epoch": 0.8852242744063324, "grad_norm": 0.07048006364320443, "learning_rate": 0.00017774357717094077, "loss": 0.3248, "step": 671 }, { "epoch": 0.8865435356200527, "grad_norm": 0.07250177667079756, "learning_rate": 0.00017764690963441066, "loss": 0.3318, "step": 672 }, { "epoch": 0.8878627968337731, "grad_norm": 0.07296791847857127, "learning_rate": 0.00017755005902982165, "loss": 0.3347, "step": 673 }, { "epoch": 0.8891820580474934, "grad_norm": 0.07401671242122355, "learning_rate": 0.00017745302558551832, "loss": 0.3193, "step": 674 }, { "epoch": 0.8905013192612137, "grad_norm": 0.07410864066903573, "learning_rate": 0.00017735580953027636, "loss": 0.3164, "step": 675 }, { "epoch": 0.8918205804749341, "grad_norm": 0.0739436940162759, "learning_rate": 0.00017725841109330197, "loss": 0.3247, "step": 676 }, { "epoch": 0.8931398416886543, "grad_norm": 0.07652837959139312, "learning_rate": 0.00017716083050423138, "loss": 0.3202, "step": 677 }, { "epoch": 0.8944591029023746, "grad_norm": 0.07456377573463747, "learning_rate": 0.00017706306799313026, "loss": 0.3183, "step": 678 }, { "epoch": 0.895778364116095, "grad_norm": 0.07599764463374072, "learning_rate": 0.00017696512379049325, "loss": 0.3294, "step": 679 }, { "epoch": 0.8970976253298153, "grad_norm": 0.07530147891920357, "learning_rate": 0.00017686699812724326, "loss": 0.3204, "step": 680 }, { "epoch": 0.8984168865435356, "grad_norm": 0.07146252342095924, "learning_rate": 0.00017676869123473113, "loss": 0.3109, "step": 681 }, { "epoch": 0.899736147757256, "grad_norm": 0.07328062757523274, "learning_rate": 0.00017667020334473493, "loss": 0.3067, "step": 682 }, { "epoch": 0.9010554089709762, "grad_norm": 0.0759890390825855, "learning_rate": 0.00017657153468945947, "loss": 0.3214, "step": 683 }, { "epoch": 0.9023746701846965, "grad_norm": 0.07691888066914436, "learning_rate": 0.00017647268550153583, "loss": 0.3213, "step": 684 }, { "epoch": 0.9036939313984169, "grad_norm": 0.07474826479972835, "learning_rate": 0.00017637365601402057, "loss": 0.32, "step": 685 }, { "epoch": 0.9050131926121372, "grad_norm": 0.0797555380722128, "learning_rate": 0.00017627444646039545, "loss": 0.3249, "step": 686 }, { "epoch": 0.9063324538258575, "grad_norm": 0.0753030263280917, "learning_rate": 0.00017617505707456682, "loss": 0.3137, "step": 687 }, { "epoch": 0.9076517150395779, "grad_norm": 0.07227621838752503, "learning_rate": 0.00017607548809086494, "loss": 0.3145, "step": 688 }, { "epoch": 0.9089709762532981, "grad_norm": 0.07000125960043199, "learning_rate": 0.00017597573974404348, "loss": 0.316, "step": 689 }, { "epoch": 0.9102902374670184, "grad_norm": 0.07450924616419469, "learning_rate": 0.0001758758122692791, "loss": 0.3217, "step": 690 }, { "epoch": 0.9116094986807388, "grad_norm": 0.07345478584235057, "learning_rate": 0.0001757757059021707, "loss": 0.3317, "step": 691 }, { "epoch": 0.9129287598944591, "grad_norm": 0.07180435506339888, "learning_rate": 0.00017567542087873895, "loss": 0.3049, "step": 692 }, { "epoch": 0.9142480211081794, "grad_norm": 0.07687645736170094, "learning_rate": 0.00017557495743542585, "loss": 0.329, "step": 693 }, { "epoch": 0.9155672823218998, "grad_norm": 0.07559918389749212, "learning_rate": 0.0001754743158090939, "loss": 0.3259, "step": 694 }, { "epoch": 0.91688654353562, "grad_norm": 0.07813929435092415, "learning_rate": 0.00017537349623702585, "loss": 0.3369, "step": 695 }, { "epoch": 0.9182058047493403, "grad_norm": 0.07307181908744907, "learning_rate": 0.0001752724989569239, "loss": 0.3055, "step": 696 }, { "epoch": 0.9195250659630607, "grad_norm": 0.07341808527081209, "learning_rate": 0.00017517132420690924, "loss": 0.3306, "step": 697 }, { "epoch": 0.920844327176781, "grad_norm": 0.07782186905777579, "learning_rate": 0.00017506997222552158, "loss": 0.3212, "step": 698 }, { "epoch": 0.9221635883905013, "grad_norm": 0.07692345408335559, "learning_rate": 0.00017496844325171827, "loss": 0.3276, "step": 699 }, { "epoch": 0.9234828496042217, "grad_norm": 0.07587193011949236, "learning_rate": 0.00017486673752487424, "loss": 0.329, "step": 700 }, { "epoch": 0.924802110817942, "grad_norm": 0.0757353706239726, "learning_rate": 0.00017476485528478093, "loss": 0.32, "step": 701 }, { "epoch": 0.9261213720316622, "grad_norm": 0.07728038555891979, "learning_rate": 0.000174662796771646, "loss": 0.3245, "step": 702 }, { "epoch": 0.9274406332453826, "grad_norm": 0.07542465542763911, "learning_rate": 0.00017456056222609276, "loss": 0.3317, "step": 703 }, { "epoch": 0.9287598944591029, "grad_norm": 0.07205179921972722, "learning_rate": 0.00017445815188915948, "loss": 0.315, "step": 704 }, { "epoch": 0.9300791556728232, "grad_norm": 0.07400647606276098, "learning_rate": 0.00017435556600229902, "loss": 0.314, "step": 705 }, { "epoch": 0.9313984168865436, "grad_norm": 0.07444557911415577, "learning_rate": 0.00017425280480737798, "loss": 0.3161, "step": 706 }, { "epoch": 0.9327176781002638, "grad_norm": 0.07716836180221044, "learning_rate": 0.00017414986854667636, "loss": 0.3262, "step": 707 }, { "epoch": 0.9340369393139841, "grad_norm": 0.07567084420717658, "learning_rate": 0.00017404675746288687, "loss": 0.3204, "step": 708 }, { "epoch": 0.9353562005277045, "grad_norm": 0.07680904155359644, "learning_rate": 0.00017394347179911448, "loss": 0.3192, "step": 709 }, { "epoch": 0.9366754617414248, "grad_norm": 0.07346909027869813, "learning_rate": 0.0001738400117988757, "loss": 0.3114, "step": 710 }, { "epoch": 0.9379947229551451, "grad_norm": 0.07671037351280624, "learning_rate": 0.0001737363777060981, "loss": 0.3236, "step": 711 }, { "epoch": 0.9393139841688655, "grad_norm": 0.07447384646940915, "learning_rate": 0.00017363256976511972, "loss": 0.3225, "step": 712 }, { "epoch": 0.9406332453825857, "grad_norm": 0.07613462531120221, "learning_rate": 0.00017352858822068844, "loss": 0.3201, "step": 713 }, { "epoch": 0.941952506596306, "grad_norm": 0.07593658785546577, "learning_rate": 0.00017342443331796147, "loss": 0.326, "step": 714 }, { "epoch": 0.9432717678100264, "grad_norm": 0.07517934455736985, "learning_rate": 0.00017332010530250473, "loss": 0.3269, "step": 715 }, { "epoch": 0.9445910290237467, "grad_norm": 0.07677364181870303, "learning_rate": 0.00017321560442029233, "loss": 0.3166, "step": 716 }, { "epoch": 0.945910290237467, "grad_norm": 0.07803905274564243, "learning_rate": 0.00017311093091770588, "loss": 0.3196, "step": 717 }, { "epoch": 0.9472295514511874, "grad_norm": 0.07688970309035748, "learning_rate": 0.00017300608504153405, "loss": 0.3284, "step": 718 }, { "epoch": 0.9485488126649076, "grad_norm": 0.07172710290575555, "learning_rate": 0.00017290106703897186, "loss": 0.3178, "step": 719 }, { "epoch": 0.9498680738786279, "grad_norm": 0.07653402436393283, "learning_rate": 0.00017279587715762022, "loss": 0.3185, "step": 720 }, { "epoch": 0.9511873350923483, "grad_norm": 0.07622257672853588, "learning_rate": 0.00017269051564548516, "loss": 0.324, "step": 721 }, { "epoch": 0.9525065963060686, "grad_norm": 0.07328922039679088, "learning_rate": 0.00017258498275097748, "loss": 0.3205, "step": 722 }, { "epoch": 0.9538258575197889, "grad_norm": 0.07100173749959074, "learning_rate": 0.000172479278722912, "loss": 0.3086, "step": 723 }, { "epoch": 0.9551451187335093, "grad_norm": 0.07814947792486318, "learning_rate": 0.00017237340381050703, "loss": 0.3224, "step": 724 }, { "epoch": 0.9564643799472295, "grad_norm": 0.07380282013859701, "learning_rate": 0.00017226735826338373, "loss": 0.3041, "step": 725 }, { "epoch": 0.9577836411609498, "grad_norm": 0.07826087492821324, "learning_rate": 0.00017216114233156566, "loss": 0.3209, "step": 726 }, { "epoch": 0.9591029023746702, "grad_norm": 0.07761381862114954, "learning_rate": 0.00017205475626547802, "loss": 0.3302, "step": 727 }, { "epoch": 0.9604221635883905, "grad_norm": 0.08022201637157055, "learning_rate": 0.00017194820031594715, "loss": 0.3305, "step": 728 }, { "epoch": 0.9617414248021108, "grad_norm": 0.07756471078365025, "learning_rate": 0.00017184147473419992, "loss": 0.323, "step": 729 }, { "epoch": 0.9630606860158312, "grad_norm": 0.07735912268866554, "learning_rate": 0.00017173457977186316, "loss": 0.3167, "step": 730 }, { "epoch": 0.9643799472295514, "grad_norm": 0.07911902737355421, "learning_rate": 0.00017162751568096306, "loss": 0.3295, "step": 731 }, { "epoch": 0.9656992084432717, "grad_norm": 0.07937296614437254, "learning_rate": 0.00017152028271392452, "loss": 0.3348, "step": 732 }, { "epoch": 0.9670184696569921, "grad_norm": 0.07503065676747361, "learning_rate": 0.00017141288112357064, "loss": 0.3162, "step": 733 }, { "epoch": 0.9683377308707124, "grad_norm": 0.07215750557035114, "learning_rate": 0.00017130531116312203, "loss": 0.3113, "step": 734 }, { "epoch": 0.9696569920844327, "grad_norm": 0.07361635619123895, "learning_rate": 0.00017119757308619639, "loss": 0.3188, "step": 735 }, { "epoch": 0.9709762532981531, "grad_norm": 0.07474675186480553, "learning_rate": 0.00017108966714680758, "loss": 0.3169, "step": 736 }, { "epoch": 0.9722955145118733, "grad_norm": 0.07270731228595637, "learning_rate": 0.00017098159359936544, "loss": 0.3164, "step": 737 }, { "epoch": 0.9736147757255936, "grad_norm": 0.0738512853814849, "learning_rate": 0.00017087335269867483, "loss": 0.319, "step": 738 }, { "epoch": 0.974934036939314, "grad_norm": 0.07802736815513399, "learning_rate": 0.0001707649446999353, "loss": 0.3407, "step": 739 }, { "epoch": 0.9762532981530343, "grad_norm": 0.07557607346449689, "learning_rate": 0.00017065636985874027, "loss": 0.3196, "step": 740 }, { "epoch": 0.9775725593667546, "grad_norm": 0.07322617896656035, "learning_rate": 0.00017054762843107658, "loss": 0.3138, "step": 741 }, { "epoch": 0.978891820580475, "grad_norm": 0.07309351538575959, "learning_rate": 0.00017043872067332375, "loss": 0.3265, "step": 742 }, { "epoch": 0.9802110817941952, "grad_norm": 0.07183859357211266, "learning_rate": 0.00017032964684225358, "loss": 0.3122, "step": 743 }, { "epoch": 0.9815303430079155, "grad_norm": 0.07185639603245424, "learning_rate": 0.00017022040719502933, "loss": 0.3197, "step": 744 }, { "epoch": 0.9828496042216359, "grad_norm": 0.0718422536684087, "learning_rate": 0.0001701110019892053, "loss": 0.313, "step": 745 }, { "epoch": 0.9841688654353562, "grad_norm": 0.07168074993523973, "learning_rate": 0.0001700014314827259, "loss": 0.3113, "step": 746 }, { "epoch": 0.9854881266490765, "grad_norm": 0.07284224199574281, "learning_rate": 0.0001698916959339256, "loss": 0.3227, "step": 747 }, { "epoch": 0.9868073878627969, "grad_norm": 0.07369662021563723, "learning_rate": 0.00016978179560152773, "loss": 0.3261, "step": 748 }, { "epoch": 0.9881266490765171, "grad_norm": 0.0715372194018445, "learning_rate": 0.00016967173074464422, "loss": 0.3243, "step": 749 }, { "epoch": 0.9894459102902374, "grad_norm": 0.07458058533622318, "learning_rate": 0.0001695615016227749, "loss": 0.312, "step": 750 }, { "epoch": 0.9907651715039578, "grad_norm": 0.072237386192741, "learning_rate": 0.00016945110849580694, "loss": 0.3307, "step": 751 }, { "epoch": 0.9920844327176781, "grad_norm": 0.07381518249161714, "learning_rate": 0.00016934055162401405, "loss": 0.3345, "step": 752 }, { "epoch": 0.9934036939313984, "grad_norm": 0.07392197375633458, "learning_rate": 0.00016922983126805614, "loss": 0.3301, "step": 753 }, { "epoch": 0.9947229551451188, "grad_norm": 0.07307555461524463, "learning_rate": 0.00016911894768897848, "loss": 0.315, "step": 754 }, { "epoch": 0.996042216358839, "grad_norm": 0.07042407975104488, "learning_rate": 0.00016900790114821122, "loss": 0.3121, "step": 755 }, { "epoch": 0.9973614775725593, "grad_norm": 0.07202267839792592, "learning_rate": 0.00016889669190756868, "loss": 0.3129, "step": 756 }, { "epoch": 0.9986807387862797, "grad_norm": 0.07587254674479163, "learning_rate": 0.0001687853202292488, "loss": 0.3256, "step": 757 }, { "epoch": 1.0, "grad_norm": 0.07024022864934218, "learning_rate": 0.00016867378637583252, "loss": 0.323, "step": 758 }, { "epoch": 1.0, "eval_loss": 0.3201766312122345, "eval_runtime": 166.886, "eval_samples_per_second": 30.59, "eval_steps_per_second": 0.959, "step": 758 }, { "epoch": 1.0013192612137203, "grad_norm": 0.07247184683375647, "learning_rate": 0.0001685620906102831, "loss": 0.3133, "step": 759 }, { "epoch": 1.0026385224274406, "grad_norm": 0.07045035766128951, "learning_rate": 0.00016845023319594557, "loss": 0.301, "step": 760 }, { "epoch": 1.003957783641161, "grad_norm": 0.0719145519547812, "learning_rate": 0.0001683382143965461, "loss": 0.3057, "step": 761 }, { "epoch": 1.0052770448548813, "grad_norm": 0.07306763120348726, "learning_rate": 0.00016822603447619127, "loss": 0.3064, "step": 762 }, { "epoch": 1.0065963060686016, "grad_norm": 0.07525843546272429, "learning_rate": 0.00016811369369936765, "loss": 0.3123, "step": 763 }, { "epoch": 1.007915567282322, "grad_norm": 0.07439840311417834, "learning_rate": 0.00016800119233094095, "loss": 0.2908, "step": 764 }, { "epoch": 1.0092348284960422, "grad_norm": 0.0764729747044926, "learning_rate": 0.00016788853063615556, "loss": 0.3086, "step": 765 }, { "epoch": 1.0105540897097625, "grad_norm": 0.07815025489532014, "learning_rate": 0.0001677757088806339, "loss": 0.3168, "step": 766 }, { "epoch": 1.0118733509234827, "grad_norm": 0.07590771522843393, "learning_rate": 0.00016766272733037576, "loss": 0.2928, "step": 767 }, { "epoch": 1.0131926121372032, "grad_norm": 0.07805168650207316, "learning_rate": 0.00016754958625175758, "loss": 0.3064, "step": 768 }, { "epoch": 1.0145118733509235, "grad_norm": 0.07578980790396111, "learning_rate": 0.00016743628591153205, "loss": 0.3187, "step": 769 }, { "epoch": 1.0158311345646438, "grad_norm": 0.07620076771032804, "learning_rate": 0.00016732282657682732, "loss": 0.3053, "step": 770 }, { "epoch": 1.017150395778364, "grad_norm": 0.07430696317338739, "learning_rate": 0.0001672092085151463, "loss": 0.3124, "step": 771 }, { "epoch": 1.0184696569920844, "grad_norm": 0.07667850560492671, "learning_rate": 0.00016709543199436625, "loss": 0.3171, "step": 772 }, { "epoch": 1.0197889182058049, "grad_norm": 0.07917477937990572, "learning_rate": 0.00016698149728273796, "loss": 0.3084, "step": 773 }, { "epoch": 1.0211081794195251, "grad_norm": 0.07770934997894881, "learning_rate": 0.00016686740464888521, "loss": 0.3168, "step": 774 }, { "epoch": 1.0224274406332454, "grad_norm": 0.07518028487500698, "learning_rate": 0.0001667531543618042, "loss": 0.3079, "step": 775 }, { "epoch": 1.0237467018469657, "grad_norm": 0.07512661695031991, "learning_rate": 0.00016663874669086264, "loss": 0.3104, "step": 776 }, { "epoch": 1.025065963060686, "grad_norm": 0.07485236710736398, "learning_rate": 0.00016652418190579943, "loss": 0.3007, "step": 777 }, { "epoch": 1.0263852242744063, "grad_norm": 0.07649165800302302, "learning_rate": 0.00016640946027672392, "loss": 0.3101, "step": 778 }, { "epoch": 1.0277044854881265, "grad_norm": 0.07421925420571847, "learning_rate": 0.00016629458207411516, "loss": 0.3009, "step": 779 }, { "epoch": 1.029023746701847, "grad_norm": 0.0742955541778007, "learning_rate": 0.00016617954756882144, "loss": 0.3007, "step": 780 }, { "epoch": 1.0303430079155673, "grad_norm": 0.07431238566157004, "learning_rate": 0.00016606435703205946, "loss": 0.303, "step": 781 }, { "epoch": 1.0316622691292876, "grad_norm": 0.07742111603050637, "learning_rate": 0.00016594901073541395, "loss": 0.3032, "step": 782 }, { "epoch": 1.0329815303430079, "grad_norm": 0.0775544922880255, "learning_rate": 0.00016583350895083666, "loss": 0.3189, "step": 783 }, { "epoch": 1.0343007915567282, "grad_norm": 0.07523196682276424, "learning_rate": 0.0001657178519506462, "loss": 0.3002, "step": 784 }, { "epoch": 1.0356200527704487, "grad_norm": 0.07378610598431484, "learning_rate": 0.0001656020400075269, "loss": 0.3056, "step": 785 }, { "epoch": 1.036939313984169, "grad_norm": 0.07830972659616253, "learning_rate": 0.00016548607339452853, "loss": 0.3135, "step": 786 }, { "epoch": 1.0382585751978892, "grad_norm": 0.07936848587204688, "learning_rate": 0.00016536995238506546, "loss": 0.314, "step": 787 }, { "epoch": 1.0395778364116095, "grad_norm": 0.07509874182037382, "learning_rate": 0.00016525367725291607, "loss": 0.307, "step": 788 }, { "epoch": 1.0408970976253298, "grad_norm": 0.07514231271036195, "learning_rate": 0.00016513724827222227, "loss": 0.3109, "step": 789 }, { "epoch": 1.04221635883905, "grad_norm": 0.07911735545590289, "learning_rate": 0.00016502066571748842, "loss": 0.3138, "step": 790 }, { "epoch": 1.0435356200527703, "grad_norm": 0.07600940297554772, "learning_rate": 0.00016490392986358122, "loss": 0.3179, "step": 791 }, { "epoch": 1.0448548812664908, "grad_norm": 0.07663193005224447, "learning_rate": 0.0001647870409857287, "loss": 0.3115, "step": 792 }, { "epoch": 1.0461741424802111, "grad_norm": 0.0768016885072219, "learning_rate": 0.00016466999935951964, "loss": 0.3066, "step": 793 }, { "epoch": 1.0474934036939314, "grad_norm": 0.07649425515056713, "learning_rate": 0.000164552805260903, "loss": 0.3105, "step": 794 }, { "epoch": 1.0488126649076517, "grad_norm": 0.07931151668188231, "learning_rate": 0.00016443545896618723, "loss": 0.3152, "step": 795 }, { "epoch": 1.050131926121372, "grad_norm": 0.07806556574162371, "learning_rate": 0.0001643179607520396, "loss": 0.319, "step": 796 }, { "epoch": 1.0514511873350922, "grad_norm": 0.07524274420186328, "learning_rate": 0.00016420031089548555, "loss": 0.3025, "step": 797 }, { "epoch": 1.0527704485488127, "grad_norm": 0.07770873493115828, "learning_rate": 0.00016408250967390805, "loss": 0.2986, "step": 798 }, { "epoch": 1.054089709762533, "grad_norm": 0.07532640613427305, "learning_rate": 0.00016396455736504697, "loss": 0.3104, "step": 799 }, { "epoch": 1.0554089709762533, "grad_norm": 0.07522115512606149, "learning_rate": 0.00016384645424699835, "loss": 0.3046, "step": 800 }, { "epoch": 1.0567282321899736, "grad_norm": 0.07510408676718197, "learning_rate": 0.00016372820059821388, "loss": 0.3049, "step": 801 }, { "epoch": 1.0580474934036939, "grad_norm": 0.07853165989405149, "learning_rate": 0.0001636097966975, "loss": 0.319, "step": 802 }, { "epoch": 1.0593667546174141, "grad_norm": 0.07789997584150327, "learning_rate": 0.00016349124282401755, "loss": 0.3033, "step": 803 }, { "epoch": 1.0606860158311346, "grad_norm": 0.0806285605864734, "learning_rate": 0.0001633725392572809, "loss": 0.3066, "step": 804 }, { "epoch": 1.062005277044855, "grad_norm": 0.07932956470163255, "learning_rate": 0.0001632536862771574, "loss": 0.3039, "step": 805 }, { "epoch": 1.0633245382585752, "grad_norm": 0.07822273889482871, "learning_rate": 0.00016313468416386654, "loss": 0.3002, "step": 806 }, { "epoch": 1.0646437994722955, "grad_norm": 0.07806436086583189, "learning_rate": 0.0001630155331979796, "loss": 0.314, "step": 807 }, { "epoch": 1.0659630606860158, "grad_norm": 0.07555894753892779, "learning_rate": 0.00016289623366041864, "loss": 0.2986, "step": 808 }, { "epoch": 1.0672823218997363, "grad_norm": 0.0766099595321606, "learning_rate": 0.00016277678583245616, "loss": 0.3125, "step": 809 }, { "epoch": 1.0686015831134565, "grad_norm": 0.07319171363015817, "learning_rate": 0.00016265718999571415, "loss": 0.3112, "step": 810 }, { "epoch": 1.0699208443271768, "grad_norm": 0.07475885750836501, "learning_rate": 0.00016253744643216368, "loss": 0.3135, "step": 811 }, { "epoch": 1.071240105540897, "grad_norm": 0.07562404523025171, "learning_rate": 0.00016241755542412403, "loss": 0.3048, "step": 812 }, { "epoch": 1.0725593667546174, "grad_norm": 0.0749708448890196, "learning_rate": 0.00016229751725426212, "loss": 0.3151, "step": 813 }, { "epoch": 1.0738786279683377, "grad_norm": 0.07469564346766237, "learning_rate": 0.00016217733220559187, "loss": 0.3076, "step": 814 }, { "epoch": 1.075197889182058, "grad_norm": 0.07485039273983625, "learning_rate": 0.00016205700056147349, "loss": 0.2988, "step": 815 }, { "epoch": 1.0765171503957784, "grad_norm": 0.07767921617859706, "learning_rate": 0.00016193652260561279, "loss": 0.3078, "step": 816 }, { "epoch": 1.0778364116094987, "grad_norm": 0.07854522664862777, "learning_rate": 0.00016181589862206052, "loss": 0.3136, "step": 817 }, { "epoch": 1.079155672823219, "grad_norm": 0.07634405522074265, "learning_rate": 0.0001616951288952118, "loss": 0.3128, "step": 818 }, { "epoch": 1.0804749340369393, "grad_norm": 0.07737723041366226, "learning_rate": 0.0001615742137098053, "loss": 0.3215, "step": 819 }, { "epoch": 1.0817941952506596, "grad_norm": 0.07956821409832669, "learning_rate": 0.0001614531533509227, "loss": 0.3162, "step": 820 }, { "epoch": 1.08311345646438, "grad_norm": 0.07394500394411092, "learning_rate": 0.00016133194810398783, "loss": 0.3013, "step": 821 }, { "epoch": 1.0844327176781003, "grad_norm": 0.07493911727671666, "learning_rate": 0.0001612105982547663, "loss": 0.2996, "step": 822 }, { "epoch": 1.0857519788918206, "grad_norm": 0.0801122725900157, "learning_rate": 0.0001610891040893645, "loss": 0.3074, "step": 823 }, { "epoch": 1.087071240105541, "grad_norm": 0.07422823732094003, "learning_rate": 0.0001609674658942291, "loss": 0.3, "step": 824 }, { "epoch": 1.0883905013192612, "grad_norm": 0.07427347219126827, "learning_rate": 0.00016084568395614648, "loss": 0.3099, "step": 825 }, { "epoch": 1.0897097625329815, "grad_norm": 0.07797876445870587, "learning_rate": 0.00016072375856224173, "loss": 0.3181, "step": 826 }, { "epoch": 1.0910290237467017, "grad_norm": 0.0758909764908752, "learning_rate": 0.0001606016899999783, "loss": 0.3053, "step": 827 }, { "epoch": 1.0923482849604222, "grad_norm": 0.07696207910915019, "learning_rate": 0.00016047947855715714, "loss": 0.3082, "step": 828 }, { "epoch": 1.0936675461741425, "grad_norm": 0.07688684105517972, "learning_rate": 0.00016035712452191608, "loss": 0.3182, "step": 829 }, { "epoch": 1.0949868073878628, "grad_norm": 0.07626116771332213, "learning_rate": 0.00016023462818272907, "loss": 0.3116, "step": 830 }, { "epoch": 1.096306068601583, "grad_norm": 0.07577326597753918, "learning_rate": 0.00016011198982840576, "loss": 0.3017, "step": 831 }, { "epoch": 1.0976253298153034, "grad_norm": 0.0746994558669387, "learning_rate": 0.0001599892097480904, "loss": 0.3019, "step": 832 }, { "epoch": 1.0989445910290236, "grad_norm": 0.07594226992167535, "learning_rate": 0.0001598662882312615, "loss": 0.3191, "step": 833 }, { "epoch": 1.1002638522427441, "grad_norm": 0.07694568934151555, "learning_rate": 0.00015974322556773108, "loss": 0.3097, "step": 834 }, { "epoch": 1.1015831134564644, "grad_norm": 0.0771447723365426, "learning_rate": 0.00015962002204764384, "loss": 0.3229, "step": 835 }, { "epoch": 1.1029023746701847, "grad_norm": 0.07632597300899378, "learning_rate": 0.00015949667796147664, "loss": 0.3157, "step": 836 }, { "epoch": 1.104221635883905, "grad_norm": 0.07722567710737664, "learning_rate": 0.00015937319360003773, "loss": 0.314, "step": 837 }, { "epoch": 1.1055408970976253, "grad_norm": 0.07631683625522165, "learning_rate": 0.00015924956925446614, "loss": 0.3198, "step": 838 }, { "epoch": 1.1068601583113455, "grad_norm": 0.07496882219315053, "learning_rate": 0.0001591258052162308, "loss": 0.3046, "step": 839 }, { "epoch": 1.108179419525066, "grad_norm": 0.07259930703434052, "learning_rate": 0.00015900190177713016, "loss": 0.297, "step": 840 }, { "epoch": 1.1094986807387863, "grad_norm": 0.07667345145790022, "learning_rate": 0.00015887785922929126, "loss": 0.307, "step": 841 }, { "epoch": 1.1108179419525066, "grad_norm": 0.07473853932575529, "learning_rate": 0.00015875367786516908, "loss": 0.3115, "step": 842 }, { "epoch": 1.1121372031662269, "grad_norm": 0.07542391375500186, "learning_rate": 0.00015862935797754594, "loss": 0.3004, "step": 843 }, { "epoch": 1.1134564643799472, "grad_norm": 0.07730397039970402, "learning_rate": 0.00015850489985953076, "loss": 0.3075, "step": 844 }, { "epoch": 1.1147757255936674, "grad_norm": 0.07907335058892856, "learning_rate": 0.00015838030380455837, "loss": 0.3113, "step": 845 }, { "epoch": 1.116094986807388, "grad_norm": 0.07826686001437348, "learning_rate": 0.00015825557010638871, "loss": 0.3217, "step": 846 }, { "epoch": 1.1174142480211082, "grad_norm": 0.07641370370851834, "learning_rate": 0.00015813069905910642, "loss": 0.3099, "step": 847 }, { "epoch": 1.1187335092348285, "grad_norm": 0.07709561671228453, "learning_rate": 0.00015800569095711982, "loss": 0.313, "step": 848 }, { "epoch": 1.1200527704485488, "grad_norm": 0.0756865596369234, "learning_rate": 0.00015788054609516044, "loss": 0.3115, "step": 849 }, { "epoch": 1.121372031662269, "grad_norm": 0.07407900314425696, "learning_rate": 0.0001577552647682822, "loss": 0.3054, "step": 850 }, { "epoch": 1.1226912928759893, "grad_norm": 0.0766971323217953, "learning_rate": 0.00015762984727186078, "loss": 0.3088, "step": 851 }, { "epoch": 1.1240105540897098, "grad_norm": 0.0794271913698087, "learning_rate": 0.00015750429390159294, "loss": 0.3035, "step": 852 }, { "epoch": 1.1253298153034301, "grad_norm": 0.07681553455067884, "learning_rate": 0.00015737860495349575, "loss": 0.3092, "step": 853 }, { "epoch": 1.1266490765171504, "grad_norm": 0.07616562684908654, "learning_rate": 0.00015725278072390597, "loss": 0.3004, "step": 854 }, { "epoch": 1.1279683377308707, "grad_norm": 0.07819556234483988, "learning_rate": 0.00015712682150947923, "loss": 0.3123, "step": 855 }, { "epoch": 1.129287598944591, "grad_norm": 0.07640221318951228, "learning_rate": 0.00015700072760718955, "loss": 0.311, "step": 856 }, { "epoch": 1.1306068601583115, "grad_norm": 0.07622725307949371, "learning_rate": 0.00015687449931432837, "loss": 0.3021, "step": 857 }, { "epoch": 1.1319261213720317, "grad_norm": 0.07310307378230405, "learning_rate": 0.00015674813692850408, "loss": 0.3056, "step": 858 }, { "epoch": 1.133245382585752, "grad_norm": 0.07614047923753756, "learning_rate": 0.00015662164074764113, "loss": 0.3061, "step": 859 }, { "epoch": 1.1345646437994723, "grad_norm": 0.0739655849358646, "learning_rate": 0.00015649501106997953, "loss": 0.3063, "step": 860 }, { "epoch": 1.1358839050131926, "grad_norm": 0.07763860243514285, "learning_rate": 0.0001563682481940739, "loss": 0.3024, "step": 861 }, { "epoch": 1.1372031662269129, "grad_norm": 0.0756222101266522, "learning_rate": 0.00015624135241879304, "loss": 0.3044, "step": 862 }, { "epoch": 1.1385224274406331, "grad_norm": 0.07778105451979428, "learning_rate": 0.00015611432404331898, "loss": 0.3108, "step": 863 }, { "epoch": 1.1398416886543536, "grad_norm": 0.0754264637054096, "learning_rate": 0.00015598716336714645, "loss": 0.3114, "step": 864 }, { "epoch": 1.141160949868074, "grad_norm": 0.07631216428246822, "learning_rate": 0.00015585987069008204, "loss": 0.3143, "step": 865 }, { "epoch": 1.1424802110817942, "grad_norm": 0.07833489518605949, "learning_rate": 0.00015573244631224365, "loss": 0.3142, "step": 866 }, { "epoch": 1.1437994722955145, "grad_norm": 0.07945270092912188, "learning_rate": 0.0001556048905340596, "loss": 0.3199, "step": 867 }, { "epoch": 1.1451187335092348, "grad_norm": 0.0797126881047072, "learning_rate": 0.00015547720365626807, "loss": 0.3078, "step": 868 }, { "epoch": 1.1464379947229553, "grad_norm": 0.0773303937865925, "learning_rate": 0.00015534938597991626, "loss": 0.3115, "step": 869 }, { "epoch": 1.1477572559366755, "grad_norm": 0.07530799616444198, "learning_rate": 0.0001552214378063599, "loss": 0.3093, "step": 870 }, { "epoch": 1.1490765171503958, "grad_norm": 0.07378848160769735, "learning_rate": 0.00015509335943726224, "loss": 0.3039, "step": 871 }, { "epoch": 1.150395778364116, "grad_norm": 0.07728479922487838, "learning_rate": 0.00015496515117459353, "loss": 0.3057, "step": 872 }, { "epoch": 1.1517150395778364, "grad_norm": 0.07554136036947544, "learning_rate": 0.00015483681332063035, "loss": 0.3104, "step": 873 }, { "epoch": 1.1530343007915567, "grad_norm": 0.07260912671635868, "learning_rate": 0.00015470834617795472, "loss": 0.3081, "step": 874 }, { "epoch": 1.154353562005277, "grad_norm": 0.07362539670290015, "learning_rate": 0.00015457975004945355, "loss": 0.3099, "step": 875 }, { "epoch": 1.1556728232189974, "grad_norm": 0.07721321001319487, "learning_rate": 0.0001544510252383178, "loss": 0.3127, "step": 876 }, { "epoch": 1.1569920844327177, "grad_norm": 0.07556915253264165, "learning_rate": 0.0001543221720480419, "loss": 0.3091, "step": 877 }, { "epoch": 1.158311345646438, "grad_norm": 0.07417225894671747, "learning_rate": 0.00015419319078242288, "loss": 0.3026, "step": 878 }, { "epoch": 1.1596306068601583, "grad_norm": 0.07204467106147029, "learning_rate": 0.00015406408174555976, "loss": 0.2998, "step": 879 }, { "epoch": 1.1609498680738786, "grad_norm": 0.07557405798105532, "learning_rate": 0.00015393484524185288, "loss": 0.3123, "step": 880 }, { "epoch": 1.162269129287599, "grad_norm": 0.07458756872374336, "learning_rate": 0.00015380548157600297, "loss": 0.3089, "step": 881 }, { "epoch": 1.1635883905013193, "grad_norm": 0.07592862991499262, "learning_rate": 0.0001536759910530107, "loss": 0.3168, "step": 882 }, { "epoch": 1.1649076517150396, "grad_norm": 0.07394519200290883, "learning_rate": 0.00015354637397817578, "loss": 0.3049, "step": 883 }, { "epoch": 1.16622691292876, "grad_norm": 0.07450781402945626, "learning_rate": 0.0001534166306570962, "loss": 0.2955, "step": 884 }, { "epoch": 1.1675461741424802, "grad_norm": 0.07888645417271047, "learning_rate": 0.0001532867613956678, "loss": 0.316, "step": 885 }, { "epoch": 1.1688654353562005, "grad_norm": 0.07334045394499303, "learning_rate": 0.00015315676650008318, "loss": 0.2954, "step": 886 }, { "epoch": 1.1701846965699207, "grad_norm": 0.07714898413450567, "learning_rate": 0.00015302664627683124, "loss": 0.318, "step": 887 }, { "epoch": 1.1715039577836412, "grad_norm": 0.07518531713631191, "learning_rate": 0.00015289640103269625, "loss": 0.3043, "step": 888 }, { "epoch": 1.1728232189973615, "grad_norm": 0.07590779806936655, "learning_rate": 0.00015276603107475739, "loss": 0.3083, "step": 889 }, { "epoch": 1.1741424802110818, "grad_norm": 0.07872532251260045, "learning_rate": 0.0001526355367103878, "loss": 0.3105, "step": 890 }, { "epoch": 1.175461741424802, "grad_norm": 0.07166066292827625, "learning_rate": 0.00015250491824725398, "loss": 0.2997, "step": 891 }, { "epoch": 1.1767810026385224, "grad_norm": 0.07463207764764149, "learning_rate": 0.00015237417599331488, "loss": 0.3059, "step": 892 }, { "epoch": 1.1781002638522429, "grad_norm": 0.07596505578143041, "learning_rate": 0.0001522433102568215, "loss": 0.3057, "step": 893 }, { "epoch": 1.1794195250659631, "grad_norm": 0.07490401916233322, "learning_rate": 0.00015211232134631586, "loss": 0.2994, "step": 894 }, { "epoch": 1.1807387862796834, "grad_norm": 0.07508538316468957, "learning_rate": 0.0001519812095706304, "loss": 0.3122, "step": 895 }, { "epoch": 1.1820580474934037, "grad_norm": 0.07483723602400037, "learning_rate": 0.00015184997523888725, "loss": 0.3116, "step": 896 }, { "epoch": 1.183377308707124, "grad_norm": 0.07621555666374144, "learning_rate": 0.0001517186186604975, "loss": 0.3109, "step": 897 }, { "epoch": 1.1846965699208443, "grad_norm": 0.0758862769698191, "learning_rate": 0.00015158714014516043, "loss": 0.3069, "step": 898 }, { "epoch": 1.1860158311345645, "grad_norm": 0.07293434207783527, "learning_rate": 0.0001514555400028629, "loss": 0.2991, "step": 899 }, { "epoch": 1.187335092348285, "grad_norm": 0.07577451683937544, "learning_rate": 0.0001513238185438784, "loss": 0.306, "step": 900 }, { "epoch": 1.1886543535620053, "grad_norm": 0.07315634015232383, "learning_rate": 0.00015119197607876657, "loss": 0.2972, "step": 901 }, { "epoch": 1.1899736147757256, "grad_norm": 0.07636765083737276, "learning_rate": 0.00015106001291837222, "loss": 0.3212, "step": 902 }, { "epoch": 1.1912928759894459, "grad_norm": 0.07515386176224755, "learning_rate": 0.00015092792937382483, "loss": 0.3066, "step": 903 }, { "epoch": 1.1926121372031662, "grad_norm": 0.07567668614704424, "learning_rate": 0.0001507957257565377, "loss": 0.3019, "step": 904 }, { "epoch": 1.1939313984168864, "grad_norm": 0.07774750749995792, "learning_rate": 0.0001506634023782071, "loss": 0.3093, "step": 905 }, { "epoch": 1.195250659630607, "grad_norm": 0.0773755138330432, "learning_rate": 0.00015053095955081184, "loss": 0.3106, "step": 906 }, { "epoch": 1.1965699208443272, "grad_norm": 0.07517154608956984, "learning_rate": 0.00015039839758661228, "loss": 0.308, "step": 907 }, { "epoch": 1.1978891820580475, "grad_norm": 0.07245191662971324, "learning_rate": 0.0001502657167981496, "loss": 0.3062, "step": 908 }, { "epoch": 1.1992084432717678, "grad_norm": 0.07191565685226466, "learning_rate": 0.00015013291749824527, "loss": 0.3005, "step": 909 }, { "epoch": 1.200527704485488, "grad_norm": 0.07615656357109149, "learning_rate": 0.00015000000000000001, "loss": 0.3029, "step": 910 }, { "epoch": 1.2018469656992083, "grad_norm": 0.07724643857733882, "learning_rate": 0.00014986696461679336, "loss": 0.3114, "step": 911 }, { "epoch": 1.2031662269129288, "grad_norm": 0.07339796727473648, "learning_rate": 0.00014973381166228272, "loss": 0.3106, "step": 912 }, { "epoch": 1.2044854881266491, "grad_norm": 0.07204444678044727, "learning_rate": 0.00014960054145040275, "loss": 0.2907, "step": 913 }, { "epoch": 1.2058047493403694, "grad_norm": 0.07483490856816973, "learning_rate": 0.00014946715429536443, "loss": 0.2984, "step": 914 }, { "epoch": 1.2071240105540897, "grad_norm": 0.07489638516212033, "learning_rate": 0.0001493336505116546, "loss": 0.3002, "step": 915 }, { "epoch": 1.20844327176781, "grad_norm": 0.07842043188631999, "learning_rate": 0.0001492000304140351, "loss": 0.3045, "step": 916 }, { "epoch": 1.2097625329815302, "grad_norm": 0.07631816181761492, "learning_rate": 0.00014906629431754185, "loss": 0.2993, "step": 917 }, { "epoch": 1.2110817941952507, "grad_norm": 0.07339830790139129, "learning_rate": 0.00014893244253748436, "loss": 0.2955, "step": 918 }, { "epoch": 1.212401055408971, "grad_norm": 0.07614380716197945, "learning_rate": 0.00014879847538944486, "loss": 0.3135, "step": 919 }, { "epoch": 1.2137203166226913, "grad_norm": 0.07536401631320024, "learning_rate": 0.00014866439318927762, "loss": 0.3103, "step": 920 }, { "epoch": 1.2150395778364116, "grad_norm": 0.07478041686953008, "learning_rate": 0.00014853019625310813, "loss": 0.3107, "step": 921 }, { "epoch": 1.2163588390501319, "grad_norm": 0.0760992177091002, "learning_rate": 0.0001483958848973324, "loss": 0.3166, "step": 922 }, { "epoch": 1.2176781002638521, "grad_norm": 0.07343504909198667, "learning_rate": 0.00014826145943861615, "loss": 0.2933, "step": 923 }, { "epoch": 1.2189973614775726, "grad_norm": 0.07698834337318551, "learning_rate": 0.00014812692019389425, "loss": 0.3087, "step": 924 }, { "epoch": 1.220316622691293, "grad_norm": 0.07702980735012524, "learning_rate": 0.00014799226748036978, "loss": 0.3033, "step": 925 }, { "epoch": 1.2216358839050132, "grad_norm": 0.07271714849073063, "learning_rate": 0.0001478575016155133, "loss": 0.2939, "step": 926 }, { "epoch": 1.2229551451187335, "grad_norm": 0.07378750410089299, "learning_rate": 0.00014772262291706223, "loss": 0.3031, "step": 927 }, { "epoch": 1.2242744063324538, "grad_norm": 0.07725483504044527, "learning_rate": 0.0001475876317030199, "loss": 0.3035, "step": 928 }, { "epoch": 1.225593667546174, "grad_norm": 0.07391045742843623, "learning_rate": 0.0001474525282916551, "loss": 0.3099, "step": 929 }, { "epoch": 1.2269129287598945, "grad_norm": 0.0735914953258023, "learning_rate": 0.0001473173130015009, "loss": 0.2971, "step": 930 }, { "epoch": 1.2282321899736148, "grad_norm": 0.07308725002253354, "learning_rate": 0.00014718198615135442, "loss": 0.3116, "step": 931 }, { "epoch": 1.229551451187335, "grad_norm": 0.07078828565181608, "learning_rate": 0.0001470465480602756, "loss": 0.2985, "step": 932 }, { "epoch": 1.2308707124010554, "grad_norm": 0.07477236282143987, "learning_rate": 0.00014691099904758667, "loss": 0.3085, "step": 933 }, { "epoch": 1.2321899736147757, "grad_norm": 0.07680234446625109, "learning_rate": 0.00014677533943287157, "loss": 0.3063, "step": 934 }, { "epoch": 1.233509234828496, "grad_norm": 0.07763810385259635, "learning_rate": 0.00014663956953597475, "loss": 0.3165, "step": 935 }, { "epoch": 1.2348284960422165, "grad_norm": 0.07608694072232146, "learning_rate": 0.00014650368967700084, "loss": 0.3079, "step": 936 }, { "epoch": 1.2361477572559367, "grad_norm": 0.07491901325213222, "learning_rate": 0.00014636770017631371, "loss": 0.3091, "step": 937 }, { "epoch": 1.237467018469657, "grad_norm": 0.07615164268550638, "learning_rate": 0.00014623160135453567, "loss": 0.3027, "step": 938 }, { "epoch": 1.2387862796833773, "grad_norm": 0.07436829343444645, "learning_rate": 0.00014609539353254678, "loss": 0.3035, "step": 939 }, { "epoch": 1.2401055408970976, "grad_norm": 0.07751456520877707, "learning_rate": 0.0001459590770314841, "loss": 0.3104, "step": 940 }, { "epoch": 1.2414248021108178, "grad_norm": 0.07408714330709366, "learning_rate": 0.00014582265217274104, "loss": 0.2989, "step": 941 }, { "epoch": 1.2427440633245384, "grad_norm": 0.07334490843445905, "learning_rate": 0.0001456861192779663, "loss": 0.2986, "step": 942 }, { "epoch": 1.2440633245382586, "grad_norm": 0.0758466850475825, "learning_rate": 0.0001455494786690634, "loss": 0.3102, "step": 943 }, { "epoch": 1.245382585751979, "grad_norm": 0.07522623596434648, "learning_rate": 0.0001454127306681898, "loss": 0.3148, "step": 944 }, { "epoch": 1.2467018469656992, "grad_norm": 0.07404276387714855, "learning_rate": 0.00014527587559775616, "loss": 0.296, "step": 945 }, { "epoch": 1.2480211081794195, "grad_norm": 0.07342172989721586, "learning_rate": 0.0001451389137804256, "loss": 0.3021, "step": 946 }, { "epoch": 1.2493403693931397, "grad_norm": 0.07487000324812171, "learning_rate": 0.00014500184553911284, "loss": 0.2996, "step": 947 }, { "epoch": 1.2506596306068603, "grad_norm": 0.07558713991695598, "learning_rate": 0.00014486467119698357, "loss": 0.3104, "step": 948 }, { "epoch": 1.2519788918205805, "grad_norm": 0.07463900683586573, "learning_rate": 0.0001447273910774537, "loss": 0.3137, "step": 949 }, { "epoch": 1.2532981530343008, "grad_norm": 0.0738628959536641, "learning_rate": 0.00014459000550418836, "loss": 0.2995, "step": 950 }, { "epoch": 1.254617414248021, "grad_norm": 0.07413738777040238, "learning_rate": 0.00014445251480110145, "loss": 0.3103, "step": 951 }, { "epoch": 1.2559366754617414, "grad_norm": 0.07525780379736238, "learning_rate": 0.00014431491929235474, "loss": 0.3038, "step": 952 }, { "epoch": 1.2572559366754619, "grad_norm": 0.07674209325711587, "learning_rate": 0.000144177219302357, "loss": 0.3114, "step": 953 }, { "epoch": 1.258575197889182, "grad_norm": 0.07520652308086352, "learning_rate": 0.00014403941515576344, "loss": 0.3089, "step": 954 }, { "epoch": 1.2598944591029024, "grad_norm": 0.07780085776115885, "learning_rate": 0.00014390150717747476, "loss": 0.3241, "step": 955 }, { "epoch": 1.2612137203166227, "grad_norm": 0.07596856438363417, "learning_rate": 0.00014376349569263647, "loss": 0.3085, "step": 956 }, { "epoch": 1.262532981530343, "grad_norm": 0.07719847280117932, "learning_rate": 0.00014362538102663817, "loss": 0.3087, "step": 957 }, { "epoch": 1.2638522427440633, "grad_norm": 0.0752462995521335, "learning_rate": 0.00014348716350511272, "loss": 0.3124, "step": 958 }, { "epoch": 1.2651715039577835, "grad_norm": 0.07503659892090066, "learning_rate": 0.0001433488434539354, "loss": 0.3072, "step": 959 }, { "epoch": 1.266490765171504, "grad_norm": 0.07416655716071678, "learning_rate": 0.00014321042119922337, "loss": 0.3051, "step": 960 }, { "epoch": 1.2678100263852243, "grad_norm": 0.07331205215818216, "learning_rate": 0.00014307189706733463, "loss": 0.2951, "step": 961 }, { "epoch": 1.2691292875989446, "grad_norm": 0.0755527927985988, "learning_rate": 0.00014293327138486741, "loss": 0.2985, "step": 962 }, { "epoch": 1.270448548812665, "grad_norm": 0.07538998906620151, "learning_rate": 0.00014279454447865936, "loss": 0.2986, "step": 963 }, { "epoch": 1.2717678100263852, "grad_norm": 0.08198827086535773, "learning_rate": 0.00014265571667578688, "loss": 0.3065, "step": 964 }, { "epoch": 1.2730870712401057, "grad_norm": 0.07485361987916725, "learning_rate": 0.00014251678830356408, "loss": 0.3015, "step": 965 }, { "epoch": 1.2744063324538257, "grad_norm": 0.07813362421028269, "learning_rate": 0.00014237775968954232, "loss": 0.3113, "step": 966 }, { "epoch": 1.2757255936675462, "grad_norm": 0.07676991674980586, "learning_rate": 0.00014223863116150928, "loss": 0.3028, "step": 967 }, { "epoch": 1.2770448548812665, "grad_norm": 0.0769554629846056, "learning_rate": 0.0001420994030474881, "loss": 0.3155, "step": 968 }, { "epoch": 1.2783641160949868, "grad_norm": 0.07220060528035936, "learning_rate": 0.0001419600756757369, "loss": 0.2962, "step": 969 }, { "epoch": 1.279683377308707, "grad_norm": 0.07652046071236204, "learning_rate": 0.00014182064937474763, "loss": 0.3001, "step": 970 }, { "epoch": 1.2810026385224274, "grad_norm": 0.07584160241181666, "learning_rate": 0.00014168112447324565, "loss": 0.3076, "step": 971 }, { "epoch": 1.2823218997361479, "grad_norm": 0.07449907125960294, "learning_rate": 0.00014154150130018866, "loss": 0.3104, "step": 972 }, { "epoch": 1.2836411609498681, "grad_norm": 0.07781585370886261, "learning_rate": 0.0001414017801847661, "loss": 0.3154, "step": 973 }, { "epoch": 1.2849604221635884, "grad_norm": 0.07434009422603818, "learning_rate": 0.00014126196145639838, "loss": 0.3104, "step": 974 }, { "epoch": 1.2862796833773087, "grad_norm": 0.07732894015882766, "learning_rate": 0.00014112204544473598, "loss": 0.3152, "step": 975 }, { "epoch": 1.287598944591029, "grad_norm": 0.07508859345016931, "learning_rate": 0.00014098203247965875, "loss": 0.2896, "step": 976 }, { "epoch": 1.2889182058047495, "grad_norm": 0.07633605711935243, "learning_rate": 0.0001408419228912752, "loss": 0.3008, "step": 977 }, { "epoch": 1.2902374670184695, "grad_norm": 0.07391699848595623, "learning_rate": 0.0001407017170099216, "loss": 0.2999, "step": 978 }, { "epoch": 1.29155672823219, "grad_norm": 0.0771856128183666, "learning_rate": 0.00014056141516616117, "loss": 0.2932, "step": 979 }, { "epoch": 1.2928759894459103, "grad_norm": 0.07510012596102593, "learning_rate": 0.00014042101769078355, "loss": 0.3119, "step": 980 }, { "epoch": 1.2941952506596306, "grad_norm": 0.07517475718850301, "learning_rate": 0.0001402805249148037, "loss": 0.3117, "step": 981 }, { "epoch": 1.2955145118733509, "grad_norm": 0.07509636995573182, "learning_rate": 0.00014013993716946137, "loss": 0.3054, "step": 982 }, { "epoch": 1.2968337730870712, "grad_norm": 0.07277096546813039, "learning_rate": 0.00013999925478622017, "loss": 0.3082, "step": 983 }, { "epoch": 1.2981530343007917, "grad_norm": 0.07496901824684878, "learning_rate": 0.0001398584780967668, "loss": 0.3104, "step": 984 }, { "epoch": 1.299472295514512, "grad_norm": 0.07472005565262688, "learning_rate": 0.0001397176074330104, "loss": 0.3149, "step": 985 }, { "epoch": 1.3007915567282322, "grad_norm": 0.07395786561555447, "learning_rate": 0.0001395766431270816, "loss": 0.3009, "step": 986 }, { "epoch": 1.3021108179419525, "grad_norm": 0.0738523458606648, "learning_rate": 0.00013943558551133186, "loss": 0.3013, "step": 987 }, { "epoch": 1.3034300791556728, "grad_norm": 0.07576470061645109, "learning_rate": 0.00013929443491833262, "loss": 0.2976, "step": 988 }, { "epoch": 1.3047493403693933, "grad_norm": 0.07507451956260033, "learning_rate": 0.00013915319168087447, "loss": 0.3071, "step": 989 }, { "epoch": 1.3060686015831133, "grad_norm": 0.07316196412564342, "learning_rate": 0.00013901185613196654, "loss": 0.3053, "step": 990 }, { "epoch": 1.3073878627968338, "grad_norm": 0.07396049631730392, "learning_rate": 0.00013887042860483552, "loss": 0.3133, "step": 991 }, { "epoch": 1.3087071240105541, "grad_norm": 0.07455025595994069, "learning_rate": 0.00013872890943292498, "loss": 0.303, "step": 992 }, { "epoch": 1.3100263852242744, "grad_norm": 0.07380808525849787, "learning_rate": 0.00013858729894989456, "loss": 0.3067, "step": 993 }, { "epoch": 1.3113456464379947, "grad_norm": 0.07621797447354348, "learning_rate": 0.00013844559748961918, "loss": 0.3151, "step": 994 }, { "epoch": 1.312664907651715, "grad_norm": 0.07643643576619834, "learning_rate": 0.0001383038053861883, "loss": 0.3265, "step": 995 }, { "epoch": 1.3139841688654355, "grad_norm": 0.07240360696561128, "learning_rate": 0.00013816192297390502, "loss": 0.2853, "step": 996 }, { "epoch": 1.3153034300791557, "grad_norm": 0.07472851468647417, "learning_rate": 0.0001380199505872854, "loss": 0.3022, "step": 997 }, { "epoch": 1.316622691292876, "grad_norm": 0.07697985792667163, "learning_rate": 0.0001378778885610576, "loss": 0.3211, "step": 998 }, { "epoch": 1.3179419525065963, "grad_norm": 0.07490883866864846, "learning_rate": 0.00013773573723016122, "loss": 0.3023, "step": 999 }, { "epoch": 1.3192612137203166, "grad_norm": 0.07147757606977836, "learning_rate": 0.00013759349692974628, "loss": 0.2964, "step": 1000 }, { "epoch": 1.320580474934037, "grad_norm": 0.07203215824298867, "learning_rate": 0.00013745116799517265, "loss": 0.2987, "step": 1001 }, { "epoch": 1.3218997361477571, "grad_norm": 0.07060141924351426, "learning_rate": 0.00013730875076200914, "loss": 0.2886, "step": 1002 }, { "epoch": 1.3232189973614776, "grad_norm": 0.07162212307749757, "learning_rate": 0.00013716624556603274, "loss": 0.293, "step": 1003 }, { "epoch": 1.324538258575198, "grad_norm": 0.07347889671028349, "learning_rate": 0.0001370236527432279, "loss": 0.2981, "step": 1004 }, { "epoch": 1.3258575197889182, "grad_norm": 0.07576920213721451, "learning_rate": 0.00013688097262978555, "loss": 0.3104, "step": 1005 }, { "epoch": 1.3271767810026385, "grad_norm": 0.07433760474607189, "learning_rate": 0.0001367382055621025, "loss": 0.3111, "step": 1006 }, { "epoch": 1.3284960422163588, "grad_norm": 0.07237067599875494, "learning_rate": 0.00013659535187678055, "loss": 0.2862, "step": 1007 }, { "epoch": 1.3298153034300793, "grad_norm": 0.0780391013667887, "learning_rate": 0.0001364524119106257, "loss": 0.3152, "step": 1008 }, { "epoch": 1.3311345646437995, "grad_norm": 0.07283292034579314, "learning_rate": 0.00013630938600064747, "loss": 0.2904, "step": 1009 }, { "epoch": 1.3324538258575198, "grad_norm": 0.0733629071744914, "learning_rate": 0.0001361662744840579, "loss": 0.2966, "step": 1010 }, { "epoch": 1.33377308707124, "grad_norm": 0.07163390228595125, "learning_rate": 0.00013602307769827084, "loss": 0.2952, "step": 1011 }, { "epoch": 1.3350923482849604, "grad_norm": 0.0723678758928173, "learning_rate": 0.00013587979598090133, "loss": 0.3096, "step": 1012 }, { "epoch": 1.3364116094986809, "grad_norm": 0.07374414465160141, "learning_rate": 0.00013573642966976452, "loss": 0.3039, "step": 1013 }, { "epoch": 1.337730870712401, "grad_norm": 0.07313532985680832, "learning_rate": 0.00013559297910287508, "loss": 0.3094, "step": 1014 }, { "epoch": 1.3390501319261214, "grad_norm": 0.07476015775487865, "learning_rate": 0.00013544944461844625, "loss": 0.3075, "step": 1015 }, { "epoch": 1.3403693931398417, "grad_norm": 0.07393627504931968, "learning_rate": 0.00013530582655488926, "loss": 0.3037, "step": 1016 }, { "epoch": 1.341688654353562, "grad_norm": 0.07447612410782062, "learning_rate": 0.00013516212525081222, "loss": 0.2955, "step": 1017 }, { "epoch": 1.3430079155672823, "grad_norm": 0.07434293968756626, "learning_rate": 0.00013501834104501963, "loss": 0.2882, "step": 1018 }, { "epoch": 1.3443271767810026, "grad_norm": 0.07817844663857237, "learning_rate": 0.00013487447427651137, "loss": 0.319, "step": 1019 }, { "epoch": 1.345646437994723, "grad_norm": 0.07666841132324655, "learning_rate": 0.00013473052528448201, "loss": 0.3113, "step": 1020 }, { "epoch": 1.3469656992084433, "grad_norm": 0.07668607267536197, "learning_rate": 0.00013458649440832005, "loss": 0.3077, "step": 1021 }, { "epoch": 1.3482849604221636, "grad_norm": 0.07492991662441542, "learning_rate": 0.0001344423819876069, "loss": 0.3038, "step": 1022 }, { "epoch": 1.349604221635884, "grad_norm": 0.07515295591742276, "learning_rate": 0.0001342981883621163, "loss": 0.3136, "step": 1023 }, { "epoch": 1.3509234828496042, "grad_norm": 0.0747768890635019, "learning_rate": 0.0001341539138718135, "loss": 0.309, "step": 1024 }, { "epoch": 1.3522427440633247, "grad_norm": 0.07282366195900576, "learning_rate": 0.00013400955885685433, "loss": 0.3022, "step": 1025 }, { "epoch": 1.3535620052770447, "grad_norm": 0.07287777580096907, "learning_rate": 0.0001338651236575845, "loss": 0.3029, "step": 1026 }, { "epoch": 1.3548812664907652, "grad_norm": 0.07531454663134504, "learning_rate": 0.00013372060861453874, "loss": 0.3111, "step": 1027 }, { "epoch": 1.3562005277044855, "grad_norm": 0.07677381449447031, "learning_rate": 0.00013357601406844007, "loss": 0.3199, "step": 1028 }, { "epoch": 1.3575197889182058, "grad_norm": 0.07223384199532076, "learning_rate": 0.00013343134036019895, "loss": 0.2991, "step": 1029 }, { "epoch": 1.358839050131926, "grad_norm": 0.0699103935535974, "learning_rate": 0.0001332865878309125, "loss": 0.2947, "step": 1030 }, { "epoch": 1.3601583113456464, "grad_norm": 0.07349240860675077, "learning_rate": 0.0001331417568218636, "loss": 0.2984, "step": 1031 }, { "epoch": 1.3614775725593669, "grad_norm": 0.0742693249792302, "learning_rate": 0.0001329968476745202, "loss": 0.294, "step": 1032 }, { "epoch": 1.3627968337730871, "grad_norm": 0.07237720589260886, "learning_rate": 0.00013285186073053455, "loss": 0.2992, "step": 1033 }, { "epoch": 1.3641160949868074, "grad_norm": 0.07399518788197722, "learning_rate": 0.00013270679633174218, "loss": 0.314, "step": 1034 }, { "epoch": 1.3654353562005277, "grad_norm": 0.07458025469024901, "learning_rate": 0.00013256165482016137, "loss": 0.3064, "step": 1035 }, { "epoch": 1.366754617414248, "grad_norm": 0.07547340226189682, "learning_rate": 0.00013241643653799212, "loss": 0.3063, "step": 1036 }, { "epoch": 1.3680738786279685, "grad_norm": 0.07556191389517768, "learning_rate": 0.00013227114182761543, "loss": 0.3071, "step": 1037 }, { "epoch": 1.3693931398416885, "grad_norm": 0.07205407599989692, "learning_rate": 0.00013212577103159258, "loss": 0.2994, "step": 1038 }, { "epoch": 1.370712401055409, "grad_norm": 0.07697557767329637, "learning_rate": 0.00013198032449266418, "loss": 0.3094, "step": 1039 }, { "epoch": 1.3720316622691293, "grad_norm": 0.07150092557237109, "learning_rate": 0.0001318348025537494, "loss": 0.3025, "step": 1040 }, { "epoch": 1.3733509234828496, "grad_norm": 0.07168793568684126, "learning_rate": 0.00013168920555794525, "loss": 0.302, "step": 1041 }, { "epoch": 1.3746701846965699, "grad_norm": 0.07416513758942954, "learning_rate": 0.00013154353384852558, "loss": 0.3011, "step": 1042 }, { "epoch": 1.3759894459102902, "grad_norm": 0.07496892381913171, "learning_rate": 0.0001313977877689405, "loss": 0.3009, "step": 1043 }, { "epoch": 1.3773087071240107, "grad_norm": 0.07663224245550665, "learning_rate": 0.00013125196766281544, "loss": 0.3167, "step": 1044 }, { "epoch": 1.378627968337731, "grad_norm": 0.07610303950581411, "learning_rate": 0.00013110607387395033, "loss": 0.3107, "step": 1045 }, { "epoch": 1.3799472295514512, "grad_norm": 0.07651019612168752, "learning_rate": 0.00013096010674631888, "loss": 0.3187, "step": 1046 }, { "epoch": 1.3812664907651715, "grad_norm": 0.07701688459882833, "learning_rate": 0.00013081406662406763, "loss": 0.3115, "step": 1047 }, { "epoch": 1.3825857519788918, "grad_norm": 0.07523413968444899, "learning_rate": 0.00013066795385151532, "loss": 0.3024, "step": 1048 }, { "epoch": 1.383905013192612, "grad_norm": 0.07290551819419376, "learning_rate": 0.0001305217687731519, "loss": 0.3001, "step": 1049 }, { "epoch": 1.3852242744063323, "grad_norm": 0.07295802869109426, "learning_rate": 0.00013037551173363774, "loss": 0.2924, "step": 1050 }, { "epoch": 1.3865435356200528, "grad_norm": 0.07458024009240735, "learning_rate": 0.00013022918307780304, "loss": 0.3092, "step": 1051 }, { "epoch": 1.3878627968337731, "grad_norm": 0.073892647122203, "learning_rate": 0.00013008278315064674, "loss": 0.3031, "step": 1052 }, { "epoch": 1.3891820580474934, "grad_norm": 0.07411815773135678, "learning_rate": 0.00012993631229733582, "loss": 0.3041, "step": 1053 }, { "epoch": 1.3905013192612137, "grad_norm": 0.07497857993025513, "learning_rate": 0.0001297897708632045, "loss": 0.3124, "step": 1054 }, { "epoch": 1.391820580474934, "grad_norm": 0.07206992513813239, "learning_rate": 0.0001296431591937534, "loss": 0.3094, "step": 1055 }, { "epoch": 1.3931398416886545, "grad_norm": 0.07542573668373072, "learning_rate": 0.0001294964776346488, "loss": 0.2971, "step": 1056 }, { "epoch": 1.3944591029023747, "grad_norm": 0.07628648599717684, "learning_rate": 0.0001293497265317216, "loss": 0.3161, "step": 1057 }, { "epoch": 1.395778364116095, "grad_norm": 0.07523984002084788, "learning_rate": 0.00012920290623096682, "loss": 0.3085, "step": 1058 }, { "epoch": 1.3970976253298153, "grad_norm": 0.07149097825781824, "learning_rate": 0.00012905601707854255, "loss": 0.3048, "step": 1059 }, { "epoch": 1.3984168865435356, "grad_norm": 0.07508545093472142, "learning_rate": 0.00012890905942076927, "loss": 0.2934, "step": 1060 }, { "epoch": 1.3997361477572559, "grad_norm": 0.07364030529525083, "learning_rate": 0.00012876203360412888, "loss": 0.3006, "step": 1061 }, { "epoch": 1.4010554089709761, "grad_norm": 0.07256222405892282, "learning_rate": 0.0001286149399752641, "loss": 0.3028, "step": 1062 }, { "epoch": 1.4023746701846966, "grad_norm": 0.07329083009242973, "learning_rate": 0.0001284677788809774, "loss": 0.3082, "step": 1063 }, { "epoch": 1.403693931398417, "grad_norm": 0.07527721762781496, "learning_rate": 0.00012832055066823038, "loss": 0.3114, "step": 1064 }, { "epoch": 1.4050131926121372, "grad_norm": 0.074206531608657, "learning_rate": 0.00012817325568414297, "loss": 0.3054, "step": 1065 }, { "epoch": 1.4063324538258575, "grad_norm": 0.07846406391816182, "learning_rate": 0.00012802589427599235, "loss": 0.3102, "step": 1066 }, { "epoch": 1.4076517150395778, "grad_norm": 0.07506450290054201, "learning_rate": 0.00012787846679121242, "loss": 0.3065, "step": 1067 }, { "epoch": 1.4089709762532983, "grad_norm": 0.07020472438101781, "learning_rate": 0.00012773097357739288, "loss": 0.2956, "step": 1068 }, { "epoch": 1.4102902374670185, "grad_norm": 0.0715172776896225, "learning_rate": 0.00012758341498227835, "loss": 0.2967, "step": 1069 }, { "epoch": 1.4116094986807388, "grad_norm": 0.07228576771756227, "learning_rate": 0.0001274357913537676, "loss": 0.2983, "step": 1070 }, { "epoch": 1.412928759894459, "grad_norm": 0.0718401747896641, "learning_rate": 0.0001272881030399127, "loss": 0.297, "step": 1071 }, { "epoch": 1.4142480211081794, "grad_norm": 0.07258610203487502, "learning_rate": 0.0001271403503889184, "loss": 0.2988, "step": 1072 }, { "epoch": 1.4155672823218997, "grad_norm": 0.07275700396285821, "learning_rate": 0.0001269925337491409, "loss": 0.3077, "step": 1073 }, { "epoch": 1.41688654353562, "grad_norm": 0.07717509948562201, "learning_rate": 0.00012684465346908742, "loss": 0.2959, "step": 1074 }, { "epoch": 1.4182058047493404, "grad_norm": 0.07794419668048032, "learning_rate": 0.00012669670989741517, "loss": 0.3113, "step": 1075 }, { "epoch": 1.4195250659630607, "grad_norm": 0.07221937072385526, "learning_rate": 0.0001265487033829306, "loss": 0.3, "step": 1076 }, { "epoch": 1.420844327176781, "grad_norm": 0.07405386800569146, "learning_rate": 0.00012640063427458856, "loss": 0.3098, "step": 1077 }, { "epoch": 1.4221635883905013, "grad_norm": 0.07557782542761532, "learning_rate": 0.0001262525029214915, "loss": 0.3082, "step": 1078 }, { "epoch": 1.4234828496042216, "grad_norm": 0.07560691179123828, "learning_rate": 0.00012610430967288853, "loss": 0.3075, "step": 1079 }, { "epoch": 1.424802110817942, "grad_norm": 0.07665305802268893, "learning_rate": 0.00012595605487817482, "loss": 0.3195, "step": 1080 }, { "epoch": 1.4261213720316623, "grad_norm": 0.07505581307472523, "learning_rate": 0.00012580773888689055, "loss": 0.3004, "step": 1081 }, { "epoch": 1.4274406332453826, "grad_norm": 0.0727439261389258, "learning_rate": 0.00012565936204872018, "loss": 0.3009, "step": 1082 }, { "epoch": 1.428759894459103, "grad_norm": 0.07605645583643747, "learning_rate": 0.00012551092471349177, "loss": 0.3039, "step": 1083 }, { "epoch": 1.4300791556728232, "grad_norm": 0.07656565167320539, "learning_rate": 0.00012536242723117585, "loss": 0.305, "step": 1084 }, { "epoch": 1.4313984168865435, "grad_norm": 0.07194459517915269, "learning_rate": 0.00012521386995188483, "loss": 0.3052, "step": 1085 }, { "epoch": 1.4327176781002637, "grad_norm": 0.07426332065205554, "learning_rate": 0.00012506525322587207, "loss": 0.3025, "step": 1086 }, { "epoch": 1.4340369393139842, "grad_norm": 0.07529934494511434, "learning_rate": 0.00012491657740353114, "loss": 0.3072, "step": 1087 }, { "epoch": 1.4353562005277045, "grad_norm": 0.07306477594182653, "learning_rate": 0.00012476784283539487, "loss": 0.3003, "step": 1088 }, { "epoch": 1.4366754617414248, "grad_norm": 0.07165760803969894, "learning_rate": 0.00012461904987213468, "loss": 0.2911, "step": 1089 }, { "epoch": 1.437994722955145, "grad_norm": 0.07358633888776554, "learning_rate": 0.0001244701988645596, "loss": 0.2995, "step": 1090 }, { "epoch": 1.4393139841688654, "grad_norm": 0.07546492542991107, "learning_rate": 0.00012432129016361557, "loss": 0.2972, "step": 1091 }, { "epoch": 1.4406332453825859, "grad_norm": 0.07758300569780292, "learning_rate": 0.00012417232412038448, "loss": 0.3166, "step": 1092 }, { "epoch": 1.4419525065963061, "grad_norm": 0.07319586330032395, "learning_rate": 0.00012402330108608346, "loss": 0.3032, "step": 1093 }, { "epoch": 1.4432717678100264, "grad_norm": 0.07367263616342869, "learning_rate": 0.00012387422141206403, "loss": 0.3061, "step": 1094 }, { "epoch": 1.4445910290237467, "grad_norm": 0.07538260996596084, "learning_rate": 0.0001237250854498112, "loss": 0.3102, "step": 1095 }, { "epoch": 1.445910290237467, "grad_norm": 0.07933393637358932, "learning_rate": 0.00012357589355094275, "loss": 0.313, "step": 1096 }, { "epoch": 1.4472295514511873, "grad_norm": 0.0729976461520818, "learning_rate": 0.00012342664606720822, "loss": 0.3011, "step": 1097 }, { "epoch": 1.4485488126649075, "grad_norm": 0.07625787048840113, "learning_rate": 0.00012327734335048837, "loss": 0.3145, "step": 1098 }, { "epoch": 1.449868073878628, "grad_norm": 0.074398148145085, "learning_rate": 0.00012312798575279406, "loss": 0.2976, "step": 1099 }, { "epoch": 1.4511873350923483, "grad_norm": 0.07598593524027682, "learning_rate": 0.0001229785736262656, "loss": 0.3048, "step": 1100 }, { "epoch": 1.4525065963060686, "grad_norm": 0.07307365753437382, "learning_rate": 0.0001228291073231718, "loss": 0.307, "step": 1101 }, { "epoch": 1.4538258575197889, "grad_norm": 0.07527612542576476, "learning_rate": 0.00012267958719590935, "loss": 0.2998, "step": 1102 }, { "epoch": 1.4551451187335092, "grad_norm": 0.07162007175085688, "learning_rate": 0.00012253001359700165, "loss": 0.2983, "step": 1103 }, { "epoch": 1.4564643799472297, "grad_norm": 0.08194252748306062, "learning_rate": 0.0001223803868790983, "loss": 0.3147, "step": 1104 }, { "epoch": 1.45778364116095, "grad_norm": 0.07352321154327078, "learning_rate": 0.00012223070739497403, "loss": 0.303, "step": 1105 }, { "epoch": 1.4591029023746702, "grad_norm": 0.07175102497795863, "learning_rate": 0.00012208097549752813, "loss": 0.2939, "step": 1106 }, { "epoch": 1.4604221635883905, "grad_norm": 0.07435913913931984, "learning_rate": 0.00012193119153978332, "loss": 0.2994, "step": 1107 }, { "epoch": 1.4617414248021108, "grad_norm": 0.07582094186915156, "learning_rate": 0.00012178135587488515, "loss": 0.3107, "step": 1108 }, { "epoch": 1.463060686015831, "grad_norm": 0.07529050993597415, "learning_rate": 0.00012163146885610107, "loss": 0.3098, "step": 1109 }, { "epoch": 1.4643799472295513, "grad_norm": 0.07420601454201191, "learning_rate": 0.00012148153083681954, "loss": 0.3065, "step": 1110 }, { "epoch": 1.4656992084432718, "grad_norm": 0.07664962508501641, "learning_rate": 0.00012133154217054936, "loss": 0.3085, "step": 1111 }, { "epoch": 1.4670184696569921, "grad_norm": 0.07557899210174025, "learning_rate": 0.00012118150321091866, "loss": 0.304, "step": 1112 }, { "epoch": 1.4683377308707124, "grad_norm": 0.07799654867425963, "learning_rate": 0.0001210314143116742, "loss": 0.3089, "step": 1113 }, { "epoch": 1.4696569920844327, "grad_norm": 0.0770658159960671, "learning_rate": 0.00012088127582668045, "loss": 0.3025, "step": 1114 }, { "epoch": 1.470976253298153, "grad_norm": 0.07657029040224259, "learning_rate": 0.0001207310881099188, "loss": 0.3198, "step": 1115 }, { "epoch": 1.4722955145118735, "grad_norm": 0.07421863532373697, "learning_rate": 0.00012058085151548668, "loss": 0.3181, "step": 1116 }, { "epoch": 1.4736147757255937, "grad_norm": 0.07468052457090772, "learning_rate": 0.00012043056639759687, "loss": 0.3063, "step": 1117 }, { "epoch": 1.474934036939314, "grad_norm": 0.07648423491944498, "learning_rate": 0.00012028023311057641, "loss": 0.3149, "step": 1118 }, { "epoch": 1.4762532981530343, "grad_norm": 0.07335756703346823, "learning_rate": 0.00012012985200886602, "loss": 0.3035, "step": 1119 }, { "epoch": 1.4775725593667546, "grad_norm": 0.07363209027276707, "learning_rate": 0.00011997942344701906, "loss": 0.3123, "step": 1120 }, { "epoch": 1.4788918205804749, "grad_norm": 0.07281378816202612, "learning_rate": 0.0001198289477797009, "loss": 0.3034, "step": 1121 }, { "epoch": 1.4802110817941951, "grad_norm": 0.07088303089033027, "learning_rate": 0.00011967842536168785, "loss": 0.2901, "step": 1122 }, { "epoch": 1.4815303430079156, "grad_norm": 0.07524492579877382, "learning_rate": 0.00011952785654786651, "loss": 0.303, "step": 1123 }, { "epoch": 1.482849604221636, "grad_norm": 0.07442240375325328, "learning_rate": 0.00011937724169323286, "loss": 0.3092, "step": 1124 }, { "epoch": 1.4841688654353562, "grad_norm": 0.07291712547773904, "learning_rate": 0.00011922658115289141, "loss": 0.2938, "step": 1125 }, { "epoch": 1.4854881266490765, "grad_norm": 0.07354114423255759, "learning_rate": 0.00011907587528205444, "loss": 0.3031, "step": 1126 }, { "epoch": 1.4868073878627968, "grad_norm": 0.07725943247456882, "learning_rate": 0.00011892512443604102, "loss": 0.3059, "step": 1127 }, { "epoch": 1.4881266490765173, "grad_norm": 0.07410572851248295, "learning_rate": 0.00011877432897027637, "loss": 0.3062, "step": 1128 }, { "epoch": 1.4894459102902375, "grad_norm": 0.0747719190893773, "learning_rate": 0.00011862348924029074, "loss": 0.3016, "step": 1129 }, { "epoch": 1.4907651715039578, "grad_norm": 0.0760977819650773, "learning_rate": 0.00011847260560171896, "loss": 0.2961, "step": 1130 }, { "epoch": 1.492084432717678, "grad_norm": 0.07783068432808546, "learning_rate": 0.00011832167841029918, "loss": 0.3068, "step": 1131 }, { "epoch": 1.4934036939313984, "grad_norm": 0.07103986216589828, "learning_rate": 0.00011817070802187236, "loss": 0.2912, "step": 1132 }, { "epoch": 1.4947229551451187, "grad_norm": 0.07338343821493125, "learning_rate": 0.00011801969479238124, "loss": 0.294, "step": 1133 }, { "epoch": 1.496042216358839, "grad_norm": 0.07606833360193153, "learning_rate": 0.00011786863907786965, "loss": 0.3055, "step": 1134 }, { "epoch": 1.4973614775725594, "grad_norm": 0.07184449908575252, "learning_rate": 0.00011771754123448149, "loss": 0.3011, "step": 1135 }, { "epoch": 1.4986807387862797, "grad_norm": 0.074510399035344, "learning_rate": 0.00011756640161846002, "loss": 0.3049, "step": 1136 }, { "epoch": 1.5, "grad_norm": 0.07455538947210576, "learning_rate": 0.00011741522058614705, "loss": 0.307, "step": 1137 }, { "epoch": 1.5013192612137203, "grad_norm": 0.07645932887256915, "learning_rate": 0.0001172639984939819, "loss": 0.3082, "step": 1138 }, { "epoch": 1.5026385224274406, "grad_norm": 0.07282102121942846, "learning_rate": 0.00011711273569850087, "loss": 0.3015, "step": 1139 }, { "epoch": 1.503957783641161, "grad_norm": 0.07248724351737003, "learning_rate": 0.00011696143255633607, "loss": 0.3052, "step": 1140 }, { "epoch": 1.5052770448548811, "grad_norm": 0.07561445068566079, "learning_rate": 0.00011681008942421483, "loss": 0.3089, "step": 1141 }, { "epoch": 1.5065963060686016, "grad_norm": 0.07492786789842969, "learning_rate": 0.00011665870665895873, "loss": 0.3017, "step": 1142 }, { "epoch": 1.507915567282322, "grad_norm": 0.0744804988940395, "learning_rate": 0.0001165072846174828, "loss": 0.3046, "step": 1143 }, { "epoch": 1.5092348284960422, "grad_norm": 0.07638596571989263, "learning_rate": 0.00011635582365679467, "loss": 0.3018, "step": 1144 }, { "epoch": 1.5105540897097627, "grad_norm": 0.07444705299536425, "learning_rate": 0.00011620432413399371, "loss": 0.308, "step": 1145 }, { "epoch": 1.5118733509234827, "grad_norm": 0.07306003620839345, "learning_rate": 0.00011605278640627028, "loss": 0.3025, "step": 1146 }, { "epoch": 1.5131926121372032, "grad_norm": 0.07084865447089043, "learning_rate": 0.00011590121083090472, "loss": 0.2956, "step": 1147 }, { "epoch": 1.5145118733509235, "grad_norm": 0.07190053399987648, "learning_rate": 0.00011574959776526665, "loss": 0.3049, "step": 1148 }, { "epoch": 1.5158311345646438, "grad_norm": 0.07256206234631044, "learning_rate": 0.0001155979475668141, "loss": 0.301, "step": 1149 }, { "epoch": 1.517150395778364, "grad_norm": 0.0727935583794776, "learning_rate": 0.0001154462605930926, "loss": 0.3018, "step": 1150 }, { "epoch": 1.5184696569920844, "grad_norm": 0.07309567471470145, "learning_rate": 0.0001152945372017344, "loss": 0.295, "step": 1151 }, { "epoch": 1.5197889182058049, "grad_norm": 0.07656253522626154, "learning_rate": 0.00011514277775045768, "loss": 0.3105, "step": 1152 }, { "epoch": 1.521108179419525, "grad_norm": 0.07325994384259894, "learning_rate": 0.00011499098259706553, "loss": 0.2995, "step": 1153 }, { "epoch": 1.5224274406332454, "grad_norm": 0.07700839307716062, "learning_rate": 0.00011483915209944529, "loss": 0.3138, "step": 1154 }, { "epoch": 1.5237467018469657, "grad_norm": 0.07208397268521954, "learning_rate": 0.0001146872866155676, "loss": 0.2984, "step": 1155 }, { "epoch": 1.525065963060686, "grad_norm": 0.07505013420199366, "learning_rate": 0.00011453538650348559, "loss": 0.2954, "step": 1156 }, { "epoch": 1.5263852242744065, "grad_norm": 0.07387180694915102, "learning_rate": 0.00011438345212133406, "loss": 0.3023, "step": 1157 }, { "epoch": 1.5277044854881265, "grad_norm": 0.07293893447164769, "learning_rate": 0.00011423148382732853, "loss": 0.3038, "step": 1158 }, { "epoch": 1.529023746701847, "grad_norm": 0.07017901724657796, "learning_rate": 0.00011407948197976457, "loss": 0.2881, "step": 1159 }, { "epoch": 1.5303430079155673, "grad_norm": 0.07450635702046587, "learning_rate": 0.00011392744693701682, "loss": 0.2996, "step": 1160 }, { "epoch": 1.5316622691292876, "grad_norm": 0.07803330150077618, "learning_rate": 0.0001137753790575382, "loss": 0.321, "step": 1161 }, { "epoch": 1.5329815303430079, "grad_norm": 0.07322860069067531, "learning_rate": 0.000113623278699859, "loss": 0.3037, "step": 1162 }, { "epoch": 1.5343007915567282, "grad_norm": 0.07538386116650381, "learning_rate": 0.00011347114622258612, "loss": 0.2947, "step": 1163 }, { "epoch": 1.5356200527704487, "grad_norm": 0.07411792066801237, "learning_rate": 0.00011331898198440219, "loss": 0.2971, "step": 1164 }, { "epoch": 1.5369393139841687, "grad_norm": 0.07217191030270441, "learning_rate": 0.0001131667863440647, "loss": 0.3028, "step": 1165 }, { "epoch": 1.5382585751978892, "grad_norm": 0.07244234322327635, "learning_rate": 0.00011301455966040524, "loss": 0.2947, "step": 1166 }, { "epoch": 1.5395778364116095, "grad_norm": 0.0736096423149514, "learning_rate": 0.0001128623022923285, "loss": 0.3073, "step": 1167 }, { "epoch": 1.5408970976253298, "grad_norm": 0.07144909443893686, "learning_rate": 0.00011271001459881154, "loss": 0.2918, "step": 1168 }, { "epoch": 1.5422163588390503, "grad_norm": 0.07300999737343969, "learning_rate": 0.00011255769693890302, "loss": 0.2961, "step": 1169 }, { "epoch": 1.5435356200527703, "grad_norm": 0.06950485939274594, "learning_rate": 0.0001124053496717221, "loss": 0.2892, "step": 1170 }, { "epoch": 1.5448548812664908, "grad_norm": 0.07185749909892862, "learning_rate": 0.00011225297315645784, "loss": 0.2991, "step": 1171 }, { "epoch": 1.5461741424802111, "grad_norm": 0.07488325457185734, "learning_rate": 0.00011210056775236823, "loss": 0.2982, "step": 1172 }, { "epoch": 1.5474934036939314, "grad_norm": 0.07414414940250175, "learning_rate": 0.00011194813381877937, "loss": 0.3045, "step": 1173 }, { "epoch": 1.5488126649076517, "grad_norm": 0.07006748697825316, "learning_rate": 0.00011179567171508463, "loss": 0.2985, "step": 1174 }, { "epoch": 1.550131926121372, "grad_norm": 0.07346142847682055, "learning_rate": 0.00011164318180074377, "loss": 0.3132, "step": 1175 }, { "epoch": 1.5514511873350925, "grad_norm": 0.06912647460227159, "learning_rate": 0.00011149066443528218, "loss": 0.2945, "step": 1176 }, { "epoch": 1.5527704485488125, "grad_norm": 0.07527790781881226, "learning_rate": 0.00011133811997828991, "loss": 0.3017, "step": 1177 }, { "epoch": 1.554089709762533, "grad_norm": 0.0742784307901058, "learning_rate": 0.00011118554878942093, "loss": 0.3005, "step": 1178 }, { "epoch": 1.5554089709762533, "grad_norm": 0.07280090972299245, "learning_rate": 0.00011103295122839221, "loss": 0.3026, "step": 1179 }, { "epoch": 1.5567282321899736, "grad_norm": 0.07352539040404946, "learning_rate": 0.00011088032765498291, "loss": 0.2997, "step": 1180 }, { "epoch": 1.558047493403694, "grad_norm": 0.07629747221132757, "learning_rate": 0.00011072767842903346, "loss": 0.309, "step": 1181 }, { "epoch": 1.5593667546174141, "grad_norm": 0.07202143970462649, "learning_rate": 0.00011057500391044489, "loss": 0.2993, "step": 1182 }, { "epoch": 1.5606860158311346, "grad_norm": 0.0766136789796535, "learning_rate": 0.00011042230445917777, "loss": 0.3065, "step": 1183 }, { "epoch": 1.562005277044855, "grad_norm": 0.07276352097492844, "learning_rate": 0.00011026958043525144, "loss": 0.3034, "step": 1184 }, { "epoch": 1.5633245382585752, "grad_norm": 0.07258337081849642, "learning_rate": 0.00011011683219874323, "loss": 0.3057, "step": 1185 }, { "epoch": 1.5646437994722955, "grad_norm": 0.07359918799761811, "learning_rate": 0.00010996406010978757, "loss": 0.3015, "step": 1186 }, { "epoch": 1.5659630606860158, "grad_norm": 0.07261470585636269, "learning_rate": 0.0001098112645285751, "loss": 0.3052, "step": 1187 }, { "epoch": 1.5672823218997363, "grad_norm": 0.07173274817240935, "learning_rate": 0.00010965844581535178, "loss": 0.2902, "step": 1188 }, { "epoch": 1.5686015831134563, "grad_norm": 0.0725924708882381, "learning_rate": 0.00010950560433041826, "loss": 0.3068, "step": 1189 }, { "epoch": 1.5699208443271768, "grad_norm": 0.07192261927161327, "learning_rate": 0.00010935274043412876, "loss": 0.3097, "step": 1190 }, { "epoch": 1.571240105540897, "grad_norm": 0.07243958807773104, "learning_rate": 0.00010919985448689031, "loss": 0.2933, "step": 1191 }, { "epoch": 1.5725593667546174, "grad_norm": 0.07382263070287906, "learning_rate": 0.00010904694684916208, "loss": 0.3133, "step": 1192 }, { "epoch": 1.5738786279683379, "grad_norm": 0.07317543523711359, "learning_rate": 0.00010889401788145423, "loss": 0.304, "step": 1193 }, { "epoch": 1.575197889182058, "grad_norm": 0.07384129612108159, "learning_rate": 0.00010874106794432728, "loss": 0.2955, "step": 1194 }, { "epoch": 1.5765171503957784, "grad_norm": 0.07189883522290734, "learning_rate": 0.00010858809739839118, "loss": 0.3018, "step": 1195 }, { "epoch": 1.5778364116094987, "grad_norm": 0.07321937680540481, "learning_rate": 0.00010843510660430447, "loss": 0.3149, "step": 1196 }, { "epoch": 1.579155672823219, "grad_norm": 0.07390188761914665, "learning_rate": 0.00010828209592277346, "loss": 0.292, "step": 1197 }, { "epoch": 1.5804749340369393, "grad_norm": 0.07382082249865383, "learning_rate": 0.00010812906571455128, "loss": 0.3094, "step": 1198 }, { "epoch": 1.5817941952506596, "grad_norm": 0.07353357099630324, "learning_rate": 0.00010797601634043713, "loss": 0.3119, "step": 1199 }, { "epoch": 1.58311345646438, "grad_norm": 0.0740953897561366, "learning_rate": 0.0001078229481612754, "loss": 0.3111, "step": 1200 }, { "epoch": 1.5844327176781001, "grad_norm": 0.07207612050942012, "learning_rate": 0.00010766986153795484, "loss": 0.3058, "step": 1201 }, { "epoch": 1.5857519788918206, "grad_norm": 0.07295966649257808, "learning_rate": 0.00010751675683140765, "loss": 0.3106, "step": 1202 }, { "epoch": 1.587071240105541, "grad_norm": 0.07161186904795262, "learning_rate": 0.00010736363440260869, "loss": 0.3036, "step": 1203 }, { "epoch": 1.5883905013192612, "grad_norm": 0.07237578263181016, "learning_rate": 0.00010721049461257456, "loss": 0.3057, "step": 1204 }, { "epoch": 1.5897097625329817, "grad_norm": 0.07495690444553119, "learning_rate": 0.00010705733782236285, "loss": 0.301, "step": 1205 }, { "epoch": 1.5910290237467017, "grad_norm": 0.07211502548266428, "learning_rate": 0.00010690416439307122, "loss": 0.3105, "step": 1206 }, { "epoch": 1.5923482849604222, "grad_norm": 0.0717090573628712, "learning_rate": 0.00010675097468583652, "loss": 0.2935, "step": 1207 }, { "epoch": 1.5936675461741425, "grad_norm": 0.07220256255693909, "learning_rate": 0.00010659776906183403, "loss": 0.2966, "step": 1208 }, { "epoch": 1.5949868073878628, "grad_norm": 0.07450699698270018, "learning_rate": 0.0001064445478822765, "loss": 0.3074, "step": 1209 }, { "epoch": 1.596306068601583, "grad_norm": 0.07240922565339371, "learning_rate": 0.00010629131150841343, "loss": 0.2897, "step": 1210 }, { "epoch": 1.5976253298153034, "grad_norm": 0.0739073611408613, "learning_rate": 0.00010613806030153004, "loss": 0.3012, "step": 1211 }, { "epoch": 1.5989445910290239, "grad_norm": 0.07009704311388419, "learning_rate": 0.00010598479462294663, "loss": 0.293, "step": 1212 }, { "epoch": 1.600263852242744, "grad_norm": 0.07232544044906013, "learning_rate": 0.00010583151483401754, "loss": 0.3022, "step": 1213 }, { "epoch": 1.6015831134564644, "grad_norm": 0.07569861170284063, "learning_rate": 0.00010567822129613043, "loss": 0.301, "step": 1214 }, { "epoch": 1.6029023746701847, "grad_norm": 0.0729782070589659, "learning_rate": 0.00010552491437070537, "loss": 0.302, "step": 1215 }, { "epoch": 1.604221635883905, "grad_norm": 0.07422638552959335, "learning_rate": 0.00010537159441919395, "loss": 0.2974, "step": 1216 }, { "epoch": 1.6055408970976255, "grad_norm": 0.07281399371036801, "learning_rate": 0.00010521826180307855, "loss": 0.3062, "step": 1217 }, { "epoch": 1.6068601583113455, "grad_norm": 0.07466537942460129, "learning_rate": 0.00010506491688387127, "loss": 0.3125, "step": 1218 }, { "epoch": 1.608179419525066, "grad_norm": 0.07562910184051136, "learning_rate": 0.0001049115600231134, "loss": 0.3127, "step": 1219 }, { "epoch": 1.6094986807387863, "grad_norm": 0.07262957331771358, "learning_rate": 0.00010475819158237425, "loss": 0.2979, "step": 1220 }, { "epoch": 1.6108179419525066, "grad_norm": 0.07400792017294598, "learning_rate": 0.00010460481192325045, "loss": 0.2961, "step": 1221 }, { "epoch": 1.6121372031662269, "grad_norm": 0.07130283798083814, "learning_rate": 0.00010445142140736515, "loss": 0.2999, "step": 1222 }, { "epoch": 1.6134564643799472, "grad_norm": 0.0740140815479883, "learning_rate": 0.00010429802039636705, "loss": 0.3, "step": 1223 }, { "epoch": 1.6147757255936677, "grad_norm": 0.07406016636793009, "learning_rate": 0.00010414460925192957, "loss": 0.3102, "step": 1224 }, { "epoch": 1.6160949868073877, "grad_norm": 0.07309010486861041, "learning_rate": 0.00010399118833575006, "loss": 0.3094, "step": 1225 }, { "epoch": 1.6174142480211082, "grad_norm": 0.07292980774069703, "learning_rate": 0.00010383775800954886, "loss": 0.2992, "step": 1226 }, { "epoch": 1.6187335092348285, "grad_norm": 0.07290086042348772, "learning_rate": 0.0001036843186350686, "loss": 0.2992, "step": 1227 }, { "epoch": 1.6200527704485488, "grad_norm": 0.07361963968224203, "learning_rate": 0.00010353087057407314, "loss": 0.2991, "step": 1228 }, { "epoch": 1.6213720316622693, "grad_norm": 0.07283273195380698, "learning_rate": 0.00010337741418834684, "loss": 0.2953, "step": 1229 }, { "epoch": 1.6226912928759893, "grad_norm": 0.0754531992072576, "learning_rate": 0.00010322394983969368, "loss": 0.3003, "step": 1230 }, { "epoch": 1.6240105540897098, "grad_norm": 0.06881799844981254, "learning_rate": 0.00010307047788993651, "loss": 0.2935, "step": 1231 }, { "epoch": 1.6253298153034301, "grad_norm": 0.0711156752630215, "learning_rate": 0.000102916998700916, "loss": 0.2896, "step": 1232 }, { "epoch": 1.6266490765171504, "grad_norm": 0.07209902024913686, "learning_rate": 0.00010276351263448989, "loss": 0.2967, "step": 1233 }, { "epoch": 1.6279683377308707, "grad_norm": 0.07235371244194405, "learning_rate": 0.00010261002005253218, "loss": 0.2974, "step": 1234 }, { "epoch": 1.629287598944591, "grad_norm": 0.072208948772814, "learning_rate": 0.00010245652131693219, "loss": 0.2933, "step": 1235 }, { "epoch": 1.6306068601583115, "grad_norm": 0.07387118069847427, "learning_rate": 0.0001023030167895938, "loss": 0.3117, "step": 1236 }, { "epoch": 1.6319261213720315, "grad_norm": 0.07609632112097228, "learning_rate": 0.00010214950683243452, "loss": 0.3034, "step": 1237 }, { "epoch": 1.633245382585752, "grad_norm": 0.07394004913641744, "learning_rate": 0.00010199599180738462, "loss": 0.3058, "step": 1238 }, { "epoch": 1.6345646437994723, "grad_norm": 0.07203283629657062, "learning_rate": 0.00010184247207638636, "loss": 0.2914, "step": 1239 }, { "epoch": 1.6358839050131926, "grad_norm": 0.07194745329648045, "learning_rate": 0.0001016889480013931, "loss": 0.2945, "step": 1240 }, { "epoch": 1.637203166226913, "grad_norm": 0.07484721742571852, "learning_rate": 0.00010153541994436849, "loss": 0.3086, "step": 1241 }, { "epoch": 1.6385224274406331, "grad_norm": 0.07485566672178946, "learning_rate": 0.00010138188826728543, "loss": 0.2987, "step": 1242 }, { "epoch": 1.6398416886543536, "grad_norm": 0.07338859705902533, "learning_rate": 0.00010122835333212548, "loss": 0.3058, "step": 1243 }, { "epoch": 1.641160949868074, "grad_norm": 0.07117389249590132, "learning_rate": 0.00010107481550087783, "loss": 0.3029, "step": 1244 }, { "epoch": 1.6424802110817942, "grad_norm": 0.07174587637890605, "learning_rate": 0.0001009212751355385, "loss": 0.3013, "step": 1245 }, { "epoch": 1.6437994722955145, "grad_norm": 0.07178099357781863, "learning_rate": 0.0001007677325981095, "loss": 0.2982, "step": 1246 }, { "epoch": 1.6451187335092348, "grad_norm": 0.0733944975534985, "learning_rate": 0.00010061418825059791, "loss": 0.2975, "step": 1247 }, { "epoch": 1.6464379947229553, "grad_norm": 0.07389297394270401, "learning_rate": 0.00010046064245501518, "loss": 0.3048, "step": 1248 }, { "epoch": 1.6477572559366753, "grad_norm": 0.07444130513860057, "learning_rate": 0.0001003070955733761, "loss": 0.2989, "step": 1249 }, { "epoch": 1.6490765171503958, "grad_norm": 0.07210440617025703, "learning_rate": 0.00010015354796769802, "loss": 0.3045, "step": 1250 }, { "epoch": 1.650395778364116, "grad_norm": 0.0720524129228272, "learning_rate": 0.0001, "loss": 0.3025, "step": 1251 }, { "epoch": 1.6517150395778364, "grad_norm": 0.073074044848758, "learning_rate": 9.9846452032302e-05, "loss": 0.3011, "step": 1252 }, { "epoch": 1.6530343007915569, "grad_norm": 0.07223364118079115, "learning_rate": 9.969290442662392e-05, "loss": 0.2936, "step": 1253 }, { "epoch": 1.654353562005277, "grad_norm": 0.07659328364020009, "learning_rate": 9.953935754498484e-05, "loss": 0.3113, "step": 1254 }, { "epoch": 1.6556728232189974, "grad_norm": 0.0759715103516261, "learning_rate": 9.938581174940211e-05, "loss": 0.313, "step": 1255 }, { "epoch": 1.6569920844327177, "grad_norm": 0.07113660653358184, "learning_rate": 9.923226740189053e-05, "loss": 0.2944, "step": 1256 }, { "epoch": 1.658311345646438, "grad_norm": 0.071376198432372, "learning_rate": 9.907872486446152e-05, "loss": 0.2989, "step": 1257 }, { "epoch": 1.6596306068601583, "grad_norm": 0.07170263772524575, "learning_rate": 9.892518449912219e-05, "loss": 0.2966, "step": 1258 }, { "epoch": 1.6609498680738786, "grad_norm": 0.07422294437037795, "learning_rate": 9.877164666787454e-05, "loss": 0.2984, "step": 1259 }, { "epoch": 1.662269129287599, "grad_norm": 0.07458256324615646, "learning_rate": 9.861811173271459e-05, "loss": 0.3074, "step": 1260 }, { "epoch": 1.6635883905013191, "grad_norm": 0.06896984627260254, "learning_rate": 9.846458005563154e-05, "loss": 0.2871, "step": 1261 }, { "epoch": 1.6649076517150396, "grad_norm": 0.0720563589507692, "learning_rate": 9.83110519986069e-05, "loss": 0.3, "step": 1262 }, { "epoch": 1.66622691292876, "grad_norm": 0.07529860844081325, "learning_rate": 9.815752792361368e-05, "loss": 0.3201, "step": 1263 }, { "epoch": 1.6675461741424802, "grad_norm": 0.07197766137478458, "learning_rate": 9.800400819261541e-05, "loss": 0.2974, "step": 1264 }, { "epoch": 1.6688654353562007, "grad_norm": 0.07727695396674339, "learning_rate": 9.785049316756552e-05, "loss": 0.2984, "step": 1265 }, { "epoch": 1.6701846965699207, "grad_norm": 0.0729188835608575, "learning_rate": 9.769698321040622e-05, "loss": 0.308, "step": 1266 }, { "epoch": 1.6715039577836412, "grad_norm": 0.07386011889981198, "learning_rate": 9.754347868306783e-05, "loss": 0.3107, "step": 1267 }, { "epoch": 1.6728232189973615, "grad_norm": 0.07191472336438554, "learning_rate": 9.738997994746786e-05, "loss": 0.307, "step": 1268 }, { "epoch": 1.6741424802110818, "grad_norm": 0.07473763593590306, "learning_rate": 9.723648736551015e-05, "loss": 0.303, "step": 1269 }, { "epoch": 1.675461741424802, "grad_norm": 0.07372279624147093, "learning_rate": 9.708300129908403e-05, "loss": 0.3068, "step": 1270 }, { "epoch": 1.6767810026385224, "grad_norm": 0.07452824169085999, "learning_rate": 9.692952211006351e-05, "loss": 0.3165, "step": 1271 }, { "epoch": 1.6781002638522429, "grad_norm": 0.07283380289338114, "learning_rate": 9.677605016030632e-05, "loss": 0.2956, "step": 1272 }, { "epoch": 1.679419525065963, "grad_norm": 0.07118029212959787, "learning_rate": 9.662258581165319e-05, "loss": 0.2942, "step": 1273 }, { "epoch": 1.6807387862796834, "grad_norm": 0.07407446915048656, "learning_rate": 9.646912942592689e-05, "loss": 0.307, "step": 1274 }, { "epoch": 1.6820580474934037, "grad_norm": 0.07274235374327014, "learning_rate": 9.631568136493142e-05, "loss": 0.2999, "step": 1275 }, { "epoch": 1.683377308707124, "grad_norm": 0.07211465201662444, "learning_rate": 9.616224199045115e-05, "loss": 0.3021, "step": 1276 }, { "epoch": 1.6846965699208445, "grad_norm": 0.0717201669126976, "learning_rate": 9.600881166424998e-05, "loss": 0.2972, "step": 1277 }, { "epoch": 1.6860158311345645, "grad_norm": 0.07296246309847129, "learning_rate": 9.585539074807047e-05, "loss": 0.2932, "step": 1278 }, { "epoch": 1.687335092348285, "grad_norm": 0.07304573305459049, "learning_rate": 9.570197960363298e-05, "loss": 0.2947, "step": 1279 }, { "epoch": 1.6886543535620053, "grad_norm": 0.07446814822383499, "learning_rate": 9.554857859263486e-05, "loss": 0.3062, "step": 1280 }, { "epoch": 1.6899736147757256, "grad_norm": 0.07109546520025324, "learning_rate": 9.539518807674957e-05, "loss": 0.2891, "step": 1281 }, { "epoch": 1.6912928759894459, "grad_norm": 0.07486052838307834, "learning_rate": 9.524180841762577e-05, "loss": 0.3039, "step": 1282 }, { "epoch": 1.6926121372031662, "grad_norm": 0.07589831574107357, "learning_rate": 9.508843997688662e-05, "loss": 0.3104, "step": 1283 }, { "epoch": 1.6939313984168867, "grad_norm": 0.07215651158262124, "learning_rate": 9.493508311612874e-05, "loss": 0.2949, "step": 1284 }, { "epoch": 1.6952506596306067, "grad_norm": 0.0712626489408401, "learning_rate": 9.478173819692149e-05, "loss": 0.2994, "step": 1285 }, { "epoch": 1.6965699208443272, "grad_norm": 0.07196626264893126, "learning_rate": 9.462840558080606e-05, "loss": 0.291, "step": 1286 }, { "epoch": 1.6978891820580475, "grad_norm": 0.07269170181437803, "learning_rate": 9.447508562929465e-05, "loss": 0.2996, "step": 1287 }, { "epoch": 1.6992084432717678, "grad_norm": 0.07433133229135581, "learning_rate": 9.432177870386958e-05, "loss": 0.303, "step": 1288 }, { "epoch": 1.7005277044854883, "grad_norm": 0.07179279257304105, "learning_rate": 9.416848516598249e-05, "loss": 0.295, "step": 1289 }, { "epoch": 1.7018469656992083, "grad_norm": 0.06986187422520702, "learning_rate": 9.401520537705339e-05, "loss": 0.2971, "step": 1290 }, { "epoch": 1.7031662269129288, "grad_norm": 0.07415679397097151, "learning_rate": 9.386193969846999e-05, "loss": 0.2926, "step": 1291 }, { "epoch": 1.7044854881266491, "grad_norm": 0.07283178151596852, "learning_rate": 9.370868849158661e-05, "loss": 0.3092, "step": 1292 }, { "epoch": 1.7058047493403694, "grad_norm": 0.07235266199215151, "learning_rate": 9.35554521177235e-05, "loss": 0.2967, "step": 1293 }, { "epoch": 1.7071240105540897, "grad_norm": 0.07166315824829765, "learning_rate": 9.3402230938166e-05, "loss": 0.2937, "step": 1294 }, { "epoch": 1.70844327176781, "grad_norm": 0.07353230319915043, "learning_rate": 9.324902531416349e-05, "loss": 0.2981, "step": 1295 }, { "epoch": 1.7097625329815305, "grad_norm": 0.07436089727879845, "learning_rate": 9.30958356069288e-05, "loss": 0.2985, "step": 1296 }, { "epoch": 1.7110817941952505, "grad_norm": 0.0723086817854819, "learning_rate": 9.294266217763716e-05, "loss": 0.2887, "step": 1297 }, { "epoch": 1.712401055408971, "grad_norm": 0.07119921038658288, "learning_rate": 9.278950538742547e-05, "loss": 0.3075, "step": 1298 }, { "epoch": 1.7137203166226913, "grad_norm": 0.07272600640681783, "learning_rate": 9.263636559739132e-05, "loss": 0.3056, "step": 1299 }, { "epoch": 1.7150395778364116, "grad_norm": 0.06996160255360119, "learning_rate": 9.248324316859237e-05, "loss": 0.2934, "step": 1300 }, { "epoch": 1.716358839050132, "grad_norm": 0.0704114660466686, "learning_rate": 9.233013846204518e-05, "loss": 0.2865, "step": 1301 }, { "epoch": 1.7176781002638521, "grad_norm": 0.07223443306709594, "learning_rate": 9.217705183872462e-05, "loss": 0.3064, "step": 1302 }, { "epoch": 1.7189973614775726, "grad_norm": 0.07449544064110772, "learning_rate": 9.202398365956291e-05, "loss": 0.3115, "step": 1303 }, { "epoch": 1.720316622691293, "grad_norm": 0.0755234789962857, "learning_rate": 9.187093428544876e-05, "loss": 0.2952, "step": 1304 }, { "epoch": 1.7216358839050132, "grad_norm": 0.07065304053008248, "learning_rate": 9.171790407722656e-05, "loss": 0.2948, "step": 1305 }, { "epoch": 1.7229551451187335, "grad_norm": 0.06929630711862488, "learning_rate": 9.156489339569554e-05, "loss": 0.2889, "step": 1306 }, { "epoch": 1.7242744063324538, "grad_norm": 0.07121462130946202, "learning_rate": 9.141190260160885e-05, "loss": 0.2899, "step": 1307 }, { "epoch": 1.7255936675461743, "grad_norm": 0.0749707940425762, "learning_rate": 9.125893205567273e-05, "loss": 0.3014, "step": 1308 }, { "epoch": 1.7269129287598943, "grad_norm": 0.07359878035248245, "learning_rate": 9.11059821185458e-05, "loss": 0.2956, "step": 1309 }, { "epoch": 1.7282321899736148, "grad_norm": 0.07240785715236389, "learning_rate": 9.095305315083795e-05, "loss": 0.3043, "step": 1310 }, { "epoch": 1.729551451187335, "grad_norm": 0.07171691299463546, "learning_rate": 9.08001455131097e-05, "loss": 0.2944, "step": 1311 }, { "epoch": 1.7308707124010554, "grad_norm": 0.07267758230149278, "learning_rate": 9.064725956587128e-05, "loss": 0.302, "step": 1312 }, { "epoch": 1.732189973614776, "grad_norm": 0.0720850016975907, "learning_rate": 9.049439566958175e-05, "loss": 0.2973, "step": 1313 }, { "epoch": 1.733509234828496, "grad_norm": 0.07368756087548865, "learning_rate": 9.034155418464823e-05, "loss": 0.3072, "step": 1314 }, { "epoch": 1.7348284960422165, "grad_norm": 0.07381934291488089, "learning_rate": 9.018873547142494e-05, "loss": 0.2983, "step": 1315 }, { "epoch": 1.7361477572559367, "grad_norm": 0.07129113383273239, "learning_rate": 9.003593989021244e-05, "loss": 0.302, "step": 1316 }, { "epoch": 1.737467018469657, "grad_norm": 0.07051318327262807, "learning_rate": 8.98831678012568e-05, "loss": 0.301, "step": 1317 }, { "epoch": 1.7387862796833773, "grad_norm": 0.07189010296318508, "learning_rate": 8.973041956474861e-05, "loss": 0.3041, "step": 1318 }, { "epoch": 1.7401055408970976, "grad_norm": 0.07355699193699618, "learning_rate": 8.95776955408223e-05, "loss": 0.2973, "step": 1319 }, { "epoch": 1.741424802110818, "grad_norm": 0.07383834368816158, "learning_rate": 8.942499608955516e-05, "loss": 0.3001, "step": 1320 }, { "epoch": 1.7427440633245381, "grad_norm": 0.07074009137866137, "learning_rate": 8.927232157096656e-05, "loss": 0.2945, "step": 1321 }, { "epoch": 1.7440633245382586, "grad_norm": 0.0733359723142765, "learning_rate": 8.911967234501713e-05, "loss": 0.2904, "step": 1322 }, { "epoch": 1.745382585751979, "grad_norm": 0.07188282005258972, "learning_rate": 8.896704877160782e-05, "loss": 0.2936, "step": 1323 }, { "epoch": 1.7467018469656992, "grad_norm": 0.07595066053484852, "learning_rate": 8.881445121057909e-05, "loss": 0.2967, "step": 1324 }, { "epoch": 1.7480211081794197, "grad_norm": 0.07343535021319497, "learning_rate": 8.86618800217101e-05, "loss": 0.2933, "step": 1325 }, { "epoch": 1.7493403693931397, "grad_norm": 0.07233577070059728, "learning_rate": 8.850933556471785e-05, "loss": 0.2955, "step": 1326 }, { "epoch": 1.7506596306068603, "grad_norm": 0.07295789009178617, "learning_rate": 8.835681819925626e-05, "loss": 0.296, "step": 1327 }, { "epoch": 1.7519788918205803, "grad_norm": 0.07474958746117713, "learning_rate": 8.820432828491542e-05, "loss": 0.2928, "step": 1328 }, { "epoch": 1.7532981530343008, "grad_norm": 0.07238442209087488, "learning_rate": 8.805186618122068e-05, "loss": 0.2983, "step": 1329 }, { "epoch": 1.754617414248021, "grad_norm": 0.07100502107284067, "learning_rate": 8.789943224763182e-05, "loss": 0.2937, "step": 1330 }, { "epoch": 1.7559366754617414, "grad_norm": 0.07263061007812549, "learning_rate": 8.77470268435422e-05, "loss": 0.2949, "step": 1331 }, { "epoch": 1.7572559366754619, "grad_norm": 0.07295721340633157, "learning_rate": 8.759465032827794e-05, "loss": 0.3017, "step": 1332 }, { "epoch": 1.758575197889182, "grad_norm": 0.06943937971174761, "learning_rate": 8.7442303061097e-05, "loss": 0.2852, "step": 1333 }, { "epoch": 1.7598944591029024, "grad_norm": 0.07271975341680349, "learning_rate": 8.728998540118847e-05, "loss": 0.2952, "step": 1334 }, { "epoch": 1.7612137203166227, "grad_norm": 0.07022722437848619, "learning_rate": 8.713769770767155e-05, "loss": 0.2878, "step": 1335 }, { "epoch": 1.762532981530343, "grad_norm": 0.07147393627503353, "learning_rate": 8.69854403395948e-05, "loss": 0.2992, "step": 1336 }, { "epoch": 1.7638522427440633, "grad_norm": 0.07289460849092237, "learning_rate": 8.683321365593532e-05, "loss": 0.3018, "step": 1337 }, { "epoch": 1.7651715039577835, "grad_norm": 0.07240022946655487, "learning_rate": 8.668101801559786e-05, "loss": 0.2985, "step": 1338 }, { "epoch": 1.766490765171504, "grad_norm": 0.0716691420251464, "learning_rate": 8.652885377741393e-05, "loss": 0.2935, "step": 1339 }, { "epoch": 1.767810026385224, "grad_norm": 0.07207159696812064, "learning_rate": 8.637672130014105e-05, "loss": 0.2905, "step": 1340 }, { "epoch": 1.7691292875989446, "grad_norm": 0.07191048120051187, "learning_rate": 8.622462094246184e-05, "loss": 0.2982, "step": 1341 }, { "epoch": 1.770448548812665, "grad_norm": 0.07214192126051991, "learning_rate": 8.607255306298319e-05, "loss": 0.2986, "step": 1342 }, { "epoch": 1.7717678100263852, "grad_norm": 0.07236961974307601, "learning_rate": 8.592051802023545e-05, "loss": 0.2992, "step": 1343 }, { "epoch": 1.7730870712401057, "grad_norm": 0.07207860119788628, "learning_rate": 8.57685161726715e-05, "loss": 0.3065, "step": 1344 }, { "epoch": 1.7744063324538257, "grad_norm": 0.07481909278822367, "learning_rate": 8.5616547878666e-05, "loss": 0.3063, "step": 1345 }, { "epoch": 1.7757255936675462, "grad_norm": 0.07285020743732017, "learning_rate": 8.546461349651445e-05, "loss": 0.2965, "step": 1346 }, { "epoch": 1.7770448548812665, "grad_norm": 0.07187139745182088, "learning_rate": 8.531271338443245e-05, "loss": 0.3009, "step": 1347 }, { "epoch": 1.7783641160949868, "grad_norm": 0.0741642664180546, "learning_rate": 8.516084790055476e-05, "loss": 0.302, "step": 1348 }, { "epoch": 1.779683377308707, "grad_norm": 0.07035210530126447, "learning_rate": 8.50090174029345e-05, "loss": 0.3001, "step": 1349 }, { "epoch": 1.7810026385224274, "grad_norm": 0.07338850952280973, "learning_rate": 8.485722224954237e-05, "loss": 0.3006, "step": 1350 }, { "epoch": 1.7823218997361479, "grad_norm": 0.0723028476909122, "learning_rate": 8.470546279826561e-05, "loss": 0.2912, "step": 1351 }, { "epoch": 1.783641160949868, "grad_norm": 0.07420208386911355, "learning_rate": 8.455373940690745e-05, "loss": 0.2985, "step": 1352 }, { "epoch": 1.7849604221635884, "grad_norm": 0.07392189807445411, "learning_rate": 8.440205243318595e-05, "loss": 0.2983, "step": 1353 }, { "epoch": 1.7862796833773087, "grad_norm": 0.07377471881670135, "learning_rate": 8.42504022347334e-05, "loss": 0.3074, "step": 1354 }, { "epoch": 1.787598944591029, "grad_norm": 0.07389089113022648, "learning_rate": 8.409878916909533e-05, "loss": 0.2935, "step": 1355 }, { "epoch": 1.7889182058047495, "grad_norm": 0.0728932681462815, "learning_rate": 8.394721359372977e-05, "loss": 0.306, "step": 1356 }, { "epoch": 1.7902374670184695, "grad_norm": 0.07294320084613275, "learning_rate": 8.379567586600632e-05, "loss": 0.3021, "step": 1357 }, { "epoch": 1.79155672823219, "grad_norm": 0.07338181561841968, "learning_rate": 8.364417634320538e-05, "loss": 0.3036, "step": 1358 }, { "epoch": 1.7928759894459103, "grad_norm": 0.0748952772322969, "learning_rate": 8.349271538251723e-05, "loss": 0.2973, "step": 1359 }, { "epoch": 1.7941952506596306, "grad_norm": 0.07200650451719053, "learning_rate": 8.33412933410413e-05, "loss": 0.2997, "step": 1360 }, { "epoch": 1.7955145118733509, "grad_norm": 0.07154628579335476, "learning_rate": 8.31899105757852e-05, "loss": 0.2934, "step": 1361 }, { "epoch": 1.7968337730870712, "grad_norm": 0.07408839741305837, "learning_rate": 8.303856744366396e-05, "loss": 0.305, "step": 1362 }, { "epoch": 1.7981530343007917, "grad_norm": 0.07354241074976191, "learning_rate": 8.288726430149917e-05, "loss": 0.2933, "step": 1363 }, { "epoch": 1.7994722955145117, "grad_norm": 0.07247768853895742, "learning_rate": 8.273600150601812e-05, "loss": 0.2985, "step": 1364 }, { "epoch": 1.8007915567282322, "grad_norm": 0.07474953123276755, "learning_rate": 8.2584779413853e-05, "loss": 0.3034, "step": 1365 }, { "epoch": 1.8021108179419525, "grad_norm": 0.07534316778822768, "learning_rate": 8.243359838154001e-05, "loss": 0.2939, "step": 1366 }, { "epoch": 1.8034300791556728, "grad_norm": 0.07280727348383001, "learning_rate": 8.228245876551857e-05, "loss": 0.3137, "step": 1367 }, { "epoch": 1.8047493403693933, "grad_norm": 0.07417663026998472, "learning_rate": 8.213136092213039e-05, "loss": 0.305, "step": 1368 }, { "epoch": 1.8060686015831133, "grad_norm": 0.07196645590783142, "learning_rate": 8.198030520761878e-05, "loss": 0.2941, "step": 1369 }, { "epoch": 1.8073878627968338, "grad_norm": 0.07271132579688099, "learning_rate": 8.182929197812769e-05, "loss": 0.3001, "step": 1370 }, { "epoch": 1.8087071240105541, "grad_norm": 0.07093958342110948, "learning_rate": 8.167832158970087e-05, "loss": 0.2993, "step": 1371 }, { "epoch": 1.8100263852242744, "grad_norm": 0.07284714007484283, "learning_rate": 8.15273943982811e-05, "loss": 0.2986, "step": 1372 }, { "epoch": 1.8113456464379947, "grad_norm": 0.07117187848013781, "learning_rate": 8.13765107597093e-05, "loss": 0.2798, "step": 1373 }, { "epoch": 1.812664907651715, "grad_norm": 0.06985870726681415, "learning_rate": 8.12256710297237e-05, "loss": 0.2889, "step": 1374 }, { "epoch": 1.8139841688654355, "grad_norm": 0.0725130211018302, "learning_rate": 8.107487556395901e-05, "loss": 0.2969, "step": 1375 }, { "epoch": 1.8153034300791555, "grad_norm": 0.07400551979555604, "learning_rate": 8.092412471794559e-05, "loss": 0.2973, "step": 1376 }, { "epoch": 1.816622691292876, "grad_norm": 0.07454774141795695, "learning_rate": 8.077341884710862e-05, "loss": 0.3081, "step": 1377 }, { "epoch": 1.8179419525065963, "grad_norm": 0.07066614268169025, "learning_rate": 8.06227583067672e-05, "loss": 0.2894, "step": 1378 }, { "epoch": 1.8192612137203166, "grad_norm": 0.07286914712697798, "learning_rate": 8.047214345213352e-05, "loss": 0.3058, "step": 1379 }, { "epoch": 1.820580474934037, "grad_norm": 0.07343564490460046, "learning_rate": 8.032157463831216e-05, "loss": 0.2936, "step": 1380 }, { "epoch": 1.8218997361477571, "grad_norm": 0.07242739061210822, "learning_rate": 8.01710522202991e-05, "loss": 0.3069, "step": 1381 }, { "epoch": 1.8232189973614776, "grad_norm": 0.07219119069486515, "learning_rate": 8.002057655298092e-05, "loss": 0.3037, "step": 1382 }, { "epoch": 1.824538258575198, "grad_norm": 0.0732537080437745, "learning_rate": 7.987014799113397e-05, "loss": 0.2928, "step": 1383 }, { "epoch": 1.8258575197889182, "grad_norm": 0.0701322220255456, "learning_rate": 7.971976688942359e-05, "loss": 0.292, "step": 1384 }, { "epoch": 1.8271767810026385, "grad_norm": 0.07407313823605201, "learning_rate": 7.956943360240314e-05, "loss": 0.3006, "step": 1385 }, { "epoch": 1.8284960422163588, "grad_norm": 0.07183063707692676, "learning_rate": 7.941914848451332e-05, "loss": 0.3005, "step": 1386 }, { "epoch": 1.8298153034300793, "grad_norm": 0.07479215326203949, "learning_rate": 7.926891189008123e-05, "loss": 0.2982, "step": 1387 }, { "epoch": 1.8311345646437993, "grad_norm": 0.07118994060902, "learning_rate": 7.911872417331957e-05, "loss": 0.2997, "step": 1388 }, { "epoch": 1.8324538258575198, "grad_norm": 0.07281228441262129, "learning_rate": 7.896858568832581e-05, "loss": 0.3018, "step": 1389 }, { "epoch": 1.83377308707124, "grad_norm": 0.07326196859113188, "learning_rate": 7.881849678908132e-05, "loss": 0.294, "step": 1390 }, { "epoch": 1.8350923482849604, "grad_norm": 0.07389049591779592, "learning_rate": 7.866845782945063e-05, "loss": 0.3129, "step": 1391 }, { "epoch": 1.8364116094986809, "grad_norm": 0.07201128452037596, "learning_rate": 7.851846916318046e-05, "loss": 0.3003, "step": 1392 }, { "epoch": 1.837730870712401, "grad_norm": 0.0751582408360859, "learning_rate": 7.836853114389894e-05, "loss": 0.3088, "step": 1393 }, { "epoch": 1.8390501319261214, "grad_norm": 0.06893641058993802, "learning_rate": 7.821864412511485e-05, "loss": 0.2878, "step": 1394 }, { "epoch": 1.8403693931398417, "grad_norm": 0.07338480970991253, "learning_rate": 7.806880846021669e-05, "loss": 0.309, "step": 1395 }, { "epoch": 1.841688654353562, "grad_norm": 0.07285902896314837, "learning_rate": 7.79190245024719e-05, "loss": 0.295, "step": 1396 }, { "epoch": 1.8430079155672823, "grad_norm": 0.07211994040805902, "learning_rate": 7.776929260502596e-05, "loss": 0.3002, "step": 1397 }, { "epoch": 1.8443271767810026, "grad_norm": 0.06955054452101624, "learning_rate": 7.761961312090174e-05, "loss": 0.2897, "step": 1398 }, { "epoch": 1.845646437994723, "grad_norm": 0.07065619570668409, "learning_rate": 7.746998640299836e-05, "loss": 0.2858, "step": 1399 }, { "epoch": 1.8469656992084431, "grad_norm": 0.07200342305344638, "learning_rate": 7.732041280409066e-05, "loss": 0.3021, "step": 1400 }, { "epoch": 1.8482849604221636, "grad_norm": 0.07153795939958889, "learning_rate": 7.717089267682818e-05, "loss": 0.2901, "step": 1401 }, { "epoch": 1.849604221635884, "grad_norm": 0.07321038616259445, "learning_rate": 7.702142637373442e-05, "loss": 0.3048, "step": 1402 }, { "epoch": 1.8509234828496042, "grad_norm": 0.07159233193140363, "learning_rate": 7.687201424720596e-05, "loss": 0.2988, "step": 1403 }, { "epoch": 1.8522427440633247, "grad_norm": 0.07233296867197198, "learning_rate": 7.672265664951165e-05, "loss": 0.2989, "step": 1404 }, { "epoch": 1.8535620052770447, "grad_norm": 0.07288905489394285, "learning_rate": 7.65733539327918e-05, "loss": 0.2881, "step": 1405 }, { "epoch": 1.8548812664907652, "grad_norm": 0.0729367134786086, "learning_rate": 7.642410644905726e-05, "loss": 0.2991, "step": 1406 }, { "epoch": 1.8562005277044855, "grad_norm": 0.0738354994563742, "learning_rate": 7.627491455018878e-05, "loss": 0.3018, "step": 1407 }, { "epoch": 1.8575197889182058, "grad_norm": 0.07323358857615325, "learning_rate": 7.612577858793595e-05, "loss": 0.3103, "step": 1408 }, { "epoch": 1.858839050131926, "grad_norm": 0.07160726735232838, "learning_rate": 7.597669891391652e-05, "loss": 0.3015, "step": 1409 }, { "epoch": 1.8601583113456464, "grad_norm": 0.07331182712246288, "learning_rate": 7.582767587961552e-05, "loss": 0.2917, "step": 1410 }, { "epoch": 1.8614775725593669, "grad_norm": 0.07209183797294927, "learning_rate": 7.567870983638443e-05, "loss": 0.3081, "step": 1411 }, { "epoch": 1.862796833773087, "grad_norm": 0.069885526955265, "learning_rate": 7.552980113544039e-05, "loss": 0.285, "step": 1412 }, { "epoch": 1.8641160949868074, "grad_norm": 0.07294811633790899, "learning_rate": 7.538095012786534e-05, "loss": 0.3047, "step": 1413 }, { "epoch": 1.8654353562005277, "grad_norm": 0.07632907438014232, "learning_rate": 7.523215716460514e-05, "loss": 0.3171, "step": 1414 }, { "epoch": 1.866754617414248, "grad_norm": 0.07163271633432247, "learning_rate": 7.508342259646887e-05, "loss": 0.3009, "step": 1415 }, { "epoch": 1.8680738786279685, "grad_norm": 0.07212390417166643, "learning_rate": 7.493474677412794e-05, "loss": 0.2907, "step": 1416 }, { "epoch": 1.8693931398416885, "grad_norm": 0.07462741092370367, "learning_rate": 7.478613004811519e-05, "loss": 0.2965, "step": 1417 }, { "epoch": 1.870712401055409, "grad_norm": 0.07518461374665197, "learning_rate": 7.463757276882415e-05, "loss": 0.3037, "step": 1418 }, { "epoch": 1.8720316622691293, "grad_norm": 0.07313846674769052, "learning_rate": 7.448907528650823e-05, "loss": 0.3044, "step": 1419 }, { "epoch": 1.8733509234828496, "grad_norm": 0.07405298896702835, "learning_rate": 7.43406379512798e-05, "loss": 0.2892, "step": 1420 }, { "epoch": 1.8746701846965699, "grad_norm": 0.07306831193831126, "learning_rate": 7.419226111310948e-05, "loss": 0.2985, "step": 1421 }, { "epoch": 1.8759894459102902, "grad_norm": 0.07694366585156719, "learning_rate": 7.40439451218252e-05, "loss": 0.311, "step": 1422 }, { "epoch": 1.8773087071240107, "grad_norm": 0.0726398060977391, "learning_rate": 7.389569032711146e-05, "loss": 0.3072, "step": 1423 }, { "epoch": 1.8786279683377307, "grad_norm": 0.0706711162495314, "learning_rate": 7.374749707850849e-05, "loss": 0.302, "step": 1424 }, { "epoch": 1.8799472295514512, "grad_norm": 0.07230579525771244, "learning_rate": 7.359936572541142e-05, "loss": 0.3022, "step": 1425 }, { "epoch": 1.8812664907651715, "grad_norm": 0.06989135459928073, "learning_rate": 7.345129661706939e-05, "loss": 0.296, "step": 1426 }, { "epoch": 1.8825857519788918, "grad_norm": 0.0734432892881329, "learning_rate": 7.330329010258483e-05, "loss": 0.2883, "step": 1427 }, { "epoch": 1.8839050131926123, "grad_norm": 0.07446450470647699, "learning_rate": 7.31553465309126e-05, "loss": 0.3056, "step": 1428 }, { "epoch": 1.8852242744063323, "grad_norm": 0.07409242987848859, "learning_rate": 7.300746625085912e-05, "loss": 0.3053, "step": 1429 }, { "epoch": 1.8865435356200528, "grad_norm": 0.07490733127522335, "learning_rate": 7.285964961108163e-05, "loss": 0.2957, "step": 1430 }, { "epoch": 1.8878627968337731, "grad_norm": 0.07222421512802478, "learning_rate": 7.271189696008729e-05, "loss": 0.3032, "step": 1431 }, { "epoch": 1.8891820580474934, "grad_norm": 0.07340916364303582, "learning_rate": 7.256420864623242e-05, "loss": 0.3064, "step": 1432 }, { "epoch": 1.8905013192612137, "grad_norm": 0.07307793326811854, "learning_rate": 7.241658501772166e-05, "loss": 0.2912, "step": 1433 }, { "epoch": 1.891820580474934, "grad_norm": 0.07607274356852967, "learning_rate": 7.226902642260711e-05, "loss": 0.2982, "step": 1434 }, { "epoch": 1.8931398416886545, "grad_norm": 0.07188270583034498, "learning_rate": 7.212153320878756e-05, "loss": 0.2964, "step": 1435 }, { "epoch": 1.8944591029023745, "grad_norm": 0.07202619309489428, "learning_rate": 7.197410572400765e-05, "loss": 0.2972, "step": 1436 }, { "epoch": 1.895778364116095, "grad_norm": 0.07230413447128566, "learning_rate": 7.182674431585704e-05, "loss": 0.2971, "step": 1437 }, { "epoch": 1.8970976253298153, "grad_norm": 0.07305670475936302, "learning_rate": 7.16794493317696e-05, "loss": 0.2945, "step": 1438 }, { "epoch": 1.8984168865435356, "grad_norm": 0.072015541412122, "learning_rate": 7.153222111902262e-05, "loss": 0.2923, "step": 1439 }, { "epoch": 1.899736147757256, "grad_norm": 0.07250186810865775, "learning_rate": 7.138506002473591e-05, "loss": 0.2975, "step": 1440 }, { "epoch": 1.9010554089709761, "grad_norm": 0.07220180886685396, "learning_rate": 7.12379663958711e-05, "loss": 0.3078, "step": 1441 }, { "epoch": 1.9023746701846966, "grad_norm": 0.07289208214728461, "learning_rate": 7.109094057923074e-05, "loss": 0.3116, "step": 1442 }, { "epoch": 1.903693931398417, "grad_norm": 0.07423008809673007, "learning_rate": 7.094398292145746e-05, "loss": 0.2965, "step": 1443 }, { "epoch": 1.9050131926121372, "grad_norm": 0.07301219792727907, "learning_rate": 7.079709376903321e-05, "loss": 0.2964, "step": 1444 }, { "epoch": 1.9063324538258575, "grad_norm": 0.0709929051440465, "learning_rate": 7.065027346827843e-05, "loss": 0.2998, "step": 1445 }, { "epoch": 1.9076517150395778, "grad_norm": 0.07291952494714533, "learning_rate": 7.050352236535125e-05, "loss": 0.2884, "step": 1446 }, { "epoch": 1.9089709762532983, "grad_norm": 0.07387213699305538, "learning_rate": 7.035684080624661e-05, "loss": 0.2983, "step": 1447 }, { "epoch": 1.9102902374670183, "grad_norm": 0.07301964587833668, "learning_rate": 7.021022913679554e-05, "loss": 0.2976, "step": 1448 }, { "epoch": 1.9116094986807388, "grad_norm": 0.07376538842862965, "learning_rate": 7.006368770266421e-05, "loss": 0.2995, "step": 1449 }, { "epoch": 1.912928759894459, "grad_norm": 0.07331476603227353, "learning_rate": 6.991721684935328e-05, "loss": 0.3002, "step": 1450 }, { "epoch": 1.9142480211081794, "grad_norm": 0.07439152965940933, "learning_rate": 6.977081692219698e-05, "loss": 0.3083, "step": 1451 }, { "epoch": 1.9155672823218999, "grad_norm": 0.07564989533269512, "learning_rate": 6.962448826636227e-05, "loss": 0.3069, "step": 1452 }, { "epoch": 1.91688654353562, "grad_norm": 0.07141401078197938, "learning_rate": 6.947823122684816e-05, "loss": 0.2912, "step": 1453 }, { "epoch": 1.9182058047493404, "grad_norm": 0.07121573561850901, "learning_rate": 6.933204614848471e-05, "loss": 0.2975, "step": 1454 }, { "epoch": 1.9195250659630607, "grad_norm": 0.07007461884984728, "learning_rate": 6.918593337593238e-05, "loss": 0.2979, "step": 1455 }, { "epoch": 1.920844327176781, "grad_norm": 0.07146991728879727, "learning_rate": 6.903989325368115e-05, "loss": 0.2904, "step": 1456 }, { "epoch": 1.9221635883905013, "grad_norm": 0.07042402871229377, "learning_rate": 6.88939261260497e-05, "loss": 0.2976, "step": 1457 }, { "epoch": 1.9234828496042216, "grad_norm": 0.06910008445599873, "learning_rate": 6.874803233718459e-05, "loss": 0.2893, "step": 1458 }, { "epoch": 1.924802110817942, "grad_norm": 0.07320501400849788, "learning_rate": 6.860221223105953e-05, "loss": 0.304, "step": 1459 }, { "epoch": 1.9261213720316621, "grad_norm": 0.07507551840436415, "learning_rate": 6.845646615147445e-05, "loss": 0.2988, "step": 1460 }, { "epoch": 1.9274406332453826, "grad_norm": 0.07468624732882859, "learning_rate": 6.83107944420548e-05, "loss": 0.3088, "step": 1461 }, { "epoch": 1.928759894459103, "grad_norm": 0.07175530987755845, "learning_rate": 6.81651974462506e-05, "loss": 0.2892, "step": 1462 }, { "epoch": 1.9300791556728232, "grad_norm": 0.07426216472492295, "learning_rate": 6.801967550733583e-05, "loss": 0.2958, "step": 1463 }, { "epoch": 1.9313984168865437, "grad_norm": 0.07481031757367486, "learning_rate": 6.787422896840743e-05, "loss": 0.307, "step": 1464 }, { "epoch": 1.9327176781002637, "grad_norm": 0.07030521887651615, "learning_rate": 6.77288581723846e-05, "loss": 0.2863, "step": 1465 }, { "epoch": 1.9340369393139842, "grad_norm": 0.07202610309146185, "learning_rate": 6.758356346200792e-05, "loss": 0.2875, "step": 1466 }, { "epoch": 1.9353562005277045, "grad_norm": 0.07629669385790117, "learning_rate": 6.743834517983865e-05, "loss": 0.3006, "step": 1467 }, { "epoch": 1.9366754617414248, "grad_norm": 0.07445565225175428, "learning_rate": 6.729320366825784e-05, "loss": 0.3044, "step": 1468 }, { "epoch": 1.937994722955145, "grad_norm": 0.07266565791811716, "learning_rate": 6.714813926946548e-05, "loss": 0.3046, "step": 1469 }, { "epoch": 1.9393139841688654, "grad_norm": 0.07221191481477221, "learning_rate": 6.700315232547981e-05, "loss": 0.2925, "step": 1470 }, { "epoch": 1.9406332453825859, "grad_norm": 0.06965751922923034, "learning_rate": 6.685824317813643e-05, "loss": 0.2927, "step": 1471 }, { "epoch": 1.941952506596306, "grad_norm": 0.0710919955648715, "learning_rate": 6.671341216908753e-05, "loss": 0.2997, "step": 1472 }, { "epoch": 1.9432717678100264, "grad_norm": 0.07226075185641738, "learning_rate": 6.656865963980105e-05, "loss": 0.3, "step": 1473 }, { "epoch": 1.9445910290237467, "grad_norm": 0.07037933810082755, "learning_rate": 6.642398593155996e-05, "loss": 0.2926, "step": 1474 }, { "epoch": 1.945910290237467, "grad_norm": 0.07224584960447299, "learning_rate": 6.627939138546129e-05, "loss": 0.2934, "step": 1475 }, { "epoch": 1.9472295514511875, "grad_norm": 0.07396659946143416, "learning_rate": 6.613487634241553e-05, "loss": 0.3069, "step": 1476 }, { "epoch": 1.9485488126649075, "grad_norm": 0.07279921508457786, "learning_rate": 6.599044114314569e-05, "loss": 0.2921, "step": 1477 }, { "epoch": 1.949868073878628, "grad_norm": 0.0718563726875386, "learning_rate": 6.58460861281865e-05, "loss": 0.2929, "step": 1478 }, { "epoch": 1.9511873350923483, "grad_norm": 0.07087336810308997, "learning_rate": 6.57018116378837e-05, "loss": 0.2971, "step": 1479 }, { "epoch": 1.9525065963060686, "grad_norm": 0.0720807852609401, "learning_rate": 6.555761801239313e-05, "loss": 0.2897, "step": 1480 }, { "epoch": 1.9538258575197889, "grad_norm": 0.07484630185336333, "learning_rate": 6.541350559167996e-05, "loss": 0.2996, "step": 1481 }, { "epoch": 1.9551451187335092, "grad_norm": 0.07232543907869944, "learning_rate": 6.526947471551798e-05, "loss": 0.3009, "step": 1482 }, { "epoch": 1.9564643799472297, "grad_norm": 0.07213844475575634, "learning_rate": 6.512552572348865e-05, "loss": 0.2977, "step": 1483 }, { "epoch": 1.9577836411609497, "grad_norm": 0.0734812331648321, "learning_rate": 6.498165895498038e-05, "loss": 0.2995, "step": 1484 }, { "epoch": 1.9591029023746702, "grad_norm": 0.07241458418395308, "learning_rate": 6.483787474918779e-05, "loss": 0.3141, "step": 1485 }, { "epoch": 1.9604221635883905, "grad_norm": 0.06986864914925695, "learning_rate": 6.469417344511076e-05, "loss": 0.2814, "step": 1486 }, { "epoch": 1.9617414248021108, "grad_norm": 0.07198567580893725, "learning_rate": 6.455055538155375e-05, "loss": 0.298, "step": 1487 }, { "epoch": 1.9630606860158313, "grad_norm": 0.07279944493436355, "learning_rate": 6.440702089712494e-05, "loss": 0.2843, "step": 1488 }, { "epoch": 1.9643799472295513, "grad_norm": 0.0713749316785241, "learning_rate": 6.426357033023549e-05, "loss": 0.3079, "step": 1489 }, { "epoch": 1.9656992084432718, "grad_norm": 0.07575114920700389, "learning_rate": 6.41202040190987e-05, "loss": 0.3137, "step": 1490 }, { "epoch": 1.9670184696569921, "grad_norm": 0.07207616634611753, "learning_rate": 6.397692230172918e-05, "loss": 0.2863, "step": 1491 }, { "epoch": 1.9683377308707124, "grad_norm": 0.07210080821715738, "learning_rate": 6.383372551594213e-05, "loss": 0.3017, "step": 1492 }, { "epoch": 1.9696569920844327, "grad_norm": 0.07281358660990626, "learning_rate": 6.369061399935255e-05, "loss": 0.3016, "step": 1493 }, { "epoch": 1.970976253298153, "grad_norm": 0.07215664542917345, "learning_rate": 6.35475880893743e-05, "loss": 0.3027, "step": 1494 }, { "epoch": 1.9722955145118735, "grad_norm": 0.0732561392660457, "learning_rate": 6.340464812321947e-05, "loss": 0.2888, "step": 1495 }, { "epoch": 1.9736147757255935, "grad_norm": 0.07242738106123589, "learning_rate": 6.326179443789752e-05, "loss": 0.2952, "step": 1496 }, { "epoch": 1.974934036939314, "grad_norm": 0.0720068506035546, "learning_rate": 6.311902737021447e-05, "loss": 0.2978, "step": 1497 }, { "epoch": 1.9762532981530343, "grad_norm": 0.07267069351075497, "learning_rate": 6.297634725677213e-05, "loss": 0.2993, "step": 1498 }, { "epoch": 1.9775725593667546, "grad_norm": 0.06973230205001282, "learning_rate": 6.283375443396726e-05, "loss": 0.2811, "step": 1499 }, { "epoch": 1.978891820580475, "grad_norm": 0.0730598871024424, "learning_rate": 6.26912492379909e-05, "loss": 0.2936, "step": 1500 }, { "epoch": 1.9802110817941951, "grad_norm": 0.0706662140250482, "learning_rate": 6.254883200482738e-05, "loss": 0.2849, "step": 1501 }, { "epoch": 1.9815303430079156, "grad_norm": 0.07291749294564975, "learning_rate": 6.240650307025373e-05, "loss": 0.2929, "step": 1502 }, { "epoch": 1.982849604221636, "grad_norm": 0.0731615687575588, "learning_rate": 6.22642627698388e-05, "loss": 0.3117, "step": 1503 }, { "epoch": 1.9841688654353562, "grad_norm": 0.06997740203514675, "learning_rate": 6.21221114389424e-05, "loss": 0.29, "step": 1504 }, { "epoch": 1.9854881266490765, "grad_norm": 0.07087050916215204, "learning_rate": 6.198004941271463e-05, "loss": 0.2994, "step": 1505 }, { "epoch": 1.9868073878627968, "grad_norm": 0.07214282194960821, "learning_rate": 6.183807702609502e-05, "loss": 0.2981, "step": 1506 }, { "epoch": 1.9881266490765173, "grad_norm": 0.07126707833206826, "learning_rate": 6.169619461381173e-05, "loss": 0.2887, "step": 1507 }, { "epoch": 1.9894459102902373, "grad_norm": 0.07627421909457675, "learning_rate": 6.155440251038083e-05, "loss": 0.3105, "step": 1508 }, { "epoch": 1.9907651715039578, "grad_norm": 0.07400391981355883, "learning_rate": 6.141270105010546e-05, "loss": 0.2994, "step": 1509 }, { "epoch": 1.992084432717678, "grad_norm": 0.07269305863072892, "learning_rate": 6.127109056707504e-05, "loss": 0.2953, "step": 1510 }, { "epoch": 1.9934036939313984, "grad_norm": 0.07382670467951749, "learning_rate": 6.11295713951645e-05, "loss": 0.3019, "step": 1511 }, { "epoch": 1.9947229551451189, "grad_norm": 0.07270032852782807, "learning_rate": 6.098814386803347e-05, "loss": 0.3018, "step": 1512 }, { "epoch": 1.996042216358839, "grad_norm": 0.07470991691222403, "learning_rate": 6.084680831912555e-05, "loss": 0.3111, "step": 1513 }, { "epoch": 1.9973614775725594, "grad_norm": 0.07192717798492458, "learning_rate": 6.07055650816674e-05, "loss": 0.2941, "step": 1514 }, { "epoch": 1.9986807387862797, "grad_norm": 0.07319009459420293, "learning_rate": 6.0564414488668165e-05, "loss": 0.291, "step": 1515 }, { "epoch": 2.0, "grad_norm": 0.07183619496651188, "learning_rate": 6.0423356872918424e-05, "loss": 0.2892, "step": 1516 }, { "epoch": 2.0, "eval_loss": 0.30579373240470886, "eval_runtime": 158.6773, "eval_samples_per_second": 32.172, "eval_steps_per_second": 1.008, "step": 1516 }, { "epoch": 2.0013192612137205, "grad_norm": 0.07050103777662692, "learning_rate": 6.028239256698964e-05, "loss": 0.2954, "step": 1517 }, { "epoch": 2.0026385224274406, "grad_norm": 0.07058621434947987, "learning_rate": 6.0141521903233235e-05, "loss": 0.278, "step": 1518 }, { "epoch": 2.003957783641161, "grad_norm": 0.06990419500168711, "learning_rate": 6.0000745213779873e-05, "loss": 0.2804, "step": 1519 }, { "epoch": 2.005277044854881, "grad_norm": 0.07089219999273959, "learning_rate": 5.986006283053866e-05, "loss": 0.2749, "step": 1520 }, { "epoch": 2.0065963060686016, "grad_norm": 0.07231818393155644, "learning_rate": 5.971947508519631e-05, "loss": 0.2789, "step": 1521 }, { "epoch": 2.007915567282322, "grad_norm": 0.07102385409447504, "learning_rate": 5.957898230921648e-05, "loss": 0.2678, "step": 1522 }, { "epoch": 2.009234828496042, "grad_norm": 0.07278898732956027, "learning_rate": 5.943858483383884e-05, "loss": 0.2816, "step": 1523 }, { "epoch": 2.0105540897097627, "grad_norm": 0.07754435559438543, "learning_rate": 5.929828299007845e-05, "loss": 0.2838, "step": 1524 }, { "epoch": 2.0118733509234827, "grad_norm": 0.0745058998115337, "learning_rate": 5.915807710872482e-05, "loss": 0.2759, "step": 1525 }, { "epoch": 2.0131926121372032, "grad_norm": 0.07278878341845896, "learning_rate": 5.901796752034128e-05, "loss": 0.2661, "step": 1526 }, { "epoch": 2.0145118733509233, "grad_norm": 0.07672918382752263, "learning_rate": 5.8877954555264034e-05, "loss": 0.2759, "step": 1527 }, { "epoch": 2.015831134564644, "grad_norm": 0.07633967902894165, "learning_rate": 5.8738038543601645e-05, "loss": 0.2826, "step": 1528 }, { "epoch": 2.0171503957783643, "grad_norm": 0.07796141634227628, "learning_rate": 5.859821981523391e-05, "loss": 0.2888, "step": 1529 }, { "epoch": 2.0184696569920844, "grad_norm": 0.07545488019892498, "learning_rate": 5.845849869981137e-05, "loss": 0.2875, "step": 1530 }, { "epoch": 2.019788918205805, "grad_norm": 0.07735578864681526, "learning_rate": 5.831887552675437e-05, "loss": 0.2846, "step": 1531 }, { "epoch": 2.021108179419525, "grad_norm": 0.0759479107528539, "learning_rate": 5.817935062525236e-05, "loss": 0.2845, "step": 1532 }, { "epoch": 2.0224274406332454, "grad_norm": 0.07554434796609166, "learning_rate": 5.803992432426313e-05, "loss": 0.2739, "step": 1533 }, { "epoch": 2.0237467018469655, "grad_norm": 0.0740304919748249, "learning_rate": 5.79005969525119e-05, "loss": 0.2778, "step": 1534 }, { "epoch": 2.025065963060686, "grad_norm": 0.07464027786408788, "learning_rate": 5.776136883849077e-05, "loss": 0.2684, "step": 1535 }, { "epoch": 2.0263852242744065, "grad_norm": 0.07403205581447105, "learning_rate": 5.762224031045769e-05, "loss": 0.2862, "step": 1536 }, { "epoch": 2.0277044854881265, "grad_norm": 0.07512509181894698, "learning_rate": 5.748321169643596e-05, "loss": 0.2797, "step": 1537 }, { "epoch": 2.029023746701847, "grad_norm": 0.0727781727430666, "learning_rate": 5.7344283324213156e-05, "loss": 0.2768, "step": 1538 }, { "epoch": 2.030343007915567, "grad_norm": 0.07505297733499403, "learning_rate": 5.7205455521340664e-05, "loss": 0.2787, "step": 1539 }, { "epoch": 2.0316622691292876, "grad_norm": 0.07347937499479017, "learning_rate": 5.706672861513262e-05, "loss": 0.2705, "step": 1540 }, { "epoch": 2.032981530343008, "grad_norm": 0.07292363282204666, "learning_rate": 5.692810293266538e-05, "loss": 0.2714, "step": 1541 }, { "epoch": 2.034300791556728, "grad_norm": 0.07878875352970884, "learning_rate": 5.6789578800776657e-05, "loss": 0.2894, "step": 1542 }, { "epoch": 2.0356200527704487, "grad_norm": 0.07601714419244184, "learning_rate": 5.665115654606459e-05, "loss": 0.2749, "step": 1543 }, { "epoch": 2.0369393139841687, "grad_norm": 0.07619820195883009, "learning_rate": 5.6512836494887325e-05, "loss": 0.2893, "step": 1544 }, { "epoch": 2.038258575197889, "grad_norm": 0.0758115346044772, "learning_rate": 5.637461897336185e-05, "loss": 0.2712, "step": 1545 }, { "epoch": 2.0395778364116097, "grad_norm": 0.07848988126547639, "learning_rate": 5.623650430736358e-05, "loss": 0.2883, "step": 1546 }, { "epoch": 2.04089709762533, "grad_norm": 0.07794275671233637, "learning_rate": 5.6098492822525285e-05, "loss": 0.2818, "step": 1547 }, { "epoch": 2.0422163588390503, "grad_norm": 0.07863560558453325, "learning_rate": 5.596058484423656e-05, "loss": 0.2828, "step": 1548 }, { "epoch": 2.0435356200527703, "grad_norm": 0.08042271609382062, "learning_rate": 5.5822780697643016e-05, "loss": 0.2917, "step": 1549 }, { "epoch": 2.044854881266491, "grad_norm": 0.07525755071691671, "learning_rate": 5.5685080707645265e-05, "loss": 0.2816, "step": 1550 }, { "epoch": 2.046174142480211, "grad_norm": 0.07617796838466234, "learning_rate": 5.554748519889858e-05, "loss": 0.2706, "step": 1551 }, { "epoch": 2.0474934036939314, "grad_norm": 0.0780285633698544, "learning_rate": 5.540999449581168e-05, "loss": 0.2787, "step": 1552 }, { "epoch": 2.048812664907652, "grad_norm": 0.07353927210568995, "learning_rate": 5.5272608922546373e-05, "loss": 0.2744, "step": 1553 }, { "epoch": 2.050131926121372, "grad_norm": 0.07702067913452867, "learning_rate": 5.513532880301645e-05, "loss": 0.2864, "step": 1554 }, { "epoch": 2.0514511873350925, "grad_norm": 0.07571524372866757, "learning_rate": 5.499815446088721e-05, "loss": 0.2803, "step": 1555 }, { "epoch": 2.0527704485488125, "grad_norm": 0.07663890107810534, "learning_rate": 5.4861086219574444e-05, "loss": 0.2796, "step": 1556 }, { "epoch": 2.054089709762533, "grad_norm": 0.07786067501284716, "learning_rate": 5.4724124402243837e-05, "loss": 0.2864, "step": 1557 }, { "epoch": 2.055408970976253, "grad_norm": 0.07467004583909588, "learning_rate": 5.458726933181022e-05, "loss": 0.2763, "step": 1558 }, { "epoch": 2.0567282321899736, "grad_norm": 0.07803885171819147, "learning_rate": 5.44505213309366e-05, "loss": 0.2833, "step": 1559 }, { "epoch": 2.058047493403694, "grad_norm": 0.07603803390910477, "learning_rate": 5.431388072203373e-05, "loss": 0.268, "step": 1560 }, { "epoch": 2.059366754617414, "grad_norm": 0.07380974794517471, "learning_rate": 5.417734782725896e-05, "loss": 0.2714, "step": 1561 }, { "epoch": 2.0606860158311346, "grad_norm": 0.07742246049491394, "learning_rate": 5.40409229685159e-05, "loss": 0.2894, "step": 1562 }, { "epoch": 2.0620052770448547, "grad_norm": 0.07786244705684475, "learning_rate": 5.3904606467453254e-05, "loss": 0.276, "step": 1563 }, { "epoch": 2.063324538258575, "grad_norm": 0.08090805658284124, "learning_rate": 5.376839864546438e-05, "loss": 0.2872, "step": 1564 }, { "epoch": 2.0646437994722957, "grad_norm": 0.07889232797617855, "learning_rate": 5.3632299823686295e-05, "loss": 0.2782, "step": 1565 }, { "epoch": 2.0659630606860158, "grad_norm": 0.07613182888539831, "learning_rate": 5.3496310322999134e-05, "loss": 0.289, "step": 1566 }, { "epoch": 2.0672823218997363, "grad_norm": 0.07859833842827041, "learning_rate": 5.3360430464025255e-05, "loss": 0.2829, "step": 1567 }, { "epoch": 2.0686015831134563, "grad_norm": 0.07959635187240913, "learning_rate": 5.3224660567128446e-05, "loss": 0.2817, "step": 1568 }, { "epoch": 2.069920844327177, "grad_norm": 0.07594034900517539, "learning_rate": 5.3089000952413346e-05, "loss": 0.2779, "step": 1569 }, { "epoch": 2.0712401055408973, "grad_norm": 0.07501145441498341, "learning_rate": 5.2953451939724454e-05, "loss": 0.258, "step": 1570 }, { "epoch": 2.0725593667546174, "grad_norm": 0.07617760674979711, "learning_rate": 5.281801384864564e-05, "loss": 0.2784, "step": 1571 }, { "epoch": 2.073878627968338, "grad_norm": 0.07604925076285504, "learning_rate": 5.268268699849912e-05, "loss": 0.272, "step": 1572 }, { "epoch": 2.075197889182058, "grad_norm": 0.07923999819437091, "learning_rate": 5.2547471708344975e-05, "loss": 0.2868, "step": 1573 }, { "epoch": 2.0765171503957784, "grad_norm": 0.07687922895089541, "learning_rate": 5.241236829698013e-05, "loss": 0.2865, "step": 1574 }, { "epoch": 2.0778364116094985, "grad_norm": 0.07869117770600417, "learning_rate": 5.2277377082937806e-05, "loss": 0.2825, "step": 1575 }, { "epoch": 2.079155672823219, "grad_norm": 0.07720800185209774, "learning_rate": 5.2142498384486726e-05, "loss": 0.2763, "step": 1576 }, { "epoch": 2.0804749340369395, "grad_norm": 0.07761023011074251, "learning_rate": 5.2007732519630245e-05, "loss": 0.2793, "step": 1577 }, { "epoch": 2.0817941952506596, "grad_norm": 0.07893484676717293, "learning_rate": 5.1873079806105785e-05, "loss": 0.2852, "step": 1578 }, { "epoch": 2.08311345646438, "grad_norm": 0.07941864554093903, "learning_rate": 5.173854056138389e-05, "loss": 0.2789, "step": 1579 }, { "epoch": 2.0844327176781, "grad_norm": 0.07606556027816859, "learning_rate": 5.160411510266768e-05, "loss": 0.279, "step": 1580 }, { "epoch": 2.0857519788918206, "grad_norm": 0.07794999001737717, "learning_rate": 5.146980374689192e-05, "loss": 0.2873, "step": 1581 }, { "epoch": 2.0870712401055407, "grad_norm": 0.07875749033919781, "learning_rate": 5.133560681072243e-05, "loss": 0.2855, "step": 1582 }, { "epoch": 2.088390501319261, "grad_norm": 0.07760804821154595, "learning_rate": 5.1201524610555165e-05, "loss": 0.2813, "step": 1583 }, { "epoch": 2.0897097625329817, "grad_norm": 0.07622442989507791, "learning_rate": 5.106755746251565e-05, "loss": 0.2699, "step": 1584 }, { "epoch": 2.0910290237467017, "grad_norm": 0.07829175213747275, "learning_rate": 5.0933705682458176e-05, "loss": 0.286, "step": 1585 }, { "epoch": 2.0923482849604222, "grad_norm": 0.07880850016323304, "learning_rate": 5.0799969585964914e-05, "loss": 0.2811, "step": 1586 }, { "epoch": 2.0936675461741423, "grad_norm": 0.07712351288717788, "learning_rate": 5.066634948834541e-05, "loss": 0.2861, "step": 1587 }, { "epoch": 2.094986807387863, "grad_norm": 0.07838299780496037, "learning_rate": 5.053284570463559e-05, "loss": 0.2879, "step": 1588 }, { "epoch": 2.0963060686015833, "grad_norm": 0.07726711372308724, "learning_rate": 5.0399458549597324e-05, "loss": 0.281, "step": 1589 }, { "epoch": 2.0976253298153034, "grad_norm": 0.07541209963972585, "learning_rate": 5.02661883377173e-05, "loss": 0.2838, "step": 1590 }, { "epoch": 2.098944591029024, "grad_norm": 0.079088645046273, "learning_rate": 5.013303538320665e-05, "loss": 0.2742, "step": 1591 }, { "epoch": 2.100263852242744, "grad_norm": 0.07474042475040342, "learning_rate": 5.000000000000002e-05, "loss": 0.2743, "step": 1592 }, { "epoch": 2.1015831134564644, "grad_norm": 0.074605317559238, "learning_rate": 4.986708250175476e-05, "loss": 0.2706, "step": 1593 }, { "epoch": 2.1029023746701845, "grad_norm": 0.0779506731188671, "learning_rate": 4.973428320185043e-05, "loss": 0.2855, "step": 1594 }, { "epoch": 2.104221635883905, "grad_norm": 0.07687299427549837, "learning_rate": 4.960160241338775e-05, "loss": 0.2694, "step": 1595 }, { "epoch": 2.1055408970976255, "grad_norm": 0.07927419710148012, "learning_rate": 4.9469040449188185e-05, "loss": 0.2798, "step": 1596 }, { "epoch": 2.1068601583113455, "grad_norm": 0.07730208453419181, "learning_rate": 4.9336597621792924e-05, "loss": 0.2735, "step": 1597 }, { "epoch": 2.108179419525066, "grad_norm": 0.07753868626857364, "learning_rate": 4.920427424346239e-05, "loss": 0.272, "step": 1598 }, { "epoch": 2.109498680738786, "grad_norm": 0.07638129250023187, "learning_rate": 4.9072070626175203e-05, "loss": 0.2833, "step": 1599 }, { "epoch": 2.1108179419525066, "grad_norm": 0.07938142856073933, "learning_rate": 4.89399870816278e-05, "loss": 0.2799, "step": 1600 }, { "epoch": 2.112137203166227, "grad_norm": 0.08141611343028621, "learning_rate": 4.8808023921233495e-05, "loss": 0.2785, "step": 1601 }, { "epoch": 2.113456464379947, "grad_norm": 0.08040965075024938, "learning_rate": 4.867618145612162e-05, "loss": 0.2922, "step": 1602 }, { "epoch": 2.1147757255936677, "grad_norm": 0.07871311460030936, "learning_rate": 4.854445999713715e-05, "loss": 0.2712, "step": 1603 }, { "epoch": 2.1160949868073877, "grad_norm": 0.07637278867792395, "learning_rate": 4.841285985483959e-05, "loss": 0.2856, "step": 1604 }, { "epoch": 2.1174142480211082, "grad_norm": 0.07487549784522321, "learning_rate": 4.8281381339502565e-05, "loss": 0.2779, "step": 1605 }, { "epoch": 2.1187335092348283, "grad_norm": 0.07862733649102978, "learning_rate": 4.81500247611128e-05, "loss": 0.2829, "step": 1606 }, { "epoch": 2.120052770448549, "grad_norm": 0.07714565313276452, "learning_rate": 4.8018790429369676e-05, "loss": 0.2794, "step": 1607 }, { "epoch": 2.1213720316622693, "grad_norm": 0.07637462153014393, "learning_rate": 4.7887678653684184e-05, "loss": 0.2716, "step": 1608 }, { "epoch": 2.1226912928759893, "grad_norm": 0.07535513940969046, "learning_rate": 4.7756689743178515e-05, "loss": 0.2802, "step": 1609 }, { "epoch": 2.12401055408971, "grad_norm": 0.07822807892779175, "learning_rate": 4.7625824006685136e-05, "loss": 0.2827, "step": 1610 }, { "epoch": 2.12532981530343, "grad_norm": 0.07447306460359324, "learning_rate": 4.749508175274605e-05, "loss": 0.2768, "step": 1611 }, { "epoch": 2.1266490765171504, "grad_norm": 0.07713604908901549, "learning_rate": 4.7364463289612215e-05, "loss": 0.2859, "step": 1612 }, { "epoch": 2.127968337730871, "grad_norm": 0.07627768917328373, "learning_rate": 4.723396892524261e-05, "loss": 0.2779, "step": 1613 }, { "epoch": 2.129287598944591, "grad_norm": 0.07874419913178575, "learning_rate": 4.710359896730379e-05, "loss": 0.2719, "step": 1614 }, { "epoch": 2.1306068601583115, "grad_norm": 0.07741675101547556, "learning_rate": 4.697335372316881e-05, "loss": 0.2732, "step": 1615 }, { "epoch": 2.1319261213720315, "grad_norm": 0.0772205510082391, "learning_rate": 4.684323349991686e-05, "loss": 0.2793, "step": 1616 }, { "epoch": 2.133245382585752, "grad_norm": 0.07818694073937388, "learning_rate": 4.671323860433222e-05, "loss": 0.2766, "step": 1617 }, { "epoch": 2.1345646437994725, "grad_norm": 0.07898914048473536, "learning_rate": 4.6583369342903806e-05, "loss": 0.2877, "step": 1618 }, { "epoch": 2.1358839050131926, "grad_norm": 0.07786077956593064, "learning_rate": 4.645362602182428e-05, "loss": 0.2747, "step": 1619 }, { "epoch": 2.137203166226913, "grad_norm": 0.07762553792259096, "learning_rate": 4.6324008946989314e-05, "loss": 0.2814, "step": 1620 }, { "epoch": 2.138522427440633, "grad_norm": 0.07802449130353682, "learning_rate": 4.619451842399707e-05, "loss": 0.2689, "step": 1621 }, { "epoch": 2.1398416886543536, "grad_norm": 0.07765207033350463, "learning_rate": 4.6065154758147154e-05, "loss": 0.2824, "step": 1622 }, { "epoch": 2.1411609498680737, "grad_norm": 0.07912036181077796, "learning_rate": 4.593591825444028e-05, "loss": 0.2844, "step": 1623 }, { "epoch": 2.142480211081794, "grad_norm": 0.07795668614196342, "learning_rate": 4.5806809217577165e-05, "loss": 0.2814, "step": 1624 }, { "epoch": 2.1437994722955147, "grad_norm": 0.07644957953798454, "learning_rate": 4.567782795195816e-05, "loss": 0.2742, "step": 1625 }, { "epoch": 2.1451187335092348, "grad_norm": 0.07936515653856664, "learning_rate": 4.554897476168223e-05, "loss": 0.2864, "step": 1626 }, { "epoch": 2.1464379947229553, "grad_norm": 0.0814024677243052, "learning_rate": 4.542024995054647e-05, "loss": 0.2769, "step": 1627 }, { "epoch": 2.1477572559366753, "grad_norm": 0.07775442147462296, "learning_rate": 4.529165382204531e-05, "loss": 0.277, "step": 1628 }, { "epoch": 2.149076517150396, "grad_norm": 0.07882098840328912, "learning_rate": 4.516318667936967e-05, "loss": 0.2814, "step": 1629 }, { "epoch": 2.150395778364116, "grad_norm": 0.07882348082172963, "learning_rate": 4.5034848825406505e-05, "loss": 0.2769, "step": 1630 }, { "epoch": 2.1517150395778364, "grad_norm": 0.07798975708634262, "learning_rate": 4.49066405627378e-05, "loss": 0.277, "step": 1631 }, { "epoch": 2.153034300791557, "grad_norm": 0.07958846171492341, "learning_rate": 4.477856219364015e-05, "loss": 0.2715, "step": 1632 }, { "epoch": 2.154353562005277, "grad_norm": 0.07813972190285003, "learning_rate": 4.465061402008375e-05, "loss": 0.278, "step": 1633 }, { "epoch": 2.1556728232189974, "grad_norm": 0.07671075187281945, "learning_rate": 4.4522796343731956e-05, "loss": 0.2772, "step": 1634 }, { "epoch": 2.1569920844327175, "grad_norm": 0.08085610206620415, "learning_rate": 4.43951094659404e-05, "loss": 0.2929, "step": 1635 }, { "epoch": 2.158311345646438, "grad_norm": 0.07701826693149255, "learning_rate": 4.426755368775637e-05, "loss": 0.2902, "step": 1636 }, { "epoch": 2.1596306068601585, "grad_norm": 0.07640700882038313, "learning_rate": 4.414012930991795e-05, "loss": 0.2724, "step": 1637 }, { "epoch": 2.1609498680738786, "grad_norm": 0.0807333610045424, "learning_rate": 4.401283663285355e-05, "loss": 0.2809, "step": 1638 }, { "epoch": 2.162269129287599, "grad_norm": 0.07618185021865422, "learning_rate": 4.388567595668103e-05, "loss": 0.2828, "step": 1639 }, { "epoch": 2.163588390501319, "grad_norm": 0.07884807794217702, "learning_rate": 4.375864758120696e-05, "loss": 0.2763, "step": 1640 }, { "epoch": 2.1649076517150396, "grad_norm": 0.07946106665523008, "learning_rate": 4.363175180592611e-05, "loss": 0.2776, "step": 1641 }, { "epoch": 2.16622691292876, "grad_norm": 0.07776293271607525, "learning_rate": 4.3504988930020483e-05, "loss": 0.2727, "step": 1642 }, { "epoch": 2.16754617414248, "grad_norm": 0.07841293295364189, "learning_rate": 4.337835925235888e-05, "loss": 0.2827, "step": 1643 }, { "epoch": 2.1688654353562007, "grad_norm": 0.07914327289085935, "learning_rate": 4.325186307149593e-05, "loss": 0.2891, "step": 1644 }, { "epoch": 2.1701846965699207, "grad_norm": 0.07799310461784408, "learning_rate": 4.312550068567165e-05, "loss": 0.2818, "step": 1645 }, { "epoch": 2.1715039577836412, "grad_norm": 0.08074266310648874, "learning_rate": 4.2999272392810455e-05, "loss": 0.2816, "step": 1646 }, { "epoch": 2.1728232189973613, "grad_norm": 0.07798819362969231, "learning_rate": 4.287317849052075e-05, "loss": 0.2789, "step": 1647 }, { "epoch": 2.174142480211082, "grad_norm": 0.07882570434860331, "learning_rate": 4.2747219276094064e-05, "loss": 0.2841, "step": 1648 }, { "epoch": 2.1754617414248023, "grad_norm": 0.07696569533949392, "learning_rate": 4.2621395046504255e-05, "loss": 0.2822, "step": 1649 }, { "epoch": 2.1767810026385224, "grad_norm": 0.07906950633508111, "learning_rate": 4.2495706098407085e-05, "loss": 0.2861, "step": 1650 }, { "epoch": 2.178100263852243, "grad_norm": 0.07865225725214663, "learning_rate": 4.2370152728139234e-05, "loss": 0.2757, "step": 1651 }, { "epoch": 2.179419525065963, "grad_norm": 0.07863917473106233, "learning_rate": 4.224473523171784e-05, "loss": 0.2821, "step": 1652 }, { "epoch": 2.1807387862796834, "grad_norm": 0.07903405153374614, "learning_rate": 4.2119453904839565e-05, "loss": 0.2889, "step": 1653 }, { "epoch": 2.1820580474934035, "grad_norm": 0.07940155937024614, "learning_rate": 4.19943090428802e-05, "loss": 0.2857, "step": 1654 }, { "epoch": 2.183377308707124, "grad_norm": 0.07892601782327527, "learning_rate": 4.186930094089357e-05, "loss": 0.2761, "step": 1655 }, { "epoch": 2.1846965699208445, "grad_norm": 0.07861701097655062, "learning_rate": 4.174442989361126e-05, "loss": 0.2878, "step": 1656 }, { "epoch": 2.1860158311345645, "grad_norm": 0.07787116358873082, "learning_rate": 4.161969619544165e-05, "loss": 0.2765, "step": 1657 }, { "epoch": 2.187335092348285, "grad_norm": 0.07990864512520574, "learning_rate": 4.149510014046922e-05, "loss": 0.282, "step": 1658 }, { "epoch": 2.188654353562005, "grad_norm": 0.07837087114372814, "learning_rate": 4.137064202245407e-05, "loss": 0.277, "step": 1659 }, { "epoch": 2.1899736147757256, "grad_norm": 0.07681755971718188, "learning_rate": 4.124632213483093e-05, "loss": 0.2812, "step": 1660 }, { "epoch": 2.191292875989446, "grad_norm": 0.07759139634132747, "learning_rate": 4.1122140770708773e-05, "loss": 0.2737, "step": 1661 }, { "epoch": 2.192612137203166, "grad_norm": 0.08049096276565267, "learning_rate": 4.099809822286984e-05, "loss": 0.2888, "step": 1662 }, { "epoch": 2.1939313984168867, "grad_norm": 0.07665798977479932, "learning_rate": 4.087419478376923e-05, "loss": 0.2793, "step": 1663 }, { "epoch": 2.1952506596306067, "grad_norm": 0.07691717343104092, "learning_rate": 4.075043074553389e-05, "loss": 0.2707, "step": 1664 }, { "epoch": 2.1965699208443272, "grad_norm": 0.07728130466566847, "learning_rate": 4.062680639996225e-05, "loss": 0.2782, "step": 1665 }, { "epoch": 2.1978891820580473, "grad_norm": 0.07928157611081746, "learning_rate": 4.050332203852336e-05, "loss": 0.2791, "step": 1666 }, { "epoch": 2.199208443271768, "grad_norm": 0.08007112966458786, "learning_rate": 4.0379977952356155e-05, "loss": 0.2839, "step": 1667 }, { "epoch": 2.2005277044854883, "grad_norm": 0.07687421741909392, "learning_rate": 4.025677443226894e-05, "loss": 0.2774, "step": 1668 }, { "epoch": 2.2018469656992083, "grad_norm": 0.07919682750631991, "learning_rate": 4.013371176873849e-05, "loss": 0.2761, "step": 1669 }, { "epoch": 2.203166226912929, "grad_norm": 0.07885174898590902, "learning_rate": 4.0010790251909624e-05, "loss": 0.2806, "step": 1670 }, { "epoch": 2.204485488126649, "grad_norm": 0.07930477436827331, "learning_rate": 3.988801017159425e-05, "loss": 0.2734, "step": 1671 }, { "epoch": 2.2058047493403694, "grad_norm": 0.07846823012103353, "learning_rate": 3.9765371817270925e-05, "loss": 0.2817, "step": 1672 }, { "epoch": 2.20712401055409, "grad_norm": 0.07738804690561635, "learning_rate": 3.964287547808394e-05, "loss": 0.2711, "step": 1673 }, { "epoch": 2.20844327176781, "grad_norm": 0.0803569937083681, "learning_rate": 3.952052144284285e-05, "loss": 0.2806, "step": 1674 }, { "epoch": 2.2097625329815305, "grad_norm": 0.07789285040467335, "learning_rate": 3.939831000002171e-05, "loss": 0.2718, "step": 1675 }, { "epoch": 2.2110817941952505, "grad_norm": 0.07916346155255566, "learning_rate": 3.927624143775826e-05, "loss": 0.2774, "step": 1676 }, { "epoch": 2.212401055408971, "grad_norm": 0.07782137255877045, "learning_rate": 3.915431604385355e-05, "loss": 0.2762, "step": 1677 }, { "epoch": 2.213720316622691, "grad_norm": 0.07925693652802089, "learning_rate": 3.903253410577088e-05, "loss": 0.2895, "step": 1678 }, { "epoch": 2.2150395778364116, "grad_norm": 0.07952356683080197, "learning_rate": 3.891089591063553e-05, "loss": 0.2814, "step": 1679 }, { "epoch": 2.216358839050132, "grad_norm": 0.07655300537908291, "learning_rate": 3.878940174523371e-05, "loss": 0.2809, "step": 1680 }, { "epoch": 2.217678100263852, "grad_norm": 0.07630071187328175, "learning_rate": 3.866805189601215e-05, "loss": 0.2678, "step": 1681 }, { "epoch": 2.2189973614775726, "grad_norm": 0.07718919357297997, "learning_rate": 3.8546846649077316e-05, "loss": 0.2806, "step": 1682 }, { "epoch": 2.2203166226912927, "grad_norm": 0.0768183421954644, "learning_rate": 3.8425786290194676e-05, "loss": 0.2821, "step": 1683 }, { "epoch": 2.221635883905013, "grad_norm": 0.07833965174961521, "learning_rate": 3.830487110478821e-05, "loss": 0.2868, "step": 1684 }, { "epoch": 2.2229551451187337, "grad_norm": 0.0781737951061863, "learning_rate": 3.8184101377939476e-05, "loss": 0.2699, "step": 1685 }, { "epoch": 2.2242744063324538, "grad_norm": 0.07962443683267113, "learning_rate": 3.806347739438724e-05, "loss": 0.2819, "step": 1686 }, { "epoch": 2.2255936675461743, "grad_norm": 0.08087724199255865, "learning_rate": 3.7942999438526504e-05, "loss": 0.2754, "step": 1687 }, { "epoch": 2.2269129287598943, "grad_norm": 0.08160684301067518, "learning_rate": 3.782266779440814e-05, "loss": 0.2776, "step": 1688 }, { "epoch": 2.228232189973615, "grad_norm": 0.07861487488574741, "learning_rate": 3.7702482745737874e-05, "loss": 0.2804, "step": 1689 }, { "epoch": 2.229551451187335, "grad_norm": 0.07949260267233423, "learning_rate": 3.7582444575875964e-05, "loss": 0.2801, "step": 1690 }, { "epoch": 2.2308707124010554, "grad_norm": 0.07964111308751806, "learning_rate": 3.746255356783632e-05, "loss": 0.2916, "step": 1691 }, { "epoch": 2.232189973614776, "grad_norm": 0.08191837074755494, "learning_rate": 3.7342810004285836e-05, "loss": 0.2716, "step": 1692 }, { "epoch": 2.233509234828496, "grad_norm": 0.081763967970594, "learning_rate": 3.722321416754386e-05, "loss": 0.2835, "step": 1693 }, { "epoch": 2.2348284960422165, "grad_norm": 0.07800003138265231, "learning_rate": 3.710376633958136e-05, "loss": 0.2723, "step": 1694 }, { "epoch": 2.2361477572559365, "grad_norm": 0.07966917189454707, "learning_rate": 3.6984466802020436e-05, "loss": 0.2857, "step": 1695 }, { "epoch": 2.237467018469657, "grad_norm": 0.07815673961252671, "learning_rate": 3.6865315836133465e-05, "loss": 0.2762, "step": 1696 }, { "epoch": 2.2387862796833775, "grad_norm": 0.07854562719496545, "learning_rate": 3.674631372284265e-05, "loss": 0.2723, "step": 1697 }, { "epoch": 2.2401055408970976, "grad_norm": 0.07882054293457676, "learning_rate": 3.66274607427191e-05, "loss": 0.2685, "step": 1698 }, { "epoch": 2.241424802110818, "grad_norm": 0.0821033710658035, "learning_rate": 3.650875717598245e-05, "loss": 0.2883, "step": 1699 }, { "epoch": 2.242744063324538, "grad_norm": 0.08177475324067357, "learning_rate": 3.6390203302500034e-05, "loss": 0.2834, "step": 1700 }, { "epoch": 2.2440633245382586, "grad_norm": 0.08223375887070909, "learning_rate": 3.627179940178615e-05, "loss": 0.286, "step": 1701 }, { "epoch": 2.2453825857519787, "grad_norm": 0.08144825600593171, "learning_rate": 3.615354575300166e-05, "loss": 0.2839, "step": 1702 }, { "epoch": 2.246701846965699, "grad_norm": 0.07930257518418132, "learning_rate": 3.603544263495303e-05, "loss": 0.2819, "step": 1703 }, { "epoch": 2.2480211081794197, "grad_norm": 0.07678624132630978, "learning_rate": 3.591749032609197e-05, "loss": 0.2714, "step": 1704 }, { "epoch": 2.2493403693931397, "grad_norm": 0.07710581293560288, "learning_rate": 3.5799689104514466e-05, "loss": 0.2808, "step": 1705 }, { "epoch": 2.2506596306068603, "grad_norm": 0.07666574431875667, "learning_rate": 3.568203924796043e-05, "loss": 0.2738, "step": 1706 }, { "epoch": 2.2519788918205803, "grad_norm": 0.07872622088585673, "learning_rate": 3.556454103381278e-05, "loss": 0.2919, "step": 1707 }, { "epoch": 2.253298153034301, "grad_norm": 0.08107082600155194, "learning_rate": 3.5447194739097e-05, "loss": 0.2931, "step": 1708 }, { "epoch": 2.2546174142480213, "grad_norm": 0.07894819148432543, "learning_rate": 3.53300006404804e-05, "loss": 0.2794, "step": 1709 }, { "epoch": 2.2559366754617414, "grad_norm": 0.07904192100699857, "learning_rate": 3.521295901427132e-05, "loss": 0.281, "step": 1710 }, { "epoch": 2.257255936675462, "grad_norm": 0.07938491134001077, "learning_rate": 3.50960701364188e-05, "loss": 0.2767, "step": 1711 }, { "epoch": 2.258575197889182, "grad_norm": 0.07930606491958049, "learning_rate": 3.49793342825116e-05, "loss": 0.2739, "step": 1712 }, { "epoch": 2.2598944591029024, "grad_norm": 0.07710706932271942, "learning_rate": 3.4862751727777797e-05, "loss": 0.283, "step": 1713 }, { "epoch": 2.261213720316623, "grad_norm": 0.07802033587695294, "learning_rate": 3.474632274708394e-05, "loss": 0.2842, "step": 1714 }, { "epoch": 2.262532981530343, "grad_norm": 0.07809053860891314, "learning_rate": 3.463004761493459e-05, "loss": 0.2707, "step": 1715 }, { "epoch": 2.2638522427440635, "grad_norm": 0.0802455300725423, "learning_rate": 3.45139266054715e-05, "loss": 0.2776, "step": 1716 }, { "epoch": 2.2651715039577835, "grad_norm": 0.08032254326707743, "learning_rate": 3.439795999247309e-05, "loss": 0.2802, "step": 1717 }, { "epoch": 2.266490765171504, "grad_norm": 0.07701398292936538, "learning_rate": 3.4282148049353824e-05, "loss": 0.2915, "step": 1718 }, { "epoch": 2.267810026385224, "grad_norm": 0.07640113131392486, "learning_rate": 3.416649104916333e-05, "loss": 0.2873, "step": 1719 }, { "epoch": 2.2691292875989446, "grad_norm": 0.08051339314871339, "learning_rate": 3.4050989264586096e-05, "loss": 0.2824, "step": 1720 }, { "epoch": 2.2704485488126647, "grad_norm": 0.07717677973374147, "learning_rate": 3.3935642967940554e-05, "loss": 0.2742, "step": 1721 }, { "epoch": 2.271767810026385, "grad_norm": 0.07627803568453873, "learning_rate": 3.3820452431178606e-05, "loss": 0.2793, "step": 1722 }, { "epoch": 2.2730870712401057, "grad_norm": 0.08047407418473598, "learning_rate": 3.3705417925884854e-05, "loss": 0.2857, "step": 1723 }, { "epoch": 2.2744063324538257, "grad_norm": 0.0776588125241044, "learning_rate": 3.3590539723276083e-05, "loss": 0.2819, "step": 1724 }, { "epoch": 2.2757255936675462, "grad_norm": 0.07795669482727091, "learning_rate": 3.3475818094200585e-05, "loss": 0.2844, "step": 1725 }, { "epoch": 2.2770448548812663, "grad_norm": 0.07946618151054231, "learning_rate": 3.336125330913737e-05, "loss": 0.2945, "step": 1726 }, { "epoch": 2.278364116094987, "grad_norm": 0.07969656476693081, "learning_rate": 3.3246845638195834e-05, "loss": 0.2802, "step": 1727 }, { "epoch": 2.2796833773087073, "grad_norm": 0.07716508974850514, "learning_rate": 3.313259535111478e-05, "loss": 0.2756, "step": 1728 }, { "epoch": 2.2810026385224274, "grad_norm": 0.07847376070554767, "learning_rate": 3.301850271726208e-05, "loss": 0.2829, "step": 1729 }, { "epoch": 2.282321899736148, "grad_norm": 0.08037063103519841, "learning_rate": 3.290456800563378e-05, "loss": 0.2765, "step": 1730 }, { "epoch": 2.283641160949868, "grad_norm": 0.0760704537789316, "learning_rate": 3.279079148485375e-05, "loss": 0.2705, "step": 1731 }, { "epoch": 2.2849604221635884, "grad_norm": 0.08217241615703291, "learning_rate": 3.267717342317271e-05, "loss": 0.2833, "step": 1732 }, { "epoch": 2.286279683377309, "grad_norm": 0.077702904871439, "learning_rate": 3.2563714088467936e-05, "loss": 0.2804, "step": 1733 }, { "epoch": 2.287598944591029, "grad_norm": 0.07706921867195793, "learning_rate": 3.2450413748242437e-05, "loss": 0.2775, "step": 1734 }, { "epoch": 2.2889182058047495, "grad_norm": 0.07608595178776131, "learning_rate": 3.233727266962425e-05, "loss": 0.2734, "step": 1735 }, { "epoch": 2.2902374670184695, "grad_norm": 0.08075195294721567, "learning_rate": 3.222429111936611e-05, "loss": 0.288, "step": 1736 }, { "epoch": 2.29155672823219, "grad_norm": 0.08056858824729886, "learning_rate": 3.211146936384445e-05, "loss": 0.2851, "step": 1737 }, { "epoch": 2.2928759894459105, "grad_norm": 0.08002648973896172, "learning_rate": 3.1998807669059096e-05, "loss": 0.283, "step": 1738 }, { "epoch": 2.2941952506596306, "grad_norm": 0.08037232234434347, "learning_rate": 3.1886306300632386e-05, "loss": 0.285, "step": 1739 }, { "epoch": 2.295514511873351, "grad_norm": 0.08243228519529584, "learning_rate": 3.1773965523808754e-05, "loss": 0.2868, "step": 1740 }, { "epoch": 2.296833773087071, "grad_norm": 0.08011523923042128, "learning_rate": 3.166178560345392e-05, "loss": 0.2838, "step": 1741 }, { "epoch": 2.2981530343007917, "grad_norm": 0.07859520831875624, "learning_rate": 3.1549766804054415e-05, "loss": 0.2802, "step": 1742 }, { "epoch": 2.2994722955145117, "grad_norm": 0.07874646858870786, "learning_rate": 3.1437909389716915e-05, "loss": 0.2814, "step": 1743 }, { "epoch": 2.300791556728232, "grad_norm": 0.0819646397416861, "learning_rate": 3.132621362416749e-05, "loss": 0.2778, "step": 1744 }, { "epoch": 2.3021108179419523, "grad_norm": 0.07954595118263666, "learning_rate": 3.1214679770751234e-05, "loss": 0.2835, "step": 1745 }, { "epoch": 2.3034300791556728, "grad_norm": 0.08266720385707631, "learning_rate": 3.110330809243134e-05, "loss": 0.2809, "step": 1746 }, { "epoch": 2.3047493403693933, "grad_norm": 0.07992637324488817, "learning_rate": 3.099209885178882e-05, "loss": 0.2765, "step": 1747 }, { "epoch": 2.3060686015831133, "grad_norm": 0.08126498283298668, "learning_rate": 3.088105231102153e-05, "loss": 0.2809, "step": 1748 }, { "epoch": 2.307387862796834, "grad_norm": 0.08085368966650008, "learning_rate": 3.0770168731943895e-05, "loss": 0.2721, "step": 1749 }, { "epoch": 2.308707124010554, "grad_norm": 0.07952936192285692, "learning_rate": 3.065944837598596e-05, "loss": 0.2773, "step": 1750 }, { "epoch": 2.3100263852242744, "grad_norm": 0.07827846750309747, "learning_rate": 3.054889150419308e-05, "loss": 0.2772, "step": 1751 }, { "epoch": 2.311345646437995, "grad_norm": 0.07580120832583083, "learning_rate": 3.043849837722511e-05, "loss": 0.2743, "step": 1752 }, { "epoch": 2.312664907651715, "grad_norm": 0.08090141396793714, "learning_rate": 3.032826925535579e-05, "loss": 0.2773, "step": 1753 }, { "epoch": 2.3139841688654355, "grad_norm": 0.07774148476684722, "learning_rate": 3.0218204398472304e-05, "loss": 0.2692, "step": 1754 }, { "epoch": 2.3153034300791555, "grad_norm": 0.07952130245210719, "learning_rate": 3.010830406607441e-05, "loss": 0.2774, "step": 1755 }, { "epoch": 2.316622691292876, "grad_norm": 0.079860002200501, "learning_rate": 2.9998568517274107e-05, "loss": 0.2856, "step": 1756 }, { "epoch": 2.3179419525065965, "grad_norm": 0.08019626895474018, "learning_rate": 2.9888998010794743e-05, "loss": 0.2844, "step": 1757 }, { "epoch": 2.3192612137203166, "grad_norm": 0.0793960572536843, "learning_rate": 2.977959280497068e-05, "loss": 0.2807, "step": 1758 }, { "epoch": 2.320580474934037, "grad_norm": 0.07824768047569547, "learning_rate": 2.9670353157746423e-05, "loss": 0.2732, "step": 1759 }, { "epoch": 2.321899736147757, "grad_norm": 0.08147659131404078, "learning_rate": 2.956127932667625e-05, "loss": 0.2848, "step": 1760 }, { "epoch": 2.3232189973614776, "grad_norm": 0.07849693551938491, "learning_rate": 2.9452371568923455e-05, "loss": 0.2837, "step": 1761 }, { "epoch": 2.324538258575198, "grad_norm": 0.0778983244812835, "learning_rate": 2.9343630141259736e-05, "loss": 0.2791, "step": 1762 }, { "epoch": 2.325857519788918, "grad_norm": 0.07792136812206918, "learning_rate": 2.923505530006472e-05, "loss": 0.2737, "step": 1763 }, { "epoch": 2.3271767810026387, "grad_norm": 0.08040842011177347, "learning_rate": 2.9126647301325173e-05, "loss": 0.2779, "step": 1764 }, { "epoch": 2.3284960422163588, "grad_norm": 0.07942190414363236, "learning_rate": 2.90184064006346e-05, "loss": 0.2839, "step": 1765 }, { "epoch": 2.3298153034300793, "grad_norm": 0.08028971257363772, "learning_rate": 2.8910332853192446e-05, "loss": 0.2846, "step": 1766 }, { "epoch": 2.3311345646437993, "grad_norm": 0.07789396036285491, "learning_rate": 2.8802426913803638e-05, "loss": 0.2655, "step": 1767 }, { "epoch": 2.33245382585752, "grad_norm": 0.08040922316012308, "learning_rate": 2.869468883687798e-05, "loss": 0.2957, "step": 1768 }, { "epoch": 2.33377308707124, "grad_norm": 0.07916066349880715, "learning_rate": 2.8587118876429377e-05, "loss": 0.2871, "step": 1769 }, { "epoch": 2.3350923482849604, "grad_norm": 0.0785899308219063, "learning_rate": 2.8479717286075502e-05, "loss": 0.2758, "step": 1770 }, { "epoch": 2.336411609498681, "grad_norm": 0.07657321773713853, "learning_rate": 2.837248431903695e-05, "loss": 0.2817, "step": 1771 }, { "epoch": 2.337730870712401, "grad_norm": 0.07689552301081462, "learning_rate": 2.8265420228136852e-05, "loss": 0.2723, "step": 1772 }, { "epoch": 2.3390501319261214, "grad_norm": 0.07903091626854208, "learning_rate": 2.8158525265800094e-05, "loss": 0.283, "step": 1773 }, { "epoch": 2.3403693931398415, "grad_norm": 0.07763450643041706, "learning_rate": 2.8051799684052883e-05, "loss": 0.2774, "step": 1774 }, { "epoch": 2.341688654353562, "grad_norm": 0.07686125542269036, "learning_rate": 2.7945243734521997e-05, "loss": 0.2809, "step": 1775 }, { "epoch": 2.3430079155672825, "grad_norm": 0.08010973079293744, "learning_rate": 2.7838857668434327e-05, "loss": 0.2813, "step": 1776 }, { "epoch": 2.3443271767810026, "grad_norm": 0.07781534655903478, "learning_rate": 2.773264173661627e-05, "loss": 0.2703, "step": 1777 }, { "epoch": 2.345646437994723, "grad_norm": 0.07914044546936705, "learning_rate": 2.7626596189492983e-05, "loss": 0.2762, "step": 1778 }, { "epoch": 2.346965699208443, "grad_norm": 0.0794544619916553, "learning_rate": 2.7520721277088024e-05, "loss": 0.2817, "step": 1779 }, { "epoch": 2.3482849604221636, "grad_norm": 0.07882416427560729, "learning_rate": 2.7415017249022524e-05, "loss": 0.278, "step": 1780 }, { "epoch": 2.349604221635884, "grad_norm": 0.07884346722661617, "learning_rate": 2.730948435451487e-05, "loss": 0.2757, "step": 1781 }, { "epoch": 2.350923482849604, "grad_norm": 0.0774446534099522, "learning_rate": 2.72041228423798e-05, "loss": 0.2782, "step": 1782 }, { "epoch": 2.3522427440633247, "grad_norm": 0.07842003150393896, "learning_rate": 2.7098932961028156e-05, "loss": 0.2755, "step": 1783 }, { "epoch": 2.3535620052770447, "grad_norm": 0.07850952901336253, "learning_rate": 2.699391495846596e-05, "loss": 0.2805, "step": 1784 }, { "epoch": 2.3548812664907652, "grad_norm": 0.07662985129103472, "learning_rate": 2.6889069082294114e-05, "loss": 0.2706, "step": 1785 }, { "epoch": 2.3562005277044857, "grad_norm": 0.07715023375723411, "learning_rate": 2.67843955797077e-05, "loss": 0.2773, "step": 1786 }, { "epoch": 2.357519788918206, "grad_norm": 0.08025784603140318, "learning_rate": 2.6679894697495266e-05, "loss": 0.2736, "step": 1787 }, { "epoch": 2.3588390501319263, "grad_norm": 0.07898743625434314, "learning_rate": 2.6575566682038556e-05, "loss": 0.2813, "step": 1788 }, { "epoch": 2.3601583113456464, "grad_norm": 0.07994706975599875, "learning_rate": 2.647141177931156e-05, "loss": 0.2741, "step": 1789 }, { "epoch": 2.361477572559367, "grad_norm": 0.0803224883006672, "learning_rate": 2.6367430234880284e-05, "loss": 0.2755, "step": 1790 }, { "epoch": 2.362796833773087, "grad_norm": 0.0812118300329486, "learning_rate": 2.626362229390189e-05, "loss": 0.2766, "step": 1791 }, { "epoch": 2.3641160949868074, "grad_norm": 0.08263247187923964, "learning_rate": 2.6159988201124318e-05, "loss": 0.2883, "step": 1792 }, { "epoch": 2.3654353562005275, "grad_norm": 0.08266781090895481, "learning_rate": 2.6056528200885543e-05, "loss": 0.2822, "step": 1793 }, { "epoch": 2.366754617414248, "grad_norm": 0.07805162231761391, "learning_rate": 2.5953242537113142e-05, "loss": 0.2725, "step": 1794 }, { "epoch": 2.3680738786279685, "grad_norm": 0.07782847976566709, "learning_rate": 2.5850131453323688e-05, "loss": 0.2736, "step": 1795 }, { "epoch": 2.3693931398416885, "grad_norm": 0.0818683092419124, "learning_rate": 2.5747195192622054e-05, "loss": 0.2823, "step": 1796 }, { "epoch": 2.370712401055409, "grad_norm": 0.08148414307136499, "learning_rate": 2.564443399770101e-05, "loss": 0.2883, "step": 1797 }, { "epoch": 2.372031662269129, "grad_norm": 0.08003421754612373, "learning_rate": 2.5541848110840517e-05, "loss": 0.275, "step": 1798 }, { "epoch": 2.3733509234828496, "grad_norm": 0.07993791847680308, "learning_rate": 2.5439437773907292e-05, "loss": 0.2704, "step": 1799 }, { "epoch": 2.37467018469657, "grad_norm": 0.08162598028892494, "learning_rate": 2.5337203228354035e-05, "loss": 0.2784, "step": 1800 }, { "epoch": 2.37598944591029, "grad_norm": 0.08194665988093293, "learning_rate": 2.523514471521913e-05, "loss": 0.2792, "step": 1801 }, { "epoch": 2.3773087071240107, "grad_norm": 0.07953969665407619, "learning_rate": 2.5133262475125786e-05, "loss": 0.2723, "step": 1802 }, { "epoch": 2.3786279683377307, "grad_norm": 0.08077165098145983, "learning_rate": 2.5031556748281715e-05, "loss": 0.285, "step": 1803 }, { "epoch": 2.379947229551451, "grad_norm": 0.07980941066092634, "learning_rate": 2.493002777447846e-05, "loss": 0.2858, "step": 1804 }, { "epoch": 2.3812664907651717, "grad_norm": 0.08106188676902543, "learning_rate": 2.4828675793090748e-05, "loss": 0.2853, "step": 1805 }, { "epoch": 2.3825857519788918, "grad_norm": 0.08199295310850611, "learning_rate": 2.4727501043076128e-05, "loss": 0.2764, "step": 1806 }, { "epoch": 2.3839050131926123, "grad_norm": 0.07760195514483341, "learning_rate": 2.4626503762974164e-05, "loss": 0.267, "step": 1807 }, { "epoch": 2.3852242744063323, "grad_norm": 0.08036373454324149, "learning_rate": 2.452568419090613e-05, "loss": 0.2796, "step": 1808 }, { "epoch": 2.386543535620053, "grad_norm": 0.08101058464107791, "learning_rate": 2.4425042564574184e-05, "loss": 0.2766, "step": 1809 }, { "epoch": 2.387862796833773, "grad_norm": 0.08182235058651041, "learning_rate": 2.4324579121261047e-05, "loss": 0.2989, "step": 1810 }, { "epoch": 2.3891820580474934, "grad_norm": 0.0778442398349721, "learning_rate": 2.4224294097829335e-05, "loss": 0.2702, "step": 1811 }, { "epoch": 2.390501319261214, "grad_norm": 0.0798370536212584, "learning_rate": 2.4124187730720917e-05, "loss": 0.2861, "step": 1812 }, { "epoch": 2.391820580474934, "grad_norm": 0.07859149226638332, "learning_rate": 2.402426025595653e-05, "loss": 0.2782, "step": 1813 }, { "epoch": 2.3931398416886545, "grad_norm": 0.07951258959409493, "learning_rate": 2.3924511909135073e-05, "loss": 0.2797, "step": 1814 }, { "epoch": 2.3944591029023745, "grad_norm": 0.08054713066662195, "learning_rate": 2.382494292543319e-05, "loss": 0.2783, "step": 1815 }, { "epoch": 2.395778364116095, "grad_norm": 0.0802258203733129, "learning_rate": 2.372555353960455e-05, "loss": 0.2851, "step": 1816 }, { "epoch": 2.397097625329815, "grad_norm": 0.07924528248748484, "learning_rate": 2.3626343985979482e-05, "loss": 0.2826, "step": 1817 }, { "epoch": 2.3984168865435356, "grad_norm": 0.08143303641349425, "learning_rate": 2.3527314498464215e-05, "loss": 0.2916, "step": 1818 }, { "epoch": 2.399736147757256, "grad_norm": 0.07852422225190488, "learning_rate": 2.3428465310540526e-05, "loss": 0.2788, "step": 1819 }, { "epoch": 2.401055408970976, "grad_norm": 0.07778002342886366, "learning_rate": 2.3329796655265102e-05, "loss": 0.2724, "step": 1820 }, { "epoch": 2.4023746701846966, "grad_norm": 0.07896552728143713, "learning_rate": 2.3231308765268888e-05, "loss": 0.2821, "step": 1821 }, { "epoch": 2.4036939313984167, "grad_norm": 0.07879825185891388, "learning_rate": 2.3133001872756775e-05, "loss": 0.284, "step": 1822 }, { "epoch": 2.405013192612137, "grad_norm": 0.07911244278915457, "learning_rate": 2.3034876209506772e-05, "loss": 0.281, "step": 1823 }, { "epoch": 2.4063324538258577, "grad_norm": 0.07992814369759461, "learning_rate": 2.293693200686976e-05, "loss": 0.2939, "step": 1824 }, { "epoch": 2.4076517150395778, "grad_norm": 0.07875802630734408, "learning_rate": 2.2839169495768643e-05, "loss": 0.2786, "step": 1825 }, { "epoch": 2.4089709762532983, "grad_norm": 0.0777237632052084, "learning_rate": 2.2741588906698073e-05, "loss": 0.2802, "step": 1826 }, { "epoch": 2.4102902374670183, "grad_norm": 0.08008668091020435, "learning_rate": 2.264419046972368e-05, "loss": 0.2813, "step": 1827 }, { "epoch": 2.411609498680739, "grad_norm": 0.08073590985220855, "learning_rate": 2.2546974414481693e-05, "loss": 0.2803, "step": 1828 }, { "epoch": 2.4129287598944593, "grad_norm": 0.07833643182283842, "learning_rate": 2.2449940970178384e-05, "loss": 0.259, "step": 1829 }, { "epoch": 2.4142480211081794, "grad_norm": 0.07704795554029126, "learning_rate": 2.2353090365589348e-05, "loss": 0.2707, "step": 1830 }, { "epoch": 2.4155672823219, "grad_norm": 0.07654832906617073, "learning_rate": 2.2256422829059253e-05, "loss": 0.271, "step": 1831 }, { "epoch": 2.41688654353562, "grad_norm": 0.0794956461830079, "learning_rate": 2.2159938588501028e-05, "loss": 0.2789, "step": 1832 }, { "epoch": 2.4182058047493404, "grad_norm": 0.07652904207603532, "learning_rate": 2.2063637871395527e-05, "loss": 0.2677, "step": 1833 }, { "epoch": 2.4195250659630605, "grad_norm": 0.07899480745205946, "learning_rate": 2.1967520904790827e-05, "loss": 0.2849, "step": 1834 }, { "epoch": 2.420844327176781, "grad_norm": 0.08191255921714842, "learning_rate": 2.1871587915301896e-05, "loss": 0.2832, "step": 1835 }, { "epoch": 2.4221635883905015, "grad_norm": 0.08115178045317992, "learning_rate": 2.177583912910979e-05, "loss": 0.2719, "step": 1836 }, { "epoch": 2.4234828496042216, "grad_norm": 0.0799761152600673, "learning_rate": 2.1680274771961395e-05, "loss": 0.2818, "step": 1837 }, { "epoch": 2.424802110817942, "grad_norm": 0.0807810093473863, "learning_rate": 2.158489506916874e-05, "loss": 0.2757, "step": 1838 }, { "epoch": 2.426121372031662, "grad_norm": 0.0794371868430207, "learning_rate": 2.1489700245608437e-05, "loss": 0.2764, "step": 1839 }, { "epoch": 2.4274406332453826, "grad_norm": 0.08001901068275855, "learning_rate": 2.139469052572127e-05, "loss": 0.2721, "step": 1840 }, { "epoch": 2.4287598944591027, "grad_norm": 0.07807437339842085, "learning_rate": 2.129986613351156e-05, "loss": 0.2637, "step": 1841 }, { "epoch": 2.430079155672823, "grad_norm": 0.07787884387099665, "learning_rate": 2.1205227292546747e-05, "loss": 0.2692, "step": 1842 }, { "epoch": 2.4313984168865437, "grad_norm": 0.08051199876443327, "learning_rate": 2.1110774225956698e-05, "loss": 0.2823, "step": 1843 }, { "epoch": 2.4327176781002637, "grad_norm": 0.08288175244991892, "learning_rate": 2.1016507156433386e-05, "loss": 0.2837, "step": 1844 }, { "epoch": 2.4340369393139842, "grad_norm": 0.08158404747653787, "learning_rate": 2.092242630623016e-05, "loss": 0.2822, "step": 1845 }, { "epoch": 2.4353562005277043, "grad_norm": 0.0802516842687393, "learning_rate": 2.0828531897161384e-05, "loss": 0.2806, "step": 1846 }, { "epoch": 2.436675461741425, "grad_norm": 0.07882937619497501, "learning_rate": 2.0734824150601884e-05, "loss": 0.2792, "step": 1847 }, { "epoch": 2.4379947229551453, "grad_norm": 0.07961636529018683, "learning_rate": 2.064130328748626e-05, "loss": 0.274, "step": 1848 }, { "epoch": 2.4393139841688654, "grad_norm": 0.07954056420073018, "learning_rate": 2.054796952830865e-05, "loss": 0.2736, "step": 1849 }, { "epoch": 2.440633245382586, "grad_norm": 0.08164050592162984, "learning_rate": 2.0454823093121924e-05, "loss": 0.2851, "step": 1850 }, { "epoch": 2.441952506596306, "grad_norm": 0.0799514629491529, "learning_rate": 2.036186420153743e-05, "loss": 0.2885, "step": 1851 }, { "epoch": 2.4432717678100264, "grad_norm": 0.08049932837280491, "learning_rate": 2.02690930727242e-05, "loss": 0.2752, "step": 1852 }, { "epoch": 2.444591029023747, "grad_norm": 0.0785403414697508, "learning_rate": 2.0176509925408683e-05, "loss": 0.2739, "step": 1853 }, { "epoch": 2.445910290237467, "grad_norm": 0.08075232867039733, "learning_rate": 2.0084114977874135e-05, "loss": 0.2882, "step": 1854 }, { "epoch": 2.4472295514511875, "grad_norm": 0.08054037329497876, "learning_rate": 1.9991908447959984e-05, "loss": 0.2742, "step": 1855 }, { "epoch": 2.4485488126649075, "grad_norm": 0.07992174213827835, "learning_rate": 1.9899890553061562e-05, "loss": 0.2699, "step": 1856 }, { "epoch": 2.449868073878628, "grad_norm": 0.07879244744705563, "learning_rate": 1.9808061510129317e-05, "loss": 0.2817, "step": 1857 }, { "epoch": 2.451187335092348, "grad_norm": 0.08081404035115847, "learning_rate": 1.9716421535668583e-05, "loss": 0.2873, "step": 1858 }, { "epoch": 2.4525065963060686, "grad_norm": 0.08087772638256892, "learning_rate": 1.96249708457388e-05, "loss": 0.2755, "step": 1859 }, { "epoch": 2.453825857519789, "grad_norm": 0.08201856281542048, "learning_rate": 1.9533709655953235e-05, "loss": 0.2954, "step": 1860 }, { "epoch": 2.455145118733509, "grad_norm": 0.0795541840007406, "learning_rate": 1.944263818147828e-05, "loss": 0.2735, "step": 1861 }, { "epoch": 2.4564643799472297, "grad_norm": 0.07805811501309914, "learning_rate": 1.9351756637033093e-05, "loss": 0.2706, "step": 1862 }, { "epoch": 2.4577836411609497, "grad_norm": 0.07929308560882503, "learning_rate": 1.9261065236889066e-05, "loss": 0.2778, "step": 1863 }, { "epoch": 2.45910290237467, "grad_norm": 0.08142008340003384, "learning_rate": 1.917056419486918e-05, "loss": 0.282, "step": 1864 }, { "epoch": 2.4604221635883903, "grad_norm": 0.07761326203283928, "learning_rate": 1.908025372434773e-05, "loss": 0.2743, "step": 1865 }, { "epoch": 2.461741424802111, "grad_norm": 0.08017027877287014, "learning_rate": 1.8990134038249585e-05, "loss": 0.2828, "step": 1866 }, { "epoch": 2.4630606860158313, "grad_norm": 0.08311423261328345, "learning_rate": 1.8900205349049904e-05, "loss": 0.2722, "step": 1867 }, { "epoch": 2.4643799472295513, "grad_norm": 0.08227938903926371, "learning_rate": 1.8810467868773453e-05, "loss": 0.281, "step": 1868 }, { "epoch": 2.465699208443272, "grad_norm": 0.07986930058259482, "learning_rate": 1.8720921808994263e-05, "loss": 0.2719, "step": 1869 }, { "epoch": 2.467018469656992, "grad_norm": 0.07790688702293064, "learning_rate": 1.8631567380834957e-05, "loss": 0.2618, "step": 1870 }, { "epoch": 2.4683377308707124, "grad_norm": 0.08258794664189158, "learning_rate": 1.854240479496643e-05, "loss": 0.2808, "step": 1871 }, { "epoch": 2.469656992084433, "grad_norm": 0.0814276492094258, "learning_rate": 1.8453434261607273e-05, "loss": 0.2788, "step": 1872 }, { "epoch": 2.470976253298153, "grad_norm": 0.0811238092973631, "learning_rate": 1.8364655990523182e-05, "loss": 0.2774, "step": 1873 }, { "epoch": 2.4722955145118735, "grad_norm": 0.08289891614552794, "learning_rate": 1.8276070191026672e-05, "loss": 0.2852, "step": 1874 }, { "epoch": 2.4736147757255935, "grad_norm": 0.0859191258275514, "learning_rate": 1.818767707197636e-05, "loss": 0.2851, "step": 1875 }, { "epoch": 2.474934036939314, "grad_norm": 0.08269769855749842, "learning_rate": 1.8099476841776697e-05, "loss": 0.2793, "step": 1876 }, { "epoch": 2.4762532981530345, "grad_norm": 0.08093390896013891, "learning_rate": 1.801146970837725e-05, "loss": 0.2803, "step": 1877 }, { "epoch": 2.4775725593667546, "grad_norm": 0.078253811613984, "learning_rate": 1.7923655879272393e-05, "loss": 0.2816, "step": 1878 }, { "epoch": 2.478891820580475, "grad_norm": 0.07932439305379811, "learning_rate": 1.7836035561500698e-05, "loss": 0.2773, "step": 1879 }, { "epoch": 2.480211081794195, "grad_norm": 0.08138672795880651, "learning_rate": 1.774860896164454e-05, "loss": 0.2816, "step": 1880 }, { "epoch": 2.4815303430079156, "grad_norm": 0.08048305472762693, "learning_rate": 1.7661376285829568e-05, "loss": 0.2838, "step": 1881 }, { "epoch": 2.4828496042216357, "grad_norm": 0.08092602303500653, "learning_rate": 1.7574337739724132e-05, "loss": 0.283, "step": 1882 }, { "epoch": 2.484168865435356, "grad_norm": 0.08070319227679996, "learning_rate": 1.7487493528539024e-05, "loss": 0.2755, "step": 1883 }, { "epoch": 2.4854881266490767, "grad_norm": 0.0804943944840069, "learning_rate": 1.7400843857026705e-05, "loss": 0.2791, "step": 1884 }, { "epoch": 2.4868073878627968, "grad_norm": 0.08031582209021997, "learning_rate": 1.7314388929481083e-05, "loss": 0.2816, "step": 1885 }, { "epoch": 2.4881266490765173, "grad_norm": 0.08093769404386134, "learning_rate": 1.7228128949736843e-05, "loss": 0.2654, "step": 1886 }, { "epoch": 2.4894459102902373, "grad_norm": 0.07735144247533363, "learning_rate": 1.714206412116911e-05, "loss": 0.2682, "step": 1887 }, { "epoch": 2.490765171503958, "grad_norm": 0.08025945317431532, "learning_rate": 1.7056194646692814e-05, "loss": 0.2867, "step": 1888 }, { "epoch": 2.492084432717678, "grad_norm": 0.07961041014453235, "learning_rate": 1.6970520728762375e-05, "loss": 0.2698, "step": 1889 }, { "epoch": 2.4934036939313984, "grad_norm": 0.07963601869539944, "learning_rate": 1.6885042569371146e-05, "loss": 0.2839, "step": 1890 }, { "epoch": 2.494722955145119, "grad_norm": 0.08000372658418474, "learning_rate": 1.6799760370050875e-05, "loss": 0.2803, "step": 1891 }, { "epoch": 2.496042216358839, "grad_norm": 0.07907928307895791, "learning_rate": 1.671467433187135e-05, "loss": 0.2753, "step": 1892 }, { "epoch": 2.4973614775725594, "grad_norm": 0.08134297880870882, "learning_rate": 1.6629784655439872e-05, "loss": 0.2847, "step": 1893 }, { "epoch": 2.4986807387862795, "grad_norm": 0.07979423124205566, "learning_rate": 1.654509154090078e-05, "loss": 0.2722, "step": 1894 }, { "epoch": 2.5, "grad_norm": 0.07852606929622762, "learning_rate": 1.6460595187934923e-05, "loss": 0.2744, "step": 1895 }, { "epoch": 2.5013192612137205, "grad_norm": 0.0780059155139534, "learning_rate": 1.6376295795759333e-05, "loss": 0.2706, "step": 1896 }, { "epoch": 2.5026385224274406, "grad_norm": 0.08062598567709782, "learning_rate": 1.629219356312657e-05, "loss": 0.2719, "step": 1897 }, { "epoch": 2.503957783641161, "grad_norm": 0.08207839065038951, "learning_rate": 1.6208288688324458e-05, "loss": 0.289, "step": 1898 }, { "epoch": 2.505277044854881, "grad_norm": 0.08006624954361863, "learning_rate": 1.6124581369175396e-05, "loss": 0.267, "step": 1899 }, { "epoch": 2.5065963060686016, "grad_norm": 0.07830125369339291, "learning_rate": 1.60410718030361e-05, "loss": 0.2702, "step": 1900 }, { "epoch": 2.507915567282322, "grad_norm": 0.0779653169816885, "learning_rate": 1.5957760186797032e-05, "loss": 0.2742, "step": 1901 }, { "epoch": 2.509234828496042, "grad_norm": 0.08010346028556846, "learning_rate": 1.587464671688187e-05, "loss": 0.2823, "step": 1902 }, { "epoch": 2.5105540897097627, "grad_norm": 0.07795937691897807, "learning_rate": 1.579173158924724e-05, "loss": 0.2705, "step": 1903 }, { "epoch": 2.5118733509234827, "grad_norm": 0.07860344514974744, "learning_rate": 1.570901499938201e-05, "loss": 0.2825, "step": 1904 }, { "epoch": 2.5131926121372032, "grad_norm": 0.08201253539363894, "learning_rate": 1.5626497142307084e-05, "loss": 0.2849, "step": 1905 }, { "epoch": 2.5145118733509237, "grad_norm": 0.07998712641302033, "learning_rate": 1.5544178212574688e-05, "loss": 0.2847, "step": 1906 }, { "epoch": 2.515831134564644, "grad_norm": 0.08239384676567711, "learning_rate": 1.5462058404268155e-05, "loss": 0.2921, "step": 1907 }, { "epoch": 2.517150395778364, "grad_norm": 0.0788493703552662, "learning_rate": 1.5380137911001248e-05, "loss": 0.2673, "step": 1908 }, { "epoch": 2.5184696569920844, "grad_norm": 0.07939950719965717, "learning_rate": 1.529841692591787e-05, "loss": 0.269, "step": 1909 }, { "epoch": 2.519788918205805, "grad_norm": 0.07950027731256681, "learning_rate": 1.5216895641691542e-05, "loss": 0.2825, "step": 1910 }, { "epoch": 2.521108179419525, "grad_norm": 0.0803140645145982, "learning_rate": 1.5135574250524897e-05, "loss": 0.2705, "step": 1911 }, { "epoch": 2.5224274406332454, "grad_norm": 0.07936687251151212, "learning_rate": 1.5054452944149356e-05, "loss": 0.2787, "step": 1912 }, { "epoch": 2.5237467018469655, "grad_norm": 0.07920178482179316, "learning_rate": 1.4973531913824524e-05, "loss": 0.2735, "step": 1913 }, { "epoch": 2.525065963060686, "grad_norm": 0.07965929179568546, "learning_rate": 1.4892811350337876e-05, "loss": 0.2861, "step": 1914 }, { "epoch": 2.5263852242744065, "grad_norm": 0.08175631124189119, "learning_rate": 1.48122914440042e-05, "loss": 0.2701, "step": 1915 }, { "epoch": 2.5277044854881265, "grad_norm": 0.08238259118298483, "learning_rate": 1.473197238466525e-05, "loss": 0.2775, "step": 1916 }, { "epoch": 2.529023746701847, "grad_norm": 0.08074195321784808, "learning_rate": 1.4651854361689178e-05, "loss": 0.2738, "step": 1917 }, { "epoch": 2.530343007915567, "grad_norm": 0.0780520994219727, "learning_rate": 1.4571937563970184e-05, "loss": 0.275, "step": 1918 }, { "epoch": 2.5316622691292876, "grad_norm": 0.07939994235997708, "learning_rate": 1.4492222179928084e-05, "loss": 0.2697, "step": 1919 }, { "epoch": 2.532981530343008, "grad_norm": 0.08163563108863972, "learning_rate": 1.4412708397507724e-05, "loss": 0.2841, "step": 1920 }, { "epoch": 2.534300791556728, "grad_norm": 0.08173934028404446, "learning_rate": 1.4333396404178733e-05, "loss": 0.2835, "step": 1921 }, { "epoch": 2.5356200527704487, "grad_norm": 0.07990359920850079, "learning_rate": 1.425428638693489e-05, "loss": 0.2788, "step": 1922 }, { "epoch": 2.5369393139841687, "grad_norm": 0.07927019512048582, "learning_rate": 1.417537853229387e-05, "loss": 0.2812, "step": 1923 }, { "epoch": 2.538258575197889, "grad_norm": 0.07916892389078398, "learning_rate": 1.4096673026296613e-05, "loss": 0.2728, "step": 1924 }, { "epoch": 2.5395778364116097, "grad_norm": 0.0787157870663067, "learning_rate": 1.4018170054507085e-05, "loss": 0.289, "step": 1925 }, { "epoch": 2.54089709762533, "grad_norm": 0.08204206651207006, "learning_rate": 1.3939869802011618e-05, "loss": 0.2864, "step": 1926 }, { "epoch": 2.5422163588390503, "grad_norm": 0.07759408747849203, "learning_rate": 1.3861772453418687e-05, "loss": 0.2708, "step": 1927 }, { "epoch": 2.5435356200527703, "grad_norm": 0.08075042237810039, "learning_rate": 1.3783878192858379e-05, "loss": 0.2754, "step": 1928 }, { "epoch": 2.544854881266491, "grad_norm": 0.0807863383135216, "learning_rate": 1.370618720398189e-05, "loss": 0.2818, "step": 1929 }, { "epoch": 2.5461741424802113, "grad_norm": 0.08096752373949145, "learning_rate": 1.3628699669961243e-05, "loss": 0.2727, "step": 1930 }, { "epoch": 2.5474934036939314, "grad_norm": 0.07841142429640731, "learning_rate": 1.3551415773488685e-05, "loss": 0.2721, "step": 1931 }, { "epoch": 2.5488126649076515, "grad_norm": 0.08246512503555026, "learning_rate": 1.3474335696776453e-05, "loss": 0.2802, "step": 1932 }, { "epoch": 2.550131926121372, "grad_norm": 0.08045898626097381, "learning_rate": 1.339745962155613e-05, "loss": 0.2799, "step": 1933 }, { "epoch": 2.5514511873350925, "grad_norm": 0.08049150928341942, "learning_rate": 1.3320787729078443e-05, "loss": 0.2755, "step": 1934 }, { "epoch": 2.5527704485488125, "grad_norm": 0.08109442150582193, "learning_rate": 1.3244320200112592e-05, "loss": 0.2853, "step": 1935 }, { "epoch": 2.554089709762533, "grad_norm": 0.08077265330634763, "learning_rate": 1.316805721494604e-05, "loss": 0.283, "step": 1936 }, { "epoch": 2.555408970976253, "grad_norm": 0.08130372098785313, "learning_rate": 1.3091998953383989e-05, "loss": 0.2822, "step": 1937 }, { "epoch": 2.5567282321899736, "grad_norm": 0.07872382922260729, "learning_rate": 1.3016145594748907e-05, "loss": 0.2769, "step": 1938 }, { "epoch": 2.558047493403694, "grad_norm": 0.07984636225540091, "learning_rate": 1.294049731788023e-05, "loss": 0.2827, "step": 1939 }, { "epoch": 2.559366754617414, "grad_norm": 0.08075398089355437, "learning_rate": 1.2865054301133805e-05, "loss": 0.276, "step": 1940 }, { "epoch": 2.5606860158311346, "grad_norm": 0.07866315568884574, "learning_rate": 1.278981672238161e-05, "loss": 0.2765, "step": 1941 }, { "epoch": 2.5620052770448547, "grad_norm": 0.07865568599494499, "learning_rate": 1.27147847590112e-05, "loss": 0.2728, "step": 1942 }, { "epoch": 2.563324538258575, "grad_norm": 0.07999446081222221, "learning_rate": 1.2639958587925382e-05, "loss": 0.2772, "step": 1943 }, { "epoch": 2.5646437994722957, "grad_norm": 0.0822683976218527, "learning_rate": 1.2565338385541792e-05, "loss": 0.2783, "step": 1944 }, { "epoch": 2.5659630606860158, "grad_norm": 0.07756068684205553, "learning_rate": 1.2490924327792376e-05, "loss": 0.2748, "step": 1945 }, { "epoch": 2.5672823218997363, "grad_norm": 0.08043560482244433, "learning_rate": 1.2416716590123145e-05, "loss": 0.2752, "step": 1946 }, { "epoch": 2.5686015831134563, "grad_norm": 0.08175629471329682, "learning_rate": 1.2342715347493594e-05, "loss": 0.265, "step": 1947 }, { "epoch": 2.569920844327177, "grad_norm": 0.0810136014402916, "learning_rate": 1.2268920774376413e-05, "loss": 0.2778, "step": 1948 }, { "epoch": 2.5712401055408973, "grad_norm": 0.07985461299875352, "learning_rate": 1.2195333044756996e-05, "loss": 0.2777, "step": 1949 }, { "epoch": 2.5725593667546174, "grad_norm": 0.08060443280850661, "learning_rate": 1.2121952332133091e-05, "loss": 0.2736, "step": 1950 }, { "epoch": 2.573878627968338, "grad_norm": 0.08033918040761369, "learning_rate": 1.2048778809514328e-05, "loss": 0.2795, "step": 1951 }, { "epoch": 2.575197889182058, "grad_norm": 0.08369866110619013, "learning_rate": 1.1975812649421881e-05, "loss": 0.2854, "step": 1952 }, { "epoch": 2.5765171503957784, "grad_norm": 0.0819210196056744, "learning_rate": 1.1903054023888017e-05, "loss": 0.2939, "step": 1953 }, { "epoch": 2.577836411609499, "grad_norm": 0.08139746257950475, "learning_rate": 1.1830503104455659e-05, "loss": 0.2829, "step": 1954 }, { "epoch": 2.579155672823219, "grad_norm": 0.07807178159419773, "learning_rate": 1.1758160062178093e-05, "loss": 0.28, "step": 1955 }, { "epoch": 2.580474934036939, "grad_norm": 0.08018057240962718, "learning_rate": 1.1686025067618423e-05, "loss": 0.2768, "step": 1956 }, { "epoch": 2.5817941952506596, "grad_norm": 0.07976344924089074, "learning_rate": 1.1614098290849295e-05, "loss": 0.2742, "step": 1957 }, { "epoch": 2.58311345646438, "grad_norm": 0.07871221854929998, "learning_rate": 1.1542379901452382e-05, "loss": 0.2809, "step": 1958 }, { "epoch": 2.5844327176781, "grad_norm": 0.08197289293994192, "learning_rate": 1.1470870068518113e-05, "loss": 0.2777, "step": 1959 }, { "epoch": 2.5857519788918206, "grad_norm": 0.08015785661778672, "learning_rate": 1.1399568960645135e-05, "loss": 0.2782, "step": 1960 }, { "epoch": 2.5870712401055407, "grad_norm": 0.07784363123440043, "learning_rate": 1.132847674594003e-05, "loss": 0.2734, "step": 1961 }, { "epoch": 2.588390501319261, "grad_norm": 0.07921276714216263, "learning_rate": 1.1257593592016868e-05, "loss": 0.2756, "step": 1962 }, { "epoch": 2.5897097625329817, "grad_norm": 0.07860791772552002, "learning_rate": 1.1186919665996775e-05, "loss": 0.274, "step": 1963 }, { "epoch": 2.5910290237467017, "grad_norm": 0.08158080776632985, "learning_rate": 1.1116455134507664e-05, "loss": 0.2871, "step": 1964 }, { "epoch": 2.5923482849604222, "grad_norm": 0.08066476609887853, "learning_rate": 1.104620016368364e-05, "loss": 0.2834, "step": 1965 }, { "epoch": 2.5936675461741423, "grad_norm": 0.08042003101087304, "learning_rate": 1.097615491916485e-05, "loss": 0.2808, "step": 1966 }, { "epoch": 2.594986807387863, "grad_norm": 0.07897042896162845, "learning_rate": 1.0906319566096867e-05, "loss": 0.2738, "step": 1967 }, { "epoch": 2.5963060686015833, "grad_norm": 0.08044796611228697, "learning_rate": 1.0836694269130498e-05, "loss": 0.277, "step": 1968 }, { "epoch": 2.5976253298153034, "grad_norm": 0.08202316511097874, "learning_rate": 1.0767279192421208e-05, "loss": 0.281, "step": 1969 }, { "epoch": 2.598944591029024, "grad_norm": 0.08019918396852282, "learning_rate": 1.0698074499628885e-05, "loss": 0.2717, "step": 1970 }, { "epoch": 2.600263852242744, "grad_norm": 0.07880984957846844, "learning_rate": 1.0629080353917397e-05, "loss": 0.2664, "step": 1971 }, { "epoch": 2.6015831134564644, "grad_norm": 0.07934957864638932, "learning_rate": 1.056029691795416e-05, "loss": 0.278, "step": 1972 }, { "epoch": 2.602902374670185, "grad_norm": 0.07925040385596518, "learning_rate": 1.049172435390986e-05, "loss": 0.283, "step": 1973 }, { "epoch": 2.604221635883905, "grad_norm": 0.08074774889033534, "learning_rate": 1.0423362823457939e-05, "loss": 0.2837, "step": 1974 }, { "epoch": 2.6055408970976255, "grad_norm": 0.07950839937283274, "learning_rate": 1.0355212487774368e-05, "loss": 0.277, "step": 1975 }, { "epoch": 2.6068601583113455, "grad_norm": 0.08037388633047247, "learning_rate": 1.0287273507537099e-05, "loss": 0.2724, "step": 1976 }, { "epoch": 2.608179419525066, "grad_norm": 0.08029897233412431, "learning_rate": 1.0219546042925843e-05, "loss": 0.2756, "step": 1977 }, { "epoch": 2.6094986807387865, "grad_norm": 0.07881450246076134, "learning_rate": 1.0152030253621569e-05, "loss": 0.2683, "step": 1978 }, { "epoch": 2.6108179419525066, "grad_norm": 0.08007388203177242, "learning_rate": 1.0084726298806213e-05, "loss": 0.2794, "step": 1979 }, { "epoch": 2.6121372031662267, "grad_norm": 0.08230642710583251, "learning_rate": 1.0017634337162275e-05, "loss": 0.2898, "step": 1980 }, { "epoch": 2.613456464379947, "grad_norm": 0.08186608906059513, "learning_rate": 9.950754526872386e-06, "loss": 0.2703, "step": 1981 }, { "epoch": 2.6147757255936677, "grad_norm": 0.0806054120621213, "learning_rate": 9.88408702561906e-06, "loss": 0.2738, "step": 1982 }, { "epoch": 2.6160949868073877, "grad_norm": 0.08087611266839757, "learning_rate": 9.817631990584165e-06, "loss": 0.2796, "step": 1983 }, { "epoch": 2.6174142480211082, "grad_norm": 0.08202549206724402, "learning_rate": 9.751389578448745e-06, "loss": 0.2836, "step": 1984 }, { "epoch": 2.6187335092348283, "grad_norm": 0.0780561099426162, "learning_rate": 9.685359945392426e-06, "loss": 0.2604, "step": 1985 }, { "epoch": 2.620052770448549, "grad_norm": 0.079498735887078, "learning_rate": 9.619543247093254e-06, "loss": 0.2742, "step": 1986 }, { "epoch": 2.6213720316622693, "grad_norm": 0.07878862959397345, "learning_rate": 9.553939638727227e-06, "loss": 0.2682, "step": 1987 }, { "epoch": 2.6226912928759893, "grad_norm": 0.08156120836955717, "learning_rate": 9.488549274967872e-06, "loss": 0.2818, "step": 1988 }, { "epoch": 2.62401055408971, "grad_norm": 0.08024173074223512, "learning_rate": 9.423372309986056e-06, "loss": 0.2778, "step": 1989 }, { "epoch": 2.62532981530343, "grad_norm": 0.0820610347651181, "learning_rate": 9.3584088974494e-06, "loss": 0.2764, "step": 1990 }, { "epoch": 2.6266490765171504, "grad_norm": 0.0811485303843612, "learning_rate": 9.293659190522142e-06, "loss": 0.2689, "step": 1991 }, { "epoch": 2.627968337730871, "grad_norm": 0.07963836896424441, "learning_rate": 9.229123341864577e-06, "loss": 0.2737, "step": 1992 }, { "epoch": 2.629287598944591, "grad_norm": 0.07955841922929359, "learning_rate": 9.16480150363287e-06, "loss": 0.2792, "step": 1993 }, { "epoch": 2.6306068601583115, "grad_norm": 0.07880776659192874, "learning_rate": 9.100693827478512e-06, "loss": 0.2687, "step": 1994 }, { "epoch": 2.6319261213720315, "grad_norm": 0.08142475896750397, "learning_rate": 9.036800464548157e-06, "loss": 0.2764, "step": 1995 }, { "epoch": 2.633245382585752, "grad_norm": 0.08134895712031173, "learning_rate": 8.973121565483156e-06, "loss": 0.2817, "step": 1996 }, { "epoch": 2.6345646437994725, "grad_norm": 0.08014935692374074, "learning_rate": 8.909657280419149e-06, "loss": 0.2692, "step": 1997 }, { "epoch": 2.6358839050131926, "grad_norm": 0.0788293362745659, "learning_rate": 8.846407758985886e-06, "loss": 0.271, "step": 1998 }, { "epoch": 2.637203166226913, "grad_norm": 0.07994102918807569, "learning_rate": 8.783373150306661e-06, "loss": 0.2769, "step": 1999 }, { "epoch": 2.638522427440633, "grad_norm": 0.07934748948566807, "learning_rate": 8.720553602998172e-06, "loss": 0.2801, "step": 2000 }, { "epoch": 2.6398416886543536, "grad_norm": 0.08229468336952052, "learning_rate": 8.657949265169984e-06, "loss": 0.2668, "step": 2001 }, { "epoch": 2.641160949868074, "grad_norm": 0.08153097002886557, "learning_rate": 8.59556028442432e-06, "loss": 0.2826, "step": 2002 }, { "epoch": 2.642480211081794, "grad_norm": 0.08015241727986083, "learning_rate": 8.533386807855604e-06, "loss": 0.2819, "step": 2003 }, { "epoch": 2.6437994722955143, "grad_norm": 0.08099151432549438, "learning_rate": 8.4714289820502e-06, "loss": 0.2752, "step": 2004 }, { "epoch": 2.6451187335092348, "grad_norm": 0.07941192492455801, "learning_rate": 8.40968695308606e-06, "loss": 0.2757, "step": 2005 }, { "epoch": 2.6464379947229553, "grad_norm": 0.07928770033542366, "learning_rate": 8.348160866532272e-06, "loss": 0.2732, "step": 2006 }, { "epoch": 2.6477572559366753, "grad_norm": 0.08012787449717272, "learning_rate": 8.286850867448881e-06, "loss": 0.291, "step": 2007 }, { "epoch": 2.649076517150396, "grad_norm": 0.08119917301405938, "learning_rate": 8.225757100386388e-06, "loss": 0.275, "step": 2008 }, { "epoch": 2.650395778364116, "grad_norm": 0.08006733783933186, "learning_rate": 8.164879709385565e-06, "loss": 0.2675, "step": 2009 }, { "epoch": 2.6517150395778364, "grad_norm": 0.08459113014289865, "learning_rate": 8.10421883797694e-06, "loss": 0.3029, "step": 2010 }, { "epoch": 2.653034300791557, "grad_norm": 0.07873800764677354, "learning_rate": 8.043774629180645e-06, "loss": 0.2797, "step": 2011 }, { "epoch": 2.654353562005277, "grad_norm": 0.08022151165235215, "learning_rate": 7.983547225505904e-06, "loss": 0.2746, "step": 2012 }, { "epoch": 2.6556728232189974, "grad_norm": 0.08261604761100258, "learning_rate": 7.923536768950856e-06, "loss": 0.2754, "step": 2013 }, { "epoch": 2.6569920844327175, "grad_norm": 0.07890321224992265, "learning_rate": 7.863743401002099e-06, "loss": 0.2775, "step": 2014 }, { "epoch": 2.658311345646438, "grad_norm": 0.07699777592828849, "learning_rate": 7.804167262634387e-06, "loss": 0.2718, "step": 2015 }, { "epoch": 2.6596306068601585, "grad_norm": 0.08186338528123278, "learning_rate": 7.744808494310386e-06, "loss": 0.2753, "step": 2016 }, { "epoch": 2.6609498680738786, "grad_norm": 0.08084542047606412, "learning_rate": 7.685667235980153e-06, "loss": 0.2758, "step": 2017 }, { "epoch": 2.662269129287599, "grad_norm": 0.08212950500773165, "learning_rate": 7.626743627081057e-06, "loss": 0.2721, "step": 2018 }, { "epoch": 2.663588390501319, "grad_norm": 0.07942370412194834, "learning_rate": 7.568037806537176e-06, "loss": 0.2751, "step": 2019 }, { "epoch": 2.6649076517150396, "grad_norm": 0.08138887690614276, "learning_rate": 7.509549912759228e-06, "loss": 0.2723, "step": 2020 }, { "epoch": 2.66622691292876, "grad_norm": 0.07867098372446928, "learning_rate": 7.4512800836440525e-06, "loss": 0.2717, "step": 2021 }, { "epoch": 2.66754617414248, "grad_norm": 0.08133630074851883, "learning_rate": 7.393228456574374e-06, "loss": 0.2882, "step": 2022 }, { "epoch": 2.6688654353562007, "grad_norm": 0.08057531514481456, "learning_rate": 7.3353951684185016e-06, "loss": 0.2864, "step": 2023 }, { "epoch": 2.6701846965699207, "grad_norm": 0.08189294266152082, "learning_rate": 7.277780355529895e-06, "loss": 0.2833, "step": 2024 }, { "epoch": 2.6715039577836412, "grad_norm": 0.08089227148811994, "learning_rate": 7.220384153746995e-06, "loss": 0.2785, "step": 2025 }, { "epoch": 2.6728232189973617, "grad_norm": 0.0813621269753518, "learning_rate": 7.163206698392744e-06, "loss": 0.2772, "step": 2026 }, { "epoch": 2.674142480211082, "grad_norm": 0.07994991444840058, "learning_rate": 7.106248124274406e-06, "loss": 0.2833, "step": 2027 }, { "epoch": 2.675461741424802, "grad_norm": 0.07821111823347549, "learning_rate": 7.0495085656831495e-06, "loss": 0.2666, "step": 2028 }, { "epoch": 2.6767810026385224, "grad_norm": 0.07932359955410291, "learning_rate": 6.992988156393821e-06, "loss": 0.2701, "step": 2029 }, { "epoch": 2.678100263852243, "grad_norm": 0.07932165770221027, "learning_rate": 6.936687029664502e-06, "loss": 0.2681, "step": 2030 }, { "epoch": 2.679419525065963, "grad_norm": 0.07963551624222473, "learning_rate": 6.880605318236344e-06, "loss": 0.2764, "step": 2031 }, { "epoch": 2.6807387862796834, "grad_norm": 0.08002689978781519, "learning_rate": 6.824743154333157e-06, "loss": 0.2747, "step": 2032 }, { "epoch": 2.6820580474934035, "grad_norm": 0.07976870920523062, "learning_rate": 6.76910066966111e-06, "loss": 0.2824, "step": 2033 }, { "epoch": 2.683377308707124, "grad_norm": 0.0819126840236899, "learning_rate": 6.713677995408452e-06, "loss": 0.2782, "step": 2034 }, { "epoch": 2.6846965699208445, "grad_norm": 0.0795953295012551, "learning_rate": 6.658475262245167e-06, "loss": 0.2809, "step": 2035 }, { "epoch": 2.6860158311345645, "grad_norm": 0.07904988766807924, "learning_rate": 6.603492600322725e-06, "loss": 0.2762, "step": 2036 }, { "epoch": 2.687335092348285, "grad_norm": 0.08022552357458787, "learning_rate": 6.548730139273662e-06, "loss": 0.2739, "step": 2037 }, { "epoch": 2.688654353562005, "grad_norm": 0.08120042088692281, "learning_rate": 6.494188008211421e-06, "loss": 0.2794, "step": 2038 }, { "epoch": 2.6899736147757256, "grad_norm": 0.07834788224585981, "learning_rate": 6.439866335729961e-06, "loss": 0.2719, "step": 2039 }, { "epoch": 2.691292875989446, "grad_norm": 0.07699804427136034, "learning_rate": 6.3857652499033974e-06, "loss": 0.2679, "step": 2040 }, { "epoch": 2.692612137203166, "grad_norm": 0.08039920646553605, "learning_rate": 6.331884878285854e-06, "loss": 0.2812, "step": 2041 }, { "epoch": 2.6939313984168867, "grad_norm": 0.07768072771266223, "learning_rate": 6.278225347911026e-06, "loss": 0.2679, "step": 2042 }, { "epoch": 2.6952506596306067, "grad_norm": 0.08058317569931753, "learning_rate": 6.22478678529197e-06, "loss": 0.2696, "step": 2043 }, { "epoch": 2.6965699208443272, "grad_norm": 0.07934542642723233, "learning_rate": 6.1715693164207e-06, "loss": 0.2815, "step": 2044 }, { "epoch": 2.6978891820580477, "grad_norm": 0.07837669892606369, "learning_rate": 6.118573066768041e-06, "loss": 0.2736, "step": 2045 }, { "epoch": 2.699208443271768, "grad_norm": 0.0806289342299388, "learning_rate": 6.065798161283187e-06, "loss": 0.2686, "step": 2046 }, { "epoch": 2.7005277044854883, "grad_norm": 0.07821841970217794, "learning_rate": 6.013244724393496e-06, "loss": 0.2701, "step": 2047 }, { "epoch": 2.7018469656992083, "grad_norm": 0.07922662699205917, "learning_rate": 5.960912880004199e-06, "loss": 0.275, "step": 2048 }, { "epoch": 2.703166226912929, "grad_norm": 0.07989893487491136, "learning_rate": 5.908802751497988e-06, "loss": 0.2802, "step": 2049 }, { "epoch": 2.7044854881266494, "grad_norm": 0.0804130725420355, "learning_rate": 5.856914461734919e-06, "loss": 0.2769, "step": 2050 }, { "epoch": 2.7058047493403694, "grad_norm": 0.08028322625613954, "learning_rate": 5.805248133051943e-06, "loss": 0.2738, "step": 2051 }, { "epoch": 2.7071240105540895, "grad_norm": 0.08174435410020747, "learning_rate": 5.753803887262743e-06, "loss": 0.2688, "step": 2052 }, { "epoch": 2.70844327176781, "grad_norm": 0.08090884420997359, "learning_rate": 5.702581845657362e-06, "loss": 0.2764, "step": 2053 }, { "epoch": 2.7097625329815305, "grad_norm": 0.0782625301077583, "learning_rate": 5.651582129001986e-06, "loss": 0.2626, "step": 2054 }, { "epoch": 2.7110817941952505, "grad_norm": 0.08062935034331804, "learning_rate": 5.600804857538588e-06, "loss": 0.2801, "step": 2055 }, { "epoch": 2.712401055408971, "grad_norm": 0.08065368464011827, "learning_rate": 5.550250150984715e-06, "loss": 0.278, "step": 2056 }, { "epoch": 2.713720316622691, "grad_norm": 0.07836071569456944, "learning_rate": 5.499918128533155e-06, "loss": 0.2679, "step": 2057 }, { "epoch": 2.7150395778364116, "grad_norm": 0.07975543111284723, "learning_rate": 5.449808908851673e-06, "loss": 0.2803, "step": 2058 }, { "epoch": 2.716358839050132, "grad_norm": 0.08070952888616961, "learning_rate": 5.399922610082764e-06, "loss": 0.2716, "step": 2059 }, { "epoch": 2.717678100263852, "grad_norm": 0.08405997472757325, "learning_rate": 5.350259349843278e-06, "loss": 0.2788, "step": 2060 }, { "epoch": 2.7189973614775726, "grad_norm": 0.08206937502059726, "learning_rate": 5.300819245224275e-06, "loss": 0.2732, "step": 2061 }, { "epoch": 2.7203166226912927, "grad_norm": 0.07920540744836249, "learning_rate": 5.251602412790624e-06, "loss": 0.273, "step": 2062 }, { "epoch": 2.721635883905013, "grad_norm": 0.07930028698059864, "learning_rate": 5.202608968580846e-06, "loss": 0.2771, "step": 2063 }, { "epoch": 2.7229551451187337, "grad_norm": 0.0790109945482145, "learning_rate": 5.15383902810671e-06, "loss": 0.2701, "step": 2064 }, { "epoch": 2.7242744063324538, "grad_norm": 0.07886461502783257, "learning_rate": 5.105292706353093e-06, "loss": 0.2775, "step": 2065 }, { "epoch": 2.7255936675461743, "grad_norm": 0.08075889442074662, "learning_rate": 5.056970117777626e-06, "loss": 0.2719, "step": 2066 }, { "epoch": 2.7269129287598943, "grad_norm": 0.08184345181107058, "learning_rate": 5.008871376310409e-06, "loss": 0.2707, "step": 2067 }, { "epoch": 2.728232189973615, "grad_norm": 0.08076778647367866, "learning_rate": 4.96099659535384e-06, "loss": 0.2707, "step": 2068 }, { "epoch": 2.7295514511873353, "grad_norm": 0.08192745615568481, "learning_rate": 4.913345887782228e-06, "loss": 0.2693, "step": 2069 }, { "epoch": 2.7308707124010554, "grad_norm": 0.07802912358533373, "learning_rate": 4.865919365941629e-06, "loss": 0.2694, "step": 2070 }, { "epoch": 2.732189973614776, "grad_norm": 0.0837696668493374, "learning_rate": 4.8187171416494985e-06, "loss": 0.2779, "step": 2071 }, { "epoch": 2.733509234828496, "grad_norm": 0.08060845600771005, "learning_rate": 4.771739326194502e-06, "loss": 0.2814, "step": 2072 }, { "epoch": 2.7348284960422165, "grad_norm": 0.0792590053328996, "learning_rate": 4.7249860303361755e-06, "loss": 0.2811, "step": 2073 }, { "epoch": 2.736147757255937, "grad_norm": 0.07943853503956293, "learning_rate": 4.678457364304745e-06, "loss": 0.2687, "step": 2074 }, { "epoch": 2.737467018469657, "grad_norm": 0.08110162363830625, "learning_rate": 4.632153437800824e-06, "loss": 0.2834, "step": 2075 }, { "epoch": 2.738786279683377, "grad_norm": 0.08336131330327683, "learning_rate": 4.586074359995119e-06, "loss": 0.2854, "step": 2076 }, { "epoch": 2.7401055408970976, "grad_norm": 0.08001712419018622, "learning_rate": 4.5402202395282474e-06, "loss": 0.2834, "step": 2077 }, { "epoch": 2.741424802110818, "grad_norm": 0.07938460355913113, "learning_rate": 4.4945911845104326e-06, "loss": 0.2672, "step": 2078 }, { "epoch": 2.742744063324538, "grad_norm": 0.08175857813330538, "learning_rate": 4.449187302521263e-06, "loss": 0.2784, "step": 2079 }, { "epoch": 2.7440633245382586, "grad_norm": 0.07970712279545984, "learning_rate": 4.40400870060943e-06, "loss": 0.2689, "step": 2080 }, { "epoch": 2.7453825857519787, "grad_norm": 0.078702803980518, "learning_rate": 4.359055485292496e-06, "loss": 0.2825, "step": 2081 }, { "epoch": 2.746701846965699, "grad_norm": 0.08206610009373459, "learning_rate": 4.314327762556624e-06, "loss": 0.2753, "step": 2082 }, { "epoch": 2.7480211081794197, "grad_norm": 0.07774074884840562, "learning_rate": 4.269825637856317e-06, "loss": 0.2756, "step": 2083 }, { "epoch": 2.7493403693931397, "grad_norm": 0.0784618582986608, "learning_rate": 4.225549216114222e-06, "loss": 0.2661, "step": 2084 }, { "epoch": 2.7506596306068603, "grad_norm": 0.08454850040611238, "learning_rate": 4.181498601720801e-06, "loss": 0.2825, "step": 2085 }, { "epoch": 2.7519788918205803, "grad_norm": 0.07934717672367357, "learning_rate": 4.137673898534178e-06, "loss": 0.2778, "step": 2086 }, { "epoch": 2.753298153034301, "grad_norm": 0.08078687671243805, "learning_rate": 4.094075209879788e-06, "loss": 0.2718, "step": 2087 }, { "epoch": 2.7546174142480213, "grad_norm": 0.08169655447872393, "learning_rate": 4.050702638550275e-06, "loss": 0.2743, "step": 2088 }, { "epoch": 2.7559366754617414, "grad_norm": 0.08012158418651004, "learning_rate": 4.00755628680507e-06, "loss": 0.2759, "step": 2089 }, { "epoch": 2.757255936675462, "grad_norm": 0.08107546364108859, "learning_rate": 3.964636256370302e-06, "loss": 0.2605, "step": 2090 }, { "epoch": 2.758575197889182, "grad_norm": 0.08223290324462568, "learning_rate": 3.921942648438526e-06, "loss": 0.2789, "step": 2091 }, { "epoch": 2.7598944591029024, "grad_norm": 0.0800095951086573, "learning_rate": 3.879475563668389e-06, "loss": 0.2724, "step": 2092 }, { "epoch": 2.761213720316623, "grad_norm": 0.07971757369987557, "learning_rate": 3.837235102184533e-06, "loss": 0.2718, "step": 2093 }, { "epoch": 2.762532981530343, "grad_norm": 0.07924796200406266, "learning_rate": 3.795221363577239e-06, "loss": 0.2759, "step": 2094 }, { "epoch": 2.763852242744063, "grad_norm": 0.07879806409934384, "learning_rate": 3.753434446902282e-06, "loss": 0.2724, "step": 2095 }, { "epoch": 2.7651715039577835, "grad_norm": 0.0785881695616826, "learning_rate": 3.71187445068063e-06, "loss": 0.2607, "step": 2096 }, { "epoch": 2.766490765171504, "grad_norm": 0.08156121862104518, "learning_rate": 3.67054147289827e-06, "loss": 0.2778, "step": 2097 }, { "epoch": 2.767810026385224, "grad_norm": 0.07964162456673265, "learning_rate": 3.6294356110059157e-06, "loss": 0.2656, "step": 2098 }, { "epoch": 2.7691292875989446, "grad_norm": 0.07978717141023955, "learning_rate": 3.588556961918832e-06, "loss": 0.2677, "step": 2099 }, { "epoch": 2.7704485488126647, "grad_norm": 0.07911235507214241, "learning_rate": 3.547905622016601e-06, "loss": 0.281, "step": 2100 }, { "epoch": 2.771767810026385, "grad_norm": 0.07679285264448896, "learning_rate": 3.5074816871428106e-06, "loss": 0.2685, "step": 2101 }, { "epoch": 2.7730870712401057, "grad_norm": 0.07761290550468401, "learning_rate": 3.4672852526049794e-06, "loss": 0.2732, "step": 2102 }, { "epoch": 2.7744063324538257, "grad_norm": 0.07921454958114892, "learning_rate": 3.4273164131741753e-06, "loss": 0.2682, "step": 2103 }, { "epoch": 2.7757255936675462, "grad_norm": 0.07945117914594071, "learning_rate": 3.387575263084941e-06, "loss": 0.2709, "step": 2104 }, { "epoch": 2.7770448548812663, "grad_norm": 0.08152594717913104, "learning_rate": 3.348061896034926e-06, "loss": 0.2825, "step": 2105 }, { "epoch": 2.778364116094987, "grad_norm": 0.07857681587106083, "learning_rate": 3.308776405184777e-06, "loss": 0.2637, "step": 2106 }, { "epoch": 2.7796833773087073, "grad_norm": 0.07951377833179271, "learning_rate": 3.2697188831578575e-06, "loss": 0.2709, "step": 2107 }, { "epoch": 2.7810026385224274, "grad_norm": 0.08209489778068796, "learning_rate": 3.230889422040051e-06, "loss": 0.285, "step": 2108 }, { "epoch": 2.782321899736148, "grad_norm": 0.08023195093449574, "learning_rate": 3.1922881133795825e-06, "loss": 0.2792, "step": 2109 }, { "epoch": 2.783641160949868, "grad_norm": 0.08039724042908916, "learning_rate": 3.1539150481866843e-06, "loss": 0.283, "step": 2110 }, { "epoch": 2.7849604221635884, "grad_norm": 0.08274603217637769, "learning_rate": 3.1157703169335305e-06, "loss": 0.2818, "step": 2111 }, { "epoch": 2.786279683377309, "grad_norm": 0.08044696140592181, "learning_rate": 3.0778540095539156e-06, "loss": 0.2775, "step": 2112 }, { "epoch": 2.787598944591029, "grad_norm": 0.08089447361465678, "learning_rate": 3.0401662154430856e-06, "loss": 0.2815, "step": 2113 }, { "epoch": 2.7889182058047495, "grad_norm": 0.07940822235508017, "learning_rate": 3.0027070234575293e-06, "loss": 0.2783, "step": 2114 }, { "epoch": 2.7902374670184695, "grad_norm": 0.07957515840649088, "learning_rate": 2.9654765219147563e-06, "loss": 0.2708, "step": 2115 }, { "epoch": 2.79155672823219, "grad_norm": 0.08018283891623518, "learning_rate": 2.928474798593073e-06, "loss": 0.2822, "step": 2116 }, { "epoch": 2.7928759894459105, "grad_norm": 0.08092023967546035, "learning_rate": 2.8917019407314392e-06, "loss": 0.2721, "step": 2117 }, { "epoch": 2.7941952506596306, "grad_norm": 0.08073570510056678, "learning_rate": 2.8551580350291817e-06, "loss": 0.2784, "step": 2118 }, { "epoch": 2.7955145118733506, "grad_norm": 0.08146662053759081, "learning_rate": 2.818843167645835e-06, "loss": 0.28, "step": 2119 }, { "epoch": 2.796833773087071, "grad_norm": 0.0834635964456077, "learning_rate": 2.7827574242009437e-06, "loss": 0.2796, "step": 2120 }, { "epoch": 2.7981530343007917, "grad_norm": 0.07984463019534323, "learning_rate": 2.746900889773829e-06, "loss": 0.2747, "step": 2121 }, { "epoch": 2.7994722955145117, "grad_norm": 0.08221928773753545, "learning_rate": 2.711273648903423e-06, "loss": 0.2797, "step": 2122 }, { "epoch": 2.800791556728232, "grad_norm": 0.08020548253522393, "learning_rate": 2.6758757855880334e-06, "loss": 0.2737, "step": 2123 }, { "epoch": 2.8021108179419523, "grad_norm": 0.0817256933997834, "learning_rate": 2.6407073832851682e-06, "loss": 0.2734, "step": 2124 }, { "epoch": 2.8034300791556728, "grad_norm": 0.07937208110690892, "learning_rate": 2.6057685249113785e-06, "loss": 0.2866, "step": 2125 }, { "epoch": 2.8047493403693933, "grad_norm": 0.07872278744384463, "learning_rate": 2.5710592928419376e-06, "loss": 0.2685, "step": 2126 }, { "epoch": 2.8060686015831133, "grad_norm": 0.08013987872967124, "learning_rate": 2.536579768910818e-06, "loss": 0.2817, "step": 2127 }, { "epoch": 2.807387862796834, "grad_norm": 0.08098181323670975, "learning_rate": 2.5023300344103274e-06, "loss": 0.2831, "step": 2128 }, { "epoch": 2.808707124010554, "grad_norm": 0.07873647773298106, "learning_rate": 2.4683101700910704e-06, "loss": 0.274, "step": 2129 }, { "epoch": 2.8100263852242744, "grad_norm": 0.07973634946965383, "learning_rate": 2.434520256161632e-06, "loss": 0.2775, "step": 2130 }, { "epoch": 2.811345646437995, "grad_norm": 0.07998063445408488, "learning_rate": 2.4009603722884742e-06, "loss": 0.2712, "step": 2131 }, { "epoch": 2.812664907651715, "grad_norm": 0.08098222323624572, "learning_rate": 2.3676305975957157e-06, "loss": 0.2767, "step": 2132 }, { "epoch": 2.8139841688654355, "grad_norm": 0.08007968486601143, "learning_rate": 2.334531010664931e-06, "loss": 0.2696, "step": 2133 }, { "epoch": 2.8153034300791555, "grad_norm": 0.08197736503450284, "learning_rate": 2.301661689534995e-06, "loss": 0.2786, "step": 2134 }, { "epoch": 2.816622691292876, "grad_norm": 0.08234286372956333, "learning_rate": 2.2690227117018847e-06, "loss": 0.2817, "step": 2135 }, { "epoch": 2.8179419525065965, "grad_norm": 0.07828010944997638, "learning_rate": 2.2366141541184883e-06, "loss": 0.2661, "step": 2136 }, { "epoch": 2.8192612137203166, "grad_norm": 0.08277433836344565, "learning_rate": 2.2044360931944398e-06, "loss": 0.2816, "step": 2137 }, { "epoch": 2.820580474934037, "grad_norm": 0.08056157532243434, "learning_rate": 2.1724886047959303e-06, "loss": 0.2742, "step": 2138 }, { "epoch": 2.821899736147757, "grad_norm": 0.07932669826084948, "learning_rate": 2.1407717642455082e-06, "loss": 0.2791, "step": 2139 }, { "epoch": 2.8232189973614776, "grad_norm": 0.08176108242633139, "learning_rate": 2.109285646321979e-06, "loss": 0.2872, "step": 2140 }, { "epoch": 2.824538258575198, "grad_norm": 0.08178783184977167, "learning_rate": 2.0780303252601052e-06, "loss": 0.2763, "step": 2141 }, { "epoch": 2.825857519788918, "grad_norm": 0.07928627397552415, "learning_rate": 2.0470058747505516e-06, "loss": 0.2661, "step": 2142 }, { "epoch": 2.8271767810026383, "grad_norm": 0.0791816772905237, "learning_rate": 2.0162123679396517e-06, "loss": 0.2802, "step": 2143 }, { "epoch": 2.8284960422163588, "grad_norm": 0.08128756557519101, "learning_rate": 1.9856498774291963e-06, "loss": 0.2807, "step": 2144 }, { "epoch": 2.8298153034300793, "grad_norm": 0.0799860796064801, "learning_rate": 1.955318475276391e-06, "loss": 0.2784, "step": 2145 }, { "epoch": 2.8311345646437993, "grad_norm": 0.07991913984966056, "learning_rate": 1.92521823299352e-06, "loss": 0.2715, "step": 2146 }, { "epoch": 2.83245382585752, "grad_norm": 0.07937032637435092, "learning_rate": 1.8953492215479151e-06, "loss": 0.2667, "step": 2147 }, { "epoch": 2.83377308707124, "grad_norm": 0.08037256452483396, "learning_rate": 1.865711511361734e-06, "loss": 0.2745, "step": 2148 }, { "epoch": 2.8350923482849604, "grad_norm": 0.08052220981710956, "learning_rate": 1.8363051723117698e-06, "loss": 0.2767, "step": 2149 }, { "epoch": 2.836411609498681, "grad_norm": 0.08260582982671778, "learning_rate": 1.8071302737293295e-06, "loss": 0.2854, "step": 2150 }, { "epoch": 2.837730870712401, "grad_norm": 0.08163954305117163, "learning_rate": 1.778186884400046e-06, "loss": 0.2762, "step": 2151 }, { "epoch": 2.8390501319261214, "grad_norm": 0.08305957817933719, "learning_rate": 1.7494750725637332e-06, "loss": 0.2861, "step": 2152 }, { "epoch": 2.8403693931398415, "grad_norm": 0.0805844122514223, "learning_rate": 1.7209949059142083e-06, "loss": 0.2784, "step": 2153 }, { "epoch": 2.841688654353562, "grad_norm": 0.0789164774171779, "learning_rate": 1.6927464515991142e-06, "loss": 0.2806, "step": 2154 }, { "epoch": 2.8430079155672825, "grad_norm": 0.08096440196921265, "learning_rate": 1.6647297762198423e-06, "loss": 0.2811, "step": 2155 }, { "epoch": 2.8443271767810026, "grad_norm": 0.08180398969104963, "learning_rate": 1.6369449458312758e-06, "loss": 0.2737, "step": 2156 }, { "epoch": 2.845646437994723, "grad_norm": 0.07867319834473072, "learning_rate": 1.6093920259416696e-06, "loss": 0.2751, "step": 2157 }, { "epoch": 2.846965699208443, "grad_norm": 0.0804671903235477, "learning_rate": 1.5820710815125484e-06, "loss": 0.2697, "step": 2158 }, { "epoch": 2.8482849604221636, "grad_norm": 0.08333173360739377, "learning_rate": 1.5549821769584638e-06, "loss": 0.2741, "step": 2159 }, { "epoch": 2.849604221635884, "grad_norm": 0.0814559744711996, "learning_rate": 1.5281253761469161e-06, "loss": 0.2806, "step": 2160 }, { "epoch": 2.850923482849604, "grad_norm": 0.08172610263480651, "learning_rate": 1.5015007423981543e-06, "loss": 0.2818, "step": 2161 }, { "epoch": 2.8522427440633247, "grad_norm": 0.0809485204668772, "learning_rate": 1.475108338485065e-06, "loss": 0.2777, "step": 2162 }, { "epoch": 2.8535620052770447, "grad_norm": 0.08004122713359416, "learning_rate": 1.4489482266329956e-06, "loss": 0.271, "step": 2163 }, { "epoch": 2.8548812664907652, "grad_norm": 0.07858693280738595, "learning_rate": 1.4230204685196203e-06, "loss": 0.2745, "step": 2164 }, { "epoch": 2.8562005277044857, "grad_norm": 0.08039787828259413, "learning_rate": 1.3973251252747954e-06, "loss": 0.2699, "step": 2165 }, { "epoch": 2.857519788918206, "grad_norm": 0.07896428979171825, "learning_rate": 1.3718622574804163e-06, "loss": 0.2738, "step": 2166 }, { "epoch": 2.858839050131926, "grad_norm": 0.0792293022427774, "learning_rate": 1.3466319251702609e-06, "loss": 0.2728, "step": 2167 }, { "epoch": 2.8601583113456464, "grad_norm": 0.08130003057894879, "learning_rate": 1.3216341878298566e-06, "loss": 0.2779, "step": 2168 }, { "epoch": 2.861477572559367, "grad_norm": 0.07897426171661638, "learning_rate": 1.2968691043963699e-06, "loss": 0.2617, "step": 2169 }, { "epoch": 2.862796833773087, "grad_norm": 0.08139436688924204, "learning_rate": 1.2723367332583946e-06, "loss": 0.2736, "step": 2170 }, { "epoch": 2.8641160949868074, "grad_norm": 0.0819493647350851, "learning_rate": 1.248037132255908e-06, "loss": 0.2765, "step": 2171 }, { "epoch": 2.8654353562005275, "grad_norm": 0.07941843038047375, "learning_rate": 1.2239703586800378e-06, "loss": 0.2723, "step": 2172 }, { "epoch": 2.866754617414248, "grad_norm": 0.0810496361901509, "learning_rate": 1.2001364692730165e-06, "loss": 0.2748, "step": 2173 }, { "epoch": 2.8680738786279685, "grad_norm": 0.08251334822456363, "learning_rate": 1.1765355202279838e-06, "loss": 0.2769, "step": 2174 }, { "epoch": 2.8693931398416885, "grad_norm": 0.08023296317335994, "learning_rate": 1.1531675671888619e-06, "loss": 0.2776, "step": 2175 }, { "epoch": 2.870712401055409, "grad_norm": 0.08008285845515258, "learning_rate": 1.130032665250269e-06, "loss": 0.2696, "step": 2176 }, { "epoch": 2.872031662269129, "grad_norm": 0.08173009364578271, "learning_rate": 1.1071308689573513e-06, "loss": 0.2775, "step": 2177 }, { "epoch": 2.8733509234828496, "grad_norm": 0.07976994584664397, "learning_rate": 1.0844622323056387e-06, "loss": 0.2728, "step": 2178 }, { "epoch": 2.87467018469657, "grad_norm": 0.07924755147968063, "learning_rate": 1.0620268087409791e-06, "loss": 0.2666, "step": 2179 }, { "epoch": 2.87598944591029, "grad_norm": 0.07997090954642246, "learning_rate": 1.0398246511593268e-06, "loss": 0.2734, "step": 2180 }, { "epoch": 2.8773087071240107, "grad_norm": 0.08276375581636095, "learning_rate": 1.0178558119067315e-06, "loss": 0.2811, "step": 2181 }, { "epoch": 2.8786279683377307, "grad_norm": 0.07711917611782462, "learning_rate": 9.96120342779061e-07, "loss": 0.265, "step": 2182 }, { "epoch": 2.879947229551451, "grad_norm": 0.0816556706428798, "learning_rate": 9.746182950220562e-07, "loss": 0.2773, "step": 2183 }, { "epoch": 2.8812664907651717, "grad_norm": 0.08091765587514728, "learning_rate": 9.533497193310537e-07, "loss": 0.2802, "step": 2184 }, { "epoch": 2.8825857519788918, "grad_norm": 0.0831151358792169, "learning_rate": 9.32314665850964e-07, "loss": 0.283, "step": 2185 }, { "epoch": 2.8839050131926123, "grad_norm": 0.07832569136592407, "learning_rate": 9.11513184176116e-07, "loss": 0.2736, "step": 2186 }, { "epoch": 2.8852242744063323, "grad_norm": 0.083120186727448, "learning_rate": 8.909453233501452e-07, "loss": 0.2916, "step": 2187 }, { "epoch": 2.886543535620053, "grad_norm": 0.08449295537476079, "learning_rate": 8.706111318658838e-07, "loss": 0.2941, "step": 2188 }, { "epoch": 2.8878627968337733, "grad_norm": 0.08233689528934261, "learning_rate": 8.505106576652377e-07, "loss": 0.2739, "step": 2189 }, { "epoch": 2.8891820580474934, "grad_norm": 0.0859875699161074, "learning_rate": 8.30643948139087e-07, "loss": 0.2804, "step": 2190 }, { "epoch": 2.8905013192612135, "grad_norm": 0.08078588698297602, "learning_rate": 8.110110501271529e-07, "loss": 0.2787, "step": 2191 }, { "epoch": 2.891820580474934, "grad_norm": 0.08044072226391902, "learning_rate": 7.91612009917897e-07, "loss": 0.2726, "step": 2192 }, { "epoch": 2.8931398416886545, "grad_norm": 0.0810418252033847, "learning_rate": 7.724468732484336e-07, "loss": 0.2903, "step": 2193 }, { "epoch": 2.8944591029023745, "grad_norm": 0.08002813747901273, "learning_rate": 7.535156853043846e-07, "loss": 0.2627, "step": 2194 }, { "epoch": 2.895778364116095, "grad_norm": 0.08224253406295658, "learning_rate": 7.348184907197908e-07, "loss": 0.2869, "step": 2195 }, { "epoch": 2.897097625329815, "grad_norm": 0.08207782232369824, "learning_rate": 7.163553335770123e-07, "loss": 0.2889, "step": 2196 }, { "epoch": 2.8984168865435356, "grad_norm": 0.08168004459698293, "learning_rate": 6.981262574066394e-07, "loss": 0.2751, "step": 2197 }, { "epoch": 2.899736147757256, "grad_norm": 0.0812092593202332, "learning_rate": 6.801313051873259e-07, "loss": 0.2768, "step": 2198 }, { "epoch": 2.901055408970976, "grad_norm": 0.0765470165125419, "learning_rate": 6.623705193457897e-07, "loss": 0.2595, "step": 2199 }, { "epoch": 2.9023746701846966, "grad_norm": 0.07954233755035575, "learning_rate": 6.448439417565788e-07, "loss": 0.2714, "step": 2200 }, { "epoch": 2.9036939313984167, "grad_norm": 0.07804629136796772, "learning_rate": 6.275516137421389e-07, "loss": 0.2697, "step": 2201 }, { "epoch": 2.905013192612137, "grad_norm": 0.07805170591574559, "learning_rate": 6.10493576072535e-07, "loss": 0.2767, "step": 2202 }, { "epoch": 2.9063324538258577, "grad_norm": 0.08165678706150606, "learning_rate": 5.936698689655184e-07, "loss": 0.2738, "step": 2203 }, { "epoch": 2.9076517150395778, "grad_norm": 0.07951917011764713, "learning_rate": 5.77080532086316e-07, "loss": 0.2708, "step": 2204 }, { "epoch": 2.9089709762532983, "grad_norm": 0.08181982866793723, "learning_rate": 5.607256045475961e-07, "loss": 0.2865, "step": 2205 }, { "epoch": 2.9102902374670183, "grad_norm": 0.07972901612998953, "learning_rate": 5.446051249093698e-07, "loss": 0.2735, "step": 2206 }, { "epoch": 2.911609498680739, "grad_norm": 0.08180794403013804, "learning_rate": 5.287191311788675e-07, "loss": 0.2725, "step": 2207 }, { "epoch": 2.9129287598944593, "grad_norm": 0.08122240256366457, "learning_rate": 5.130676608104845e-07, "loss": 0.2832, "step": 2208 }, { "epoch": 2.9142480211081794, "grad_norm": 0.08268050415041092, "learning_rate": 4.976507507056916e-07, "loss": 0.2774, "step": 2209 }, { "epoch": 2.9155672823219, "grad_norm": 0.08153476824525888, "learning_rate": 4.82468437212913e-07, "loss": 0.2881, "step": 2210 }, { "epoch": 2.91688654353562, "grad_norm": 0.08167142898376217, "learning_rate": 4.6752075612748194e-07, "loss": 0.2931, "step": 2211 }, { "epoch": 2.9182058047493404, "grad_norm": 0.080089185639967, "learning_rate": 4.5280774269154115e-07, "loss": 0.2746, "step": 2212 }, { "epoch": 2.919525065963061, "grad_norm": 0.0790045839408177, "learning_rate": 4.3832943159394234e-07, "loss": 0.2626, "step": 2213 }, { "epoch": 2.920844327176781, "grad_norm": 0.08038661058482462, "learning_rate": 4.240858569701911e-07, "loss": 0.2743, "step": 2214 }, { "epoch": 2.922163588390501, "grad_norm": 0.08281897252430612, "learning_rate": 4.1007705240235783e-07, "loss": 0.2867, "step": 2215 }, { "epoch": 2.9234828496042216, "grad_norm": 0.07870342887625904, "learning_rate": 3.96303050918978e-07, "loss": 0.2757, "step": 2216 }, { "epoch": 2.924802110817942, "grad_norm": 0.07935216211945757, "learning_rate": 3.827638849950077e-07, "loss": 0.2713, "step": 2217 }, { "epoch": 2.926121372031662, "grad_norm": 0.07915410240197905, "learning_rate": 3.6945958655174583e-07, "loss": 0.2794, "step": 2218 }, { "epoch": 2.9274406332453826, "grad_norm": 0.07870486112936713, "learning_rate": 3.56390186956701e-07, "loss": 0.2807, "step": 2219 }, { "epoch": 2.9287598944591027, "grad_norm": 0.07859881315531353, "learning_rate": 3.435557170236026e-07, "loss": 0.275, "step": 2220 }, { "epoch": 2.930079155672823, "grad_norm": 0.08059907314820777, "learning_rate": 3.3095620701226737e-07, "loss": 0.2771, "step": 2221 }, { "epoch": 2.9313984168865437, "grad_norm": 0.08075811506706905, "learning_rate": 3.185916866285443e-07, "loss": 0.2761, "step": 2222 }, { "epoch": 2.9327176781002637, "grad_norm": 0.08303811114646592, "learning_rate": 3.0646218502425886e-07, "loss": 0.2774, "step": 2223 }, { "epoch": 2.9340369393139842, "grad_norm": 0.08162936188923256, "learning_rate": 2.945677307971351e-07, "loss": 0.2762, "step": 2224 }, { "epoch": 2.9353562005277043, "grad_norm": 0.08338095849994467, "learning_rate": 2.8290835199069607e-07, "loss": 0.2793, "step": 2225 }, { "epoch": 2.936675461741425, "grad_norm": 0.07900460977352994, "learning_rate": 2.7148407609427493e-07, "loss": 0.2784, "step": 2226 }, { "epoch": 2.9379947229551453, "grad_norm": 0.07973590354606731, "learning_rate": 2.6029493004285923e-07, "loss": 0.2747, "step": 2227 }, { "epoch": 2.9393139841688654, "grad_norm": 0.0800717946303067, "learning_rate": 2.493409402171132e-07, "loss": 0.2709, "step": 2228 }, { "epoch": 2.940633245382586, "grad_norm": 0.07898942537281006, "learning_rate": 2.386221324432225e-07, "loss": 0.268, "step": 2229 }, { "epoch": 2.941952506596306, "grad_norm": 0.0803944509554427, "learning_rate": 2.2813853199292746e-07, "loss": 0.2845, "step": 2230 }, { "epoch": 2.9432717678100264, "grad_norm": 0.08167893457419691, "learning_rate": 2.1789016358340076e-07, "loss": 0.2757, "step": 2231 }, { "epoch": 2.944591029023747, "grad_norm": 0.07837633241992431, "learning_rate": 2.0787705137721437e-07, "loss": 0.2702, "step": 2232 }, { "epoch": 2.945910290237467, "grad_norm": 0.08240421183753124, "learning_rate": 1.980992189822839e-07, "loss": 0.2906, "step": 2233 }, { "epoch": 2.9472295514511875, "grad_norm": 0.07866587321078851, "learning_rate": 1.8855668945177984e-07, "loss": 0.2734, "step": 2234 }, { "epoch": 2.9485488126649075, "grad_norm": 0.07956732629855476, "learning_rate": 1.7924948528412755e-07, "loss": 0.2675, "step": 2235 }, { "epoch": 2.949868073878628, "grad_norm": 0.0813402113252344, "learning_rate": 1.701776284229073e-07, "loss": 0.2713, "step": 2236 }, { "epoch": 2.9511873350923485, "grad_norm": 0.07934961767649966, "learning_rate": 1.6134114025684323e-07, "loss": 0.2736, "step": 2237 }, { "epoch": 2.9525065963060686, "grad_norm": 0.08086495591698961, "learning_rate": 1.5274004161970335e-07, "loss": 0.2774, "step": 2238 }, { "epoch": 2.9538258575197887, "grad_norm": 0.07904568008662359, "learning_rate": 1.4437435279029966e-07, "loss": 0.2788, "step": 2239 }, { "epoch": 2.955145118733509, "grad_norm": 0.08013471864402794, "learning_rate": 1.3624409349239918e-07, "loss": 0.2711, "step": 2240 }, { "epoch": 2.9564643799472297, "grad_norm": 0.08173290437794392, "learning_rate": 1.2834928289472416e-07, "loss": 0.2909, "step": 2241 }, { "epoch": 2.9577836411609497, "grad_norm": 0.08210625957568107, "learning_rate": 1.2068993961084074e-07, "loss": 0.2802, "step": 2242 }, { "epoch": 2.95910290237467, "grad_norm": 0.07944904160206251, "learning_rate": 1.1326608169920372e-07, "loss": 0.2805, "step": 2243 }, { "epoch": 2.9604221635883903, "grad_norm": 0.07999137763661669, "learning_rate": 1.0607772666302306e-07, "loss": 0.2656, "step": 2244 }, { "epoch": 2.961741424802111, "grad_norm": 0.08336284893084, "learning_rate": 9.912489145026405e-08, "loss": 0.2757, "step": 2245 }, { "epoch": 2.9630606860158313, "grad_norm": 0.0774498163296274, "learning_rate": 9.240759245364716e-08, "loss": 0.2729, "step": 2246 }, { "epoch": 2.9643799472295513, "grad_norm": 0.08249353985690891, "learning_rate": 8.592584551053718e-08, "loss": 0.2734, "step": 2247 }, { "epoch": 2.965699208443272, "grad_norm": 0.08026843767538812, "learning_rate": 7.967966590293197e-08, "loss": 0.2658, "step": 2248 }, { "epoch": 2.967018469656992, "grad_norm": 0.08232209862356267, "learning_rate": 7.366906835745147e-08, "loss": 0.2803, "step": 2249 }, { "epoch": 2.9683377308707124, "grad_norm": 0.08333837529864252, "learning_rate": 6.789406704527102e-08, "loss": 0.2813, "step": 2250 }, { "epoch": 2.969656992084433, "grad_norm": 0.07898345332770025, "learning_rate": 6.235467558211028e-08, "loss": 0.2693, "step": 2251 }, { "epoch": 2.970976253298153, "grad_norm": 0.08039466043481228, "learning_rate": 5.705090702819993e-08, "loss": 0.2765, "step": 2252 }, { "epoch": 2.9722955145118735, "grad_norm": 0.08120177090564082, "learning_rate": 5.198277388821504e-08, "loss": 0.2745, "step": 2253 }, { "epoch": 2.9736147757255935, "grad_norm": 0.08221312326257504, "learning_rate": 4.715028811128619e-08, "loss": 0.2818, "step": 2254 }, { "epoch": 2.974934036939314, "grad_norm": 0.08131382187339421, "learning_rate": 4.255346109097724e-08, "loss": 0.2814, "step": 2255 }, { "epoch": 2.9762532981530345, "grad_norm": 0.08020038761899813, "learning_rate": 3.819230366521875e-08, "loss": 0.2774, "step": 2256 }, { "epoch": 2.9775725593667546, "grad_norm": 0.08110895485910391, "learning_rate": 3.406682611630796e-08, "loss": 0.2807, "step": 2257 }, { "epoch": 2.978891820580475, "grad_norm": 0.08051512048744235, "learning_rate": 3.017703817087547e-08, "loss": 0.2894, "step": 2258 }, { "epoch": 2.980211081794195, "grad_norm": 0.08057299553087337, "learning_rate": 2.6522948999874175e-08, "loss": 0.2809, "step": 2259 }, { "epoch": 2.9815303430079156, "grad_norm": 0.07821267246340399, "learning_rate": 2.3104567218545924e-08, "loss": 0.2656, "step": 2260 }, { "epoch": 2.982849604221636, "grad_norm": 0.08322052786986561, "learning_rate": 1.992190088641044e-08, "loss": 0.2808, "step": 2261 }, { "epoch": 2.984168865435356, "grad_norm": 0.0802230286126292, "learning_rate": 1.6974957507231993e-08, "loss": 0.2801, "step": 2262 }, { "epoch": 2.9854881266490763, "grad_norm": 0.08235823034993352, "learning_rate": 1.426374402901942e-08, "loss": 0.2791, "step": 2263 }, { "epoch": 2.9868073878627968, "grad_norm": 0.07846889302418314, "learning_rate": 1.1788266844003914e-08, "loss": 0.2747, "step": 2264 }, { "epoch": 2.9881266490765173, "grad_norm": 0.07963905697978255, "learning_rate": 9.548531788605707e-09, "loss": 0.2665, "step": 2265 }, { "epoch": 2.9894459102902373, "grad_norm": 0.0830067862006687, "learning_rate": 7.544544143445186e-09, "loss": 0.2826, "step": 2266 }, { "epoch": 2.990765171503958, "grad_norm": 0.08196234688662356, "learning_rate": 5.776308633342886e-09, "loss": 0.2842, "step": 2267 }, { "epoch": 2.992084432717678, "grad_norm": 0.08152959649016855, "learning_rate": 4.2438294272528765e-09, "loss": 0.2738, "step": 2268 }, { "epoch": 2.9934036939313984, "grad_norm": 0.07939256700743008, "learning_rate": 2.947110138296072e-09, "loss": 0.2715, "step": 2269 }, { "epoch": 2.994722955145119, "grad_norm": 0.07953394789153871, "learning_rate": 1.886153823749126e-09, "loss": 0.2831, "step": 2270 }, { "epoch": 2.996042216358839, "grad_norm": 0.08314826191969563, "learning_rate": 1.0609629850222292e-09, "loss": 0.2678, "step": 2271 }, { "epoch": 2.9973614775725594, "grad_norm": 0.0811917095267708, "learning_rate": 4.715395676813117e-10, "loss": 0.2807, "step": 2272 }, { "epoch": 2.9986807387862795, "grad_norm": 0.07859033760345668, "learning_rate": 1.178849614036359e-10, "loss": 0.2782, "step": 2273 }, { "epoch": 3.0, "grad_norm": 0.0781387406799188, "learning_rate": 0.0, "loss": 0.274, "step": 2274 }, { "epoch": 3.0, "eval_loss": 0.30313608050346375, "eval_runtime": 158.3847, "eval_samples_per_second": 32.232, "eval_steps_per_second": 1.01, "step": 2274 }, { "epoch": 3.0, "step": 2274, "total_flos": 8.121740457714647e+17, "train_loss": 0.32005984397079196, "train_runtime": 25010.8841, "train_samples_per_second": 11.633, "train_steps_per_second": 0.091 } ], "logging_steps": 1, "max_steps": 2274, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.121740457714647e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }