{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9985396722375466, "eval_steps": 500, "global_step": 4620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012980691221807562, "grad_norm": 6.221705913543701, "learning_rate": 8.658008658008659e-08, "loss": 0.9647, "step": 2 }, { "epoch": 0.0025961382443615124, "grad_norm": 6.434932231903076, "learning_rate": 1.7316017316017318e-07, "loss": 0.9086, "step": 4 }, { "epoch": 0.0038942073665422685, "grad_norm": 5.885791301727295, "learning_rate": 2.597402597402598e-07, "loss": 1.044, "step": 6 }, { "epoch": 0.005192276488723025, "grad_norm": 6.61972713470459, "learning_rate": 3.4632034632034636e-07, "loss": 0.9563, "step": 8 }, { "epoch": 0.0064903456109037805, "grad_norm": 6.213660717010498, "learning_rate": 4.329004329004329e-07, "loss": 0.9333, "step": 10 }, { "epoch": 0.007788414733084537, "grad_norm": 6.075322151184082, "learning_rate": 5.194805194805196e-07, "loss": 0.9011, "step": 12 }, { "epoch": 0.009086483855265294, "grad_norm": 6.823555946350098, "learning_rate": 6.060606060606061e-07, "loss": 0.9548, "step": 14 }, { "epoch": 0.01038455297744605, "grad_norm": 6.020087242126465, "learning_rate": 6.926406926406927e-07, "loss": 0.9349, "step": 16 }, { "epoch": 0.011682622099626805, "grad_norm": 6.454049110412598, "learning_rate": 7.792207792207792e-07, "loss": 0.9455, "step": 18 }, { "epoch": 0.012980691221807561, "grad_norm": 6.042789936065674, "learning_rate": 8.658008658008658e-07, "loss": 0.964, "step": 20 }, { "epoch": 0.014278760343988317, "grad_norm": 6.42781400680542, "learning_rate": 9.523809523809525e-07, "loss": 0.9941, "step": 22 }, { "epoch": 0.015576829466169074, "grad_norm": 7.119968891143799, "learning_rate": 1.0389610389610392e-06, "loss": 0.9855, "step": 24 }, { "epoch": 0.01687489858834983, "grad_norm": 6.382406711578369, "learning_rate": 1.1255411255411256e-06, "loss": 1.0011, "step": 26 }, { "epoch": 0.018172967710530587, "grad_norm": 6.274774074554443, "learning_rate": 1.2121212121212122e-06, "loss": 0.9528, "step": 28 }, { "epoch": 0.01947103683271134, "grad_norm": 6.0175652503967285, "learning_rate": 1.2987012987012986e-06, "loss": 0.9487, "step": 30 }, { "epoch": 0.0207691059548921, "grad_norm": 5.71115779876709, "learning_rate": 1.3852813852813854e-06, "loss": 0.9224, "step": 32 }, { "epoch": 0.022067175077072853, "grad_norm": 6.351312160491943, "learning_rate": 1.471861471861472e-06, "loss": 0.977, "step": 34 }, { "epoch": 0.02336524419925361, "grad_norm": 5.8165178298950195, "learning_rate": 1.5584415584415584e-06, "loss": 0.9185, "step": 36 }, { "epoch": 0.024663313321434368, "grad_norm": 6.0321946144104, "learning_rate": 1.6450216450216453e-06, "loss": 0.9071, "step": 38 }, { "epoch": 0.025961382443615122, "grad_norm": 6.1914448738098145, "learning_rate": 1.7316017316017317e-06, "loss": 0.8436, "step": 40 }, { "epoch": 0.02725945156579588, "grad_norm": 6.654114723205566, "learning_rate": 1.8181818181818183e-06, "loss": 0.9657, "step": 42 }, { "epoch": 0.028557520687976633, "grad_norm": 6.520123481750488, "learning_rate": 1.904761904761905e-06, "loss": 0.8844, "step": 44 }, { "epoch": 0.02985558981015739, "grad_norm": 6.339972972869873, "learning_rate": 1.9913419913419915e-06, "loss": 0.9399, "step": 46 }, { "epoch": 0.03115365893233815, "grad_norm": 6.470807075500488, "learning_rate": 2.0779220779220784e-06, "loss": 0.875, "step": 48 }, { "epoch": 0.0324517280545189, "grad_norm": 5.704713344573975, "learning_rate": 2.1645021645021648e-06, "loss": 0.8692, "step": 50 }, { "epoch": 0.03374979717669966, "grad_norm": 5.687604904174805, "learning_rate": 2.251082251082251e-06, "loss": 0.8451, "step": 52 }, { "epoch": 0.03504786629888042, "grad_norm": 7.234206676483154, "learning_rate": 2.337662337662338e-06, "loss": 0.9811, "step": 54 }, { "epoch": 0.036345935421061175, "grad_norm": 5.676932334899902, "learning_rate": 2.4242424242424244e-06, "loss": 0.8899, "step": 56 }, { "epoch": 0.037644004543241925, "grad_norm": 6.458381175994873, "learning_rate": 2.510822510822511e-06, "loss": 0.7795, "step": 58 }, { "epoch": 0.03894207366542268, "grad_norm": 6.359379291534424, "learning_rate": 2.597402597402597e-06, "loss": 0.7892, "step": 60 }, { "epoch": 0.04024014278760344, "grad_norm": 6.186464309692383, "learning_rate": 2.6839826839826844e-06, "loss": 0.834, "step": 62 }, { "epoch": 0.0415382119097842, "grad_norm": 5.135880470275879, "learning_rate": 2.770562770562771e-06, "loss": 0.7017, "step": 64 }, { "epoch": 0.042836281031964955, "grad_norm": 5.825822353363037, "learning_rate": 2.8571428571428573e-06, "loss": 0.831, "step": 66 }, { "epoch": 0.044134350154145706, "grad_norm": 6.155277729034424, "learning_rate": 2.943722943722944e-06, "loss": 0.8614, "step": 68 }, { "epoch": 0.04543241927632646, "grad_norm": 5.385496139526367, "learning_rate": 3.0303030303030305e-06, "loss": 0.7658, "step": 70 }, { "epoch": 0.04673048839850722, "grad_norm": 4.945676803588867, "learning_rate": 3.116883116883117e-06, "loss": 0.6174, "step": 72 }, { "epoch": 0.04802855752068798, "grad_norm": 4.385101795196533, "learning_rate": 3.2034632034632033e-06, "loss": 0.6436, "step": 74 }, { "epoch": 0.049326626642868736, "grad_norm": 4.539665222167969, "learning_rate": 3.2900432900432905e-06, "loss": 0.7198, "step": 76 }, { "epoch": 0.050624695765049486, "grad_norm": 4.591179847717285, "learning_rate": 3.376623376623377e-06, "loss": 0.6633, "step": 78 }, { "epoch": 0.051922764887230244, "grad_norm": 3.828946113586426, "learning_rate": 3.4632034632034634e-06, "loss": 0.6397, "step": 80 }, { "epoch": 0.053220834009411, "grad_norm": 4.122572422027588, "learning_rate": 3.54978354978355e-06, "loss": 0.5394, "step": 82 }, { "epoch": 0.05451890313159176, "grad_norm": 3.9561424255371094, "learning_rate": 3.6363636363636366e-06, "loss": 0.6242, "step": 84 }, { "epoch": 0.055816972253772516, "grad_norm": 3.6515066623687744, "learning_rate": 3.722943722943723e-06, "loss": 0.5686, "step": 86 }, { "epoch": 0.05711504137595327, "grad_norm": 2.567302703857422, "learning_rate": 3.80952380952381e-06, "loss": 0.5455, "step": 88 }, { "epoch": 0.058413110498134024, "grad_norm": 2.7233035564422607, "learning_rate": 3.896103896103897e-06, "loss": 0.5528, "step": 90 }, { "epoch": 0.05971117962031478, "grad_norm": 2.6668949127197266, "learning_rate": 3.982683982683983e-06, "loss": 0.5247, "step": 92 }, { "epoch": 0.06100924874249554, "grad_norm": 1.6147193908691406, "learning_rate": 4.0692640692640695e-06, "loss": 0.4497, "step": 94 }, { "epoch": 0.0623073178646763, "grad_norm": 1.8311339616775513, "learning_rate": 4.155844155844157e-06, "loss": 0.5019, "step": 96 }, { "epoch": 0.06360538698685705, "grad_norm": 1.484650731086731, "learning_rate": 4.242424242424243e-06, "loss": 0.4461, "step": 98 }, { "epoch": 0.0649034561090378, "grad_norm": 1.1527801752090454, "learning_rate": 4.3290043290043295e-06, "loss": 0.3936, "step": 100 }, { "epoch": 0.06620152523121857, "grad_norm": 0.7213553190231323, "learning_rate": 4.415584415584416e-06, "loss": 0.3996, "step": 102 }, { "epoch": 0.06749959435339932, "grad_norm": 0.7048112750053406, "learning_rate": 4.502164502164502e-06, "loss": 0.3714, "step": 104 }, { "epoch": 0.06879766347558007, "grad_norm": 0.6584962010383606, "learning_rate": 4.5887445887445896e-06, "loss": 0.4198, "step": 106 }, { "epoch": 0.07009573259776083, "grad_norm": 0.7030335068702698, "learning_rate": 4.675324675324676e-06, "loss": 0.3815, "step": 108 }, { "epoch": 0.07139380171994159, "grad_norm": 0.4970128834247589, "learning_rate": 4.761904761904762e-06, "loss": 0.3602, "step": 110 }, { "epoch": 0.07269187084212235, "grad_norm": 1.0927718877792358, "learning_rate": 4.848484848484849e-06, "loss": 0.3811, "step": 112 }, { "epoch": 0.0739899399643031, "grad_norm": 0.6064110398292542, "learning_rate": 4.935064935064935e-06, "loss": 0.3954, "step": 114 }, { "epoch": 0.07528800908648385, "grad_norm": 0.33001700043678284, "learning_rate": 5.021645021645022e-06, "loss": 0.3487, "step": 116 }, { "epoch": 0.07658607820866462, "grad_norm": 0.32029810547828674, "learning_rate": 5.108225108225109e-06, "loss": 0.3489, "step": 118 }, { "epoch": 0.07788414733084537, "grad_norm": 0.33607786893844604, "learning_rate": 5.194805194805194e-06, "loss": 0.399, "step": 120 }, { "epoch": 0.07918221645302613, "grad_norm": 0.36514097452163696, "learning_rate": 5.281385281385282e-06, "loss": 0.3893, "step": 122 }, { "epoch": 0.08048028557520688, "grad_norm": 0.33424612879753113, "learning_rate": 5.367965367965369e-06, "loss": 0.3566, "step": 124 }, { "epoch": 0.08177835469738763, "grad_norm": 0.4200645685195923, "learning_rate": 5.4545454545454545e-06, "loss": 0.3899, "step": 126 }, { "epoch": 0.0830764238195684, "grad_norm": 0.49920353293418884, "learning_rate": 5.541125541125542e-06, "loss": 0.3799, "step": 128 }, { "epoch": 0.08437449294174915, "grad_norm": 0.35418564081192017, "learning_rate": 5.627705627705629e-06, "loss": 0.4147, "step": 130 }, { "epoch": 0.08567256206392991, "grad_norm": 0.2320559173822403, "learning_rate": 5.7142857142857145e-06, "loss": 0.3384, "step": 132 }, { "epoch": 0.08697063118611066, "grad_norm": 0.2450694888830185, "learning_rate": 5.800865800865802e-06, "loss": 0.3339, "step": 134 }, { "epoch": 0.08826870030829141, "grad_norm": 0.36250317096710205, "learning_rate": 5.887445887445888e-06, "loss": 0.4015, "step": 136 }, { "epoch": 0.08956676943047218, "grad_norm": 0.22851811349391937, "learning_rate": 5.9740259740259746e-06, "loss": 0.3732, "step": 138 }, { "epoch": 0.09086483855265293, "grad_norm": 0.2725748121738434, "learning_rate": 6.060606060606061e-06, "loss": 0.3637, "step": 140 }, { "epoch": 0.09216290767483369, "grad_norm": 0.24354153871536255, "learning_rate": 6.147186147186147e-06, "loss": 0.3413, "step": 142 }, { "epoch": 0.09346097679701444, "grad_norm": 0.2328408807516098, "learning_rate": 6.233766233766234e-06, "loss": 0.3378, "step": 144 }, { "epoch": 0.09475904591919519, "grad_norm": 0.23436664044857025, "learning_rate": 6.320346320346321e-06, "loss": 0.3291, "step": 146 }, { "epoch": 0.09605711504137596, "grad_norm": 0.27714380621910095, "learning_rate": 6.406926406926407e-06, "loss": 0.3592, "step": 148 }, { "epoch": 0.09735518416355671, "grad_norm": 0.328075647354126, "learning_rate": 6.493506493506494e-06, "loss": 0.3786, "step": 150 }, { "epoch": 0.09865325328573747, "grad_norm": 0.4423561096191406, "learning_rate": 6.580086580086581e-06, "loss": 0.3584, "step": 152 }, { "epoch": 0.09995132240791822, "grad_norm": 0.20591601729393005, "learning_rate": 6.666666666666667e-06, "loss": 0.3232, "step": 154 }, { "epoch": 0.10124939153009897, "grad_norm": 0.20472541451454163, "learning_rate": 6.753246753246754e-06, "loss": 0.3587, "step": 156 }, { "epoch": 0.10254746065227974, "grad_norm": 0.2650071382522583, "learning_rate": 6.839826839826841e-06, "loss": 0.3445, "step": 158 }, { "epoch": 0.10384552977446049, "grad_norm": 0.4035618305206299, "learning_rate": 6.926406926406927e-06, "loss": 0.3476, "step": 160 }, { "epoch": 0.10514359889664125, "grad_norm": 0.3187306225299835, "learning_rate": 7.012987012987014e-06, "loss": 0.3527, "step": 162 }, { "epoch": 0.106441668018822, "grad_norm": 0.4146274924278259, "learning_rate": 7.0995670995671e-06, "loss": 0.3232, "step": 164 }, { "epoch": 0.10773973714100275, "grad_norm": 0.6427023410797119, "learning_rate": 7.186147186147187e-06, "loss": 0.3684, "step": 166 }, { "epoch": 0.10903780626318352, "grad_norm": 0.4874240756034851, "learning_rate": 7.272727272727273e-06, "loss": 0.3592, "step": 168 }, { "epoch": 0.11033587538536427, "grad_norm": 0.3884155750274658, "learning_rate": 7.3593073593073596e-06, "loss": 0.3561, "step": 170 }, { "epoch": 0.11163394450754503, "grad_norm": 0.24177426099777222, "learning_rate": 7.445887445887446e-06, "loss": 0.3415, "step": 172 }, { "epoch": 0.11293201362972578, "grad_norm": 0.19901804625988007, "learning_rate": 7.532467532467533e-06, "loss": 0.3646, "step": 174 }, { "epoch": 0.11423008275190653, "grad_norm": 0.20472317934036255, "learning_rate": 7.61904761904762e-06, "loss": 0.3295, "step": 176 }, { "epoch": 0.1155281518740873, "grad_norm": 0.23273694515228271, "learning_rate": 7.705627705627707e-06, "loss": 0.3614, "step": 178 }, { "epoch": 0.11682622099626805, "grad_norm": 0.4663180410861969, "learning_rate": 7.792207792207793e-06, "loss": 0.3648, "step": 180 }, { "epoch": 0.11812429011844881, "grad_norm": 0.23577463626861572, "learning_rate": 7.87878787878788e-06, "loss": 0.3335, "step": 182 }, { "epoch": 0.11942235924062956, "grad_norm": 0.45609503984451294, "learning_rate": 7.965367965367966e-06, "loss": 0.3438, "step": 184 }, { "epoch": 0.12072042836281031, "grad_norm": 0.32707342505455017, "learning_rate": 8.051948051948052e-06, "loss": 0.3621, "step": 186 }, { "epoch": 0.12201849748499108, "grad_norm": 0.1876477748155594, "learning_rate": 8.138528138528139e-06, "loss": 0.3464, "step": 188 }, { "epoch": 0.12331656660717183, "grad_norm": 0.24206683039665222, "learning_rate": 8.225108225108225e-06, "loss": 0.328, "step": 190 }, { "epoch": 0.1246146357293526, "grad_norm": 0.2968093454837799, "learning_rate": 8.311688311688313e-06, "loss": 0.372, "step": 192 }, { "epoch": 0.12591270485153336, "grad_norm": 0.2258334457874298, "learning_rate": 8.398268398268398e-06, "loss": 0.3266, "step": 194 }, { "epoch": 0.1272107739737141, "grad_norm": 0.3802196979522705, "learning_rate": 8.484848484848486e-06, "loss": 0.3552, "step": 196 }, { "epoch": 0.12850884309589486, "grad_norm": 0.334238737821579, "learning_rate": 8.571428571428571e-06, "loss": 0.345, "step": 198 }, { "epoch": 0.1298069122180756, "grad_norm": 0.21422573924064636, "learning_rate": 8.658008658008659e-06, "loss": 0.3385, "step": 200 }, { "epoch": 0.13110498134025636, "grad_norm": 0.3789936602115631, "learning_rate": 8.744588744588745e-06, "loss": 0.3306, "step": 202 }, { "epoch": 0.13240305046243714, "grad_norm": 0.19019010663032532, "learning_rate": 8.831168831168832e-06, "loss": 0.3485, "step": 204 }, { "epoch": 0.1337011195846179, "grad_norm": 0.33091527223587036, "learning_rate": 8.917748917748918e-06, "loss": 0.3121, "step": 206 }, { "epoch": 0.13499918870679864, "grad_norm": 0.2778131067752838, "learning_rate": 9.004329004329005e-06, "loss": 0.3527, "step": 208 }, { "epoch": 0.1362972578289794, "grad_norm": 0.24179336428642273, "learning_rate": 9.090909090909091e-06, "loss": 0.3753, "step": 210 }, { "epoch": 0.13759532695116014, "grad_norm": 0.24708905816078186, "learning_rate": 9.177489177489179e-06, "loss": 0.3219, "step": 212 }, { "epoch": 0.13889339607334092, "grad_norm": 0.21441404521465302, "learning_rate": 9.264069264069266e-06, "loss": 0.3367, "step": 214 }, { "epoch": 0.14019146519552167, "grad_norm": 0.4769307076931, "learning_rate": 9.350649350649352e-06, "loss": 0.3676, "step": 216 }, { "epoch": 0.14148953431770242, "grad_norm": 0.2470211535692215, "learning_rate": 9.437229437229438e-06, "loss": 0.3654, "step": 218 }, { "epoch": 0.14278760343988317, "grad_norm": 0.5748988389968872, "learning_rate": 9.523809523809525e-06, "loss": 0.3355, "step": 220 }, { "epoch": 0.14408567256206392, "grad_norm": 0.2319840043783188, "learning_rate": 9.610389610389611e-06, "loss": 0.3438, "step": 222 }, { "epoch": 0.1453837416842447, "grad_norm": 0.26565733551979065, "learning_rate": 9.696969696969698e-06, "loss": 0.3592, "step": 224 }, { "epoch": 0.14668181080642545, "grad_norm": 0.22992628812789917, "learning_rate": 9.783549783549784e-06, "loss": 0.3267, "step": 226 }, { "epoch": 0.1479798799286062, "grad_norm": 0.306938499212265, "learning_rate": 9.87012987012987e-06, "loss": 0.3191, "step": 228 }, { "epoch": 0.14927794905078695, "grad_norm": 0.1975240856409073, "learning_rate": 9.956709956709958e-06, "loss": 0.3386, "step": 230 }, { "epoch": 0.1505760181729677, "grad_norm": 0.3766959011554718, "learning_rate": 9.999998719119619e-06, "loss": 0.3453, "step": 232 }, { "epoch": 0.15187408729514848, "grad_norm": 0.2567223310470581, "learning_rate": 9.999988472080506e-06, "loss": 0.3323, "step": 234 }, { "epoch": 0.15317215641732923, "grad_norm": 0.3132372498512268, "learning_rate": 9.99996797802328e-06, "loss": 0.3535, "step": 236 }, { "epoch": 0.15447022553950998, "grad_norm": 0.5064688920974731, "learning_rate": 9.99993723698994e-06, "loss": 0.3416, "step": 238 }, { "epoch": 0.15576829466169073, "grad_norm": 0.2574165463447571, "learning_rate": 9.999896249043488e-06, "loss": 0.3487, "step": 240 }, { "epoch": 0.15706636378387148, "grad_norm": 0.2975156605243683, "learning_rate": 9.999845014267928e-06, "loss": 0.3125, "step": 242 }, { "epoch": 0.15836443290605226, "grad_norm": 0.28568053245544434, "learning_rate": 9.999783532768258e-06, "loss": 0.3285, "step": 244 }, { "epoch": 0.159662502028233, "grad_norm": 0.2554554045200348, "learning_rate": 9.999711804670478e-06, "loss": 0.3314, "step": 246 }, { "epoch": 0.16096057115041376, "grad_norm": 0.427518367767334, "learning_rate": 9.99962983012159e-06, "loss": 0.3245, "step": 248 }, { "epoch": 0.1622586402725945, "grad_norm": 0.36999958753585815, "learning_rate": 9.999537609289592e-06, "loss": 0.3287, "step": 250 }, { "epoch": 0.16355670939477526, "grad_norm": 0.2572516202926636, "learning_rate": 9.999435142363484e-06, "loss": 0.3142, "step": 252 }, { "epoch": 0.16485477851695604, "grad_norm": 0.2060335874557495, "learning_rate": 9.999322429553262e-06, "loss": 0.3273, "step": 254 }, { "epoch": 0.1661528476391368, "grad_norm": 0.4580685496330261, "learning_rate": 9.999199471089918e-06, "loss": 0.3378, "step": 256 }, { "epoch": 0.16745091676131754, "grad_norm": 0.2914876937866211, "learning_rate": 9.999066267225447e-06, "loss": 0.338, "step": 258 }, { "epoch": 0.1687489858834983, "grad_norm": 0.27025333046913147, "learning_rate": 9.998922818232836e-06, "loss": 0.3277, "step": 260 }, { "epoch": 0.17004705500567904, "grad_norm": 0.4284892678260803, "learning_rate": 9.998769124406074e-06, "loss": 0.3473, "step": 262 }, { "epoch": 0.17134512412785982, "grad_norm": 0.27784469723701477, "learning_rate": 9.998605186060138e-06, "loss": 0.3421, "step": 264 }, { "epoch": 0.17264319325004057, "grad_norm": 0.26006096601486206, "learning_rate": 9.998431003531008e-06, "loss": 0.3345, "step": 266 }, { "epoch": 0.17394126237222132, "grad_norm": 0.3378351628780365, "learning_rate": 9.998246577175653e-06, "loss": 0.3235, "step": 268 }, { "epoch": 0.17523933149440207, "grad_norm": 0.2822704315185547, "learning_rate": 9.99805190737204e-06, "loss": 0.3217, "step": 270 }, { "epoch": 0.17653740061658282, "grad_norm": 0.26916196942329407, "learning_rate": 9.997846994519126e-06, "loss": 0.3524, "step": 272 }, { "epoch": 0.1778354697387636, "grad_norm": 0.2934769093990326, "learning_rate": 9.997631839036858e-06, "loss": 0.3235, "step": 274 }, { "epoch": 0.17913353886094435, "grad_norm": 0.28291717171669006, "learning_rate": 9.997406441366182e-06, "loss": 0.3144, "step": 276 }, { "epoch": 0.1804316079831251, "grad_norm": 0.48731377720832825, "learning_rate": 9.99717080196903e-06, "loss": 0.3117, "step": 278 }, { "epoch": 0.18172967710530585, "grad_norm": 0.3186780512332916, "learning_rate": 9.99692492132832e-06, "loss": 0.3214, "step": 280 }, { "epoch": 0.1830277462274866, "grad_norm": 0.33035191893577576, "learning_rate": 9.996668799947962e-06, "loss": 0.3181, "step": 282 }, { "epoch": 0.18432581534966738, "grad_norm": 0.2466474175453186, "learning_rate": 9.996402438352856e-06, "loss": 0.3278, "step": 284 }, { "epoch": 0.18562388447184813, "grad_norm": 0.3591921329498291, "learning_rate": 9.996125837088883e-06, "loss": 0.3408, "step": 286 }, { "epoch": 0.18692195359402888, "grad_norm": 0.3142857253551483, "learning_rate": 9.995838996722916e-06, "loss": 0.3084, "step": 288 }, { "epoch": 0.18822002271620963, "grad_norm": 0.24143460392951965, "learning_rate": 9.995541917842803e-06, "loss": 0.3257, "step": 290 }, { "epoch": 0.18951809183839038, "grad_norm": 0.35196101665496826, "learning_rate": 9.995234601057381e-06, "loss": 0.334, "step": 292 }, { "epoch": 0.19081616096057116, "grad_norm": 0.2780432403087616, "learning_rate": 9.994917046996472e-06, "loss": 0.3588, "step": 294 }, { "epoch": 0.1921142300827519, "grad_norm": 0.486937016248703, "learning_rate": 9.99458925631087e-06, "loss": 0.3177, "step": 296 }, { "epoch": 0.19341229920493266, "grad_norm": 0.3460686206817627, "learning_rate": 9.994251229672351e-06, "loss": 0.3205, "step": 298 }, { "epoch": 0.19471036832711341, "grad_norm": 0.3075374364852905, "learning_rate": 9.993902967773674e-06, "loss": 0.3218, "step": 300 }, { "epoch": 0.19600843744929417, "grad_norm": 0.37752580642700195, "learning_rate": 9.993544471328565e-06, "loss": 0.3387, "step": 302 }, { "epoch": 0.19730650657147494, "grad_norm": 0.27894681692123413, "learning_rate": 9.993175741071732e-06, "loss": 0.3492, "step": 304 }, { "epoch": 0.1986045756936557, "grad_norm": 0.3676518201828003, "learning_rate": 9.992796777758855e-06, "loss": 0.33, "step": 306 }, { "epoch": 0.19990264481583644, "grad_norm": 0.2856215834617615, "learning_rate": 9.992407582166582e-06, "loss": 0.3194, "step": 308 }, { "epoch": 0.2012007139380172, "grad_norm": 0.4344342350959778, "learning_rate": 9.992008155092534e-06, "loss": 0.3523, "step": 310 }, { "epoch": 0.20249878306019795, "grad_norm": 0.31622278690338135, "learning_rate": 9.991598497355304e-06, "loss": 0.3342, "step": 312 }, { "epoch": 0.20379685218237872, "grad_norm": 0.3520386219024658, "learning_rate": 9.991178609794443e-06, "loss": 0.3152, "step": 314 }, { "epoch": 0.20509492130455947, "grad_norm": 0.34654003381729126, "learning_rate": 9.990748493270474e-06, "loss": 0.3267, "step": 316 }, { "epoch": 0.20639299042674022, "grad_norm": 0.27719029784202576, "learning_rate": 9.990308148664882e-06, "loss": 0.3114, "step": 318 }, { "epoch": 0.20769105954892098, "grad_norm": 0.3424472212791443, "learning_rate": 9.989857576880113e-06, "loss": 0.3365, "step": 320 }, { "epoch": 0.20898912867110173, "grad_norm": 0.3492903411388397, "learning_rate": 9.989396778839572e-06, "loss": 0.369, "step": 322 }, { "epoch": 0.2102871977932825, "grad_norm": 0.38008520007133484, "learning_rate": 9.988925755487622e-06, "loss": 0.3094, "step": 324 }, { "epoch": 0.21158526691546325, "grad_norm": 0.4675543010234833, "learning_rate": 9.988444507789584e-06, "loss": 0.3393, "step": 326 }, { "epoch": 0.212883336037644, "grad_norm": 0.33241841197013855, "learning_rate": 9.987953036731727e-06, "loss": 0.3196, "step": 328 }, { "epoch": 0.21418140515982476, "grad_norm": 0.38456469774246216, "learning_rate": 9.98745134332128e-06, "loss": 0.3331, "step": 330 }, { "epoch": 0.2154794742820055, "grad_norm": 0.3709038197994232, "learning_rate": 9.986939428586416e-06, "loss": 0.3157, "step": 332 }, { "epoch": 0.21677754340418628, "grad_norm": 0.36491093039512634, "learning_rate": 9.986417293576257e-06, "loss": 0.3101, "step": 334 }, { "epoch": 0.21807561252636704, "grad_norm": 0.4126209616661072, "learning_rate": 9.985884939360873e-06, "loss": 0.3453, "step": 336 }, { "epoch": 0.21937368164854779, "grad_norm": 0.2743375599384308, "learning_rate": 9.985342367031272e-06, "loss": 0.3249, "step": 338 }, { "epoch": 0.22067175077072854, "grad_norm": 0.4701857268810272, "learning_rate": 9.984789577699407e-06, "loss": 0.3025, "step": 340 }, { "epoch": 0.2219698198929093, "grad_norm": 0.25151172280311584, "learning_rate": 9.984226572498173e-06, "loss": 0.3246, "step": 342 }, { "epoch": 0.22326788901509007, "grad_norm": 0.33588528633117676, "learning_rate": 9.98365335258139e-06, "loss": 0.3194, "step": 344 }, { "epoch": 0.22456595813727082, "grad_norm": 0.253648966550827, "learning_rate": 9.983069919123828e-06, "loss": 0.3495, "step": 346 }, { "epoch": 0.22586402725945157, "grad_norm": 0.26863688230514526, "learning_rate": 9.982476273321175e-06, "loss": 0.3319, "step": 348 }, { "epoch": 0.22716209638163232, "grad_norm": 0.43418222665786743, "learning_rate": 9.981872416390055e-06, "loss": 0.3757, "step": 350 }, { "epoch": 0.22846016550381307, "grad_norm": 0.33871492743492126, "learning_rate": 9.981258349568018e-06, "loss": 0.3244, "step": 352 }, { "epoch": 0.22975823462599385, "grad_norm": 0.3968394994735718, "learning_rate": 9.980634074113538e-06, "loss": 0.3182, "step": 354 }, { "epoch": 0.2310563037481746, "grad_norm": 0.32208913564682007, "learning_rate": 9.97999959130601e-06, "loss": 0.3202, "step": 356 }, { "epoch": 0.23235437287035535, "grad_norm": 0.3505564033985138, "learning_rate": 9.979354902445745e-06, "loss": 0.3164, "step": 358 }, { "epoch": 0.2336524419925361, "grad_norm": 0.3456391394138336, "learning_rate": 9.97870000885398e-06, "loss": 0.3125, "step": 360 }, { "epoch": 0.23495051111471685, "grad_norm": 0.2613707184791565, "learning_rate": 9.978034911872853e-06, "loss": 0.3136, "step": 362 }, { "epoch": 0.23624858023689763, "grad_norm": 0.4940257966518402, "learning_rate": 9.977359612865424e-06, "loss": 0.2883, "step": 364 }, { "epoch": 0.23754664935907838, "grad_norm": 0.3928545117378235, "learning_rate": 9.976674113215655e-06, "loss": 0.3132, "step": 366 }, { "epoch": 0.23884471848125913, "grad_norm": 0.2908713221549988, "learning_rate": 9.975978414328413e-06, "loss": 0.3239, "step": 368 }, { "epoch": 0.24014278760343988, "grad_norm": 0.4576404094696045, "learning_rate": 9.975272517629474e-06, "loss": 0.3089, "step": 370 }, { "epoch": 0.24144085672562063, "grad_norm": 0.3171669542789459, "learning_rate": 9.974556424565503e-06, "loss": 0.3187, "step": 372 }, { "epoch": 0.2427389258478014, "grad_norm": 0.3621912896633148, "learning_rate": 9.973830136604068e-06, "loss": 0.315, "step": 374 }, { "epoch": 0.24403699496998216, "grad_norm": 0.44670671224594116, "learning_rate": 9.973093655233633e-06, "loss": 0.3198, "step": 376 }, { "epoch": 0.2453350640921629, "grad_norm": 0.36450040340423584, "learning_rate": 9.972346981963546e-06, "loss": 0.3009, "step": 378 }, { "epoch": 0.24663313321434366, "grad_norm": 0.4079269468784332, "learning_rate": 9.971590118324047e-06, "loss": 0.312, "step": 380 }, { "epoch": 0.2479312023365244, "grad_norm": 0.3717515468597412, "learning_rate": 9.970823065866259e-06, "loss": 0.3324, "step": 382 }, { "epoch": 0.2492292714587052, "grad_norm": 0.4971848726272583, "learning_rate": 9.970045826162182e-06, "loss": 0.2901, "step": 384 }, { "epoch": 0.25052734058088594, "grad_norm": 0.39832013845443726, "learning_rate": 9.969258400804703e-06, "loss": 0.3033, "step": 386 }, { "epoch": 0.2518254097030667, "grad_norm": 0.33511894941329956, "learning_rate": 9.968460791407575e-06, "loss": 0.3168, "step": 388 }, { "epoch": 0.25312347882524744, "grad_norm": 0.5621903538703918, "learning_rate": 9.967652999605424e-06, "loss": 0.3164, "step": 390 }, { "epoch": 0.2544215479474282, "grad_norm": 0.532831072807312, "learning_rate": 9.96683502705375e-06, "loss": 0.3086, "step": 392 }, { "epoch": 0.25571961706960894, "grad_norm": 0.4372377395629883, "learning_rate": 9.966006875428909e-06, "loss": 0.3267, "step": 394 }, { "epoch": 0.2570176861917897, "grad_norm": 0.536139190196991, "learning_rate": 9.965168546428122e-06, "loss": 0.3318, "step": 396 }, { "epoch": 0.2583157553139705, "grad_norm": 0.37483111023902893, "learning_rate": 9.964320041769467e-06, "loss": 0.3129, "step": 398 }, { "epoch": 0.2596138244361512, "grad_norm": 0.2753461003303528, "learning_rate": 9.96346136319188e-06, "loss": 0.3164, "step": 400 }, { "epoch": 0.260911893558332, "grad_norm": 0.3785316050052643, "learning_rate": 9.96259251245514e-06, "loss": 0.2932, "step": 402 }, { "epoch": 0.2622099626805127, "grad_norm": 0.42932626605033875, "learning_rate": 9.96171349133988e-06, "loss": 0.3472, "step": 404 }, { "epoch": 0.2635080318026935, "grad_norm": 0.44714730978012085, "learning_rate": 9.960824301647569e-06, "loss": 0.2953, "step": 406 }, { "epoch": 0.2648061009248743, "grad_norm": 0.43149423599243164, "learning_rate": 9.959924945200525e-06, "loss": 0.2881, "step": 408 }, { "epoch": 0.266104170047055, "grad_norm": 0.3228270411491394, "learning_rate": 9.959015423841895e-06, "loss": 0.3182, "step": 410 }, { "epoch": 0.2674022391692358, "grad_norm": 0.4355909526348114, "learning_rate": 9.958095739435658e-06, "loss": 0.3049, "step": 412 }, { "epoch": 0.2687003082914165, "grad_norm": 0.33498936891555786, "learning_rate": 9.957165893866623e-06, "loss": 0.3035, "step": 414 }, { "epoch": 0.2699983774135973, "grad_norm": 0.293516606092453, "learning_rate": 9.956225889040425e-06, "loss": 0.326, "step": 416 }, { "epoch": 0.27129644653577806, "grad_norm": 0.5864387154579163, "learning_rate": 9.955275726883517e-06, "loss": 0.3342, "step": 418 }, { "epoch": 0.2725945156579588, "grad_norm": 0.5899648666381836, "learning_rate": 9.95431540934317e-06, "loss": 0.3279, "step": 420 }, { "epoch": 0.27389258478013956, "grad_norm": 0.5094327926635742, "learning_rate": 9.953344938387464e-06, "loss": 0.2923, "step": 422 }, { "epoch": 0.2751906539023203, "grad_norm": 0.3359895348548889, "learning_rate": 9.952364316005293e-06, "loss": 0.3105, "step": 424 }, { "epoch": 0.27648872302450106, "grad_norm": 0.33057889342308044, "learning_rate": 9.951373544206352e-06, "loss": 0.3125, "step": 426 }, { "epoch": 0.27778679214668184, "grad_norm": 0.30746719241142273, "learning_rate": 9.950372625021137e-06, "loss": 0.3078, "step": 428 }, { "epoch": 0.27908486126886256, "grad_norm": 0.39584919810295105, "learning_rate": 9.949361560500939e-06, "loss": 0.3285, "step": 430 }, { "epoch": 0.28038293039104334, "grad_norm": 0.3933069109916687, "learning_rate": 9.948340352717845e-06, "loss": 0.3468, "step": 432 }, { "epoch": 0.28168099951322406, "grad_norm": 0.33667418360710144, "learning_rate": 9.947309003764723e-06, "loss": 0.3087, "step": 434 }, { "epoch": 0.28297906863540484, "grad_norm": 0.3505193889141083, "learning_rate": 9.94626751575523e-06, "loss": 0.3081, "step": 436 }, { "epoch": 0.2842771377575856, "grad_norm": 0.355273962020874, "learning_rate": 9.9452158908238e-06, "loss": 0.3154, "step": 438 }, { "epoch": 0.28557520687976634, "grad_norm": 0.339298814535141, "learning_rate": 9.944154131125643e-06, "loss": 0.3011, "step": 440 }, { "epoch": 0.2868732760019471, "grad_norm": 0.4688047170639038, "learning_rate": 9.943082238836737e-06, "loss": 0.3002, "step": 442 }, { "epoch": 0.28817134512412784, "grad_norm": 0.4089771807193756, "learning_rate": 9.942000216153829e-06, "loss": 0.321, "step": 444 }, { "epoch": 0.2894694142463086, "grad_norm": 0.3933537006378174, "learning_rate": 9.940908065294421e-06, "loss": 0.3004, "step": 446 }, { "epoch": 0.2907674833684894, "grad_norm": 0.3342922031879425, "learning_rate": 9.939805788496778e-06, "loss": 0.2861, "step": 448 }, { "epoch": 0.2920655524906701, "grad_norm": 0.44521984457969666, "learning_rate": 9.93869338801992e-06, "loss": 0.3014, "step": 450 }, { "epoch": 0.2933636216128509, "grad_norm": 0.3567230701446533, "learning_rate": 9.937570866143604e-06, "loss": 0.3415, "step": 452 }, { "epoch": 0.2946616907350316, "grad_norm": 0.5693230628967285, "learning_rate": 9.936438225168336e-06, "loss": 0.3188, "step": 454 }, { "epoch": 0.2959597598572124, "grad_norm": 0.426076203584671, "learning_rate": 9.935295467415363e-06, "loss": 0.3286, "step": 456 }, { "epoch": 0.2972578289793932, "grad_norm": 0.37038612365722656, "learning_rate": 9.93414259522666e-06, "loss": 0.3127, "step": 458 }, { "epoch": 0.2985558981015739, "grad_norm": 0.5200949311256409, "learning_rate": 9.932979610964933e-06, "loss": 0.3234, "step": 460 }, { "epoch": 0.2998539672237547, "grad_norm": 0.424402117729187, "learning_rate": 9.931806517013612e-06, "loss": 0.3118, "step": 462 }, { "epoch": 0.3011520363459354, "grad_norm": 0.5034117698669434, "learning_rate": 9.930623315776848e-06, "loss": 0.3102, "step": 464 }, { "epoch": 0.3024501054681162, "grad_norm": 0.355267733335495, "learning_rate": 9.9294300096795e-06, "loss": 0.31, "step": 466 }, { "epoch": 0.30374817459029696, "grad_norm": 0.7377546429634094, "learning_rate": 9.928226601167139e-06, "loss": 0.3273, "step": 468 }, { "epoch": 0.3050462437124777, "grad_norm": 0.3086271286010742, "learning_rate": 9.927013092706044e-06, "loss": 0.3091, "step": 470 }, { "epoch": 0.30634431283465846, "grad_norm": 0.28919801115989685, "learning_rate": 9.925789486783186e-06, "loss": 0.3058, "step": 472 }, { "epoch": 0.3076423819568392, "grad_norm": 0.37627580761909485, "learning_rate": 9.924555785906235e-06, "loss": 0.3282, "step": 474 }, { "epoch": 0.30894045107901996, "grad_norm": 0.3910629153251648, "learning_rate": 9.92331199260355e-06, "loss": 0.3189, "step": 476 }, { "epoch": 0.31023852020120074, "grad_norm": 0.4212571382522583, "learning_rate": 9.922058109424168e-06, "loss": 0.2825, "step": 478 }, { "epoch": 0.31153658932338146, "grad_norm": 0.45060059428215027, "learning_rate": 9.920794138937807e-06, "loss": 0.2931, "step": 480 }, { "epoch": 0.31283465844556224, "grad_norm": 0.4592582583427429, "learning_rate": 9.919520083734862e-06, "loss": 0.2858, "step": 482 }, { "epoch": 0.31413272756774296, "grad_norm": 0.5878152847290039, "learning_rate": 9.918235946426389e-06, "loss": 0.3694, "step": 484 }, { "epoch": 0.31543079668992374, "grad_norm": 0.4250052869319916, "learning_rate": 9.916941729644112e-06, "loss": 0.2929, "step": 486 }, { "epoch": 0.3167288658121045, "grad_norm": 0.3931960463523865, "learning_rate": 9.91563743604041e-06, "loss": 0.3196, "step": 488 }, { "epoch": 0.31802693493428524, "grad_norm": 0.3579998016357422, "learning_rate": 9.914323068288312e-06, "loss": 0.2993, "step": 490 }, { "epoch": 0.319325004056466, "grad_norm": 0.4256168305873871, "learning_rate": 9.912998629081495e-06, "loss": 0.292, "step": 492 }, { "epoch": 0.32062307317864674, "grad_norm": 0.7532792091369629, "learning_rate": 9.911664121134272e-06, "loss": 0.2954, "step": 494 }, { "epoch": 0.3219211423008275, "grad_norm": 0.4556688964366913, "learning_rate": 9.910319547181601e-06, "loss": 0.3198, "step": 496 }, { "epoch": 0.3232192114230083, "grad_norm": 0.3796408772468567, "learning_rate": 9.90896490997906e-06, "loss": 0.3136, "step": 498 }, { "epoch": 0.324517280545189, "grad_norm": 0.45700427889823914, "learning_rate": 9.907600212302852e-06, "loss": 0.3099, "step": 500 }, { "epoch": 0.324517280545189, "eval_loss": 0.30662256479263306, "eval_runtime": 397.2663, "eval_samples_per_second": 26.129, "eval_steps_per_second": 3.267, "step": 500 }, { "epoch": 0.3258153496673698, "grad_norm": 0.5378230810165405, "learning_rate": 9.906225456949803e-06, "loss": 0.3103, "step": 502 }, { "epoch": 0.3271134187895505, "grad_norm": 0.4217565655708313, "learning_rate": 9.904840646737346e-06, "loss": 0.2855, "step": 504 }, { "epoch": 0.3284114879117313, "grad_norm": 0.3789873719215393, "learning_rate": 9.903445784503525e-06, "loss": 0.2838, "step": 506 }, { "epoch": 0.3297095570339121, "grad_norm": 0.41591763496398926, "learning_rate": 9.90204087310698e-06, "loss": 0.3086, "step": 508 }, { "epoch": 0.3310076261560928, "grad_norm": 0.4686451256275177, "learning_rate": 9.900625915426948e-06, "loss": 0.3103, "step": 510 }, { "epoch": 0.3323056952782736, "grad_norm": 0.588723361492157, "learning_rate": 9.899200914363256e-06, "loss": 0.3105, "step": 512 }, { "epoch": 0.3336037644004543, "grad_norm": 0.452433317899704, "learning_rate": 9.897765872836313e-06, "loss": 0.3191, "step": 514 }, { "epoch": 0.3349018335226351, "grad_norm": 0.5950521230697632, "learning_rate": 9.896320793787106e-06, "loss": 0.292, "step": 516 }, { "epoch": 0.33619990264481586, "grad_norm": 0.3550559878349304, "learning_rate": 9.89486568017719e-06, "loss": 0.3349, "step": 518 }, { "epoch": 0.3374979717669966, "grad_norm": 0.46887919306755066, "learning_rate": 9.89340053498869e-06, "loss": 0.2756, "step": 520 }, { "epoch": 0.33879604088917736, "grad_norm": 0.6027584671974182, "learning_rate": 9.891925361224284e-06, "loss": 0.3139, "step": 522 }, { "epoch": 0.3400941100113581, "grad_norm": 0.6142759323120117, "learning_rate": 9.890440161907206e-06, "loss": 0.3208, "step": 524 }, { "epoch": 0.34139217913353886, "grad_norm": 0.40327784419059753, "learning_rate": 9.888944940081236e-06, "loss": 0.2802, "step": 526 }, { "epoch": 0.34269024825571964, "grad_norm": 0.4556993842124939, "learning_rate": 9.887439698810694e-06, "loss": 0.3023, "step": 528 }, { "epoch": 0.34398831737790037, "grad_norm": 0.46487879753112793, "learning_rate": 9.885924441180435e-06, "loss": 0.3053, "step": 530 }, { "epoch": 0.34528638650008114, "grad_norm": 0.4788736402988434, "learning_rate": 9.884399170295839e-06, "loss": 0.2934, "step": 532 }, { "epoch": 0.34658445562226187, "grad_norm": 0.3936035633087158, "learning_rate": 9.88286388928281e-06, "loss": 0.2934, "step": 534 }, { "epoch": 0.34788252474444264, "grad_norm": 0.6599826812744141, "learning_rate": 9.881318601287767e-06, "loss": 0.3241, "step": 536 }, { "epoch": 0.3491805938666234, "grad_norm": 0.4173608124256134, "learning_rate": 9.879763309477633e-06, "loss": 0.2853, "step": 538 }, { "epoch": 0.35047866298880415, "grad_norm": 0.4245106279850006, "learning_rate": 9.878198017039839e-06, "loss": 0.3162, "step": 540 }, { "epoch": 0.3517767321109849, "grad_norm": 0.428534597158432, "learning_rate": 9.876622727182306e-06, "loss": 0.3084, "step": 542 }, { "epoch": 0.35307480123316565, "grad_norm": 0.38509663939476013, "learning_rate": 9.875037443133449e-06, "loss": 0.3254, "step": 544 }, { "epoch": 0.3543728703553464, "grad_norm": 0.3531460464000702, "learning_rate": 9.873442168142158e-06, "loss": 0.2758, "step": 546 }, { "epoch": 0.3556709394775272, "grad_norm": 0.4225866198539734, "learning_rate": 9.871836905477807e-06, "loss": 0.3104, "step": 548 }, { "epoch": 0.3569690085997079, "grad_norm": 0.45116230845451355, "learning_rate": 9.870221658430233e-06, "loss": 0.3151, "step": 550 }, { "epoch": 0.3582670777218887, "grad_norm": 0.5050747394561768, "learning_rate": 9.868596430309739e-06, "loss": 0.3014, "step": 552 }, { "epoch": 0.3595651468440694, "grad_norm": 0.4084848165512085, "learning_rate": 9.866961224447076e-06, "loss": 0.2963, "step": 554 }, { "epoch": 0.3608632159662502, "grad_norm": 0.6075277924537659, "learning_rate": 9.865316044193453e-06, "loss": 0.2868, "step": 556 }, { "epoch": 0.362161285088431, "grad_norm": 0.3911098539829254, "learning_rate": 9.863660892920514e-06, "loss": 0.3037, "step": 558 }, { "epoch": 0.3634593542106117, "grad_norm": 0.626965343952179, "learning_rate": 9.861995774020341e-06, "loss": 0.3069, "step": 560 }, { "epoch": 0.3647574233327925, "grad_norm": 0.49004608392715454, "learning_rate": 9.860320690905443e-06, "loss": 0.3196, "step": 562 }, { "epoch": 0.3660554924549732, "grad_norm": 0.4215310513973236, "learning_rate": 9.858635647008747e-06, "loss": 0.2655, "step": 564 }, { "epoch": 0.367353561577154, "grad_norm": 0.5000298023223877, "learning_rate": 9.856940645783599e-06, "loss": 0.3214, "step": 566 }, { "epoch": 0.36865163069933476, "grad_norm": 0.45774155855178833, "learning_rate": 9.855235690703748e-06, "loss": 0.2782, "step": 568 }, { "epoch": 0.3699496998215155, "grad_norm": 0.7206143140792847, "learning_rate": 9.85352078526334e-06, "loss": 0.3212, "step": 570 }, { "epoch": 0.37124776894369627, "grad_norm": 0.6110740900039673, "learning_rate": 9.851795932976919e-06, "loss": 0.3667, "step": 572 }, { "epoch": 0.372545838065877, "grad_norm": 0.47487127780914307, "learning_rate": 9.850061137379414e-06, "loss": 0.2955, "step": 574 }, { "epoch": 0.37384390718805777, "grad_norm": 0.42945396900177, "learning_rate": 9.848316402026125e-06, "loss": 0.2888, "step": 576 }, { "epoch": 0.37514197631023855, "grad_norm": 0.3821617662906647, "learning_rate": 9.84656173049273e-06, "loss": 0.2964, "step": 578 }, { "epoch": 0.37644004543241927, "grad_norm": 0.3967457413673401, "learning_rate": 9.844797126375265e-06, "loss": 0.2879, "step": 580 }, { "epoch": 0.37773811455460005, "grad_norm": 0.37126317620277405, "learning_rate": 9.843022593290129e-06, "loss": 0.3218, "step": 582 }, { "epoch": 0.37903618367678077, "grad_norm": 0.44353944063186646, "learning_rate": 9.84123813487406e-06, "loss": 0.2984, "step": 584 }, { "epoch": 0.38033425279896155, "grad_norm": 0.457200825214386, "learning_rate": 9.839443754784146e-06, "loss": 0.3038, "step": 586 }, { "epoch": 0.3816323219211423, "grad_norm": 0.6422001719474792, "learning_rate": 9.837639456697802e-06, "loss": 0.2895, "step": 588 }, { "epoch": 0.38293039104332305, "grad_norm": 0.6200585961341858, "learning_rate": 9.835825244312772e-06, "loss": 0.3046, "step": 590 }, { "epoch": 0.3842284601655038, "grad_norm": 0.4290887117385864, "learning_rate": 9.83400112134712e-06, "loss": 0.2772, "step": 592 }, { "epoch": 0.38552652928768455, "grad_norm": 0.4128061830997467, "learning_rate": 9.832167091539215e-06, "loss": 0.2784, "step": 594 }, { "epoch": 0.38682459840986533, "grad_norm": 0.39861446619033813, "learning_rate": 9.830323158647734e-06, "loss": 0.3072, "step": 596 }, { "epoch": 0.3881226675320461, "grad_norm": 0.46855080127716064, "learning_rate": 9.82846932645165e-06, "loss": 0.319, "step": 598 }, { "epoch": 0.38942073665422683, "grad_norm": 0.5035063624382019, "learning_rate": 9.826605598750223e-06, "loss": 0.2913, "step": 600 }, { "epoch": 0.3907188057764076, "grad_norm": 0.506700873374939, "learning_rate": 9.824731979362991e-06, "loss": 0.3144, "step": 602 }, { "epoch": 0.39201687489858833, "grad_norm": 0.5185604691505432, "learning_rate": 9.822848472129764e-06, "loss": 0.2827, "step": 604 }, { "epoch": 0.3933149440207691, "grad_norm": 0.5520429015159607, "learning_rate": 9.82095508091062e-06, "loss": 0.3019, "step": 606 }, { "epoch": 0.3946130131429499, "grad_norm": 0.6037489175796509, "learning_rate": 9.819051809585888e-06, "loss": 0.336, "step": 608 }, { "epoch": 0.3959110822651306, "grad_norm": 0.37559938430786133, "learning_rate": 9.81713866205615e-06, "loss": 0.2809, "step": 610 }, { "epoch": 0.3972091513873114, "grad_norm": 0.3614325225353241, "learning_rate": 9.815215642242224e-06, "loss": 0.2935, "step": 612 }, { "epoch": 0.3985072205094921, "grad_norm": 0.4787777066230774, "learning_rate": 9.813282754085168e-06, "loss": 0.295, "step": 614 }, { "epoch": 0.3998052896316729, "grad_norm": 0.41655001044273376, "learning_rate": 9.811340001546252e-06, "loss": 0.253, "step": 616 }, { "epoch": 0.40110335875385367, "grad_norm": 0.8925591707229614, "learning_rate": 9.809387388606977e-06, "loss": 0.3309, "step": 618 }, { "epoch": 0.4024014278760344, "grad_norm": 0.669928789138794, "learning_rate": 9.807424919269038e-06, "loss": 0.2886, "step": 620 }, { "epoch": 0.40369949699821517, "grad_norm": 0.5598891377449036, "learning_rate": 9.80545259755434e-06, "loss": 0.2967, "step": 622 }, { "epoch": 0.4049975661203959, "grad_norm": 0.5220828056335449, "learning_rate": 9.803470427504972e-06, "loss": 0.3003, "step": 624 }, { "epoch": 0.40629563524257667, "grad_norm": 0.40260300040245056, "learning_rate": 9.801478413183211e-06, "loss": 0.297, "step": 626 }, { "epoch": 0.40759370436475745, "grad_norm": 0.46311309933662415, "learning_rate": 9.799476558671513e-06, "loss": 0.3185, "step": 628 }, { "epoch": 0.40889177348693817, "grad_norm": 0.44444623589515686, "learning_rate": 9.797464868072489e-06, "loss": 0.2928, "step": 630 }, { "epoch": 0.41018984260911895, "grad_norm": 0.43029001355171204, "learning_rate": 9.795443345508915e-06, "loss": 0.2967, "step": 632 }, { "epoch": 0.41148791173129967, "grad_norm": 0.525133490562439, "learning_rate": 9.793411995123719e-06, "loss": 0.2945, "step": 634 }, { "epoch": 0.41278598085348045, "grad_norm": 0.3737124800682068, "learning_rate": 9.791370821079967e-06, "loss": 0.28, "step": 636 }, { "epoch": 0.41408404997566123, "grad_norm": 0.5362243056297302, "learning_rate": 9.789319827560854e-06, "loss": 0.3092, "step": 638 }, { "epoch": 0.41538211909784195, "grad_norm": 0.4771840572357178, "learning_rate": 9.78725901876971e-06, "loss": 0.2874, "step": 640 }, { "epoch": 0.41668018822002273, "grad_norm": 0.4128475785255432, "learning_rate": 9.78518839892997e-06, "loss": 0.2807, "step": 642 }, { "epoch": 0.41797825734220345, "grad_norm": 0.6022436618804932, "learning_rate": 9.783107972285177e-06, "loss": 0.2894, "step": 644 }, { "epoch": 0.41927632646438423, "grad_norm": 0.47519171237945557, "learning_rate": 9.78101774309898e-06, "loss": 0.2733, "step": 646 }, { "epoch": 0.420574395586565, "grad_norm": 0.7285898327827454, "learning_rate": 9.77891771565511e-06, "loss": 0.2964, "step": 648 }, { "epoch": 0.42187246470874573, "grad_norm": 0.5154335498809814, "learning_rate": 9.77680789425738e-06, "loss": 0.2737, "step": 650 }, { "epoch": 0.4231705338309265, "grad_norm": 0.7030506730079651, "learning_rate": 9.774688283229674e-06, "loss": 0.2888, "step": 652 }, { "epoch": 0.42446860295310723, "grad_norm": 0.43323034048080444, "learning_rate": 9.772558886915946e-06, "loss": 0.3104, "step": 654 }, { "epoch": 0.425766672075288, "grad_norm": 0.5499376654624939, "learning_rate": 9.770419709680193e-06, "loss": 0.3202, "step": 656 }, { "epoch": 0.4270647411974688, "grad_norm": 0.46289414167404175, "learning_rate": 9.768270755906467e-06, "loss": 0.2873, "step": 658 }, { "epoch": 0.4283628103196495, "grad_norm": 0.44146114587783813, "learning_rate": 9.766112029998847e-06, "loss": 0.3032, "step": 660 }, { "epoch": 0.4296608794418303, "grad_norm": 0.4063436686992645, "learning_rate": 9.763943536381448e-06, "loss": 0.2923, "step": 662 }, { "epoch": 0.430958948564011, "grad_norm": 0.40460067987442017, "learning_rate": 9.761765279498397e-06, "loss": 0.267, "step": 664 }, { "epoch": 0.4322570176861918, "grad_norm": 0.5318605899810791, "learning_rate": 9.759577263813833e-06, "loss": 0.3031, "step": 666 }, { "epoch": 0.43355508680837257, "grad_norm": 0.44349297881126404, "learning_rate": 9.757379493811892e-06, "loss": 0.3057, "step": 668 }, { "epoch": 0.4348531559305533, "grad_norm": 0.40534383058547974, "learning_rate": 9.755171973996705e-06, "loss": 0.2954, "step": 670 }, { "epoch": 0.43615122505273407, "grad_norm": 0.4766862690448761, "learning_rate": 9.752954708892379e-06, "loss": 0.3148, "step": 672 }, { "epoch": 0.4374492941749148, "grad_norm": 0.43945378065109253, "learning_rate": 9.750727703042994e-06, "loss": 0.3052, "step": 674 }, { "epoch": 0.43874736329709557, "grad_norm": 0.5216367840766907, "learning_rate": 9.7484909610126e-06, "loss": 0.2916, "step": 676 }, { "epoch": 0.44004543241927635, "grad_norm": 0.4955980181694031, "learning_rate": 9.74624448738519e-06, "loss": 0.2736, "step": 678 }, { "epoch": 0.4413435015414571, "grad_norm": 0.536312997341156, "learning_rate": 9.743988286764706e-06, "loss": 0.3125, "step": 680 }, { "epoch": 0.44264157066363785, "grad_norm": 0.6743183135986328, "learning_rate": 9.741722363775029e-06, "loss": 0.3188, "step": 682 }, { "epoch": 0.4439396397858186, "grad_norm": 0.42892593145370483, "learning_rate": 9.739446723059953e-06, "loss": 0.3098, "step": 684 }, { "epoch": 0.44523770890799935, "grad_norm": 0.6258504986763, "learning_rate": 9.737161369283201e-06, "loss": 0.3084, "step": 686 }, { "epoch": 0.44653577803018013, "grad_norm": 0.5092728734016418, "learning_rate": 9.734866307128395e-06, "loss": 0.2788, "step": 688 }, { "epoch": 0.44783384715236085, "grad_norm": 0.5033906698226929, "learning_rate": 9.732561541299053e-06, "loss": 0.2764, "step": 690 }, { "epoch": 0.44913191627454163, "grad_norm": 0.5485709309577942, "learning_rate": 9.730247076518584e-06, "loss": 0.3055, "step": 692 }, { "epoch": 0.45042998539672235, "grad_norm": 0.5740441083908081, "learning_rate": 9.727922917530267e-06, "loss": 0.3244, "step": 694 }, { "epoch": 0.45172805451890313, "grad_norm": 0.37472525238990784, "learning_rate": 9.725589069097257e-06, "loss": 0.2959, "step": 696 }, { "epoch": 0.4530261236410839, "grad_norm": 0.5061174631118774, "learning_rate": 9.723245536002561e-06, "loss": 0.3305, "step": 698 }, { "epoch": 0.45432419276326463, "grad_norm": 0.396668404340744, "learning_rate": 9.720892323049034e-06, "loss": 0.2844, "step": 700 }, { "epoch": 0.4556222618854454, "grad_norm": 0.5846567749977112, "learning_rate": 9.718529435059372e-06, "loss": 0.3026, "step": 702 }, { "epoch": 0.45692033100762613, "grad_norm": 0.7130705118179321, "learning_rate": 9.716156876876096e-06, "loss": 0.2986, "step": 704 }, { "epoch": 0.4582184001298069, "grad_norm": 0.46607914566993713, "learning_rate": 9.71377465336155e-06, "loss": 0.2794, "step": 706 }, { "epoch": 0.4595164692519877, "grad_norm": 0.5561748743057251, "learning_rate": 9.711382769397879e-06, "loss": 0.2665, "step": 708 }, { "epoch": 0.4608145383741684, "grad_norm": 0.4958229660987854, "learning_rate": 9.708981229887032e-06, "loss": 0.3019, "step": 710 }, { "epoch": 0.4621126074963492, "grad_norm": 0.40989038348197937, "learning_rate": 9.706570039750742e-06, "loss": 0.2849, "step": 712 }, { "epoch": 0.4634106766185299, "grad_norm": 0.4585845172405243, "learning_rate": 9.704149203930522e-06, "loss": 0.2948, "step": 714 }, { "epoch": 0.4647087457407107, "grad_norm": 0.4955282509326935, "learning_rate": 9.701718727387656e-06, "loss": 0.3011, "step": 716 }, { "epoch": 0.46600681486289147, "grad_norm": 0.5887092351913452, "learning_rate": 9.699278615103182e-06, "loss": 0.2922, "step": 718 }, { "epoch": 0.4673048839850722, "grad_norm": 0.5292546153068542, "learning_rate": 9.696828872077885e-06, "loss": 0.2895, "step": 720 }, { "epoch": 0.468602953107253, "grad_norm": 0.46292200684547424, "learning_rate": 9.694369503332292e-06, "loss": 0.2718, "step": 722 }, { "epoch": 0.4699010222294337, "grad_norm": 0.47631463408470154, "learning_rate": 9.691900513906649e-06, "loss": 0.2836, "step": 724 }, { "epoch": 0.4711990913516145, "grad_norm": 0.4282437562942505, "learning_rate": 9.689421908860928e-06, "loss": 0.3108, "step": 726 }, { "epoch": 0.47249716047379525, "grad_norm": 0.4927038252353668, "learning_rate": 9.686933693274801e-06, "loss": 0.2781, "step": 728 }, { "epoch": 0.473795229595976, "grad_norm": 0.5298264622688293, "learning_rate": 9.684435872247635e-06, "loss": 0.298, "step": 730 }, { "epoch": 0.47509329871815675, "grad_norm": 0.5173654556274414, "learning_rate": 9.681928450898492e-06, "loss": 0.3704, "step": 732 }, { "epoch": 0.4763913678403375, "grad_norm": 0.5215631723403931, "learning_rate": 9.679411434366094e-06, "loss": 0.2782, "step": 734 }, { "epoch": 0.47768943696251825, "grad_norm": 0.47444891929626465, "learning_rate": 9.676884827808843e-06, "loss": 0.2813, "step": 736 }, { "epoch": 0.47898750608469903, "grad_norm": 0.760757565498352, "learning_rate": 9.674348636404784e-06, "loss": 0.3025, "step": 738 }, { "epoch": 0.48028557520687976, "grad_norm": 0.45039957761764526, "learning_rate": 9.67180286535161e-06, "loss": 0.2833, "step": 740 }, { "epoch": 0.48158364432906053, "grad_norm": 0.446219801902771, "learning_rate": 9.669247519866645e-06, "loss": 0.3004, "step": 742 }, { "epoch": 0.48288171345124126, "grad_norm": 0.5375770330429077, "learning_rate": 9.666682605186834e-06, "loss": 0.314, "step": 744 }, { "epoch": 0.48417978257342204, "grad_norm": 0.623903214931488, "learning_rate": 9.664108126568736e-06, "loss": 0.2834, "step": 746 }, { "epoch": 0.4854778516956028, "grad_norm": 0.4831778109073639, "learning_rate": 9.66152408928851e-06, "loss": 0.3102, "step": 748 }, { "epoch": 0.48677592081778354, "grad_norm": 0.49016305804252625, "learning_rate": 9.658930498641901e-06, "loss": 0.2988, "step": 750 }, { "epoch": 0.4880739899399643, "grad_norm": 0.4612979590892792, "learning_rate": 9.656327359944237e-06, "loss": 0.292, "step": 752 }, { "epoch": 0.48937205906214504, "grad_norm": 0.3829922378063202, "learning_rate": 9.653714678530413e-06, "loss": 0.3067, "step": 754 }, { "epoch": 0.4906701281843258, "grad_norm": 0.50174480676651, "learning_rate": 9.651092459754879e-06, "loss": 0.2839, "step": 756 }, { "epoch": 0.4919681973065066, "grad_norm": 0.6914039850234985, "learning_rate": 9.64846070899163e-06, "loss": 0.3222, "step": 758 }, { "epoch": 0.4932662664286873, "grad_norm": 0.5280846953392029, "learning_rate": 9.6458194316342e-06, "loss": 0.3005, "step": 760 }, { "epoch": 0.4945643355508681, "grad_norm": 0.7020367980003357, "learning_rate": 9.643168633095647e-06, "loss": 0.2896, "step": 762 }, { "epoch": 0.4958624046730488, "grad_norm": 0.4698193073272705, "learning_rate": 9.640508318808536e-06, "loss": 0.295, "step": 764 }, { "epoch": 0.4971604737952296, "grad_norm": 0.457131028175354, "learning_rate": 9.637838494224941e-06, "loss": 0.2954, "step": 766 }, { "epoch": 0.4984585429174104, "grad_norm": 0.548276424407959, "learning_rate": 9.635159164816416e-06, "loss": 0.2918, "step": 768 }, { "epoch": 0.4997566120395911, "grad_norm": 0.5867443084716797, "learning_rate": 9.632470336074009e-06, "loss": 0.3028, "step": 770 }, { "epoch": 0.5010546811617719, "grad_norm": 0.5271953344345093, "learning_rate": 9.629772013508225e-06, "loss": 0.2828, "step": 772 }, { "epoch": 0.5023527502839527, "grad_norm": 0.5174055695533752, "learning_rate": 9.627064202649027e-06, "loss": 0.2802, "step": 774 }, { "epoch": 0.5036508194061334, "grad_norm": 0.46736016869544983, "learning_rate": 9.624346909045828e-06, "loss": 0.282, "step": 776 }, { "epoch": 0.5049488885283141, "grad_norm": 0.4548105299472809, "learning_rate": 9.62162013826747e-06, "loss": 0.2844, "step": 778 }, { "epoch": 0.5062469576504949, "grad_norm": 0.6403613090515137, "learning_rate": 9.61888389590222e-06, "loss": 0.2863, "step": 780 }, { "epoch": 0.5075450267726757, "grad_norm": 0.5032674670219421, "learning_rate": 9.616138187557758e-06, "loss": 0.292, "step": 782 }, { "epoch": 0.5088430958948564, "grad_norm": 0.5554517507553101, "learning_rate": 9.613383018861159e-06, "loss": 0.3134, "step": 784 }, { "epoch": 0.5101411650170372, "grad_norm": 0.42384248971939087, "learning_rate": 9.610618395458892e-06, "loss": 0.2793, "step": 786 }, { "epoch": 0.5114392341392179, "grad_norm": 0.6139311790466309, "learning_rate": 9.607844323016795e-06, "loss": 0.2942, "step": 788 }, { "epoch": 0.5127373032613987, "grad_norm": 0.6126854419708252, "learning_rate": 9.605060807220079e-06, "loss": 0.2851, "step": 790 }, { "epoch": 0.5140353723835794, "grad_norm": 0.5307278633117676, "learning_rate": 9.602267853773301e-06, "loss": 0.3155, "step": 792 }, { "epoch": 0.5153334415057602, "grad_norm": 0.5110380053520203, "learning_rate": 9.599465468400368e-06, "loss": 0.3176, "step": 794 }, { "epoch": 0.516631510627941, "grad_norm": 0.4387734532356262, "learning_rate": 9.596653656844507e-06, "loss": 0.2814, "step": 796 }, { "epoch": 0.5179295797501217, "grad_norm": 0.48630279302597046, "learning_rate": 9.593832424868271e-06, "loss": 0.2664, "step": 798 }, { "epoch": 0.5192276488723024, "grad_norm": 0.43809932470321655, "learning_rate": 9.591001778253514e-06, "loss": 0.2794, "step": 800 }, { "epoch": 0.5205257179944832, "grad_norm": 0.6637348532676697, "learning_rate": 9.58816172280139e-06, "loss": 0.3109, "step": 802 }, { "epoch": 0.521823787116664, "grad_norm": 0.5344498157501221, "learning_rate": 9.585312264332329e-06, "loss": 0.2855, "step": 804 }, { "epoch": 0.5231218562388448, "grad_norm": 0.6171989440917969, "learning_rate": 9.582453408686038e-06, "loss": 0.2664, "step": 806 }, { "epoch": 0.5244199253610254, "grad_norm": 0.464542955160141, "learning_rate": 9.579585161721478e-06, "loss": 0.2782, "step": 808 }, { "epoch": 0.5257179944832062, "grad_norm": 0.47406068444252014, "learning_rate": 9.576707529316857e-06, "loss": 0.2736, "step": 810 }, { "epoch": 0.527016063605387, "grad_norm": 0.5167007446289062, "learning_rate": 9.573820517369623e-06, "loss": 0.3046, "step": 812 }, { "epoch": 0.5283141327275678, "grad_norm": 0.4275253117084503, "learning_rate": 9.570924131796437e-06, "loss": 0.2862, "step": 814 }, { "epoch": 0.5296122018497486, "grad_norm": 0.5892252922058105, "learning_rate": 9.568018378533181e-06, "loss": 0.2912, "step": 816 }, { "epoch": 0.5309102709719292, "grad_norm": 0.7287339568138123, "learning_rate": 9.565103263534926e-06, "loss": 0.2937, "step": 818 }, { "epoch": 0.53220834009411, "grad_norm": 0.6369681358337402, "learning_rate": 9.562178792775936e-06, "loss": 0.3049, "step": 820 }, { "epoch": 0.5335064092162908, "grad_norm": 0.5859299302101135, "learning_rate": 9.559244972249643e-06, "loss": 0.2912, "step": 822 }, { "epoch": 0.5348044783384716, "grad_norm": 0.5471466183662415, "learning_rate": 9.556301807968645e-06, "loss": 0.2977, "step": 824 }, { "epoch": 0.5361025474606523, "grad_norm": 0.42406165599823, "learning_rate": 9.553349305964687e-06, "loss": 0.275, "step": 826 }, { "epoch": 0.537400616582833, "grad_norm": 0.6027302742004395, "learning_rate": 9.550387472288651e-06, "loss": 0.2727, "step": 828 }, { "epoch": 0.5386986857050138, "grad_norm": 0.5783841609954834, "learning_rate": 9.547416313010544e-06, "loss": 0.285, "step": 830 }, { "epoch": 0.5399967548271946, "grad_norm": 0.5259782671928406, "learning_rate": 9.544435834219486e-06, "loss": 0.2783, "step": 832 }, { "epoch": 0.5412948239493753, "grad_norm": 0.5866817235946655, "learning_rate": 9.541446042023692e-06, "loss": 0.2919, "step": 834 }, { "epoch": 0.5425928930715561, "grad_norm": 0.4397294521331787, "learning_rate": 9.538446942550468e-06, "loss": 0.2875, "step": 836 }, { "epoch": 0.5438909621937368, "grad_norm": 0.7915981411933899, "learning_rate": 9.535438541946195e-06, "loss": 0.3513, "step": 838 }, { "epoch": 0.5451890313159176, "grad_norm": 0.5959506034851074, "learning_rate": 9.532420846376316e-06, "loss": 0.2872, "step": 840 }, { "epoch": 0.5464871004380983, "grad_norm": 0.5247907042503357, "learning_rate": 9.529393862025317e-06, "loss": 0.2764, "step": 842 }, { "epoch": 0.5477851695602791, "grad_norm": 0.48218366503715515, "learning_rate": 9.526357595096727e-06, "loss": 0.2812, "step": 844 }, { "epoch": 0.5490832386824599, "grad_norm": 0.5723532438278198, "learning_rate": 9.523312051813097e-06, "loss": 0.287, "step": 846 }, { "epoch": 0.5503813078046406, "grad_norm": 0.7367391586303711, "learning_rate": 9.52025723841599e-06, "loss": 0.3058, "step": 848 }, { "epoch": 0.5516793769268213, "grad_norm": 0.6731153130531311, "learning_rate": 9.517193161165964e-06, "loss": 0.2907, "step": 850 }, { "epoch": 0.5529774460490021, "grad_norm": 0.48499786853790283, "learning_rate": 9.514119826342564e-06, "loss": 0.2864, "step": 852 }, { "epoch": 0.5542755151711829, "grad_norm": 1.0737590789794922, "learning_rate": 9.51103724024431e-06, "loss": 0.3191, "step": 854 }, { "epoch": 0.5555735842933637, "grad_norm": 0.4783880114555359, "learning_rate": 9.50794540918868e-06, "loss": 0.3082, "step": 856 }, { "epoch": 0.5568716534155443, "grad_norm": 0.5207197070121765, "learning_rate": 9.504844339512096e-06, "loss": 0.2753, "step": 858 }, { "epoch": 0.5581697225377251, "grad_norm": 0.4406615197658539, "learning_rate": 9.501734037569918e-06, "loss": 0.2925, "step": 860 }, { "epoch": 0.5594677916599059, "grad_norm": 0.5738576650619507, "learning_rate": 9.498614509736426e-06, "loss": 0.2673, "step": 862 }, { "epoch": 0.5607658607820867, "grad_norm": 0.7184911966323853, "learning_rate": 9.495485762404801e-06, "loss": 0.2912, "step": 864 }, { "epoch": 0.5620639299042675, "grad_norm": 0.4641527235507965, "learning_rate": 9.49234780198713e-06, "loss": 0.3017, "step": 866 }, { "epoch": 0.5633619990264481, "grad_norm": 0.6077806353569031, "learning_rate": 9.489200634914373e-06, "loss": 0.2869, "step": 868 }, { "epoch": 0.5646600681486289, "grad_norm": 0.6200167536735535, "learning_rate": 9.486044267636359e-06, "loss": 0.2973, "step": 870 }, { "epoch": 0.5659581372708097, "grad_norm": 0.8165602087974548, "learning_rate": 9.482878706621775e-06, "loss": 0.3121, "step": 872 }, { "epoch": 0.5672562063929905, "grad_norm": 0.5459227561950684, "learning_rate": 9.479703958358149e-06, "loss": 0.2912, "step": 874 }, { "epoch": 0.5685542755151712, "grad_norm": 0.4871876537799835, "learning_rate": 9.476520029351834e-06, "loss": 0.2893, "step": 876 }, { "epoch": 0.5698523446373519, "grad_norm": 0.7457983493804932, "learning_rate": 9.473326926128002e-06, "loss": 0.3177, "step": 878 }, { "epoch": 0.5711504137595327, "grad_norm": 0.4602726399898529, "learning_rate": 9.470124655230627e-06, "loss": 0.2968, "step": 880 }, { "epoch": 0.5724484828817135, "grad_norm": 0.7152136564254761, "learning_rate": 9.466913223222467e-06, "loss": 0.3071, "step": 882 }, { "epoch": 0.5737465520038942, "grad_norm": 0.6501118540763855, "learning_rate": 9.46369263668506e-06, "loss": 0.2779, "step": 884 }, { "epoch": 0.575044621126075, "grad_norm": 0.4788232743740082, "learning_rate": 9.4604629022187e-06, "loss": 0.2997, "step": 886 }, { "epoch": 0.5763426902482557, "grad_norm": 0.7461189031600952, "learning_rate": 9.457224026442435e-06, "loss": 0.3217, "step": 888 }, { "epoch": 0.5776407593704365, "grad_norm": 0.6262990832328796, "learning_rate": 9.453976015994043e-06, "loss": 0.2952, "step": 890 }, { "epoch": 0.5789388284926172, "grad_norm": 0.3967663645744324, "learning_rate": 9.450718877530021e-06, "loss": 0.2857, "step": 892 }, { "epoch": 0.580236897614798, "grad_norm": 0.4894019365310669, "learning_rate": 9.447452617725581e-06, "loss": 0.2889, "step": 894 }, { "epoch": 0.5815349667369788, "grad_norm": 0.8662911057472229, "learning_rate": 9.444177243274619e-06, "loss": 0.2965, "step": 896 }, { "epoch": 0.5828330358591595, "grad_norm": 0.4326671063899994, "learning_rate": 9.440892760889715e-06, "loss": 0.2618, "step": 898 }, { "epoch": 0.5841311049813402, "grad_norm": 0.6280264258384705, "learning_rate": 9.437599177302115e-06, "loss": 0.2868, "step": 900 }, { "epoch": 0.585429174103521, "grad_norm": 0.45585232973098755, "learning_rate": 9.434296499261719e-06, "loss": 0.2733, "step": 902 }, { "epoch": 0.5867272432257018, "grad_norm": 0.4711018204689026, "learning_rate": 9.43098473353706e-06, "loss": 0.2963, "step": 904 }, { "epoch": 0.5880253123478826, "grad_norm": 0.5202216506004333, "learning_rate": 9.427663886915302e-06, "loss": 0.3009, "step": 906 }, { "epoch": 0.5893233814700632, "grad_norm": 0.9075475335121155, "learning_rate": 9.42433396620221e-06, "loss": 0.3143, "step": 908 }, { "epoch": 0.590621450592244, "grad_norm": 0.7112184762954712, "learning_rate": 9.420994978222156e-06, "loss": 0.3372, "step": 910 }, { "epoch": 0.5919195197144248, "grad_norm": 0.4606819450855255, "learning_rate": 9.417646929818089e-06, "loss": 0.2799, "step": 912 }, { "epoch": 0.5932175888366056, "grad_norm": 0.6315789222717285, "learning_rate": 9.414289827851526e-06, "loss": 0.295, "step": 914 }, { "epoch": 0.5945156579587864, "grad_norm": 0.4764246940612793, "learning_rate": 9.41092367920254e-06, "loss": 0.2754, "step": 916 }, { "epoch": 0.595813727080967, "grad_norm": 0.5172373056411743, "learning_rate": 9.407548490769747e-06, "loss": 0.2986, "step": 918 }, { "epoch": 0.5971117962031478, "grad_norm": 0.729606032371521, "learning_rate": 9.404164269470282e-06, "loss": 0.3031, "step": 920 }, { "epoch": 0.5984098653253286, "grad_norm": 0.45380473136901855, "learning_rate": 9.4007710222398e-06, "loss": 0.2871, "step": 922 }, { "epoch": 0.5997079344475094, "grad_norm": 0.5831772685050964, "learning_rate": 9.397368756032445e-06, "loss": 0.2793, "step": 924 }, { "epoch": 0.6010060035696901, "grad_norm": 0.5093604922294617, "learning_rate": 9.393957477820858e-06, "loss": 0.3001, "step": 926 }, { "epoch": 0.6023040726918708, "grad_norm": 0.5217366814613342, "learning_rate": 9.390537194596135e-06, "loss": 0.2857, "step": 928 }, { "epoch": 0.6036021418140516, "grad_norm": 0.5689509510993958, "learning_rate": 9.387107913367832e-06, "loss": 0.286, "step": 930 }, { "epoch": 0.6049002109362324, "grad_norm": 0.7780814170837402, "learning_rate": 9.383669641163952e-06, "loss": 0.3082, "step": 932 }, { "epoch": 0.6061982800584131, "grad_norm": 0.8530994057655334, "learning_rate": 9.380222385030916e-06, "loss": 0.2943, "step": 934 }, { "epoch": 0.6074963491805939, "grad_norm": 0.5024810433387756, "learning_rate": 9.376766152033556e-06, "loss": 0.2841, "step": 936 }, { "epoch": 0.6087944183027746, "grad_norm": 0.6342744827270508, "learning_rate": 9.373300949255112e-06, "loss": 0.2911, "step": 938 }, { "epoch": 0.6100924874249554, "grad_norm": 0.5131987929344177, "learning_rate": 9.369826783797192e-06, "loss": 0.2688, "step": 940 }, { "epoch": 0.6113905565471361, "grad_norm": 0.527497410774231, "learning_rate": 9.366343662779784e-06, "loss": 0.2708, "step": 942 }, { "epoch": 0.6126886256693169, "grad_norm": 0.6879523396492004, "learning_rate": 9.362851593341227e-06, "loss": 0.2956, "step": 944 }, { "epoch": 0.6139866947914977, "grad_norm": 0.6231520771980286, "learning_rate": 9.359350582638193e-06, "loss": 0.2809, "step": 946 }, { "epoch": 0.6152847639136784, "grad_norm": 0.6896982789039612, "learning_rate": 9.355840637845683e-06, "loss": 0.3119, "step": 948 }, { "epoch": 0.6165828330358591, "grad_norm": 0.5351325273513794, "learning_rate": 9.352321766157011e-06, "loss": 0.2885, "step": 950 }, { "epoch": 0.6178809021580399, "grad_norm": 0.5489811301231384, "learning_rate": 9.348793974783778e-06, "loss": 0.2846, "step": 952 }, { "epoch": 0.6191789712802207, "grad_norm": 0.6425981521606445, "learning_rate": 9.345257270955873e-06, "loss": 0.3106, "step": 954 }, { "epoch": 0.6204770404024015, "grad_norm": 0.5222558975219727, "learning_rate": 9.34171166192144e-06, "loss": 0.2881, "step": 956 }, { "epoch": 0.6217751095245821, "grad_norm": 0.5315155982971191, "learning_rate": 9.338157154946887e-06, "loss": 0.3083, "step": 958 }, { "epoch": 0.6230731786467629, "grad_norm": 0.522007405757904, "learning_rate": 9.334593757316845e-06, "loss": 0.2948, "step": 960 }, { "epoch": 0.6243712477689437, "grad_norm": 0.5437946915626526, "learning_rate": 9.331021476334174e-06, "loss": 0.2916, "step": 962 }, { "epoch": 0.6256693168911245, "grad_norm": 0.935867965221405, "learning_rate": 9.327440319319933e-06, "loss": 0.294, "step": 964 }, { "epoch": 0.6269673860133053, "grad_norm": 0.5434510111808777, "learning_rate": 9.32385029361338e-06, "loss": 0.2816, "step": 966 }, { "epoch": 0.6282654551354859, "grad_norm": 0.6359426975250244, "learning_rate": 9.32025140657194e-06, "loss": 0.2717, "step": 968 }, { "epoch": 0.6295635242576667, "grad_norm": 0.5581540465354919, "learning_rate": 9.316643665571203e-06, "loss": 0.2967, "step": 970 }, { "epoch": 0.6308615933798475, "grad_norm": 0.8231354355812073, "learning_rate": 9.313027078004903e-06, "loss": 0.315, "step": 972 }, { "epoch": 0.6321596625020283, "grad_norm": 0.5670283436775208, "learning_rate": 9.30940165128491e-06, "loss": 0.2733, "step": 974 }, { "epoch": 0.633457731624209, "grad_norm": 0.554184079170227, "learning_rate": 9.305767392841194e-06, "loss": 0.2753, "step": 976 }, { "epoch": 0.6347558007463897, "grad_norm": 0.4797976315021515, "learning_rate": 9.302124310121841e-06, "loss": 0.286, "step": 978 }, { "epoch": 0.6360538698685705, "grad_norm": 0.7054647207260132, "learning_rate": 9.298472410593013e-06, "loss": 0.2823, "step": 980 }, { "epoch": 0.6373519389907513, "grad_norm": 0.8681824803352356, "learning_rate": 9.294811701738946e-06, "loss": 0.3016, "step": 982 }, { "epoch": 0.638650008112932, "grad_norm": 0.9729883670806885, "learning_rate": 9.291142191061927e-06, "loss": 0.2868, "step": 984 }, { "epoch": 0.6399480772351128, "grad_norm": 0.5139796137809753, "learning_rate": 9.287463886082277e-06, "loss": 0.2924, "step": 986 }, { "epoch": 0.6412461463572935, "grad_norm": 0.46500131487846375, "learning_rate": 9.28377679433835e-06, "loss": 0.3024, "step": 988 }, { "epoch": 0.6425442154794743, "grad_norm": 0.4761664867401123, "learning_rate": 9.280080923386501e-06, "loss": 0.3045, "step": 990 }, { "epoch": 0.643842284601655, "grad_norm": 0.48920348286628723, "learning_rate": 9.276376280801079e-06, "loss": 0.2919, "step": 992 }, { "epoch": 0.6451403537238358, "grad_norm": 0.6410265564918518, "learning_rate": 9.27266287417441e-06, "loss": 0.2805, "step": 994 }, { "epoch": 0.6464384228460166, "grad_norm": 0.5757927894592285, "learning_rate": 9.268940711116776e-06, "loss": 0.306, "step": 996 }, { "epoch": 0.6477364919681973, "grad_norm": 0.4552311599254608, "learning_rate": 9.265209799256417e-06, "loss": 0.2895, "step": 998 }, { "epoch": 0.649034561090378, "grad_norm": 0.5916186571121216, "learning_rate": 9.261470146239488e-06, "loss": 0.3073, "step": 1000 }, { "epoch": 0.649034561090378, "eval_loss": 0.2901286482810974, "eval_runtime": 397.3068, "eval_samples_per_second": 26.126, "eval_steps_per_second": 3.267, "step": 1000 }, { "epoch": 0.6503326302125588, "grad_norm": 0.5421074628829956, "learning_rate": 9.25772175973007e-06, "loss": 0.2851, "step": 1002 }, { "epoch": 0.6516306993347396, "grad_norm": 0.5432619452476501, "learning_rate": 9.253964647410134e-06, "loss": 0.2702, "step": 1004 }, { "epoch": 0.6529287684569204, "grad_norm": 0.5896894335746765, "learning_rate": 9.250198816979538e-06, "loss": 0.2927, "step": 1006 }, { "epoch": 0.654226837579101, "grad_norm": 0.8111984133720398, "learning_rate": 9.246424276156008e-06, "loss": 0.3007, "step": 1008 }, { "epoch": 0.6555249067012818, "grad_norm": 0.6752954125404358, "learning_rate": 9.242641032675118e-06, "loss": 0.2888, "step": 1010 }, { "epoch": 0.6568229758234626, "grad_norm": 0.5588261485099792, "learning_rate": 9.238849094290279e-06, "loss": 0.2968, "step": 1012 }, { "epoch": 0.6581210449456434, "grad_norm": 0.6739563345909119, "learning_rate": 9.235048468772722e-06, "loss": 0.2858, "step": 1014 }, { "epoch": 0.6594191140678242, "grad_norm": 0.5356653928756714, "learning_rate": 9.23123916391148e-06, "loss": 0.2758, "step": 1016 }, { "epoch": 0.6607171831900048, "grad_norm": 0.8876270651817322, "learning_rate": 9.227421187513375e-06, "loss": 0.3002, "step": 1018 }, { "epoch": 0.6620152523121856, "grad_norm": 0.4201999604701996, "learning_rate": 9.223594547402999e-06, "loss": 0.3116, "step": 1020 }, { "epoch": 0.6633133214343664, "grad_norm": 0.805539071559906, "learning_rate": 9.2197592514227e-06, "loss": 0.2997, "step": 1022 }, { "epoch": 0.6646113905565472, "grad_norm": 0.5493653416633606, "learning_rate": 9.215915307432565e-06, "loss": 0.2735, "step": 1024 }, { "epoch": 0.6659094596787279, "grad_norm": 0.5224273204803467, "learning_rate": 9.212062723310406e-06, "loss": 0.2785, "step": 1026 }, { "epoch": 0.6672075288009086, "grad_norm": 0.8348823189735413, "learning_rate": 9.208201506951742e-06, "loss": 0.3077, "step": 1028 }, { "epoch": 0.6685055979230894, "grad_norm": 0.5641005039215088, "learning_rate": 9.20433166626978e-06, "loss": 0.2669, "step": 1030 }, { "epoch": 0.6698036670452702, "grad_norm": 0.4589135944843292, "learning_rate": 9.200453209195404e-06, "loss": 0.3061, "step": 1032 }, { "epoch": 0.671101736167451, "grad_norm": 0.35073229670524597, "learning_rate": 9.196566143677157e-06, "loss": 0.2684, "step": 1034 }, { "epoch": 0.6723998052896317, "grad_norm": 0.5255971550941467, "learning_rate": 9.192670477681224e-06, "loss": 0.2768, "step": 1036 }, { "epoch": 0.6736978744118124, "grad_norm": 0.4588732421398163, "learning_rate": 9.188766219191415e-06, "loss": 0.2579, "step": 1038 }, { "epoch": 0.6749959435339932, "grad_norm": 0.7348477244377136, "learning_rate": 9.184853376209149e-06, "loss": 0.2804, "step": 1040 }, { "epoch": 0.676294012656174, "grad_norm": 0.5118248462677002, "learning_rate": 9.18093195675344e-06, "loss": 0.3096, "step": 1042 }, { "epoch": 0.6775920817783547, "grad_norm": 0.8831693530082703, "learning_rate": 9.177001968860878e-06, "loss": 0.419, "step": 1044 }, { "epoch": 0.6788901509005355, "grad_norm": 0.6509882807731628, "learning_rate": 9.173063420585612e-06, "loss": 0.2748, "step": 1046 }, { "epoch": 0.6801882200227162, "grad_norm": 0.6977046728134155, "learning_rate": 9.169116319999336e-06, "loss": 0.2944, "step": 1048 }, { "epoch": 0.681486289144897, "grad_norm": 0.8557306528091431, "learning_rate": 9.165160675191272e-06, "loss": 0.312, "step": 1050 }, { "epoch": 0.6827843582670777, "grad_norm": 0.6913865804672241, "learning_rate": 9.16119649426815e-06, "loss": 0.3047, "step": 1052 }, { "epoch": 0.6840824273892585, "grad_norm": 0.5484463572502136, "learning_rate": 9.157223785354197e-06, "loss": 0.3005, "step": 1054 }, { "epoch": 0.6853804965114393, "grad_norm": 0.44823819398880005, "learning_rate": 9.153242556591115e-06, "loss": 0.2943, "step": 1056 }, { "epoch": 0.68667856563362, "grad_norm": 0.48865145444869995, "learning_rate": 9.149252816138069e-06, "loss": 0.2905, "step": 1058 }, { "epoch": 0.6879766347558007, "grad_norm": 0.6097568273544312, "learning_rate": 9.145254572171662e-06, "loss": 0.312, "step": 1060 }, { "epoch": 0.6892747038779815, "grad_norm": 0.5407414436340332, "learning_rate": 9.141247832885932e-06, "loss": 0.2777, "step": 1062 }, { "epoch": 0.6905727730001623, "grad_norm": 0.5648403167724609, "learning_rate": 9.137232606492323e-06, "loss": 0.3427, "step": 1064 }, { "epoch": 0.6918708421223431, "grad_norm": 0.5602331757545471, "learning_rate": 9.133208901219676e-06, "loss": 0.2925, "step": 1066 }, { "epoch": 0.6931689112445237, "grad_norm": 0.45207473635673523, "learning_rate": 9.129176725314201e-06, "loss": 0.2749, "step": 1068 }, { "epoch": 0.6944669803667045, "grad_norm": 0.4779388904571533, "learning_rate": 9.12513608703948e-06, "loss": 0.2844, "step": 1070 }, { "epoch": 0.6957650494888853, "grad_norm": 0.6065648198127747, "learning_rate": 9.121086994676423e-06, "loss": 0.2757, "step": 1072 }, { "epoch": 0.6970631186110661, "grad_norm": 0.46534982323646545, "learning_rate": 9.11702945652328e-06, "loss": 0.2828, "step": 1074 }, { "epoch": 0.6983611877332468, "grad_norm": 0.6966667771339417, "learning_rate": 9.112963480895598e-06, "loss": 0.3004, "step": 1076 }, { "epoch": 0.6996592568554275, "grad_norm": 0.6863649487495422, "learning_rate": 9.108889076126226e-06, "loss": 0.2924, "step": 1078 }, { "epoch": 0.7009573259776083, "grad_norm": 0.6956730484962463, "learning_rate": 9.104806250565283e-06, "loss": 0.2685, "step": 1080 }, { "epoch": 0.7022553950997891, "grad_norm": 0.8401827216148376, "learning_rate": 9.100715012580142e-06, "loss": 0.3128, "step": 1082 }, { "epoch": 0.7035534642219698, "grad_norm": 0.6265568733215332, "learning_rate": 9.096615370555423e-06, "loss": 0.3002, "step": 1084 }, { "epoch": 0.7048515333441506, "grad_norm": 0.6071637272834778, "learning_rate": 9.092507332892968e-06, "loss": 0.2915, "step": 1086 }, { "epoch": 0.7061496024663313, "grad_norm": 0.6788455843925476, "learning_rate": 9.088390908011822e-06, "loss": 0.2614, "step": 1088 }, { "epoch": 0.7074476715885121, "grad_norm": 0.5349068641662598, "learning_rate": 9.084266104348219e-06, "loss": 0.2952, "step": 1090 }, { "epoch": 0.7087457407106929, "grad_norm": 0.5392186045646667, "learning_rate": 9.080132930355567e-06, "loss": 0.2878, "step": 1092 }, { "epoch": 0.7100438098328736, "grad_norm": 0.48492342233657837, "learning_rate": 9.07599139450443e-06, "loss": 0.2771, "step": 1094 }, { "epoch": 0.7113418789550544, "grad_norm": 0.4398370385169983, "learning_rate": 9.071841505282501e-06, "loss": 0.268, "step": 1096 }, { "epoch": 0.7126399480772351, "grad_norm": 0.5449391007423401, "learning_rate": 9.067683271194601e-06, "loss": 0.2935, "step": 1098 }, { "epoch": 0.7139380171994159, "grad_norm": 0.4654986560344696, "learning_rate": 9.06351670076265e-06, "loss": 0.2606, "step": 1100 }, { "epoch": 0.7152360863215966, "grad_norm": 0.48918795585632324, "learning_rate": 9.059341802525652e-06, "loss": 0.3388, "step": 1102 }, { "epoch": 0.7165341554437774, "grad_norm": 0.5506134033203125, "learning_rate": 9.055158585039678e-06, "loss": 0.2913, "step": 1104 }, { "epoch": 0.7178322245659582, "grad_norm": 0.5453973412513733, "learning_rate": 9.050967056877846e-06, "loss": 0.2862, "step": 1106 }, { "epoch": 0.7191302936881389, "grad_norm": 0.6833801865577698, "learning_rate": 9.046767226630313e-06, "loss": 0.2734, "step": 1108 }, { "epoch": 0.7204283628103196, "grad_norm": 0.5205474495887756, "learning_rate": 9.042559102904245e-06, "loss": 0.262, "step": 1110 }, { "epoch": 0.7217264319325004, "grad_norm": 1.0703086853027344, "learning_rate": 9.038342694323806e-06, "loss": 0.2986, "step": 1112 }, { "epoch": 0.7230245010546812, "grad_norm": 0.7175477743148804, "learning_rate": 9.034118009530137e-06, "loss": 0.3019, "step": 1114 }, { "epoch": 0.724322570176862, "grad_norm": 0.4632490575313568, "learning_rate": 9.029885057181343e-06, "loss": 0.2663, "step": 1116 }, { "epoch": 0.7256206392990426, "grad_norm": 0.7186155319213867, "learning_rate": 9.025643845952473e-06, "loss": 0.3115, "step": 1118 }, { "epoch": 0.7269187084212234, "grad_norm": 0.5666090846061707, "learning_rate": 9.0213943845355e-06, "loss": 0.2975, "step": 1120 }, { "epoch": 0.7282167775434042, "grad_norm": 0.6130120158195496, "learning_rate": 9.017136681639307e-06, "loss": 0.2924, "step": 1122 }, { "epoch": 0.729514846665585, "grad_norm": 0.5019729137420654, "learning_rate": 9.012870745989663e-06, "loss": 0.278, "step": 1124 }, { "epoch": 0.7308129157877657, "grad_norm": 0.6438244581222534, "learning_rate": 9.008596586329216e-06, "loss": 0.2854, "step": 1126 }, { "epoch": 0.7321109849099464, "grad_norm": 0.7025477886199951, "learning_rate": 9.004314211417461e-06, "loss": 0.3087, "step": 1128 }, { "epoch": 0.7334090540321272, "grad_norm": 0.4508492052555084, "learning_rate": 9.000023630030734e-06, "loss": 0.2885, "step": 1130 }, { "epoch": 0.734707123154308, "grad_norm": 0.5872870683670044, "learning_rate": 8.995724850962189e-06, "loss": 0.3107, "step": 1132 }, { "epoch": 0.7360051922764888, "grad_norm": 0.5650608539581299, "learning_rate": 8.99141788302178e-06, "loss": 0.3029, "step": 1134 }, { "epoch": 0.7373032613986695, "grad_norm": 0.5341686010360718, "learning_rate": 8.987102735036241e-06, "loss": 0.2886, "step": 1136 }, { "epoch": 0.7386013305208502, "grad_norm": 0.49814504384994507, "learning_rate": 8.982779415849076e-06, "loss": 0.2532, "step": 1138 }, { "epoch": 0.739899399643031, "grad_norm": 0.6889177560806274, "learning_rate": 8.978447934320526e-06, "loss": 0.2863, "step": 1140 }, { "epoch": 0.7411974687652118, "grad_norm": 0.5664680600166321, "learning_rate": 8.97410829932757e-06, "loss": 0.2794, "step": 1142 }, { "epoch": 0.7424955378873925, "grad_norm": 0.6941066980361938, "learning_rate": 8.969760519763891e-06, "loss": 0.2838, "step": 1144 }, { "epoch": 0.7437936070095733, "grad_norm": 0.5983604788780212, "learning_rate": 8.965404604539864e-06, "loss": 0.3035, "step": 1146 }, { "epoch": 0.745091676131754, "grad_norm": 0.7246621251106262, "learning_rate": 8.96104056258254e-06, "loss": 0.2739, "step": 1148 }, { "epoch": 0.7463897452539348, "grad_norm": 0.5326955318450928, "learning_rate": 8.956668402835618e-06, "loss": 0.2943, "step": 1150 }, { "epoch": 0.7476878143761155, "grad_norm": 0.44938555359840393, "learning_rate": 8.952288134259443e-06, "loss": 0.2727, "step": 1152 }, { "epoch": 0.7489858834982963, "grad_norm": 0.495539128780365, "learning_rate": 8.947899765830973e-06, "loss": 0.2775, "step": 1154 }, { "epoch": 0.7502839526204771, "grad_norm": 0.6510003805160522, "learning_rate": 8.94350330654377e-06, "loss": 0.2798, "step": 1156 }, { "epoch": 0.7515820217426578, "grad_norm": 0.518806517124176, "learning_rate": 8.93909876540797e-06, "loss": 0.2805, "step": 1158 }, { "epoch": 0.7528800908648385, "grad_norm": 0.7767261266708374, "learning_rate": 8.934686151450276e-06, "loss": 0.2929, "step": 1160 }, { "epoch": 0.7541781599870193, "grad_norm": 0.5557768940925598, "learning_rate": 8.930265473713939e-06, "loss": 0.2922, "step": 1162 }, { "epoch": 0.7554762291092001, "grad_norm": 0.7437509894371033, "learning_rate": 8.925836741258729e-06, "loss": 0.3149, "step": 1164 }, { "epoch": 0.7567742982313809, "grad_norm": 0.602512776851654, "learning_rate": 8.921399963160934e-06, "loss": 0.2828, "step": 1166 }, { "epoch": 0.7580723673535615, "grad_norm": 0.6104228496551514, "learning_rate": 8.916955148513317e-06, "loss": 0.2781, "step": 1168 }, { "epoch": 0.7593704364757423, "grad_norm": 0.5656735897064209, "learning_rate": 8.912502306425121e-06, "loss": 0.3243, "step": 1170 }, { "epoch": 0.7606685055979231, "grad_norm": 0.564076840877533, "learning_rate": 8.908041446022038e-06, "loss": 0.2718, "step": 1172 }, { "epoch": 0.7619665747201039, "grad_norm": 0.5532398223876953, "learning_rate": 8.903572576446193e-06, "loss": 0.2702, "step": 1174 }, { "epoch": 0.7632646438422847, "grad_norm": 0.6288996338844299, "learning_rate": 8.899095706856122e-06, "loss": 0.261, "step": 1176 }, { "epoch": 0.7645627129644653, "grad_norm": 0.5717208385467529, "learning_rate": 8.894610846426763e-06, "loss": 0.3029, "step": 1178 }, { "epoch": 0.7658607820866461, "grad_norm": 0.5458253026008606, "learning_rate": 8.890118004349423e-06, "loss": 0.2828, "step": 1180 }, { "epoch": 0.7671588512088269, "grad_norm": 0.6043146252632141, "learning_rate": 8.885617189831772e-06, "loss": 0.2635, "step": 1182 }, { "epoch": 0.7684569203310077, "grad_norm": 0.6473830342292786, "learning_rate": 8.881108412097816e-06, "loss": 0.2921, "step": 1184 }, { "epoch": 0.7697549894531884, "grad_norm": 0.7034235596656799, "learning_rate": 8.876591680387883e-06, "loss": 0.3111, "step": 1186 }, { "epoch": 0.7710530585753691, "grad_norm": 0.7677924633026123, "learning_rate": 8.872067003958597e-06, "loss": 0.2794, "step": 1188 }, { "epoch": 0.7723511276975499, "grad_norm": 0.9395731687545776, "learning_rate": 8.867534392082873e-06, "loss": 0.2879, "step": 1190 }, { "epoch": 0.7736491968197307, "grad_norm": 0.6890425682067871, "learning_rate": 8.862993854049879e-06, "loss": 0.2914, "step": 1192 }, { "epoch": 0.7749472659419114, "grad_norm": 0.4863192141056061, "learning_rate": 8.858445399165033e-06, "loss": 0.2758, "step": 1194 }, { "epoch": 0.7762453350640922, "grad_norm": 0.5594764351844788, "learning_rate": 8.853889036749978e-06, "loss": 0.2943, "step": 1196 }, { "epoch": 0.7775434041862729, "grad_norm": 0.590385377407074, "learning_rate": 8.849324776142558e-06, "loss": 0.2929, "step": 1198 }, { "epoch": 0.7788414733084537, "grad_norm": 0.8073956966400146, "learning_rate": 8.84475262669681e-06, "loss": 0.3345, "step": 1200 }, { "epoch": 0.7801395424306344, "grad_norm": 0.6071555614471436, "learning_rate": 8.840172597782934e-06, "loss": 0.2723, "step": 1202 }, { "epoch": 0.7814376115528152, "grad_norm": 0.5159868001937866, "learning_rate": 8.83558469878728e-06, "loss": 0.2733, "step": 1204 }, { "epoch": 0.782735680674996, "grad_norm": 0.6181891560554504, "learning_rate": 8.830988939112326e-06, "loss": 0.2891, "step": 1206 }, { "epoch": 0.7840337497971767, "grad_norm": 0.8166855573654175, "learning_rate": 8.82638532817666e-06, "loss": 0.2962, "step": 1208 }, { "epoch": 0.7853318189193574, "grad_norm": 1.0593408346176147, "learning_rate": 8.82177387541496e-06, "loss": 0.2814, "step": 1210 }, { "epoch": 0.7866298880415382, "grad_norm": 0.5648901462554932, "learning_rate": 8.817154590277976e-06, "loss": 0.2903, "step": 1212 }, { "epoch": 0.787927957163719, "grad_norm": 0.5895318984985352, "learning_rate": 8.812527482232513e-06, "loss": 0.2842, "step": 1214 }, { "epoch": 0.7892260262858998, "grad_norm": 0.687621533870697, "learning_rate": 8.807892560761399e-06, "loss": 0.2718, "step": 1216 }, { "epoch": 0.7905240954080804, "grad_norm": 0.5297589302062988, "learning_rate": 8.803249835363486e-06, "loss": 0.2762, "step": 1218 }, { "epoch": 0.7918221645302612, "grad_norm": 0.4767840504646301, "learning_rate": 8.798599315553611e-06, "loss": 0.2846, "step": 1220 }, { "epoch": 0.793120233652442, "grad_norm": 0.8247559070587158, "learning_rate": 8.79394101086259e-06, "loss": 0.2998, "step": 1222 }, { "epoch": 0.7944183027746228, "grad_norm": 0.518978476524353, "learning_rate": 8.789274930837189e-06, "loss": 0.3102, "step": 1224 }, { "epoch": 0.7957163718968036, "grad_norm": 0.5800621509552002, "learning_rate": 8.784601085040114e-06, "loss": 0.2624, "step": 1226 }, { "epoch": 0.7970144410189842, "grad_norm": 0.6104470491409302, "learning_rate": 8.779919483049982e-06, "loss": 0.2812, "step": 1228 }, { "epoch": 0.798312510141165, "grad_norm": 0.8473309278488159, "learning_rate": 8.775230134461307e-06, "loss": 0.2945, "step": 1230 }, { "epoch": 0.7996105792633458, "grad_norm": 0.6173761487007141, "learning_rate": 8.770533048884483e-06, "loss": 0.2897, "step": 1232 }, { "epoch": 0.8009086483855266, "grad_norm": 0.6139901280403137, "learning_rate": 8.76582823594575e-06, "loss": 0.2894, "step": 1234 }, { "epoch": 0.8022067175077073, "grad_norm": 0.49528780579566956, "learning_rate": 8.761115705287195e-06, "loss": 0.2973, "step": 1236 }, { "epoch": 0.803504786629888, "grad_norm": 0.7021246552467346, "learning_rate": 8.756395466566718e-06, "loss": 0.3278, "step": 1238 }, { "epoch": 0.8048028557520688, "grad_norm": 0.6768953204154968, "learning_rate": 8.751667529458014e-06, "loss": 0.2904, "step": 1240 }, { "epoch": 0.8061009248742496, "grad_norm": 0.6045536994934082, "learning_rate": 8.746931903650558e-06, "loss": 0.317, "step": 1242 }, { "epoch": 0.8073989939964303, "grad_norm": 0.6364895105361938, "learning_rate": 8.74218859884958e-06, "loss": 0.2707, "step": 1244 }, { "epoch": 0.8086970631186111, "grad_norm": 0.5393198132514954, "learning_rate": 8.737437624776047e-06, "loss": 0.294, "step": 1246 }, { "epoch": 0.8099951322407918, "grad_norm": 0.500775158405304, "learning_rate": 8.732678991166647e-06, "loss": 0.2636, "step": 1248 }, { "epoch": 0.8112932013629726, "grad_norm": 0.5400915741920471, "learning_rate": 8.727912707773764e-06, "loss": 0.266, "step": 1250 }, { "epoch": 0.8125912704851533, "grad_norm": 0.8606507182121277, "learning_rate": 8.723138784365459e-06, "loss": 0.3018, "step": 1252 }, { "epoch": 0.8138893396073341, "grad_norm": 0.5905175805091858, "learning_rate": 8.71835723072545e-06, "loss": 0.2696, "step": 1254 }, { "epoch": 0.8151874087295149, "grad_norm": 0.6802726984024048, "learning_rate": 8.71356805665309e-06, "loss": 0.2734, "step": 1256 }, { "epoch": 0.8164854778516956, "grad_norm": 0.7192598581314087, "learning_rate": 8.708771271963356e-06, "loss": 0.2909, "step": 1258 }, { "epoch": 0.8177835469738763, "grad_norm": 0.8091411590576172, "learning_rate": 8.703966886486819e-06, "loss": 0.2904, "step": 1260 }, { "epoch": 0.8190816160960571, "grad_norm": 0.8599343299865723, "learning_rate": 8.699154910069624e-06, "loss": 0.2633, "step": 1262 }, { "epoch": 0.8203796852182379, "grad_norm": 0.6553810238838196, "learning_rate": 8.694335352573476e-06, "loss": 0.2688, "step": 1264 }, { "epoch": 0.8216777543404187, "grad_norm": 0.7036312818527222, "learning_rate": 8.68950822387562e-06, "loss": 0.2903, "step": 1266 }, { "epoch": 0.8229758234625993, "grad_norm": 1.0410854816436768, "learning_rate": 8.684673533868808e-06, "loss": 0.3346, "step": 1268 }, { "epoch": 0.8242738925847801, "grad_norm": 0.5900399088859558, "learning_rate": 8.679831292461297e-06, "loss": 0.2833, "step": 1270 }, { "epoch": 0.8255719617069609, "grad_norm": 0.6472350358963013, "learning_rate": 8.674981509576819e-06, "loss": 0.2736, "step": 1272 }, { "epoch": 0.8268700308291417, "grad_norm": 0.6742092370986938, "learning_rate": 8.670124195154557e-06, "loss": 0.2993, "step": 1274 }, { "epoch": 0.8281680999513225, "grad_norm": 0.6098873615264893, "learning_rate": 8.665259359149132e-06, "loss": 0.2668, "step": 1276 }, { "epoch": 0.8294661690735031, "grad_norm": 0.5756238698959351, "learning_rate": 8.66038701153058e-06, "loss": 0.2764, "step": 1278 }, { "epoch": 0.8307642381956839, "grad_norm": 0.5642629265785217, "learning_rate": 8.655507162284331e-06, "loss": 0.2659, "step": 1280 }, { "epoch": 0.8320623073178647, "grad_norm": 0.5651534795761108, "learning_rate": 8.650619821411188e-06, "loss": 0.2929, "step": 1282 }, { "epoch": 0.8333603764400455, "grad_norm": 0.6836010217666626, "learning_rate": 8.64572499892731e-06, "loss": 0.2643, "step": 1284 }, { "epoch": 0.8346584455622262, "grad_norm": 0.7692021727561951, "learning_rate": 8.640822704864183e-06, "loss": 0.2596, "step": 1286 }, { "epoch": 0.8359565146844069, "grad_norm": 0.5894228219985962, "learning_rate": 8.635912949268614e-06, "loss": 0.2964, "step": 1288 }, { "epoch": 0.8372545838065877, "grad_norm": 0.5140237808227539, "learning_rate": 8.630995742202695e-06, "loss": 0.259, "step": 1290 }, { "epoch": 0.8385526529287685, "grad_norm": 0.6199679970741272, "learning_rate": 8.62607109374379e-06, "loss": 0.2837, "step": 1292 }, { "epoch": 0.8398507220509492, "grad_norm": 0.5351325273513794, "learning_rate": 8.621139013984519e-06, "loss": 0.2697, "step": 1294 }, { "epoch": 0.84114879117313, "grad_norm": 0.5435934662818909, "learning_rate": 8.616199513032723e-06, "loss": 0.273, "step": 1296 }, { "epoch": 0.8424468602953107, "grad_norm": 0.5445231199264526, "learning_rate": 8.611252601011457e-06, "loss": 0.2838, "step": 1298 }, { "epoch": 0.8437449294174915, "grad_norm": 0.6098053455352783, "learning_rate": 8.606298288058967e-06, "loss": 0.2826, "step": 1300 }, { "epoch": 0.8450429985396722, "grad_norm": 0.7896488904953003, "learning_rate": 8.601336584328659e-06, "loss": 0.3067, "step": 1302 }, { "epoch": 0.846341067661853, "grad_norm": 0.6145248413085938, "learning_rate": 8.596367499989093e-06, "loss": 0.2476, "step": 1304 }, { "epoch": 0.8476391367840338, "grad_norm": 0.5983669757843018, "learning_rate": 8.59139104522395e-06, "loss": 0.2877, "step": 1306 }, { "epoch": 0.8489372059062145, "grad_norm": 0.6240435242652893, "learning_rate": 8.58640723023202e-06, "loss": 0.2564, "step": 1308 }, { "epoch": 0.8502352750283952, "grad_norm": 0.6200278997421265, "learning_rate": 8.581416065227176e-06, "loss": 0.2837, "step": 1310 }, { "epoch": 0.851533344150576, "grad_norm": 0.590140700340271, "learning_rate": 8.576417560438348e-06, "loss": 0.2641, "step": 1312 }, { "epoch": 0.8528314132727568, "grad_norm": 0.9220361113548279, "learning_rate": 8.571411726109518e-06, "loss": 0.2914, "step": 1314 }, { "epoch": 0.8541294823949376, "grad_norm": 0.6970017552375793, "learning_rate": 8.566398572499685e-06, "loss": 0.3028, "step": 1316 }, { "epoch": 0.8554275515171182, "grad_norm": 0.5961741805076599, "learning_rate": 8.561378109882844e-06, "loss": 0.2782, "step": 1318 }, { "epoch": 0.856725620639299, "grad_norm": 0.7446039319038391, "learning_rate": 8.556350348547978e-06, "loss": 0.2835, "step": 1320 }, { "epoch": 0.8580236897614798, "grad_norm": 0.6202130317687988, "learning_rate": 8.551315298799017e-06, "loss": 0.286, "step": 1322 }, { "epoch": 0.8593217588836606, "grad_norm": 0.8216232657432556, "learning_rate": 8.546272970954838e-06, "loss": 0.2777, "step": 1324 }, { "epoch": 0.8606198280058414, "grad_norm": 0.49645790457725525, "learning_rate": 8.54122337534923e-06, "loss": 0.2823, "step": 1326 }, { "epoch": 0.861917897128022, "grad_norm": 0.6926692724227905, "learning_rate": 8.536166522330875e-06, "loss": 0.3023, "step": 1328 }, { "epoch": 0.8632159662502028, "grad_norm": 0.7954425811767578, "learning_rate": 8.53110242226333e-06, "loss": 0.3134, "step": 1330 }, { "epoch": 0.8645140353723836, "grad_norm": 0.5701652765274048, "learning_rate": 8.526031085525004e-06, "loss": 0.2787, "step": 1332 }, { "epoch": 0.8658121044945644, "grad_norm": 0.5190241932868958, "learning_rate": 8.520952522509139e-06, "loss": 0.2768, "step": 1334 }, { "epoch": 0.8671101736167451, "grad_norm": 0.576443612575531, "learning_rate": 8.51586674362378e-06, "loss": 0.2645, "step": 1336 }, { "epoch": 0.8684082427389258, "grad_norm": 0.5375766754150391, "learning_rate": 8.510773759291768e-06, "loss": 0.2733, "step": 1338 }, { "epoch": 0.8697063118611066, "grad_norm": 0.5133069753646851, "learning_rate": 8.505673579950708e-06, "loss": 0.2898, "step": 1340 }, { "epoch": 0.8710043809832874, "grad_norm": 0.8139728903770447, "learning_rate": 8.500566216052948e-06, "loss": 0.3018, "step": 1342 }, { "epoch": 0.8723024501054681, "grad_norm": 0.7572596669197083, "learning_rate": 8.495451678065563e-06, "loss": 0.2862, "step": 1344 }, { "epoch": 0.8736005192276489, "grad_norm": 0.8263686895370483, "learning_rate": 8.490329976470329e-06, "loss": 0.3138, "step": 1346 }, { "epoch": 0.8748985883498296, "grad_norm": 0.9575554132461548, "learning_rate": 8.485201121763706e-06, "loss": 0.2916, "step": 1348 }, { "epoch": 0.8761966574720104, "grad_norm": 0.5792005062103271, "learning_rate": 8.48006512445681e-06, "loss": 0.2875, "step": 1350 }, { "epoch": 0.8774947265941911, "grad_norm": 0.6314175128936768, "learning_rate": 8.474921995075399e-06, "loss": 0.306, "step": 1352 }, { "epoch": 0.8787927957163719, "grad_norm": 0.8024908900260925, "learning_rate": 8.46977174415984e-06, "loss": 0.3745, "step": 1354 }, { "epoch": 0.8800908648385527, "grad_norm": 0.6125353574752808, "learning_rate": 8.464614382265107e-06, "loss": 0.288, "step": 1356 }, { "epoch": 0.8813889339607334, "grad_norm": 0.4953877031803131, "learning_rate": 8.459449919960737e-06, "loss": 0.2685, "step": 1358 }, { "epoch": 0.8826870030829141, "grad_norm": 0.681485652923584, "learning_rate": 8.454278367830823e-06, "loss": 0.2615, "step": 1360 }, { "epoch": 0.8839850722050949, "grad_norm": 0.8042451739311218, "learning_rate": 8.449099736473986e-06, "loss": 0.2736, "step": 1362 }, { "epoch": 0.8852831413272757, "grad_norm": 0.5504181385040283, "learning_rate": 8.443914036503356e-06, "loss": 0.2824, "step": 1364 }, { "epoch": 0.8865812104494565, "grad_norm": 0.5905902981758118, "learning_rate": 8.438721278546553e-06, "loss": 0.2967, "step": 1366 }, { "epoch": 0.8878792795716371, "grad_norm": 0.5147536396980286, "learning_rate": 8.433521473245653e-06, "loss": 0.2812, "step": 1368 }, { "epoch": 0.8891773486938179, "grad_norm": 0.6061310768127441, "learning_rate": 8.428314631257186e-06, "loss": 0.2829, "step": 1370 }, { "epoch": 0.8904754178159987, "grad_norm": 0.6898365616798401, "learning_rate": 8.423100763252094e-06, "loss": 0.2688, "step": 1372 }, { "epoch": 0.8917734869381795, "grad_norm": 0.6006032824516296, "learning_rate": 8.417879879915724e-06, "loss": 0.2975, "step": 1374 }, { "epoch": 0.8930715560603603, "grad_norm": 0.8392308950424194, "learning_rate": 8.412651991947795e-06, "loss": 0.279, "step": 1376 }, { "epoch": 0.8943696251825409, "grad_norm": 0.627784013748169, "learning_rate": 8.407417110062389e-06, "loss": 0.2653, "step": 1378 }, { "epoch": 0.8956676943047217, "grad_norm": 0.6972095966339111, "learning_rate": 8.40217524498791e-06, "loss": 0.2688, "step": 1380 }, { "epoch": 0.8969657634269025, "grad_norm": 0.6680583357810974, "learning_rate": 8.396926407467085e-06, "loss": 0.2901, "step": 1382 }, { "epoch": 0.8982638325490833, "grad_norm": 0.5485673546791077, "learning_rate": 8.391670608256925e-06, "loss": 0.2836, "step": 1384 }, { "epoch": 0.899561901671264, "grad_norm": 0.6606987118721008, "learning_rate": 8.386407858128707e-06, "loss": 0.269, "step": 1386 }, { "epoch": 0.9008599707934447, "grad_norm": 0.6488548517227173, "learning_rate": 8.381138167867955e-06, "loss": 0.2669, "step": 1388 }, { "epoch": 0.9021580399156255, "grad_norm": 0.7695783376693726, "learning_rate": 8.375861548274417e-06, "loss": 0.2737, "step": 1390 }, { "epoch": 0.9034561090378063, "grad_norm": 0.5541255474090576, "learning_rate": 8.370578010162043e-06, "loss": 0.2705, "step": 1392 }, { "epoch": 0.904754178159987, "grad_norm": 0.8548727035522461, "learning_rate": 8.365287564358956e-06, "loss": 0.3155, "step": 1394 }, { "epoch": 0.9060522472821678, "grad_norm": 0.7566190958023071, "learning_rate": 8.359990221707444e-06, "loss": 0.3073, "step": 1396 }, { "epoch": 0.9073503164043485, "grad_norm": 0.6765993237495422, "learning_rate": 8.354685993063923e-06, "loss": 0.297, "step": 1398 }, { "epoch": 0.9086483855265293, "grad_norm": 0.6941819787025452, "learning_rate": 8.349374889298923e-06, "loss": 0.2877, "step": 1400 }, { "epoch": 0.90994645464871, "grad_norm": 0.75313800573349, "learning_rate": 8.344056921297064e-06, "loss": 0.2736, "step": 1402 }, { "epoch": 0.9112445237708908, "grad_norm": 0.6858447790145874, "learning_rate": 8.338732099957038e-06, "loss": 0.2845, "step": 1404 }, { "epoch": 0.9125425928930716, "grad_norm": 0.5937078595161438, "learning_rate": 8.333400436191575e-06, "loss": 0.3002, "step": 1406 }, { "epoch": 0.9138406620152523, "grad_norm": 0.5123778581619263, "learning_rate": 8.32806194092743e-06, "loss": 0.308, "step": 1408 }, { "epoch": 0.915138731137433, "grad_norm": 0.6879609227180481, "learning_rate": 8.322716625105363e-06, "loss": 0.2705, "step": 1410 }, { "epoch": 0.9164368002596138, "grad_norm": 0.5689347386360168, "learning_rate": 8.317364499680107e-06, "loss": 0.2987, "step": 1412 }, { "epoch": 0.9177348693817946, "grad_norm": 0.6719285249710083, "learning_rate": 8.312005575620355e-06, "loss": 0.2762, "step": 1414 }, { "epoch": 0.9190329385039754, "grad_norm": 0.44851255416870117, "learning_rate": 8.306639863908725e-06, "loss": 0.2866, "step": 1416 }, { "epoch": 0.920331007626156, "grad_norm": 0.5113731026649475, "learning_rate": 8.301267375541757e-06, "loss": 0.2559, "step": 1418 }, { "epoch": 0.9216290767483368, "grad_norm": 0.568748414516449, "learning_rate": 8.295888121529873e-06, "loss": 0.2788, "step": 1420 }, { "epoch": 0.9229271458705176, "grad_norm": 0.7482309341430664, "learning_rate": 8.290502112897357e-06, "loss": 0.2776, "step": 1422 }, { "epoch": 0.9242252149926984, "grad_norm": 0.685269832611084, "learning_rate": 8.285109360682344e-06, "loss": 0.2707, "step": 1424 }, { "epoch": 0.9255232841148792, "grad_norm": 0.6623191237449646, "learning_rate": 8.279709875936784e-06, "loss": 0.2605, "step": 1426 }, { "epoch": 0.9268213532370598, "grad_norm": 0.6568692326545715, "learning_rate": 8.274303669726427e-06, "loss": 0.2973, "step": 1428 }, { "epoch": 0.9281194223592406, "grad_norm": 0.6378563642501831, "learning_rate": 8.268890753130794e-06, "loss": 0.2931, "step": 1430 }, { "epoch": 0.9294174914814214, "grad_norm": 0.5852165818214417, "learning_rate": 8.263471137243165e-06, "loss": 0.2657, "step": 1432 }, { "epoch": 0.9307155606036022, "grad_norm": 0.8133228421211243, "learning_rate": 8.258044833170545e-06, "loss": 0.2942, "step": 1434 }, { "epoch": 0.9320136297257829, "grad_norm": 0.6137779355049133, "learning_rate": 8.252611852033648e-06, "loss": 0.2881, "step": 1436 }, { "epoch": 0.9333116988479636, "grad_norm": 0.5792057514190674, "learning_rate": 8.247172204966867e-06, "loss": 0.2671, "step": 1438 }, { "epoch": 0.9346097679701444, "grad_norm": 0.5968283414840698, "learning_rate": 8.241725903118264e-06, "loss": 0.2762, "step": 1440 }, { "epoch": 0.9359078370923252, "grad_norm": 0.6384152770042419, "learning_rate": 8.236272957649534e-06, "loss": 0.2634, "step": 1442 }, { "epoch": 0.937205906214506, "grad_norm": 0.5112971067428589, "learning_rate": 8.23081337973599e-06, "loss": 0.2865, "step": 1444 }, { "epoch": 0.9385039753366867, "grad_norm": 0.7261037826538086, "learning_rate": 8.225347180566534e-06, "loss": 0.2877, "step": 1446 }, { "epoch": 0.9398020444588674, "grad_norm": 0.6861732602119446, "learning_rate": 8.219874371343643e-06, "loss": 0.2983, "step": 1448 }, { "epoch": 0.9411001135810482, "grad_norm": 0.5197873711585999, "learning_rate": 8.214394963283336e-06, "loss": 0.2823, "step": 1450 }, { "epoch": 0.942398182703229, "grad_norm": 0.5893886685371399, "learning_rate": 8.208908967615159e-06, "loss": 0.2662, "step": 1452 }, { "epoch": 0.9436962518254097, "grad_norm": 0.5803912281990051, "learning_rate": 8.203416395582156e-06, "loss": 0.2865, "step": 1454 }, { "epoch": 0.9449943209475905, "grad_norm": 0.6083657145500183, "learning_rate": 8.197917258440851e-06, "loss": 0.2672, "step": 1456 }, { "epoch": 0.9462923900697712, "grad_norm": 0.5100261569023132, "learning_rate": 8.192411567461222e-06, "loss": 0.2724, "step": 1458 }, { "epoch": 0.947590459191952, "grad_norm": 0.5970443487167358, "learning_rate": 8.186899333926676e-06, "loss": 0.2941, "step": 1460 }, { "epoch": 0.9488885283141327, "grad_norm": 0.6493470072746277, "learning_rate": 8.181380569134034e-06, "loss": 0.2864, "step": 1462 }, { "epoch": 0.9501865974363135, "grad_norm": 1.1178690195083618, "learning_rate": 8.175855284393495e-06, "loss": 0.3056, "step": 1464 }, { "epoch": 0.9514846665584943, "grad_norm": 0.66445392370224, "learning_rate": 8.170323491028625e-06, "loss": 0.2697, "step": 1466 }, { "epoch": 0.952782735680675, "grad_norm": 0.5957862734794617, "learning_rate": 8.164785200376326e-06, "loss": 0.2943, "step": 1468 }, { "epoch": 0.9540808048028557, "grad_norm": 0.7215313911437988, "learning_rate": 8.15924042378682e-06, "loss": 0.2711, "step": 1470 }, { "epoch": 0.9553788739250365, "grad_norm": 0.5567473769187927, "learning_rate": 8.153689172623618e-06, "loss": 0.2826, "step": 1472 }, { "epoch": 0.9566769430472173, "grad_norm": 0.607883095741272, "learning_rate": 8.148131458263499e-06, "loss": 0.2845, "step": 1474 }, { "epoch": 0.9579750121693981, "grad_norm": 0.9432318806648254, "learning_rate": 8.142567292096488e-06, "loss": 0.313, "step": 1476 }, { "epoch": 0.9592730812915787, "grad_norm": 0.6713499426841736, "learning_rate": 8.136996685525837e-06, "loss": 0.2771, "step": 1478 }, { "epoch": 0.9605711504137595, "grad_norm": 0.6611443758010864, "learning_rate": 8.131419649967993e-06, "loss": 0.2901, "step": 1480 }, { "epoch": 0.9618692195359403, "grad_norm": 0.48653215169906616, "learning_rate": 8.125836196852577e-06, "loss": 0.2918, "step": 1482 }, { "epoch": 0.9631672886581211, "grad_norm": 0.49231061339378357, "learning_rate": 8.120246337622364e-06, "loss": 0.2967, "step": 1484 }, { "epoch": 0.9644653577803018, "grad_norm": 0.6124557256698608, "learning_rate": 8.114650083733263e-06, "loss": 0.2774, "step": 1486 }, { "epoch": 0.9657634269024825, "grad_norm": 0.597593367099762, "learning_rate": 8.109047446654276e-06, "loss": 0.2772, "step": 1488 }, { "epoch": 0.9670614960246633, "grad_norm": 0.9487907290458679, "learning_rate": 8.103438437867502e-06, "loss": 0.2764, "step": 1490 }, { "epoch": 0.9683595651468441, "grad_norm": 0.6041224598884583, "learning_rate": 8.097823068868085e-06, "loss": 0.261, "step": 1492 }, { "epoch": 0.9696576342690248, "grad_norm": 0.5877594947814941, "learning_rate": 8.092201351164213e-06, "loss": 0.2564, "step": 1494 }, { "epoch": 0.9709557033912056, "grad_norm": 0.9975115060806274, "learning_rate": 8.086573296277078e-06, "loss": 0.2714, "step": 1496 }, { "epoch": 0.9722537725133863, "grad_norm": 0.6410690546035767, "learning_rate": 8.080938915740863e-06, "loss": 0.2975, "step": 1498 }, { "epoch": 0.9735518416355671, "grad_norm": 0.6526374220848083, "learning_rate": 8.075298221102714e-06, "loss": 0.263, "step": 1500 }, { "epoch": 0.9735518416355671, "eval_loss": 0.28459566831588745, "eval_runtime": 397.2374, "eval_samples_per_second": 26.13, "eval_steps_per_second": 3.268, "step": 1500 }, { "epoch": 0.9748499107577479, "grad_norm": 0.6227244734764099, "learning_rate": 8.06965122392272e-06, "loss": 0.2659, "step": 1502 }, { "epoch": 0.9761479798799286, "grad_norm": 0.7231420278549194, "learning_rate": 8.063997935773885e-06, "loss": 0.2634, "step": 1504 }, { "epoch": 0.9774460490021094, "grad_norm": 0.6238067150115967, "learning_rate": 8.058338368242103e-06, "loss": 0.2721, "step": 1506 }, { "epoch": 0.9787441181242901, "grad_norm": 0.7857838273048401, "learning_rate": 8.052672532926137e-06, "loss": 0.3416, "step": 1508 }, { "epoch": 0.9800421872464709, "grad_norm": 0.6195446848869324, "learning_rate": 8.0470004414376e-06, "loss": 0.2882, "step": 1510 }, { "epoch": 0.9813402563686516, "grad_norm": 0.8230568766593933, "learning_rate": 8.041322105400923e-06, "loss": 0.3002, "step": 1512 }, { "epoch": 0.9826383254908324, "grad_norm": 0.6055063009262085, "learning_rate": 8.035637536453336e-06, "loss": 0.28, "step": 1514 }, { "epoch": 0.9839363946130132, "grad_norm": 0.71924889087677, "learning_rate": 8.029946746244839e-06, "loss": 0.2914, "step": 1516 }, { "epoch": 0.9852344637351939, "grad_norm": 0.6978484988212585, "learning_rate": 8.024249746438189e-06, "loss": 0.2927, "step": 1518 }, { "epoch": 0.9865325328573746, "grad_norm": 0.6936092972755432, "learning_rate": 8.018546548708862e-06, "loss": 0.2897, "step": 1520 }, { "epoch": 0.9878306019795554, "grad_norm": 0.7579458355903625, "learning_rate": 8.01283716474504e-06, "loss": 0.2712, "step": 1522 }, { "epoch": 0.9891286711017362, "grad_norm": 0.6067072749137878, "learning_rate": 8.007121606247583e-06, "loss": 0.2851, "step": 1524 }, { "epoch": 0.990426740223917, "grad_norm": 0.672527551651001, "learning_rate": 8.001399884930004e-06, "loss": 0.3041, "step": 1526 }, { "epoch": 0.9917248093460976, "grad_norm": 0.8299069404602051, "learning_rate": 7.995672012518444e-06, "loss": 0.2754, "step": 1528 }, { "epoch": 0.9930228784682784, "grad_norm": 0.5839980244636536, "learning_rate": 7.989938000751655e-06, "loss": 0.3113, "step": 1530 }, { "epoch": 0.9943209475904592, "grad_norm": 0.48691216111183167, "learning_rate": 7.98419786138097e-06, "loss": 0.2699, "step": 1532 }, { "epoch": 0.99561901671264, "grad_norm": 0.6445747017860413, "learning_rate": 7.978451606170275e-06, "loss": 0.2637, "step": 1534 }, { "epoch": 0.9969170858348207, "grad_norm": 0.623403787612915, "learning_rate": 7.972699246895996e-06, "loss": 0.2596, "step": 1536 }, { "epoch": 0.9982151549570014, "grad_norm": 0.7055762410163879, "learning_rate": 7.966940795347066e-06, "loss": 0.266, "step": 1538 }, { "epoch": 0.9995132240791822, "grad_norm": 0.625282883644104, "learning_rate": 7.961176263324902e-06, "loss": 0.2673, "step": 1540 }, { "epoch": 1.0008112932013629, "grad_norm": 0.6170228123664856, "learning_rate": 7.955405662643384e-06, "loss": 0.2841, "step": 1542 }, { "epoch": 1.0021093623235438, "grad_norm": 0.6153374314308167, "learning_rate": 7.94962900512883e-06, "loss": 0.3138, "step": 1544 }, { "epoch": 1.0034074314457244, "grad_norm": 0.6280811429023743, "learning_rate": 7.94384630261997e-06, "loss": 0.2884, "step": 1546 }, { "epoch": 1.0047055005679053, "grad_norm": 0.7206188440322876, "learning_rate": 7.938057566967926e-06, "loss": 0.2595, "step": 1548 }, { "epoch": 1.006003569690086, "grad_norm": 0.6341105699539185, "learning_rate": 7.932262810036176e-06, "loss": 0.2691, "step": 1550 }, { "epoch": 1.0073016388122666, "grad_norm": 0.6815624833106995, "learning_rate": 7.926462043700544e-06, "loss": 0.2787, "step": 1552 }, { "epoch": 1.0085997079344475, "grad_norm": 0.6890838742256165, "learning_rate": 7.920655279849173e-06, "loss": 0.2769, "step": 1554 }, { "epoch": 1.0098977770566282, "grad_norm": 1.006568193435669, "learning_rate": 7.91484253038249e-06, "loss": 0.3032, "step": 1556 }, { "epoch": 1.011195846178809, "grad_norm": 0.7289845943450928, "learning_rate": 7.909023807213193e-06, "loss": 0.2488, "step": 1558 }, { "epoch": 1.0124939153009898, "grad_norm": 0.6092172861099243, "learning_rate": 7.90319912226622e-06, "loss": 0.2688, "step": 1560 }, { "epoch": 1.0137919844231704, "grad_norm": 0.6097801327705383, "learning_rate": 7.897368487478733e-06, "loss": 0.2819, "step": 1562 }, { "epoch": 1.0150900535453513, "grad_norm": 1.2450780868530273, "learning_rate": 7.89153191480008e-06, "loss": 0.269, "step": 1564 }, { "epoch": 1.016388122667532, "grad_norm": 0.700198769569397, "learning_rate": 7.885689416191785e-06, "loss": 0.2708, "step": 1566 }, { "epoch": 1.0176861917897129, "grad_norm": 0.6829163432121277, "learning_rate": 7.87984100362751e-06, "loss": 0.2616, "step": 1568 }, { "epoch": 1.0189842609118935, "grad_norm": 0.7198750376701355, "learning_rate": 7.87398668909304e-06, "loss": 0.2651, "step": 1570 }, { "epoch": 1.0202823300340742, "grad_norm": 0.9876133799552917, "learning_rate": 7.868126484586261e-06, "loss": 0.2926, "step": 1572 }, { "epoch": 1.021580399156255, "grad_norm": 0.547264039516449, "learning_rate": 7.862260402117121e-06, "loss": 0.2558, "step": 1574 }, { "epoch": 1.0228784682784358, "grad_norm": 0.866314709186554, "learning_rate": 7.85638845370762e-06, "loss": 0.2688, "step": 1576 }, { "epoch": 1.0241765374006166, "grad_norm": 0.7893325686454773, "learning_rate": 7.850510651391778e-06, "loss": 0.3133, "step": 1578 }, { "epoch": 1.0254746065227973, "grad_norm": 0.8353627324104309, "learning_rate": 7.844627007215613e-06, "loss": 0.2799, "step": 1580 }, { "epoch": 1.026772675644978, "grad_norm": 0.6759359240531921, "learning_rate": 7.838737533237111e-06, "loss": 0.3038, "step": 1582 }, { "epoch": 1.0280707447671589, "grad_norm": 0.693621814250946, "learning_rate": 7.832842241526212e-06, "loss": 0.3311, "step": 1584 }, { "epoch": 1.0293688138893395, "grad_norm": 0.856414258480072, "learning_rate": 7.826941144164777e-06, "loss": 0.2808, "step": 1586 }, { "epoch": 1.0306668830115204, "grad_norm": 0.7658901810646057, "learning_rate": 7.82103425324656e-06, "loss": 0.2671, "step": 1588 }, { "epoch": 1.031964952133701, "grad_norm": 0.6982436776161194, "learning_rate": 7.815121580877197e-06, "loss": 0.254, "step": 1590 }, { "epoch": 1.0332630212558818, "grad_norm": 0.6291922926902771, "learning_rate": 7.809203139174167e-06, "loss": 0.2561, "step": 1592 }, { "epoch": 1.0345610903780627, "grad_norm": 0.6522267460823059, "learning_rate": 7.803278940266772e-06, "loss": 0.2777, "step": 1594 }, { "epoch": 1.0358591595002433, "grad_norm": 0.6874932646751404, "learning_rate": 7.797348996296116e-06, "loss": 0.3095, "step": 1596 }, { "epoch": 1.0371572286224242, "grad_norm": 0.6713000535964966, "learning_rate": 7.791413319415076e-06, "loss": 0.2871, "step": 1598 }, { "epoch": 1.0384552977446049, "grad_norm": 0.5834596753120422, "learning_rate": 7.785471921788276e-06, "loss": 0.2798, "step": 1600 }, { "epoch": 1.0397533668667855, "grad_norm": 0.64345782995224, "learning_rate": 7.779524815592068e-06, "loss": 0.2793, "step": 1602 }, { "epoch": 1.0410514359889664, "grad_norm": 0.6583820581436157, "learning_rate": 7.773572013014502e-06, "loss": 0.2934, "step": 1604 }, { "epoch": 1.042349505111147, "grad_norm": 0.6981980204582214, "learning_rate": 7.767613526255296e-06, "loss": 0.2829, "step": 1606 }, { "epoch": 1.043647574233328, "grad_norm": 0.6999492049217224, "learning_rate": 7.761649367525828e-06, "loss": 0.278, "step": 1608 }, { "epoch": 1.0449456433555087, "grad_norm": 0.5379292368888855, "learning_rate": 7.755679549049093e-06, "loss": 0.2693, "step": 1610 }, { "epoch": 1.0462437124776893, "grad_norm": 0.5985593199729919, "learning_rate": 7.749704083059686e-06, "loss": 0.256, "step": 1612 }, { "epoch": 1.0475417815998702, "grad_norm": 0.6079729795455933, "learning_rate": 7.743722981803777e-06, "loss": 0.2799, "step": 1614 }, { "epoch": 1.0488398507220509, "grad_norm": 0.5461024045944214, "learning_rate": 7.737736257539086e-06, "loss": 0.2732, "step": 1616 }, { "epoch": 1.0501379198442318, "grad_norm": 0.7636616230010986, "learning_rate": 7.731743922534854e-06, "loss": 0.2501, "step": 1618 }, { "epoch": 1.0514359889664124, "grad_norm": 0.6093832850456238, "learning_rate": 7.725745989071825e-06, "loss": 0.2506, "step": 1620 }, { "epoch": 1.052734058088593, "grad_norm": 0.6643341779708862, "learning_rate": 7.71974246944221e-06, "loss": 0.2851, "step": 1622 }, { "epoch": 1.054032127210774, "grad_norm": 0.5813004970550537, "learning_rate": 7.713733375949677e-06, "loss": 0.2762, "step": 1624 }, { "epoch": 1.0553301963329547, "grad_norm": 1.1216081380844116, "learning_rate": 7.707718720909308e-06, "loss": 0.2896, "step": 1626 }, { "epoch": 1.0566282654551356, "grad_norm": 0.8359357118606567, "learning_rate": 7.70169851664759e-06, "loss": 0.2917, "step": 1628 }, { "epoch": 1.0579263345773162, "grad_norm": 0.8841298222541809, "learning_rate": 7.695672775502379e-06, "loss": 0.2921, "step": 1630 }, { "epoch": 1.0592244036994969, "grad_norm": 0.7251891493797302, "learning_rate": 7.689641509822879e-06, "loss": 0.2729, "step": 1632 }, { "epoch": 1.0605224728216778, "grad_norm": 0.5894423127174377, "learning_rate": 7.683604731969616e-06, "loss": 0.2742, "step": 1634 }, { "epoch": 1.0618205419438584, "grad_norm": 0.6796072721481323, "learning_rate": 7.677562454314414e-06, "loss": 0.2904, "step": 1636 }, { "epoch": 1.0631186110660393, "grad_norm": 0.8476268649101257, "learning_rate": 7.671514689240366e-06, "loss": 0.2788, "step": 1638 }, { "epoch": 1.06441668018822, "grad_norm": 1.0308847427368164, "learning_rate": 7.665461449141814e-06, "loss": 0.2862, "step": 1640 }, { "epoch": 1.0657147493104007, "grad_norm": 0.8409659266471863, "learning_rate": 7.659402746424316e-06, "loss": 0.2834, "step": 1642 }, { "epoch": 1.0670128184325816, "grad_norm": 0.5841651558876038, "learning_rate": 7.653338593504632e-06, "loss": 0.2763, "step": 1644 }, { "epoch": 1.0683108875547622, "grad_norm": 0.6023411154747009, "learning_rate": 7.647269002810683e-06, "loss": 0.2746, "step": 1646 }, { "epoch": 1.0696089566769431, "grad_norm": 0.639076292514801, "learning_rate": 7.641193986781542e-06, "loss": 0.2658, "step": 1648 }, { "epoch": 1.0709070257991238, "grad_norm": 0.8287789225578308, "learning_rate": 7.635113557867395e-06, "loss": 0.2985, "step": 1650 }, { "epoch": 1.0722050949213044, "grad_norm": 0.8636316061019897, "learning_rate": 7.629027728529527e-06, "loss": 0.3108, "step": 1652 }, { "epoch": 1.0735031640434853, "grad_norm": 0.5215837955474854, "learning_rate": 7.622936511240284e-06, "loss": 0.2599, "step": 1654 }, { "epoch": 1.074801233165666, "grad_norm": 0.8347405195236206, "learning_rate": 7.616839918483061e-06, "loss": 0.2597, "step": 1656 }, { "epoch": 1.076099302287847, "grad_norm": 0.5234360694885254, "learning_rate": 7.610737962752264e-06, "loss": 0.2746, "step": 1658 }, { "epoch": 1.0773973714100276, "grad_norm": 0.5826201438903809, "learning_rate": 7.604630656553292e-06, "loss": 0.2735, "step": 1660 }, { "epoch": 1.0786954405322082, "grad_norm": 0.7408040165901184, "learning_rate": 7.598518012402509e-06, "loss": 0.2885, "step": 1662 }, { "epoch": 1.0799935096543891, "grad_norm": 0.6896710395812988, "learning_rate": 7.592400042827221e-06, "loss": 0.2782, "step": 1664 }, { "epoch": 1.0812915787765698, "grad_norm": 1.0489345788955688, "learning_rate": 7.586276760365645e-06, "loss": 0.3131, "step": 1666 }, { "epoch": 1.0825896478987507, "grad_norm": 0.827761709690094, "learning_rate": 7.580148177566886e-06, "loss": 0.2775, "step": 1668 }, { "epoch": 1.0838877170209313, "grad_norm": 0.6957029700279236, "learning_rate": 7.574014306990916e-06, "loss": 0.2797, "step": 1670 }, { "epoch": 1.085185786143112, "grad_norm": 0.6354756951332092, "learning_rate": 7.5678751612085344e-06, "loss": 0.2805, "step": 1672 }, { "epoch": 1.086483855265293, "grad_norm": 0.5801124572753906, "learning_rate": 7.561730752801364e-06, "loss": 0.2718, "step": 1674 }, { "epoch": 1.0877819243874736, "grad_norm": 0.7270207405090332, "learning_rate": 7.555581094361804e-06, "loss": 0.2876, "step": 1676 }, { "epoch": 1.0890799935096545, "grad_norm": 0.5913581252098083, "learning_rate": 7.549426198493014e-06, "loss": 0.2697, "step": 1678 }, { "epoch": 1.0903780626318351, "grad_norm": 0.5647186636924744, "learning_rate": 7.543266077808893e-06, "loss": 0.309, "step": 1680 }, { "epoch": 1.0916761317540158, "grad_norm": 0.6343077421188354, "learning_rate": 7.5371007449340406e-06, "loss": 0.2493, "step": 1682 }, { "epoch": 1.0929742008761967, "grad_norm": 0.6789856553077698, "learning_rate": 7.530930212503742e-06, "loss": 0.2574, "step": 1684 }, { "epoch": 1.0942722699983773, "grad_norm": 0.6660630702972412, "learning_rate": 7.524754493163939e-06, "loss": 0.2819, "step": 1686 }, { "epoch": 1.0955703391205582, "grad_norm": 0.6563140153884888, "learning_rate": 7.5185735995712025e-06, "loss": 0.2781, "step": 1688 }, { "epoch": 1.096868408242739, "grad_norm": 0.5954127907752991, "learning_rate": 7.512387544392706e-06, "loss": 0.2991, "step": 1690 }, { "epoch": 1.0981664773649196, "grad_norm": 0.6786440014839172, "learning_rate": 7.506196340306204e-06, "loss": 0.2739, "step": 1692 }, { "epoch": 1.0994645464871005, "grad_norm": 0.7437634468078613, "learning_rate": 7.500000000000001e-06, "loss": 0.2987, "step": 1694 }, { "epoch": 1.1007626156092811, "grad_norm": 0.6413884162902832, "learning_rate": 7.4937985361729285e-06, "loss": 0.2653, "step": 1696 }, { "epoch": 1.102060684731462, "grad_norm": 0.5322827696800232, "learning_rate": 7.487591961534319e-06, "loss": 0.266, "step": 1698 }, { "epoch": 1.1033587538536427, "grad_norm": 0.6884098649024963, "learning_rate": 7.481380288803976e-06, "loss": 0.2856, "step": 1700 }, { "epoch": 1.1046568229758233, "grad_norm": 0.6576172709465027, "learning_rate": 7.475163530712157e-06, "loss": 0.2797, "step": 1702 }, { "epoch": 1.1059548920980042, "grad_norm": 0.6505524516105652, "learning_rate": 7.468941699999535e-06, "loss": 0.2654, "step": 1704 }, { "epoch": 1.107252961220185, "grad_norm": 0.6331707835197449, "learning_rate": 7.4627148094171844e-06, "loss": 0.2634, "step": 1706 }, { "epoch": 1.1085510303423658, "grad_norm": 0.5772790908813477, "learning_rate": 7.456482871726545e-06, "loss": 0.289, "step": 1708 }, { "epoch": 1.1098490994645465, "grad_norm": 0.6612452864646912, "learning_rate": 7.450245899699401e-06, "loss": 0.2829, "step": 1710 }, { "epoch": 1.1111471685867271, "grad_norm": 0.7573365569114685, "learning_rate": 7.4440039061178585e-06, "loss": 0.2774, "step": 1712 }, { "epoch": 1.112445237708908, "grad_norm": 0.6036496758460999, "learning_rate": 7.437756903774307e-06, "loss": 0.2714, "step": 1714 }, { "epoch": 1.1137433068310887, "grad_norm": 0.6847440600395203, "learning_rate": 7.431504905471407e-06, "loss": 0.2739, "step": 1716 }, { "epoch": 1.1150413759532696, "grad_norm": 0.7789754271507263, "learning_rate": 7.425247924022058e-06, "loss": 0.2952, "step": 1718 }, { "epoch": 1.1163394450754502, "grad_norm": 0.7801019549369812, "learning_rate": 7.4189859722493665e-06, "loss": 0.2624, "step": 1720 }, { "epoch": 1.117637514197631, "grad_norm": 0.6115939021110535, "learning_rate": 7.412719062986632e-06, "loss": 0.2753, "step": 1722 }, { "epoch": 1.1189355833198118, "grad_norm": 0.6163650155067444, "learning_rate": 7.406447209077307e-06, "loss": 0.2787, "step": 1724 }, { "epoch": 1.1202336524419925, "grad_norm": 1.0455377101898193, "learning_rate": 7.4001704233749864e-06, "loss": 0.2695, "step": 1726 }, { "epoch": 1.1215317215641734, "grad_norm": 0.5705534815788269, "learning_rate": 7.393888718743362e-06, "loss": 0.2659, "step": 1728 }, { "epoch": 1.122829790686354, "grad_norm": 0.6186498999595642, "learning_rate": 7.387602108056214e-06, "loss": 0.2687, "step": 1730 }, { "epoch": 1.1241278598085347, "grad_norm": 0.5981277227401733, "learning_rate": 7.381310604197375e-06, "loss": 0.2812, "step": 1732 }, { "epoch": 1.1254259289307156, "grad_norm": 0.6603893637657166, "learning_rate": 7.375014220060705e-06, "loss": 0.2904, "step": 1734 }, { "epoch": 1.1267239980528962, "grad_norm": 0.5768886208534241, "learning_rate": 7.368712968550068e-06, "loss": 0.2549, "step": 1736 }, { "epoch": 1.1280220671750771, "grad_norm": 0.7435418367385864, "learning_rate": 7.362406862579299e-06, "loss": 0.2909, "step": 1738 }, { "epoch": 1.1293201362972578, "grad_norm": 0.8667522668838501, "learning_rate": 7.3560959150721844e-06, "loss": 0.3026, "step": 1740 }, { "epoch": 1.1306182054194385, "grad_norm": 0.8828291893005371, "learning_rate": 7.3497801389624345e-06, "loss": 0.3033, "step": 1742 }, { "epoch": 1.1319162745416194, "grad_norm": 0.7345783114433289, "learning_rate": 7.343459547193652e-06, "loss": 0.2697, "step": 1744 }, { "epoch": 1.1332143436638, "grad_norm": 0.6791905760765076, "learning_rate": 7.337134152719312e-06, "loss": 0.3237, "step": 1746 }, { "epoch": 1.134512412785981, "grad_norm": 0.69078528881073, "learning_rate": 7.330803968502728e-06, "loss": 0.2797, "step": 1748 }, { "epoch": 1.1358104819081616, "grad_norm": 0.6449896097183228, "learning_rate": 7.324469007517035e-06, "loss": 0.2741, "step": 1750 }, { "epoch": 1.1371085510303423, "grad_norm": 0.7471843957901001, "learning_rate": 7.318129282745152e-06, "loss": 0.2988, "step": 1752 }, { "epoch": 1.1384066201525231, "grad_norm": 0.8137723803520203, "learning_rate": 7.311784807179769e-06, "loss": 0.2828, "step": 1754 }, { "epoch": 1.1397046892747038, "grad_norm": 0.6233217716217041, "learning_rate": 7.305435593823303e-06, "loss": 0.2832, "step": 1756 }, { "epoch": 1.1410027583968847, "grad_norm": 0.8680141568183899, "learning_rate": 7.299081655687885e-06, "loss": 0.2854, "step": 1758 }, { "epoch": 1.1423008275190654, "grad_norm": 0.7370233535766602, "learning_rate": 7.29272300579533e-06, "loss": 0.3256, "step": 1760 }, { "epoch": 1.143598896641246, "grad_norm": 0.6261857151985168, "learning_rate": 7.2863596571771086e-06, "loss": 0.2525, "step": 1762 }, { "epoch": 1.144896965763427, "grad_norm": 0.8367358446121216, "learning_rate": 7.279991622874319e-06, "loss": 0.2779, "step": 1764 }, { "epoch": 1.1461950348856076, "grad_norm": 0.7411924600601196, "learning_rate": 7.273618915937666e-06, "loss": 0.2994, "step": 1766 }, { "epoch": 1.1474931040077885, "grad_norm": 0.732494056224823, "learning_rate": 7.267241549427426e-06, "loss": 0.2847, "step": 1768 }, { "epoch": 1.1487911731299691, "grad_norm": 0.8433805108070374, "learning_rate": 7.260859536413429e-06, "loss": 0.2655, "step": 1770 }, { "epoch": 1.1500892422521498, "grad_norm": 0.9636470675468445, "learning_rate": 7.254472889975025e-06, "loss": 0.3023, "step": 1772 }, { "epoch": 1.1513873113743307, "grad_norm": 0.717646598815918, "learning_rate": 7.248081623201061e-06, "loss": 0.2527, "step": 1774 }, { "epoch": 1.1526853804965114, "grad_norm": 0.7321946024894714, "learning_rate": 7.241685749189851e-06, "loss": 0.2737, "step": 1776 }, { "epoch": 1.1539834496186923, "grad_norm": 0.6342837810516357, "learning_rate": 7.235285281049154e-06, "loss": 0.2523, "step": 1778 }, { "epoch": 1.155281518740873, "grad_norm": 0.5659118890762329, "learning_rate": 7.228880231896143e-06, "loss": 0.2631, "step": 1780 }, { "epoch": 1.1565795878630536, "grad_norm": 0.8062835931777954, "learning_rate": 7.22247061485738e-06, "loss": 0.2834, "step": 1782 }, { "epoch": 1.1578776569852345, "grad_norm": 0.8629544377326965, "learning_rate": 7.216056443068784e-06, "loss": 0.277, "step": 1784 }, { "epoch": 1.1591757261074151, "grad_norm": 0.6428398489952087, "learning_rate": 7.209637729675615e-06, "loss": 0.2768, "step": 1786 }, { "epoch": 1.160473795229596, "grad_norm": 0.751363217830658, "learning_rate": 7.203214487832437e-06, "loss": 0.2637, "step": 1788 }, { "epoch": 1.1617718643517767, "grad_norm": 0.9838536977767944, "learning_rate": 7.196786730703097e-06, "loss": 0.3142, "step": 1790 }, { "epoch": 1.1630699334739574, "grad_norm": 0.7683244943618774, "learning_rate": 7.190354471460692e-06, "loss": 0.2858, "step": 1792 }, { "epoch": 1.1643680025961383, "grad_norm": 0.8046476244926453, "learning_rate": 7.18391772328755e-06, "loss": 0.2964, "step": 1794 }, { "epoch": 1.165666071718319, "grad_norm": 0.7937556505203247, "learning_rate": 7.177476499375193e-06, "loss": 0.2975, "step": 1796 }, { "epoch": 1.1669641408404998, "grad_norm": 0.6757269501686096, "learning_rate": 7.171030812924325e-06, "loss": 0.2483, "step": 1798 }, { "epoch": 1.1682622099626805, "grad_norm": 0.5440795421600342, "learning_rate": 7.164580677144781e-06, "loss": 0.3115, "step": 1800 }, { "epoch": 1.1695602790848612, "grad_norm": 0.7921977639198303, "learning_rate": 7.158126105255533e-06, "loss": 0.268, "step": 1802 }, { "epoch": 1.170858348207042, "grad_norm": 0.6896278858184814, "learning_rate": 7.151667110484626e-06, "loss": 0.259, "step": 1804 }, { "epoch": 1.1721564173292227, "grad_norm": 0.7437115907669067, "learning_rate": 7.145203706069183e-06, "loss": 0.2568, "step": 1806 }, { "epoch": 1.1734544864514036, "grad_norm": 0.554347038269043, "learning_rate": 7.138735905255355e-06, "loss": 0.2737, "step": 1808 }, { "epoch": 1.1747525555735843, "grad_norm": 0.9953756332397461, "learning_rate": 7.132263721298312e-06, "loss": 0.2722, "step": 1810 }, { "epoch": 1.176050624695765, "grad_norm": 0.8220161199569702, "learning_rate": 7.125787167462197e-06, "loss": 0.2953, "step": 1812 }, { "epoch": 1.1773486938179458, "grad_norm": 0.732215940952301, "learning_rate": 7.119306257020117e-06, "loss": 0.2789, "step": 1814 }, { "epoch": 1.1786467629401265, "grad_norm": 0.6108527779579163, "learning_rate": 7.1128210032541e-06, "loss": 0.2711, "step": 1816 }, { "epoch": 1.1799448320623074, "grad_norm": 0.6442737579345703, "learning_rate": 7.106331419455085e-06, "loss": 0.2666, "step": 1818 }, { "epoch": 1.181242901184488, "grad_norm": 0.5654114484786987, "learning_rate": 7.099837518922873e-06, "loss": 0.263, "step": 1820 }, { "epoch": 1.1825409703066687, "grad_norm": 0.6771984100341797, "learning_rate": 7.093339314966124e-06, "loss": 0.2644, "step": 1822 }, { "epoch": 1.1838390394288496, "grad_norm": 0.9465017914772034, "learning_rate": 7.086836820902305e-06, "loss": 0.3016, "step": 1824 }, { "epoch": 1.1851371085510303, "grad_norm": 0.6316088438034058, "learning_rate": 7.080330050057687e-06, "loss": 0.2895, "step": 1826 }, { "epoch": 1.1864351776732112, "grad_norm": 0.5392777323722839, "learning_rate": 7.073819015767298e-06, "loss": 0.2905, "step": 1828 }, { "epoch": 1.1877332467953918, "grad_norm": 0.7234270572662354, "learning_rate": 7.067303731374907e-06, "loss": 0.2787, "step": 1830 }, { "epoch": 1.1890313159175725, "grad_norm": 0.9397181868553162, "learning_rate": 7.06078421023299e-06, "loss": 0.2957, "step": 1832 }, { "epoch": 1.1903293850397534, "grad_norm": 0.7759517431259155, "learning_rate": 7.054260465702712e-06, "loss": 0.3027, "step": 1834 }, { "epoch": 1.191627454161934, "grad_norm": 0.5716654062271118, "learning_rate": 7.047732511153885e-06, "loss": 0.2786, "step": 1836 }, { "epoch": 1.192925523284115, "grad_norm": 0.6282373070716858, "learning_rate": 7.041200359964957e-06, "loss": 0.2688, "step": 1838 }, { "epoch": 1.1942235924062956, "grad_norm": 0.654155433177948, "learning_rate": 7.034664025522972e-06, "loss": 0.2892, "step": 1840 }, { "epoch": 1.1955216615284763, "grad_norm": 0.6796264052391052, "learning_rate": 7.028123521223548e-06, "loss": 0.2734, "step": 1842 }, { "epoch": 1.1968197306506572, "grad_norm": 0.595437228679657, "learning_rate": 7.021578860470848e-06, "loss": 0.2713, "step": 1844 }, { "epoch": 1.1981177997728378, "grad_norm": 0.8119726777076721, "learning_rate": 7.015030056677559e-06, "loss": 0.2737, "step": 1846 }, { "epoch": 1.1994158688950187, "grad_norm": 0.5636496543884277, "learning_rate": 7.008477123264849e-06, "loss": 0.2742, "step": 1848 }, { "epoch": 1.2007139380171994, "grad_norm": 0.6880688071250916, "learning_rate": 7.001920073662356e-06, "loss": 0.2817, "step": 1850 }, { "epoch": 1.20201200713938, "grad_norm": 0.9881495833396912, "learning_rate": 6.995358921308153e-06, "loss": 0.2897, "step": 1852 }, { "epoch": 1.203310076261561, "grad_norm": 0.8301553130149841, "learning_rate": 6.98879367964872e-06, "loss": 0.2635, "step": 1854 }, { "epoch": 1.2046081453837416, "grad_norm": 0.6604253053665161, "learning_rate": 6.9822243621389175e-06, "loss": 0.2854, "step": 1856 }, { "epoch": 1.2059062145059225, "grad_norm": 0.6378334164619446, "learning_rate": 6.975650982241962e-06, "loss": 0.2532, "step": 1858 }, { "epoch": 1.2072042836281032, "grad_norm": 0.90976881980896, "learning_rate": 6.969073553429388e-06, "loss": 0.2617, "step": 1860 }, { "epoch": 1.2085023527502838, "grad_norm": 0.8062371015548706, "learning_rate": 6.96249208918104e-06, "loss": 0.266, "step": 1862 }, { "epoch": 1.2098004218724647, "grad_norm": 0.6593420505523682, "learning_rate": 6.9559066029850195e-06, "loss": 0.2674, "step": 1864 }, { "epoch": 1.2110984909946454, "grad_norm": 0.8178257346153259, "learning_rate": 6.949317108337681e-06, "loss": 0.284, "step": 1866 }, { "epoch": 1.2123965601168263, "grad_norm": 0.6504294872283936, "learning_rate": 6.9427236187435886e-06, "loss": 0.2793, "step": 1868 }, { "epoch": 1.213694629239007, "grad_norm": 0.6841611862182617, "learning_rate": 6.936126147715494e-06, "loss": 0.2575, "step": 1870 }, { "epoch": 1.2149926983611876, "grad_norm": 0.6120657920837402, "learning_rate": 6.92952470877431e-06, "loss": 0.2694, "step": 1872 }, { "epoch": 1.2162907674833685, "grad_norm": 0.5534852147102356, "learning_rate": 6.92291931544908e-06, "loss": 0.2698, "step": 1874 }, { "epoch": 1.2175888366055492, "grad_norm": 0.5726101994514465, "learning_rate": 6.916309981276954e-06, "loss": 0.2648, "step": 1876 }, { "epoch": 1.21888690572773, "grad_norm": 0.8797536492347717, "learning_rate": 6.909696719803156e-06, "loss": 0.2825, "step": 1878 }, { "epoch": 1.2201849748499107, "grad_norm": 1.3035683631896973, "learning_rate": 6.903079544580957e-06, "loss": 0.303, "step": 1880 }, { "epoch": 1.2214830439720914, "grad_norm": 1.0807839632034302, "learning_rate": 6.896458469171654e-06, "loss": 0.2844, "step": 1882 }, { "epoch": 1.2227811130942723, "grad_norm": 0.5816299915313721, "learning_rate": 6.889833507144534e-06, "loss": 0.2603, "step": 1884 }, { "epoch": 1.224079182216453, "grad_norm": 0.6241100430488586, "learning_rate": 6.8832046720768475e-06, "loss": 0.2808, "step": 1886 }, { "epoch": 1.2253772513386338, "grad_norm": 0.7606406211853027, "learning_rate": 6.876571977553786e-06, "loss": 0.264, "step": 1888 }, { "epoch": 1.2266753204608145, "grad_norm": 0.8786008954048157, "learning_rate": 6.869935437168449e-06, "loss": 0.2692, "step": 1890 }, { "epoch": 1.2279733895829952, "grad_norm": 0.5967274308204651, "learning_rate": 6.863295064521816e-06, "loss": 0.263, "step": 1892 }, { "epoch": 1.229271458705176, "grad_norm": 0.7956132292747498, "learning_rate": 6.8566508732227255e-06, "loss": 0.2627, "step": 1894 }, { "epoch": 1.2305695278273567, "grad_norm": 0.7813339233398438, "learning_rate": 6.850002876887836e-06, "loss": 0.2727, "step": 1896 }, { "epoch": 1.2318675969495376, "grad_norm": 0.6777212023735046, "learning_rate": 6.843351089141606e-06, "loss": 0.2513, "step": 1898 }, { "epoch": 1.2331656660717183, "grad_norm": 1.0501703023910522, "learning_rate": 6.836695523616268e-06, "loss": 0.2591, "step": 1900 }, { "epoch": 1.234463735193899, "grad_norm": 0.8571747541427612, "learning_rate": 6.83003619395179e-06, "loss": 0.2628, "step": 1902 }, { "epoch": 1.2357618043160798, "grad_norm": 1.218461513519287, "learning_rate": 6.82337311379586e-06, "loss": 0.32, "step": 1904 }, { "epoch": 1.2370598734382605, "grad_norm": 0.7305260300636292, "learning_rate": 6.81670629680385e-06, "loss": 0.2718, "step": 1906 }, { "epoch": 1.2383579425604414, "grad_norm": 0.6194341778755188, "learning_rate": 6.81003575663879e-06, "loss": 0.2498, "step": 1908 }, { "epoch": 1.239656011682622, "grad_norm": 0.8357636332511902, "learning_rate": 6.803361506971341e-06, "loss": 0.282, "step": 1910 }, { "epoch": 1.2409540808048027, "grad_norm": 0.696173906326294, "learning_rate": 6.796683561479766e-06, "loss": 0.3304, "step": 1912 }, { "epoch": 1.2422521499269836, "grad_norm": 0.5937722325325012, "learning_rate": 6.7900019338499005e-06, "loss": 0.2574, "step": 1914 }, { "epoch": 1.2435502190491643, "grad_norm": 0.5384775400161743, "learning_rate": 6.78331663777513e-06, "loss": 0.2678, "step": 1916 }, { "epoch": 1.2448482881713452, "grad_norm": 0.5718085169792175, "learning_rate": 6.776627686956354e-06, "loss": 0.2802, "step": 1918 }, { "epoch": 1.2461463572935259, "grad_norm": 0.5267000794410706, "learning_rate": 6.7699350951019685e-06, "loss": 0.2645, "step": 1920 }, { "epoch": 1.2474444264157065, "grad_norm": 0.8117934465408325, "learning_rate": 6.7632388759278225e-06, "loss": 0.287, "step": 1922 }, { "epoch": 1.2487424955378874, "grad_norm": 0.7576075196266174, "learning_rate": 6.756539043157204e-06, "loss": 0.2884, "step": 1924 }, { "epoch": 1.250040564660068, "grad_norm": 0.6339088082313538, "learning_rate": 6.74983561052081e-06, "loss": 0.2639, "step": 1926 }, { "epoch": 1.251338633782249, "grad_norm": 0.7152970433235168, "learning_rate": 6.743128591756707e-06, "loss": 0.2553, "step": 1928 }, { "epoch": 1.2526367029044296, "grad_norm": 0.7142999172210693, "learning_rate": 6.7364180006103165e-06, "loss": 0.2811, "step": 1930 }, { "epoch": 1.2539347720266103, "grad_norm": 0.6179227232933044, "learning_rate": 6.729703850834381e-06, "loss": 0.2279, "step": 1932 }, { "epoch": 1.2552328411487912, "grad_norm": 0.5081775188446045, "learning_rate": 6.722986156188935e-06, "loss": 0.2953, "step": 1934 }, { "epoch": 1.2565309102709719, "grad_norm": 0.9362536072731018, "learning_rate": 6.716264930441279e-06, "loss": 0.4065, "step": 1936 }, { "epoch": 1.2578289793931527, "grad_norm": 0.6047359108924866, "learning_rate": 6.7095401873659446e-06, "loss": 0.2841, "step": 1938 }, { "epoch": 1.2591270485153334, "grad_norm": 0.7063091397285461, "learning_rate": 6.702811940744681e-06, "loss": 0.2624, "step": 1940 }, { "epoch": 1.260425117637514, "grad_norm": 0.7677826881408691, "learning_rate": 6.696080204366411e-06, "loss": 0.2677, "step": 1942 }, { "epoch": 1.261723186759695, "grad_norm": 0.7509384751319885, "learning_rate": 6.689344992027213e-06, "loss": 0.3026, "step": 1944 }, { "epoch": 1.2630212558818756, "grad_norm": 0.6702812314033508, "learning_rate": 6.682606317530284e-06, "loss": 0.2581, "step": 1946 }, { "epoch": 1.2643193250040565, "grad_norm": 0.6308957934379578, "learning_rate": 6.675864194685924e-06, "loss": 0.2977, "step": 1948 }, { "epoch": 1.2656173941262372, "grad_norm": 0.8863910436630249, "learning_rate": 6.669118637311491e-06, "loss": 0.2826, "step": 1950 }, { "epoch": 1.2669154632484179, "grad_norm": 0.6605067253112793, "learning_rate": 6.6623696592313926e-06, "loss": 0.2624, "step": 1952 }, { "epoch": 1.2682135323705988, "grad_norm": 0.7147417664527893, "learning_rate": 6.655617274277035e-06, "loss": 0.2712, "step": 1954 }, { "epoch": 1.2695116014927794, "grad_norm": 0.6210743188858032, "learning_rate": 6.648861496286818e-06, "loss": 0.2676, "step": 1956 }, { "epoch": 1.2708096706149603, "grad_norm": 0.5406560301780701, "learning_rate": 6.6421023391060845e-06, "loss": 0.2514, "step": 1958 }, { "epoch": 1.272107739737141, "grad_norm": 0.8908967971801758, "learning_rate": 6.635339816587109e-06, "loss": 0.2516, "step": 1960 }, { "epoch": 1.2734058088593216, "grad_norm": 0.7854211330413818, "learning_rate": 6.628573942589062e-06, "loss": 0.2536, "step": 1962 }, { "epoch": 1.2747038779815025, "grad_norm": 0.7821478843688965, "learning_rate": 6.621804730977983e-06, "loss": 0.2981, "step": 1964 }, { "epoch": 1.2760019471036832, "grad_norm": 0.6973228454589844, "learning_rate": 6.6150321956267495e-06, "loss": 0.2637, "step": 1966 }, { "epoch": 1.277300016225864, "grad_norm": 1.100315809249878, "learning_rate": 6.608256350415052e-06, "loss": 0.2901, "step": 1968 }, { "epoch": 1.2785980853480448, "grad_norm": 0.7265414595603943, "learning_rate": 6.6014772092293656e-06, "loss": 0.2693, "step": 1970 }, { "epoch": 1.2798961544702254, "grad_norm": 0.7700524926185608, "learning_rate": 6.594694785962917e-06, "loss": 0.2721, "step": 1972 }, { "epoch": 1.2811942235924063, "grad_norm": 0.6772423386573792, "learning_rate": 6.587909094515663e-06, "loss": 0.2378, "step": 1974 }, { "epoch": 1.282492292714587, "grad_norm": 0.8507439494132996, "learning_rate": 6.581120148794255e-06, "loss": 0.3059, "step": 1976 }, { "epoch": 1.2837903618367679, "grad_norm": 1.0275970697402954, "learning_rate": 6.5743279627120145e-06, "loss": 0.2677, "step": 1978 }, { "epoch": 1.2850884309589485, "grad_norm": 0.5820989608764648, "learning_rate": 6.567532550188908e-06, "loss": 0.2427, "step": 1980 }, { "epoch": 1.2863865000811292, "grad_norm": 0.7007371187210083, "learning_rate": 6.560733925151504e-06, "loss": 0.2604, "step": 1982 }, { "epoch": 1.28768456920331, "grad_norm": 0.8188920021057129, "learning_rate": 6.553932101532967e-06, "loss": 0.2684, "step": 1984 }, { "epoch": 1.2889826383254908, "grad_norm": 0.8642311096191406, "learning_rate": 6.547127093273009e-06, "loss": 0.296, "step": 1986 }, { "epoch": 1.2902807074476716, "grad_norm": 0.6796236634254456, "learning_rate": 6.5403189143178725e-06, "loss": 0.2301, "step": 1988 }, { "epoch": 1.2915787765698523, "grad_norm": 0.7118216156959534, "learning_rate": 6.533507578620293e-06, "loss": 0.2742, "step": 1990 }, { "epoch": 1.292876845692033, "grad_norm": 0.6332305073738098, "learning_rate": 6.526693100139481e-06, "loss": 0.2765, "step": 1992 }, { "epoch": 1.2941749148142139, "grad_norm": 0.7948624491691589, "learning_rate": 6.519875492841086e-06, "loss": 0.2771, "step": 1994 }, { "epoch": 1.2954729839363945, "grad_norm": 0.652149498462677, "learning_rate": 6.513054770697167e-06, "loss": 0.2646, "step": 1996 }, { "epoch": 1.2967710530585754, "grad_norm": 0.7546085119247437, "learning_rate": 6.5062309476861714e-06, "loss": 0.2531, "step": 1998 }, { "epoch": 1.298069122180756, "grad_norm": 1.0073802471160889, "learning_rate": 6.499404037792899e-06, "loss": 0.2822, "step": 2000 }, { "epoch": 1.298069122180756, "eval_loss": 0.2830822765827179, "eval_runtime": 397.2088, "eval_samples_per_second": 26.132, "eval_steps_per_second": 3.268, "step": 2000 }, { "epoch": 1.2993671913029368, "grad_norm": 0.9446436166763306, "learning_rate": 6.492574055008474e-06, "loss": 0.2798, "step": 2002 }, { "epoch": 1.3006652604251177, "grad_norm": 0.8931697607040405, "learning_rate": 6.485741013330321e-06, "loss": 0.2681, "step": 2004 }, { "epoch": 1.3019633295472983, "grad_norm": 0.776228666305542, "learning_rate": 6.4789049267621325e-06, "loss": 0.2482, "step": 2006 }, { "epoch": 1.3032613986694792, "grad_norm": 0.7612467408180237, "learning_rate": 6.472065809313842e-06, "loss": 0.302, "step": 2008 }, { "epoch": 1.3045594677916599, "grad_norm": 0.7268627882003784, "learning_rate": 6.465223675001593e-06, "loss": 0.2658, "step": 2010 }, { "epoch": 1.3058575369138405, "grad_norm": 0.6127284169197083, "learning_rate": 6.458378537847713e-06, "loss": 0.2606, "step": 2012 }, { "epoch": 1.3071556060360214, "grad_norm": 0.8818506598472595, "learning_rate": 6.451530411880684e-06, "loss": 0.2817, "step": 2014 }, { "epoch": 1.308453675158202, "grad_norm": 0.7983253002166748, "learning_rate": 6.444679311135112e-06, "loss": 0.2862, "step": 2016 }, { "epoch": 1.309751744280383, "grad_norm": 1.010633945465088, "learning_rate": 6.437825249651699e-06, "loss": 0.2799, "step": 2018 }, { "epoch": 1.3110498134025637, "grad_norm": 0.7234138250350952, "learning_rate": 6.430968241477219e-06, "loss": 0.2753, "step": 2020 }, { "epoch": 1.3123478825247443, "grad_norm": 0.7758423089981079, "learning_rate": 6.424108300664479e-06, "loss": 0.3015, "step": 2022 }, { "epoch": 1.3136459516469252, "grad_norm": 0.5169479250907898, "learning_rate": 6.417245441272299e-06, "loss": 0.282, "step": 2024 }, { "epoch": 1.3149440207691059, "grad_norm": 0.6473793983459473, "learning_rate": 6.410379677365481e-06, "loss": 0.2677, "step": 2026 }, { "epoch": 1.3162420898912868, "grad_norm": 0.7481542229652405, "learning_rate": 6.403511023014778e-06, "loss": 0.281, "step": 2028 }, { "epoch": 1.3175401590134674, "grad_norm": 0.5950220227241516, "learning_rate": 6.396639492296868e-06, "loss": 0.27, "step": 2030 }, { "epoch": 1.318838228135648, "grad_norm": 0.6593016982078552, "learning_rate": 6.389765099294325e-06, "loss": 0.2539, "step": 2032 }, { "epoch": 1.320136297257829, "grad_norm": 0.6176759600639343, "learning_rate": 6.382887858095585e-06, "loss": 0.2436, "step": 2034 }, { "epoch": 1.3214343663800097, "grad_norm": 0.6842327117919922, "learning_rate": 6.376007782794926e-06, "loss": 0.2596, "step": 2036 }, { "epoch": 1.3227324355021906, "grad_norm": 0.6928659081459045, "learning_rate": 6.369124887492429e-06, "loss": 0.2521, "step": 2038 }, { "epoch": 1.3240305046243712, "grad_norm": 0.8345635533332825, "learning_rate": 6.3622391862939606e-06, "loss": 0.2541, "step": 2040 }, { "epoch": 1.3253285737465519, "grad_norm": 0.7090135216712952, "learning_rate": 6.3553506933111295e-06, "loss": 0.2714, "step": 2042 }, { "epoch": 1.3266266428687328, "grad_norm": 0.944474995136261, "learning_rate": 6.348459422661276e-06, "loss": 0.2619, "step": 2044 }, { "epoch": 1.3279247119909134, "grad_norm": 0.8054224848747253, "learning_rate": 6.341565388467425e-06, "loss": 0.2676, "step": 2046 }, { "epoch": 1.3292227811130943, "grad_norm": 1.0661146640777588, "learning_rate": 6.334668604858268e-06, "loss": 0.3463, "step": 2048 }, { "epoch": 1.330520850235275, "grad_norm": 0.7750746011734009, "learning_rate": 6.3277690859681275e-06, "loss": 0.2432, "step": 2050 }, { "epoch": 1.3318189193574557, "grad_norm": 0.8616805076599121, "learning_rate": 6.320866845936942e-06, "loss": 0.2953, "step": 2052 }, { "epoch": 1.3331169884796366, "grad_norm": 0.6890926957130432, "learning_rate": 6.313961898910214e-06, "loss": 0.2865, "step": 2054 }, { "epoch": 1.3344150576018172, "grad_norm": 0.6274803280830383, "learning_rate": 6.307054259039003e-06, "loss": 0.2527, "step": 2056 }, { "epoch": 1.3357131267239981, "grad_norm": 0.7185997366905212, "learning_rate": 6.300143940479881e-06, "loss": 0.3088, "step": 2058 }, { "epoch": 1.3370111958461788, "grad_norm": 0.8126703500747681, "learning_rate": 6.293230957394916e-06, "loss": 0.2668, "step": 2060 }, { "epoch": 1.3383092649683594, "grad_norm": 0.6337158679962158, "learning_rate": 6.2863153239516306e-06, "loss": 0.2568, "step": 2062 }, { "epoch": 1.3396073340905403, "grad_norm": 0.8735512495040894, "learning_rate": 6.279397054322983e-06, "loss": 0.2833, "step": 2064 }, { "epoch": 1.340905403212721, "grad_norm": 0.747967004776001, "learning_rate": 6.272476162687331e-06, "loss": 0.2687, "step": 2066 }, { "epoch": 1.342203472334902, "grad_norm": 0.8506736755371094, "learning_rate": 6.265552663228411e-06, "loss": 0.267, "step": 2068 }, { "epoch": 1.3435015414570826, "grad_norm": 0.7105609178543091, "learning_rate": 6.2586265701352976e-06, "loss": 0.2798, "step": 2070 }, { "epoch": 1.3447996105792632, "grad_norm": 0.749955952167511, "learning_rate": 6.251697897602384e-06, "loss": 0.2969, "step": 2072 }, { "epoch": 1.3460976797014441, "grad_norm": 0.7074192762374878, "learning_rate": 6.244766659829351e-06, "loss": 0.2559, "step": 2074 }, { "epoch": 1.3473957488236248, "grad_norm": 0.6345508098602295, "learning_rate": 6.2378328710211345e-06, "loss": 0.2672, "step": 2076 }, { "epoch": 1.3486938179458057, "grad_norm": 0.6274634003639221, "learning_rate": 6.2308965453878975e-06, "loss": 0.2577, "step": 2078 }, { "epoch": 1.3499918870679863, "grad_norm": 0.9168699383735657, "learning_rate": 6.223957697145006e-06, "loss": 0.2663, "step": 2080 }, { "epoch": 1.351289956190167, "grad_norm": 1.073569893836975, "learning_rate": 6.217016340512993e-06, "loss": 0.2913, "step": 2082 }, { "epoch": 1.352588025312348, "grad_norm": 0.7373440265655518, "learning_rate": 6.210072489717533e-06, "loss": 0.2454, "step": 2084 }, { "epoch": 1.3538860944345286, "grad_norm": 0.9055160284042358, "learning_rate": 6.203126158989411e-06, "loss": 0.2941, "step": 2086 }, { "epoch": 1.3551841635567095, "grad_norm": 0.8349823951721191, "learning_rate": 6.196177362564495e-06, "loss": 0.2972, "step": 2088 }, { "epoch": 1.3564822326788901, "grad_norm": 0.6581112742424011, "learning_rate": 6.189226114683708e-06, "loss": 0.2669, "step": 2090 }, { "epoch": 1.3577803018010708, "grad_norm": 0.8073927760124207, "learning_rate": 6.1822724295929924e-06, "loss": 0.2761, "step": 2092 }, { "epoch": 1.3590783709232517, "grad_norm": 0.9573937058448792, "learning_rate": 6.175316321543291e-06, "loss": 0.2835, "step": 2094 }, { "epoch": 1.3603764400454323, "grad_norm": 0.5541600584983826, "learning_rate": 6.168357804790509e-06, "loss": 0.2673, "step": 2096 }, { "epoch": 1.3616745091676132, "grad_norm": 0.5873492956161499, "learning_rate": 6.161396893595489e-06, "loss": 0.3062, "step": 2098 }, { "epoch": 1.362972578289794, "grad_norm": 0.7199841737747192, "learning_rate": 6.154433602223979e-06, "loss": 0.266, "step": 2100 }, { "epoch": 1.3642706474119746, "grad_norm": 0.8412846922874451, "learning_rate": 6.14746794494661e-06, "loss": 0.2661, "step": 2102 }, { "epoch": 1.3655687165341555, "grad_norm": 0.5512349009513855, "learning_rate": 6.140499936038854e-06, "loss": 0.2475, "step": 2104 }, { "epoch": 1.3668667856563361, "grad_norm": 0.6925185918807983, "learning_rate": 6.13352958978101e-06, "loss": 0.262, "step": 2106 }, { "epoch": 1.368164854778517, "grad_norm": 0.7893130779266357, "learning_rate": 6.126556920458162e-06, "loss": 0.2791, "step": 2108 }, { "epoch": 1.3694629239006977, "grad_norm": 0.7951237559318542, "learning_rate": 6.1195819423601565e-06, "loss": 0.2641, "step": 2110 }, { "epoch": 1.3707609930228783, "grad_norm": 0.8454503417015076, "learning_rate": 6.112604669781572e-06, "loss": 0.2421, "step": 2112 }, { "epoch": 1.3720590621450592, "grad_norm": 1.0187815427780151, "learning_rate": 6.105625117021692e-06, "loss": 0.2731, "step": 2114 }, { "epoch": 1.37335713126724, "grad_norm": 0.6526299715042114, "learning_rate": 6.0986432983844645e-06, "loss": 0.3148, "step": 2116 }, { "epoch": 1.3746552003894208, "grad_norm": 0.697232723236084, "learning_rate": 6.091659228178492e-06, "loss": 0.268, "step": 2118 }, { "epoch": 1.3759532695116015, "grad_norm": 0.8213191032409668, "learning_rate": 6.084672920716983e-06, "loss": 0.2678, "step": 2120 }, { "epoch": 1.3772513386337821, "grad_norm": 0.8158425092697144, "learning_rate": 6.077684390317737e-06, "loss": 0.2961, "step": 2122 }, { "epoch": 1.378549407755963, "grad_norm": 0.8254956007003784, "learning_rate": 6.070693651303105e-06, "loss": 0.2859, "step": 2124 }, { "epoch": 1.3798474768781437, "grad_norm": 0.7043136358261108, "learning_rate": 6.0637007179999665e-06, "loss": 0.2805, "step": 2126 }, { "epoch": 1.3811455460003246, "grad_norm": 0.6378942728042603, "learning_rate": 6.056705604739696e-06, "loss": 0.2624, "step": 2128 }, { "epoch": 1.3824436151225052, "grad_norm": 0.6072874069213867, "learning_rate": 6.049708325858139e-06, "loss": 0.2719, "step": 2130 }, { "epoch": 1.383741684244686, "grad_norm": 0.6928173899650574, "learning_rate": 6.0427088956955745e-06, "loss": 0.2759, "step": 2132 }, { "epoch": 1.3850397533668668, "grad_norm": 0.7546768188476562, "learning_rate": 6.035707328596698e-06, "loss": 0.2531, "step": 2134 }, { "epoch": 1.3863378224890475, "grad_norm": 1.3784035444259644, "learning_rate": 6.028703638910574e-06, "loss": 0.2961, "step": 2136 }, { "epoch": 1.3876358916112284, "grad_norm": 0.7331580519676208, "learning_rate": 6.021697840990628e-06, "loss": 0.267, "step": 2138 }, { "epoch": 1.388933960733409, "grad_norm": 0.6743558049201965, "learning_rate": 6.014689949194594e-06, "loss": 0.2585, "step": 2140 }, { "epoch": 1.3902320298555897, "grad_norm": 0.7402589321136475, "learning_rate": 6.0076799778845105e-06, "loss": 0.291, "step": 2142 }, { "epoch": 1.3915300989777706, "grad_norm": 1.4501358270645142, "learning_rate": 6.000667941426668e-06, "loss": 0.3113, "step": 2144 }, { "epoch": 1.3928281680999512, "grad_norm": 0.7894465923309326, "learning_rate": 5.9936538541915925e-06, "loss": 0.2622, "step": 2146 }, { "epoch": 1.3941262372221321, "grad_norm": 0.6070988178253174, "learning_rate": 5.986637730554014e-06, "loss": 0.2807, "step": 2148 }, { "epoch": 1.3954243063443128, "grad_norm": 0.9005820155143738, "learning_rate": 5.979619584892834e-06, "loss": 0.2892, "step": 2150 }, { "epoch": 1.3967223754664935, "grad_norm": 0.6825641989707947, "learning_rate": 5.972599431591098e-06, "loss": 0.2674, "step": 2152 }, { "epoch": 1.3980204445886744, "grad_norm": 0.6085848212242126, "learning_rate": 5.96557728503597e-06, "loss": 0.2722, "step": 2154 }, { "epoch": 1.399318513710855, "grad_norm": 0.7618375420570374, "learning_rate": 5.958553159618693e-06, "loss": 0.2749, "step": 2156 }, { "epoch": 1.400616582833036, "grad_norm": 0.7533162832260132, "learning_rate": 5.951527069734569e-06, "loss": 0.2741, "step": 2158 }, { "epoch": 1.4019146519552166, "grad_norm": 0.8005886077880859, "learning_rate": 5.944499029782924e-06, "loss": 0.2676, "step": 2160 }, { "epoch": 1.4032127210773973, "grad_norm": 0.7042413949966431, "learning_rate": 5.937469054167084e-06, "loss": 0.2935, "step": 2162 }, { "epoch": 1.4045107901995781, "grad_norm": 0.6408364772796631, "learning_rate": 5.930437157294339e-06, "loss": 0.254, "step": 2164 }, { "epoch": 1.4058088593217588, "grad_norm": 0.8525874614715576, "learning_rate": 5.923403353575916e-06, "loss": 0.288, "step": 2166 }, { "epoch": 1.4071069284439397, "grad_norm": 0.7489825487136841, "learning_rate": 5.9163676574269526e-06, "loss": 0.2749, "step": 2168 }, { "epoch": 1.4084049975661204, "grad_norm": 0.6510657668113708, "learning_rate": 5.9093300832664625e-06, "loss": 0.2957, "step": 2170 }, { "epoch": 1.409703066688301, "grad_norm": 0.71758633852005, "learning_rate": 5.902290645517308e-06, "loss": 0.2733, "step": 2172 }, { "epoch": 1.411001135810482, "grad_norm": 0.6123653054237366, "learning_rate": 5.895249358606173e-06, "loss": 0.2885, "step": 2174 }, { "epoch": 1.4122992049326626, "grad_norm": 1.0535272359848022, "learning_rate": 5.888206236963529e-06, "loss": 0.2965, "step": 2176 }, { "epoch": 1.4135972740548435, "grad_norm": 0.6721482872962952, "learning_rate": 5.88116129502361e-06, "loss": 0.2694, "step": 2178 }, { "epoch": 1.4148953431770241, "grad_norm": 0.7783011794090271, "learning_rate": 5.874114547224375e-06, "loss": 0.2917, "step": 2180 }, { "epoch": 1.4161934122992048, "grad_norm": 0.7448126077651978, "learning_rate": 5.867066008007492e-06, "loss": 0.2675, "step": 2182 }, { "epoch": 1.4174914814213857, "grad_norm": 1.007434368133545, "learning_rate": 5.860015691818292e-06, "loss": 0.2784, "step": 2184 }, { "epoch": 1.4187895505435664, "grad_norm": 0.6640110611915588, "learning_rate": 5.852963613105757e-06, "loss": 0.2684, "step": 2186 }, { "epoch": 1.4200876196657473, "grad_norm": 0.6505919694900513, "learning_rate": 5.8459097863224705e-06, "loss": 0.2666, "step": 2188 }, { "epoch": 1.421385688787928, "grad_norm": 0.652265727519989, "learning_rate": 5.838854225924607e-06, "loss": 0.2666, "step": 2190 }, { "epoch": 1.4226837579101086, "grad_norm": 0.5974420309066772, "learning_rate": 5.83179694637189e-06, "loss": 0.2814, "step": 2192 }, { "epoch": 1.4239818270322895, "grad_norm": 0.6164116859436035, "learning_rate": 5.824737962127569e-06, "loss": 0.2959, "step": 2194 }, { "epoch": 1.4252798961544701, "grad_norm": 0.6848245859146118, "learning_rate": 5.817677287658382e-06, "loss": 0.2596, "step": 2196 }, { "epoch": 1.426577965276651, "grad_norm": 0.8903093934059143, "learning_rate": 5.810614937434537e-06, "loss": 0.2917, "step": 2198 }, { "epoch": 1.4278760343988317, "grad_norm": 0.7246087789535522, "learning_rate": 5.803550925929673e-06, "loss": 0.2568, "step": 2200 }, { "epoch": 1.4291741035210124, "grad_norm": 0.626372754573822, "learning_rate": 5.796485267620834e-06, "loss": 0.2721, "step": 2202 }, { "epoch": 1.4304721726431933, "grad_norm": 0.6185775399208069, "learning_rate": 5.789417976988437e-06, "loss": 0.2733, "step": 2204 }, { "epoch": 1.431770241765374, "grad_norm": 0.6580395698547363, "learning_rate": 5.782349068516249e-06, "loss": 0.2609, "step": 2206 }, { "epoch": 1.4330683108875548, "grad_norm": 0.6881176233291626, "learning_rate": 5.7752785566913484e-06, "loss": 0.2513, "step": 2208 }, { "epoch": 1.4343663800097355, "grad_norm": 0.6333670616149902, "learning_rate": 5.768206456004103e-06, "loss": 0.2582, "step": 2210 }, { "epoch": 1.4356644491319162, "grad_norm": 0.7055801153182983, "learning_rate": 5.761132780948132e-06, "loss": 0.2935, "step": 2212 }, { "epoch": 1.436962518254097, "grad_norm": 0.5895374417304993, "learning_rate": 5.754057546020289e-06, "loss": 0.2929, "step": 2214 }, { "epoch": 1.4382605873762777, "grad_norm": 0.5848456025123596, "learning_rate": 5.746980765720613e-06, "loss": 0.2434, "step": 2216 }, { "epoch": 1.4395586564984586, "grad_norm": 0.7759333848953247, "learning_rate": 5.739902454552323e-06, "loss": 0.2631, "step": 2218 }, { "epoch": 1.4408567256206393, "grad_norm": 0.8770084977149963, "learning_rate": 5.732822627021765e-06, "loss": 0.2899, "step": 2220 }, { "epoch": 1.44215479474282, "grad_norm": 0.7373355031013489, "learning_rate": 5.725741297638399e-06, "loss": 0.2414, "step": 2222 }, { "epoch": 1.4434528638650008, "grad_norm": 0.9495701193809509, "learning_rate": 5.718658480914757e-06, "loss": 0.2969, "step": 2224 }, { "epoch": 1.4447509329871815, "grad_norm": 1.1291700601577759, "learning_rate": 5.711574191366427e-06, "loss": 0.2761, "step": 2226 }, { "epoch": 1.4460490021093624, "grad_norm": 0.7184433341026306, "learning_rate": 5.704488443512008e-06, "loss": 0.2484, "step": 2228 }, { "epoch": 1.447347071231543, "grad_norm": 0.8845110535621643, "learning_rate": 5.697401251873092e-06, "loss": 0.268, "step": 2230 }, { "epoch": 1.4486451403537237, "grad_norm": 0.8232345581054688, "learning_rate": 5.690312630974229e-06, "loss": 0.3071, "step": 2232 }, { "epoch": 1.4499432094759046, "grad_norm": 0.5671408772468567, "learning_rate": 5.683222595342898e-06, "loss": 0.2503, "step": 2234 }, { "epoch": 1.4512412785980853, "grad_norm": 0.6496191620826721, "learning_rate": 5.676131159509477e-06, "loss": 0.2885, "step": 2236 }, { "epoch": 1.4525393477202662, "grad_norm": 0.6415663361549377, "learning_rate": 5.6690383380072136e-06, "loss": 0.2726, "step": 2238 }, { "epoch": 1.4538374168424468, "grad_norm": 0.8236604332923889, "learning_rate": 5.661944145372193e-06, "loss": 0.2842, "step": 2240 }, { "epoch": 1.4551354859646275, "grad_norm": 1.2532596588134766, "learning_rate": 5.654848596143319e-06, "loss": 0.3024, "step": 2242 }, { "epoch": 1.4564335550868084, "grad_norm": 0.786945641040802, "learning_rate": 5.647751704862263e-06, "loss": 0.2567, "step": 2244 }, { "epoch": 1.457731624208989, "grad_norm": 0.6400529146194458, "learning_rate": 5.640653486073457e-06, "loss": 0.2706, "step": 2246 }, { "epoch": 1.45902969333117, "grad_norm": 0.6810324788093567, "learning_rate": 5.633553954324047e-06, "loss": 0.2708, "step": 2248 }, { "epoch": 1.4603277624533506, "grad_norm": 0.8678742051124573, "learning_rate": 5.626453124163876e-06, "loss": 0.2866, "step": 2250 }, { "epoch": 1.4616258315755313, "grad_norm": 0.6219431161880493, "learning_rate": 5.619351010145442e-06, "loss": 0.2831, "step": 2252 }, { "epoch": 1.4629239006977122, "grad_norm": 0.6800902485847473, "learning_rate": 5.612247626823878e-06, "loss": 0.2671, "step": 2254 }, { "epoch": 1.4642219698198928, "grad_norm": 0.9607003331184387, "learning_rate": 5.605142988756917e-06, "loss": 0.3037, "step": 2256 }, { "epoch": 1.4655200389420737, "grad_norm": 0.8659271597862244, "learning_rate": 5.598037110504863e-06, "loss": 0.274, "step": 2258 }, { "epoch": 1.4668181080642544, "grad_norm": 0.9243478775024414, "learning_rate": 5.590930006630561e-06, "loss": 0.2573, "step": 2260 }, { "epoch": 1.468116177186435, "grad_norm": 0.7669197916984558, "learning_rate": 5.583821691699373e-06, "loss": 0.3405, "step": 2262 }, { "epoch": 1.469414246308616, "grad_norm": 0.7261797189712524, "learning_rate": 5.576712180279134e-06, "loss": 0.2989, "step": 2264 }, { "epoch": 1.4707123154307966, "grad_norm": 0.6921839714050293, "learning_rate": 5.56960148694014e-06, "loss": 0.2783, "step": 2266 }, { "epoch": 1.4720103845529775, "grad_norm": 1.0927634239196777, "learning_rate": 5.562489626255104e-06, "loss": 0.2967, "step": 2268 }, { "epoch": 1.4733084536751582, "grad_norm": 0.7308538556098938, "learning_rate": 5.555376612799131e-06, "loss": 0.2963, "step": 2270 }, { "epoch": 1.4746065227973388, "grad_norm": 0.6555987000465393, "learning_rate": 5.5482624611496936e-06, "loss": 0.2616, "step": 2272 }, { "epoch": 1.4759045919195197, "grad_norm": 0.8785131573677063, "learning_rate": 5.541147185886591e-06, "loss": 0.2902, "step": 2274 }, { "epoch": 1.4772026610417004, "grad_norm": 0.8525524735450745, "learning_rate": 5.534030801591928e-06, "loss": 0.3078, "step": 2276 }, { "epoch": 1.4785007301638813, "grad_norm": 0.5884180665016174, "learning_rate": 5.5269133228500825e-06, "loss": 0.284, "step": 2278 }, { "epoch": 1.479798799286062, "grad_norm": 0.9287165403366089, "learning_rate": 5.519794764247673e-06, "loss": 0.2579, "step": 2280 }, { "epoch": 1.4810968684082426, "grad_norm": 1.0357251167297363, "learning_rate": 5.512675140373537e-06, "loss": 0.2767, "step": 2282 }, { "epoch": 1.4823949375304235, "grad_norm": 0.9528429508209229, "learning_rate": 5.505554465818687e-06, "loss": 0.2907, "step": 2284 }, { "epoch": 1.4836930066526042, "grad_norm": 0.71951824426651, "learning_rate": 5.498432755176295e-06, "loss": 0.2774, "step": 2286 }, { "epoch": 1.484991075774785, "grad_norm": 0.9653683304786682, "learning_rate": 5.4913100230416536e-06, "loss": 0.2883, "step": 2288 }, { "epoch": 1.4862891448969657, "grad_norm": 0.9004492163658142, "learning_rate": 5.48418628401215e-06, "loss": 0.2679, "step": 2290 }, { "epoch": 1.4875872140191464, "grad_norm": 0.5753763914108276, "learning_rate": 5.477061552687234e-06, "loss": 0.2659, "step": 2292 }, { "epoch": 1.4888852831413273, "grad_norm": 0.6553775072097778, "learning_rate": 5.469935843668389e-06, "loss": 0.2701, "step": 2294 }, { "epoch": 1.490183352263508, "grad_norm": 0.7805309295654297, "learning_rate": 5.462809171559104e-06, "loss": 0.254, "step": 2296 }, { "epoch": 1.4914814213856888, "grad_norm": 0.8218640685081482, "learning_rate": 5.455681550964839e-06, "loss": 0.2669, "step": 2298 }, { "epoch": 1.4927794905078695, "grad_norm": 1.0237478017807007, "learning_rate": 5.448552996492999e-06, "loss": 0.2817, "step": 2300 }, { "epoch": 1.4940775596300502, "grad_norm": 1.2428785562515259, "learning_rate": 5.441423522752904e-06, "loss": 0.3027, "step": 2302 }, { "epoch": 1.495375628752231, "grad_norm": 0.6950944662094116, "learning_rate": 5.4342931443557576e-06, "loss": 0.2879, "step": 2304 }, { "epoch": 1.4966736978744117, "grad_norm": 1.057386040687561, "learning_rate": 5.427161875914616e-06, "loss": 0.2711, "step": 2306 }, { "epoch": 1.4979717669965926, "grad_norm": 0.7146075367927551, "learning_rate": 5.420029732044359e-06, "loss": 0.2671, "step": 2308 }, { "epoch": 1.4992698361187733, "grad_norm": 0.6171428561210632, "learning_rate": 5.412896727361663e-06, "loss": 0.2914, "step": 2310 }, { "epoch": 1.500567905240954, "grad_norm": 0.7253311276435852, "learning_rate": 5.405762876484967e-06, "loss": 0.3456, "step": 2312 }, { "epoch": 1.5018659743631348, "grad_norm": 0.6064237952232361, "learning_rate": 5.3986281940344445e-06, "loss": 0.2846, "step": 2314 }, { "epoch": 1.5031640434853157, "grad_norm": 0.6278650760650635, "learning_rate": 5.391492694631975e-06, "loss": 0.2562, "step": 2316 }, { "epoch": 1.5044621126074964, "grad_norm": 0.7003477811813354, "learning_rate": 5.3843563929011065e-06, "loss": 0.2796, "step": 2318 }, { "epoch": 1.505760181729677, "grad_norm": 0.597042441368103, "learning_rate": 5.37721930346704e-06, "loss": 0.2833, "step": 2320 }, { "epoch": 1.5070582508518577, "grad_norm": 0.7248748540878296, "learning_rate": 5.370081440956582e-06, "loss": 0.2784, "step": 2322 }, { "epoch": 1.5083563199740386, "grad_norm": 0.5374922156333923, "learning_rate": 5.362942819998131e-06, "loss": 0.2797, "step": 2324 }, { "epoch": 1.5096543890962195, "grad_norm": 0.7135380506515503, "learning_rate": 5.3558034552216355e-06, "loss": 0.2791, "step": 2326 }, { "epoch": 1.5109524582184002, "grad_norm": 0.8197761178016663, "learning_rate": 5.348663361258568e-06, "loss": 0.3011, "step": 2328 }, { "epoch": 1.5122505273405809, "grad_norm": 0.9654028415679932, "learning_rate": 5.341522552741897e-06, "loss": 0.2688, "step": 2330 }, { "epoch": 1.5135485964627615, "grad_norm": 0.7758445739746094, "learning_rate": 5.334381044306057e-06, "loss": 0.2733, "step": 2332 }, { "epoch": 1.5148466655849424, "grad_norm": 0.6773033738136292, "learning_rate": 5.327238850586912e-06, "loss": 0.2544, "step": 2334 }, { "epoch": 1.5161447347071233, "grad_norm": 0.8551132678985596, "learning_rate": 5.320095986221736e-06, "loss": 0.2687, "step": 2336 }, { "epoch": 1.517442803829304, "grad_norm": 0.9141733050346375, "learning_rate": 5.312952465849173e-06, "loss": 0.2662, "step": 2338 }, { "epoch": 1.5187408729514846, "grad_norm": 0.8719170689582825, "learning_rate": 5.3058083041092145e-06, "loss": 0.2806, "step": 2340 }, { "epoch": 1.5200389420736653, "grad_norm": 0.6118455529212952, "learning_rate": 5.2986635156431645e-06, "loss": 0.2654, "step": 2342 }, { "epoch": 1.5213370111958462, "grad_norm": 0.7894203662872314, "learning_rate": 5.291518115093612e-06, "loss": 0.2538, "step": 2344 }, { "epoch": 1.522635080318027, "grad_norm": 0.6997982859611511, "learning_rate": 5.2843721171044e-06, "loss": 0.2754, "step": 2346 }, { "epoch": 1.5239331494402077, "grad_norm": 0.9888088703155518, "learning_rate": 5.277225536320601e-06, "loss": 0.2915, "step": 2348 }, { "epoch": 1.5252312185623884, "grad_norm": 0.7454066872596741, "learning_rate": 5.27007838738847e-06, "loss": 0.2879, "step": 2350 }, { "epoch": 1.526529287684569, "grad_norm": 0.8018419146537781, "learning_rate": 5.262930684955439e-06, "loss": 0.2726, "step": 2352 }, { "epoch": 1.52782735680675, "grad_norm": 0.8444095849990845, "learning_rate": 5.255782443670068e-06, "loss": 0.2974, "step": 2354 }, { "epoch": 1.5291254259289309, "grad_norm": 0.807758092880249, "learning_rate": 5.24863367818202e-06, "loss": 0.288, "step": 2356 }, { "epoch": 1.5304234950511115, "grad_norm": 0.7994396686553955, "learning_rate": 5.241484403142036e-06, "loss": 0.2523, "step": 2358 }, { "epoch": 1.5317215641732922, "grad_norm": 0.8152909278869629, "learning_rate": 5.234334633201903e-06, "loss": 0.2698, "step": 2360 }, { "epoch": 1.5330196332954729, "grad_norm": 0.8949559926986694, "learning_rate": 5.227184383014414e-06, "loss": 0.2649, "step": 2362 }, { "epoch": 1.5343177024176538, "grad_norm": 0.7753397226333618, "learning_rate": 5.220033667233356e-06, "loss": 0.2605, "step": 2364 }, { "epoch": 1.5356157715398346, "grad_norm": 0.661039412021637, "learning_rate": 5.212882500513462e-06, "loss": 0.2504, "step": 2366 }, { "epoch": 1.5369138406620153, "grad_norm": 0.9915288686752319, "learning_rate": 5.205730897510396e-06, "loss": 0.2714, "step": 2368 }, { "epoch": 1.538211909784196, "grad_norm": 0.777660071849823, "learning_rate": 5.198578872880709e-06, "loss": 0.2732, "step": 2370 }, { "epoch": 1.5395099789063766, "grad_norm": 0.7751399278640747, "learning_rate": 5.191426441281824e-06, "loss": 0.2807, "step": 2372 }, { "epoch": 1.5408080480285575, "grad_norm": 0.7880353331565857, "learning_rate": 5.184273617371991e-06, "loss": 0.2797, "step": 2374 }, { "epoch": 1.5421061171507384, "grad_norm": 0.7058007717132568, "learning_rate": 5.177120415810271e-06, "loss": 0.2322, "step": 2376 }, { "epoch": 1.543404186272919, "grad_norm": 0.7940738201141357, "learning_rate": 5.169966851256489e-06, "loss": 0.2598, "step": 2378 }, { "epoch": 1.5447022553950998, "grad_norm": 0.6759476661682129, "learning_rate": 5.162812938371226e-06, "loss": 0.2913, "step": 2380 }, { "epoch": 1.5460003245172804, "grad_norm": 0.844478189945221, "learning_rate": 5.155658691815766e-06, "loss": 0.2568, "step": 2382 }, { "epoch": 1.5472983936394613, "grad_norm": 0.8355162143707275, "learning_rate": 5.1485041262520845e-06, "loss": 0.2721, "step": 2384 }, { "epoch": 1.5485964627616422, "grad_norm": 0.9037420749664307, "learning_rate": 5.141349256342805e-06, "loss": 0.2795, "step": 2386 }, { "epoch": 1.5498945318838229, "grad_norm": 0.787811279296875, "learning_rate": 5.134194096751182e-06, "loss": 0.2665, "step": 2388 }, { "epoch": 1.5511926010060035, "grad_norm": 0.7204416990280151, "learning_rate": 5.1270386621410565e-06, "loss": 0.2719, "step": 2390 }, { "epoch": 1.5524906701281842, "grad_norm": 0.9914586544036865, "learning_rate": 5.1198829671768355e-06, "loss": 0.3155, "step": 2392 }, { "epoch": 1.553788739250365, "grad_norm": 0.7270455360412598, "learning_rate": 5.112727026523461e-06, "loss": 0.2679, "step": 2394 }, { "epoch": 1.555086808372546, "grad_norm": 0.7560290694236755, "learning_rate": 5.105570854846378e-06, "loss": 0.2526, "step": 2396 }, { "epoch": 1.5563848774947266, "grad_norm": 0.673557460308075, "learning_rate": 5.098414466811504e-06, "loss": 0.2522, "step": 2398 }, { "epoch": 1.5576829466169073, "grad_norm": 0.6697165369987488, "learning_rate": 5.091257877085201e-06, "loss": 0.2628, "step": 2400 }, { "epoch": 1.558981015739088, "grad_norm": 0.8556097149848938, "learning_rate": 5.08410110033424e-06, "loss": 0.2877, "step": 2402 }, { "epoch": 1.5602790848612689, "grad_norm": 0.6153810024261475, "learning_rate": 5.0769441512257845e-06, "loss": 0.2445, "step": 2404 }, { "epoch": 1.5615771539834498, "grad_norm": 0.7087406516075134, "learning_rate": 5.069787044427341e-06, "loss": 0.3031, "step": 2406 }, { "epoch": 1.5628752231056304, "grad_norm": 0.7914629578590393, "learning_rate": 5.062629794606748e-06, "loss": 0.2809, "step": 2408 }, { "epoch": 1.564173292227811, "grad_norm": 0.586275041103363, "learning_rate": 5.055472416432129e-06, "loss": 0.2504, "step": 2410 }, { "epoch": 1.5654713613499918, "grad_norm": 0.5806017518043518, "learning_rate": 5.0483149245718785e-06, "loss": 0.2532, "step": 2412 }, { "epoch": 1.5667694304721727, "grad_norm": 0.6169623732566833, "learning_rate": 5.041157333694617e-06, "loss": 0.2551, "step": 2414 }, { "epoch": 1.5680674995943535, "grad_norm": 0.7911830544471741, "learning_rate": 5.033999658469174e-06, "loss": 0.2816, "step": 2416 }, { "epoch": 1.5693655687165342, "grad_norm": 0.8229092359542847, "learning_rate": 5.0268419135645455e-06, "loss": 0.2455, "step": 2418 }, { "epoch": 1.5706636378387149, "grad_norm": 0.8928384184837341, "learning_rate": 5.019684113649877e-06, "loss": 0.2633, "step": 2420 }, { "epoch": 1.5719617069608955, "grad_norm": 1.5598856210708618, "learning_rate": 5.01252627339442e-06, "loss": 0.2848, "step": 2422 }, { "epoch": 1.5732597760830764, "grad_norm": 0.8572347164154053, "learning_rate": 5.005368407467515e-06, "loss": 0.2831, "step": 2424 }, { "epoch": 1.5745578452052573, "grad_norm": 0.870752215385437, "learning_rate": 4.99821053053855e-06, "loss": 0.3355, "step": 2426 }, { "epoch": 1.575855914327438, "grad_norm": 0.7191547155380249, "learning_rate": 4.991052657276938e-06, "loss": 0.2789, "step": 2428 }, { "epoch": 1.5771539834496187, "grad_norm": 0.8099519610404968, "learning_rate": 4.9838948023520864e-06, "loss": 0.263, "step": 2430 }, { "epoch": 1.5784520525717993, "grad_norm": 1.112536072731018, "learning_rate": 4.9767369804333616e-06, "loss": 0.3136, "step": 2432 }, { "epoch": 1.5797501216939802, "grad_norm": 0.7345004081726074, "learning_rate": 4.9695792061900615e-06, "loss": 0.2599, "step": 2434 }, { "epoch": 1.581048190816161, "grad_norm": 0.7431607246398926, "learning_rate": 4.9624214942913916e-06, "loss": 0.2789, "step": 2436 }, { "epoch": 1.5823462599383418, "grad_norm": 0.6821392774581909, "learning_rate": 4.9552638594064234e-06, "loss": 0.2611, "step": 2438 }, { "epoch": 1.5836443290605224, "grad_norm": 0.7300307750701904, "learning_rate": 4.948106316204077e-06, "loss": 0.2964, "step": 2440 }, { "epoch": 1.584942398182703, "grad_norm": 0.6654735207557678, "learning_rate": 4.940948879353078e-06, "loss": 0.2764, "step": 2442 }, { "epoch": 1.586240467304884, "grad_norm": 0.6538625955581665, "learning_rate": 4.933791563521939e-06, "loss": 0.2803, "step": 2444 }, { "epoch": 1.5875385364270649, "grad_norm": 0.6428682804107666, "learning_rate": 4.926634383378923e-06, "loss": 0.2786, "step": 2446 }, { "epoch": 1.5888366055492456, "grad_norm": 0.6169781684875488, "learning_rate": 4.919477353592015e-06, "loss": 0.2553, "step": 2448 }, { "epoch": 1.5901346746714262, "grad_norm": 0.6271160840988159, "learning_rate": 4.912320488828887e-06, "loss": 0.2518, "step": 2450 }, { "epoch": 1.5914327437936069, "grad_norm": 0.6366329789161682, "learning_rate": 4.905163803756885e-06, "loss": 0.2852, "step": 2452 }, { "epoch": 1.5927308129157878, "grad_norm": 0.7276479005813599, "learning_rate": 4.898007313042975e-06, "loss": 0.2689, "step": 2454 }, { "epoch": 1.5940288820379687, "grad_norm": 1.1389663219451904, "learning_rate": 4.890851031353727e-06, "loss": 0.3095, "step": 2456 }, { "epoch": 1.5953269511601493, "grad_norm": 0.7491416931152344, "learning_rate": 4.883694973355288e-06, "loss": 0.2693, "step": 2458 }, { "epoch": 1.59662502028233, "grad_norm": 0.7409158945083618, "learning_rate": 4.87653915371334e-06, "loss": 0.2863, "step": 2460 }, { "epoch": 1.5979230894045107, "grad_norm": 0.7384921908378601, "learning_rate": 4.869383587093083e-06, "loss": 0.274, "step": 2462 }, { "epoch": 1.5992211585266916, "grad_norm": 0.9084856510162354, "learning_rate": 4.862228288159191e-06, "loss": 0.2902, "step": 2464 }, { "epoch": 1.6005192276488724, "grad_norm": 0.8034806847572327, "learning_rate": 4.8550732715757965e-06, "loss": 0.2569, "step": 2466 }, { "epoch": 1.6018172967710531, "grad_norm": 0.6543619632720947, "learning_rate": 4.847918552006448e-06, "loss": 0.2678, "step": 2468 }, { "epoch": 1.6031153658932338, "grad_norm": 1.1536222696304321, "learning_rate": 4.840764144114089e-06, "loss": 0.2929, "step": 2470 }, { "epoch": 1.6044134350154144, "grad_norm": 0.7316953539848328, "learning_rate": 4.833610062561022e-06, "loss": 0.2914, "step": 2472 }, { "epoch": 1.6057115041375953, "grad_norm": 0.784407913684845, "learning_rate": 4.826456322008884e-06, "loss": 0.2652, "step": 2474 }, { "epoch": 1.6070095732597762, "grad_norm": 0.8138547539710999, "learning_rate": 4.819302937118606e-06, "loss": 0.2737, "step": 2476 }, { "epoch": 1.608307642381957, "grad_norm": 0.6371288895606995, "learning_rate": 4.8121499225503974e-06, "loss": 0.2527, "step": 2478 }, { "epoch": 1.6096057115041376, "grad_norm": 0.7160821557044983, "learning_rate": 4.804997292963706e-06, "loss": 0.2672, "step": 2480 }, { "epoch": 1.6109037806263182, "grad_norm": 0.7064367532730103, "learning_rate": 4.79784506301719e-06, "loss": 0.2624, "step": 2482 }, { "epoch": 1.6122018497484991, "grad_norm": 0.5695556402206421, "learning_rate": 4.79069324736869e-06, "loss": 0.2595, "step": 2484 }, { "epoch": 1.61349991887068, "grad_norm": 0.6478171348571777, "learning_rate": 4.783541860675195e-06, "loss": 0.2615, "step": 2486 }, { "epoch": 1.6147979879928607, "grad_norm": 0.8635448813438416, "learning_rate": 4.776390917592819e-06, "loss": 0.2896, "step": 2488 }, { "epoch": 1.6160960571150413, "grad_norm": 1.418860912322998, "learning_rate": 4.7692404327767615e-06, "loss": 0.3497, "step": 2490 }, { "epoch": 1.617394126237222, "grad_norm": 0.9336181282997131, "learning_rate": 4.762090420881289e-06, "loss": 0.2819, "step": 2492 }, { "epoch": 1.618692195359403, "grad_norm": 0.6734920144081116, "learning_rate": 4.754940896559692e-06, "loss": 0.2961, "step": 2494 }, { "epoch": 1.6199902644815838, "grad_norm": 0.5942052602767944, "learning_rate": 4.747791874464269e-06, "loss": 0.277, "step": 2496 }, { "epoch": 1.6212883336037645, "grad_norm": 0.5411677956581116, "learning_rate": 4.740643369246286e-06, "loss": 0.2755, "step": 2498 }, { "epoch": 1.6225864027259451, "grad_norm": 0.5986727476119995, "learning_rate": 4.7334953955559425e-06, "loss": 0.2693, "step": 2500 }, { "epoch": 1.6225864027259451, "eval_loss": 0.27872952818870544, "eval_runtime": 397.1807, "eval_samples_per_second": 26.134, "eval_steps_per_second": 3.268, "step": 2500 }, { "epoch": 1.6238844718481258, "grad_norm": 0.883634626865387, "learning_rate": 4.726347968042364e-06, "loss": 0.292, "step": 2502 }, { "epoch": 1.6251825409703067, "grad_norm": 0.636574923992157, "learning_rate": 4.71920110135354e-06, "loss": 0.2514, "step": 2504 }, { "epoch": 1.6264806100924876, "grad_norm": 1.0073038339614868, "learning_rate": 4.712054810136327e-06, "loss": 0.2847, "step": 2506 }, { "epoch": 1.6277786792146682, "grad_norm": 0.8760421872138977, "learning_rate": 4.704909109036387e-06, "loss": 0.2916, "step": 2508 }, { "epoch": 1.629076748336849, "grad_norm": 0.6921551823616028, "learning_rate": 4.6977640126981835e-06, "loss": 0.2634, "step": 2510 }, { "epoch": 1.6303748174590296, "grad_norm": 1.0730628967285156, "learning_rate": 4.690619535764934e-06, "loss": 0.2693, "step": 2512 }, { "epoch": 1.6316728865812105, "grad_norm": 0.719224214553833, "learning_rate": 4.683475692878592e-06, "loss": 0.2713, "step": 2514 }, { "epoch": 1.6329709557033913, "grad_norm": 0.7900041341781616, "learning_rate": 4.676332498679806e-06, "loss": 0.2529, "step": 2516 }, { "epoch": 1.634269024825572, "grad_norm": 0.6915913820266724, "learning_rate": 4.669189967807901e-06, "loss": 0.2482, "step": 2518 }, { "epoch": 1.6355670939477527, "grad_norm": 1.0098533630371094, "learning_rate": 4.662048114900837e-06, "loss": 0.2655, "step": 2520 }, { "epoch": 1.6368651630699333, "grad_norm": 0.7695287466049194, "learning_rate": 4.654906954595186e-06, "loss": 0.2485, "step": 2522 }, { "epoch": 1.6381632321921142, "grad_norm": 0.8169167637825012, "learning_rate": 4.647766501526104e-06, "loss": 0.2513, "step": 2524 }, { "epoch": 1.6394613013142951, "grad_norm": 1.0432205200195312, "learning_rate": 4.640626770327293e-06, "loss": 0.2324, "step": 2526 }, { "epoch": 1.6407593704364758, "grad_norm": 0.9282649159431458, "learning_rate": 4.633487775630981e-06, "loss": 0.2408, "step": 2528 }, { "epoch": 1.6420574395586565, "grad_norm": 1.0055458545684814, "learning_rate": 4.626349532067879e-06, "loss": 0.3035, "step": 2530 }, { "epoch": 1.6433555086808371, "grad_norm": 0.7849979996681213, "learning_rate": 4.6192120542671665e-06, "loss": 0.2976, "step": 2532 }, { "epoch": 1.644653577803018, "grad_norm": 0.7660190463066101, "learning_rate": 4.612075356856447e-06, "loss": 0.293, "step": 2534 }, { "epoch": 1.645951646925199, "grad_norm": 0.8642041087150574, "learning_rate": 4.604939454461732e-06, "loss": 0.2945, "step": 2536 }, { "epoch": 1.6472497160473796, "grad_norm": 0.8616300821304321, "learning_rate": 4.597804361707395e-06, "loss": 0.2309, "step": 2538 }, { "epoch": 1.6485477851695602, "grad_norm": 0.7627285122871399, "learning_rate": 4.5906700932161575e-06, "loss": 0.2467, "step": 2540 }, { "epoch": 1.649845854291741, "grad_norm": 0.9800201654434204, "learning_rate": 4.583536663609049e-06, "loss": 0.312, "step": 2542 }, { "epoch": 1.6511439234139218, "grad_norm": 0.7120026350021362, "learning_rate": 4.576404087505375e-06, "loss": 0.2897, "step": 2544 }, { "epoch": 1.6524419925361027, "grad_norm": 0.9634262323379517, "learning_rate": 4.569272379522704e-06, "loss": 0.3055, "step": 2546 }, { "epoch": 1.6537400616582834, "grad_norm": 0.9010637402534485, "learning_rate": 4.562141554276811e-06, "loss": 0.276, "step": 2548 }, { "epoch": 1.655038130780464, "grad_norm": 0.8885979056358337, "learning_rate": 4.555011626381674e-06, "loss": 0.2735, "step": 2550 }, { "epoch": 1.6563361999026447, "grad_norm": 0.9717615842819214, "learning_rate": 4.5478826104494225e-06, "loss": 0.2741, "step": 2552 }, { "epoch": 1.6576342690248256, "grad_norm": 0.7608319520950317, "learning_rate": 4.540754521090325e-06, "loss": 0.2811, "step": 2554 }, { "epoch": 1.6589323381470065, "grad_norm": 0.6534138321876526, "learning_rate": 4.533627372912744e-06, "loss": 0.2456, "step": 2556 }, { "epoch": 1.6602304072691871, "grad_norm": 0.6963424682617188, "learning_rate": 4.52650118052312e-06, "loss": 0.2737, "step": 2558 }, { "epoch": 1.6615284763913678, "grad_norm": 0.7808846235275269, "learning_rate": 4.519375958525927e-06, "loss": 0.2714, "step": 2560 }, { "epoch": 1.6628265455135485, "grad_norm": 0.6302852034568787, "learning_rate": 4.512251721523659e-06, "loss": 0.259, "step": 2562 }, { "epoch": 1.6641246146357294, "grad_norm": 0.9222784042358398, "learning_rate": 4.5051284841167845e-06, "loss": 0.2871, "step": 2564 }, { "epoch": 1.6654226837579102, "grad_norm": 0.7034717798233032, "learning_rate": 4.498006260903724e-06, "loss": 0.2865, "step": 2566 }, { "epoch": 1.666720752880091, "grad_norm": 0.7048061490058899, "learning_rate": 4.4908850664808245e-06, "loss": 0.275, "step": 2568 }, { "epoch": 1.6680188220022716, "grad_norm": 0.830504834651947, "learning_rate": 4.483764915442318e-06, "loss": 0.2521, "step": 2570 }, { "epoch": 1.6693168911244523, "grad_norm": 0.6978550553321838, "learning_rate": 4.476645822380304e-06, "loss": 0.2506, "step": 2572 }, { "epoch": 1.6706149602466331, "grad_norm": 0.6006783246994019, "learning_rate": 4.46952780188471e-06, "loss": 0.2572, "step": 2574 }, { "epoch": 1.671913029368814, "grad_norm": 0.6841716170310974, "learning_rate": 4.462410868543268e-06, "loss": 0.274, "step": 2576 }, { "epoch": 1.6732110984909947, "grad_norm": 0.785740315914154, "learning_rate": 4.4552950369414785e-06, "loss": 0.2701, "step": 2578 }, { "epoch": 1.6745091676131754, "grad_norm": 0.7412464618682861, "learning_rate": 4.448180321662588e-06, "loss": 0.2673, "step": 2580 }, { "epoch": 1.675807236735356, "grad_norm": 0.9385430812835693, "learning_rate": 4.441066737287553e-06, "loss": 0.255, "step": 2582 }, { "epoch": 1.677105305857537, "grad_norm": 0.9223133325576782, "learning_rate": 4.433954298395012e-06, "loss": 0.2764, "step": 2584 }, { "epoch": 1.6784033749797178, "grad_norm": 1.079526424407959, "learning_rate": 4.426843019561259e-06, "loss": 0.2886, "step": 2586 }, { "epoch": 1.6797014441018985, "grad_norm": 0.6183680295944214, "learning_rate": 4.419732915360203e-06, "loss": 0.2679, "step": 2588 }, { "epoch": 1.6809995132240791, "grad_norm": 0.698919951915741, "learning_rate": 4.4126240003633565e-06, "loss": 0.2615, "step": 2590 }, { "epoch": 1.6822975823462598, "grad_norm": 0.8784648776054382, "learning_rate": 4.405516289139785e-06, "loss": 0.2762, "step": 2592 }, { "epoch": 1.6835956514684407, "grad_norm": 0.9501413702964783, "learning_rate": 4.398409796256096e-06, "loss": 0.3234, "step": 2594 }, { "epoch": 1.6848937205906216, "grad_norm": 0.8343080878257751, "learning_rate": 4.391304536276389e-06, "loss": 0.2732, "step": 2596 }, { "epoch": 1.6861917897128023, "grad_norm": 0.8229643106460571, "learning_rate": 4.384200523762249e-06, "loss": 0.2667, "step": 2598 }, { "epoch": 1.687489858834983, "grad_norm": 1.0842139720916748, "learning_rate": 4.377097773272694e-06, "loss": 0.3088, "step": 2600 }, { "epoch": 1.6887879279571636, "grad_norm": 0.9638790488243103, "learning_rate": 4.369996299364164e-06, "loss": 0.2569, "step": 2602 }, { "epoch": 1.6900859970793445, "grad_norm": 0.7487169504165649, "learning_rate": 4.362896116590475e-06, "loss": 0.2725, "step": 2604 }, { "epoch": 1.6913840662015254, "grad_norm": 0.7520002126693726, "learning_rate": 4.355797239502807e-06, "loss": 0.2715, "step": 2606 }, { "epoch": 1.692682135323706, "grad_norm": 1.0945136547088623, "learning_rate": 4.348699682649652e-06, "loss": 0.2864, "step": 2608 }, { "epoch": 1.6939802044458867, "grad_norm": 0.7077198624610901, "learning_rate": 4.3416034605768035e-06, "loss": 0.2416, "step": 2610 }, { "epoch": 1.6952782735680674, "grad_norm": 0.6732199192047119, "learning_rate": 4.33450858782732e-06, "loss": 0.2529, "step": 2612 }, { "epoch": 1.6965763426902483, "grad_norm": 0.7461745738983154, "learning_rate": 4.327415078941491e-06, "loss": 0.2769, "step": 2614 }, { "epoch": 1.6978744118124292, "grad_norm": 0.8414039015769958, "learning_rate": 4.320322948456814e-06, "loss": 0.285, "step": 2616 }, { "epoch": 1.6991724809346098, "grad_norm": 0.8390476703643799, "learning_rate": 4.313232210907959e-06, "loss": 0.2937, "step": 2618 }, { "epoch": 1.7004705500567905, "grad_norm": 0.7394710183143616, "learning_rate": 4.306142880826746e-06, "loss": 0.2396, "step": 2620 }, { "epoch": 1.7017686191789712, "grad_norm": 0.8404092788696289, "learning_rate": 4.299054972742104e-06, "loss": 0.2964, "step": 2622 }, { "epoch": 1.703066688301152, "grad_norm": 0.7553304433822632, "learning_rate": 4.2919685011800535e-06, "loss": 0.2767, "step": 2624 }, { "epoch": 1.704364757423333, "grad_norm": 0.8209068775177002, "learning_rate": 4.284883480663665e-06, "loss": 0.2821, "step": 2626 }, { "epoch": 1.7056628265455136, "grad_norm": 0.8213204145431519, "learning_rate": 4.277799925713043e-06, "loss": 0.3174, "step": 2628 }, { "epoch": 1.7069608956676943, "grad_norm": 0.7265504598617554, "learning_rate": 4.270717850845279e-06, "loss": 0.2827, "step": 2630 }, { "epoch": 1.708258964789875, "grad_norm": 0.701632022857666, "learning_rate": 4.2636372705744425e-06, "loss": 0.2733, "step": 2632 }, { "epoch": 1.7095570339120558, "grad_norm": 0.7622504234313965, "learning_rate": 4.25655819941153e-06, "loss": 0.2591, "step": 2634 }, { "epoch": 1.7108551030342367, "grad_norm": 0.7175800800323486, "learning_rate": 4.249480651864448e-06, "loss": 0.2492, "step": 2636 }, { "epoch": 1.7121531721564174, "grad_norm": 0.6563390493392944, "learning_rate": 4.242404642437985e-06, "loss": 0.2572, "step": 2638 }, { "epoch": 1.713451241278598, "grad_norm": 0.7086949944496155, "learning_rate": 4.23533018563377e-06, "loss": 0.273, "step": 2640 }, { "epoch": 1.7147493104007787, "grad_norm": 0.7614745497703552, "learning_rate": 4.228257295950258e-06, "loss": 0.2644, "step": 2642 }, { "epoch": 1.7160473795229596, "grad_norm": 0.8298172354698181, "learning_rate": 4.221185987882684e-06, "loss": 0.2743, "step": 2644 }, { "epoch": 1.7173454486451405, "grad_norm": 0.902599573135376, "learning_rate": 4.214116275923051e-06, "loss": 0.302, "step": 2646 }, { "epoch": 1.7186435177673212, "grad_norm": 1.1521806716918945, "learning_rate": 4.207048174560079e-06, "loss": 0.2949, "step": 2648 }, { "epoch": 1.7199415868895018, "grad_norm": 0.7773716449737549, "learning_rate": 4.1999816982792e-06, "loss": 0.2486, "step": 2650 }, { "epoch": 1.7212396560116825, "grad_norm": 0.9626412987709045, "learning_rate": 4.192916861562503e-06, "loss": 0.2829, "step": 2652 }, { "epoch": 1.7225377251338634, "grad_norm": 0.7856587171554565, "learning_rate": 4.1858536788887314e-06, "loss": 0.2953, "step": 2654 }, { "epoch": 1.7238357942560443, "grad_norm": 0.697591245174408, "learning_rate": 4.178792164733225e-06, "loss": 0.2451, "step": 2656 }, { "epoch": 1.725133863378225, "grad_norm": 0.8723238706588745, "learning_rate": 4.171732333567911e-06, "loss": 0.2616, "step": 2658 }, { "epoch": 1.7264319325004056, "grad_norm": 0.7141104340553284, "learning_rate": 4.1646741998612676e-06, "loss": 0.2783, "step": 2660 }, { "epoch": 1.7277300016225863, "grad_norm": 0.8903067111968994, "learning_rate": 4.15761777807829e-06, "loss": 0.2693, "step": 2662 }, { "epoch": 1.7290280707447672, "grad_norm": 0.6853131651878357, "learning_rate": 4.150563082680472e-06, "loss": 0.25, "step": 2664 }, { "epoch": 1.730326139866948, "grad_norm": 0.737055242061615, "learning_rate": 4.143510128125762e-06, "loss": 0.2766, "step": 2666 }, { "epoch": 1.7316242089891287, "grad_norm": 0.6878398060798645, "learning_rate": 4.136458928868546e-06, "loss": 0.2595, "step": 2668 }, { "epoch": 1.7329222781113094, "grad_norm": 0.6739474534988403, "learning_rate": 4.129409499359609e-06, "loss": 0.2848, "step": 2670 }, { "epoch": 1.73422034723349, "grad_norm": 0.7432988882064819, "learning_rate": 4.122361854046112e-06, "loss": 0.2812, "step": 2672 }, { "epoch": 1.735518416355671, "grad_norm": 0.7747724056243896, "learning_rate": 4.115316007371557e-06, "loss": 0.3149, "step": 2674 }, { "epoch": 1.7368164854778518, "grad_norm": 0.752185583114624, "learning_rate": 4.108271973775763e-06, "loss": 0.257, "step": 2676 }, { "epoch": 1.7381145546000325, "grad_norm": 0.7205533981323242, "learning_rate": 4.1012297676948295e-06, "loss": 0.2777, "step": 2678 }, { "epoch": 1.7394126237222132, "grad_norm": 0.9242103099822998, "learning_rate": 4.094189403561112e-06, "loss": 0.3011, "step": 2680 }, { "epoch": 1.7407106928443938, "grad_norm": 0.8891807794570923, "learning_rate": 4.087150895803192e-06, "loss": 0.2612, "step": 2682 }, { "epoch": 1.7420087619665747, "grad_norm": 0.528108537197113, "learning_rate": 4.080114258845846e-06, "loss": 0.2741, "step": 2684 }, { "epoch": 1.7433068310887556, "grad_norm": 0.8966034650802612, "learning_rate": 4.073079507110019e-06, "loss": 0.2686, "step": 2686 }, { "epoch": 1.7446049002109363, "grad_norm": 0.7042858004570007, "learning_rate": 4.066046655012786e-06, "loss": 0.2715, "step": 2688 }, { "epoch": 1.745902969333117, "grad_norm": 0.6862479448318481, "learning_rate": 4.059015716967335e-06, "loss": 0.2473, "step": 2690 }, { "epoch": 1.7472010384552976, "grad_norm": 0.8353973031044006, "learning_rate": 4.051986707382928e-06, "loss": 0.2954, "step": 2692 }, { "epoch": 1.7484991075774785, "grad_norm": 0.6343300342559814, "learning_rate": 4.044959640664878e-06, "loss": 0.2758, "step": 2694 }, { "epoch": 1.7497971766996594, "grad_norm": 0.6705842018127441, "learning_rate": 4.037934531214512e-06, "loss": 0.2385, "step": 2696 }, { "epoch": 1.75109524582184, "grad_norm": 0.6428258419036865, "learning_rate": 4.030911393429149e-06, "loss": 0.2513, "step": 2698 }, { "epoch": 1.7523933149440207, "grad_norm": 0.7831487655639648, "learning_rate": 4.023890241702068e-06, "loss": 0.2638, "step": 2700 }, { "epoch": 1.7536913840662014, "grad_norm": 1.0063369274139404, "learning_rate": 4.016871090422471e-06, "loss": 0.3345, "step": 2702 }, { "epoch": 1.7549894531883823, "grad_norm": 0.6533817648887634, "learning_rate": 4.00985395397547e-06, "loss": 0.2658, "step": 2704 }, { "epoch": 1.7562875223105632, "grad_norm": 0.7007792592048645, "learning_rate": 4.002838846742039e-06, "loss": 0.2506, "step": 2706 }, { "epoch": 1.7575855914327438, "grad_norm": 0.7208094596862793, "learning_rate": 3.995825783099002e-06, "loss": 0.3119, "step": 2708 }, { "epoch": 1.7588836605549245, "grad_norm": 0.6511527895927429, "learning_rate": 3.988814777418985e-06, "loss": 0.288, "step": 2710 }, { "epoch": 1.7601817296771052, "grad_norm": 0.755549430847168, "learning_rate": 3.981805844070407e-06, "loss": 0.2932, "step": 2712 }, { "epoch": 1.761479798799286, "grad_norm": 0.6259021759033203, "learning_rate": 3.97479899741743e-06, "loss": 0.2659, "step": 2714 }, { "epoch": 1.762777867921467, "grad_norm": 0.7175931930541992, "learning_rate": 3.9677942518199465e-06, "loss": 0.2678, "step": 2716 }, { "epoch": 1.7640759370436476, "grad_norm": 0.6321528553962708, "learning_rate": 3.960791621633539e-06, "loss": 0.3125, "step": 2718 }, { "epoch": 1.7653740061658283, "grad_norm": 0.7054442167282104, "learning_rate": 3.953791121209458e-06, "loss": 0.2666, "step": 2720 }, { "epoch": 1.766672075288009, "grad_norm": 0.8013783693313599, "learning_rate": 3.946792764894587e-06, "loss": 0.2754, "step": 2722 }, { "epoch": 1.7679701444101898, "grad_norm": 0.7462177276611328, "learning_rate": 3.939796567031414e-06, "loss": 0.3176, "step": 2724 }, { "epoch": 1.7692682135323707, "grad_norm": 0.7922548055648804, "learning_rate": 3.932802541958008e-06, "loss": 0.242, "step": 2726 }, { "epoch": 1.7705662826545514, "grad_norm": 0.5902887582778931, "learning_rate": 3.92581070400798e-06, "loss": 0.2755, "step": 2728 }, { "epoch": 1.771864351776732, "grad_norm": 0.8159043192863464, "learning_rate": 3.918821067510464e-06, "loss": 0.2818, "step": 2730 }, { "epoch": 1.7731624208989127, "grad_norm": 0.8275859355926514, "learning_rate": 3.911833646790077e-06, "loss": 0.2606, "step": 2732 }, { "epoch": 1.7744604900210936, "grad_norm": 0.8391454815864563, "learning_rate": 3.9048484561668976e-06, "loss": 0.2722, "step": 2734 }, { "epoch": 1.7757585591432745, "grad_norm": 0.9506736397743225, "learning_rate": 3.897865509956434e-06, "loss": 0.2787, "step": 2736 }, { "epoch": 1.7770566282654552, "grad_norm": 1.100533127784729, "learning_rate": 3.890884822469595e-06, "loss": 0.2915, "step": 2738 }, { "epoch": 1.7783546973876359, "grad_norm": 0.6861360669136047, "learning_rate": 3.883906408012659e-06, "loss": 0.2667, "step": 2740 }, { "epoch": 1.7796527665098165, "grad_norm": 0.9265029430389404, "learning_rate": 3.876930280887247e-06, "loss": 0.2786, "step": 2742 }, { "epoch": 1.7809508356319974, "grad_norm": 0.7722248435020447, "learning_rate": 3.869956455390295e-06, "loss": 0.2638, "step": 2744 }, { "epoch": 1.7822489047541783, "grad_norm": 0.6469891667366028, "learning_rate": 3.862984945814013e-06, "loss": 0.2385, "step": 2746 }, { "epoch": 1.783546973876359, "grad_norm": 0.5821743607521057, "learning_rate": 3.856015766445877e-06, "loss": 0.2664, "step": 2748 }, { "epoch": 1.7848450429985396, "grad_norm": 0.9658638834953308, "learning_rate": 3.8490489315685764e-06, "loss": 0.3075, "step": 2750 }, { "epoch": 1.7861431121207203, "grad_norm": 0.8133390545845032, "learning_rate": 3.842084455460007e-06, "loss": 0.2654, "step": 2752 }, { "epoch": 1.7874411812429012, "grad_norm": 0.8790935277938843, "learning_rate": 3.835122352393216e-06, "loss": 0.2715, "step": 2754 }, { "epoch": 1.788739250365082, "grad_norm": 0.8830332159996033, "learning_rate": 3.828162636636404e-06, "loss": 0.2511, "step": 2756 }, { "epoch": 1.7900373194872627, "grad_norm": 0.735605776309967, "learning_rate": 3.821205322452863e-06, "loss": 0.2568, "step": 2758 }, { "epoch": 1.7913353886094434, "grad_norm": 0.7210475206375122, "learning_rate": 3.814250424100978e-06, "loss": 0.2665, "step": 2760 }, { "epoch": 1.792633457731624, "grad_norm": 1.0224586725234985, "learning_rate": 3.8072979558341693e-06, "loss": 0.2838, "step": 2762 }, { "epoch": 1.793931526853805, "grad_norm": 1.085744857788086, "learning_rate": 3.800347931900889e-06, "loss": 0.3261, "step": 2764 }, { "epoch": 1.7952295959759859, "grad_norm": 0.6781581044197083, "learning_rate": 3.79340036654457e-06, "loss": 0.2587, "step": 2766 }, { "epoch": 1.7965276650981665, "grad_norm": 0.8727030158042908, "learning_rate": 3.786455274003612e-06, "loss": 0.2865, "step": 2768 }, { "epoch": 1.7978257342203472, "grad_norm": 0.803399384021759, "learning_rate": 3.779512668511346e-06, "loss": 0.276, "step": 2770 }, { "epoch": 1.7991238033425279, "grad_norm": 0.5774478316307068, "learning_rate": 3.7725725642960047e-06, "loss": 0.2788, "step": 2772 }, { "epoch": 1.8004218724647088, "grad_norm": 0.6861681342124939, "learning_rate": 3.7656349755806978e-06, "loss": 0.2672, "step": 2774 }, { "epoch": 1.8017199415868896, "grad_norm": 0.7429419159889221, "learning_rate": 3.7586999165833743e-06, "loss": 0.2658, "step": 2776 }, { "epoch": 1.8030180107090703, "grad_norm": 0.7567784190177917, "learning_rate": 3.751767401516805e-06, "loss": 0.2643, "step": 2778 }, { "epoch": 1.804316079831251, "grad_norm": 0.65522700548172, "learning_rate": 3.744837444588542e-06, "loss": 0.2576, "step": 2780 }, { "epoch": 1.8056141489534316, "grad_norm": 0.8535295128822327, "learning_rate": 3.737910060000899e-06, "loss": 0.3216, "step": 2782 }, { "epoch": 1.8069122180756125, "grad_norm": 0.857086718082428, "learning_rate": 3.7309852619509136e-06, "loss": 0.2236, "step": 2784 }, { "epoch": 1.8082102871977934, "grad_norm": 0.7500776648521423, "learning_rate": 3.7240630646303262e-06, "loss": 0.2673, "step": 2786 }, { "epoch": 1.809508356319974, "grad_norm": 0.7844164967536926, "learning_rate": 3.717143482225547e-06, "loss": 0.2546, "step": 2788 }, { "epoch": 1.8108064254421548, "grad_norm": 0.7926374673843384, "learning_rate": 3.71022652891762e-06, "loss": 0.2777, "step": 2790 }, { "epoch": 1.8121044945643354, "grad_norm": 1.038275122642517, "learning_rate": 3.7033122188822156e-06, "loss": 0.3107, "step": 2792 }, { "epoch": 1.8134025636865163, "grad_norm": 0.7840460538864136, "learning_rate": 3.696400566289571e-06, "loss": 0.2984, "step": 2794 }, { "epoch": 1.8147006328086972, "grad_norm": 0.6379064917564392, "learning_rate": 3.689491585304491e-06, "loss": 0.2737, "step": 2796 }, { "epoch": 1.8159987019308779, "grad_norm": 0.7306720018386841, "learning_rate": 3.6825852900862922e-06, "loss": 0.2777, "step": 2798 }, { "epoch": 1.8172967710530585, "grad_norm": 0.8343081474304199, "learning_rate": 3.675681694788801e-06, "loss": 0.2641, "step": 2800 }, { "epoch": 1.8185948401752392, "grad_norm": 0.8888320326805115, "learning_rate": 3.6687808135602955e-06, "loss": 0.2589, "step": 2802 }, { "epoch": 1.81989290929742, "grad_norm": 0.725276529788971, "learning_rate": 3.661882660543507e-06, "loss": 0.3076, "step": 2804 }, { "epoch": 1.821190978419601, "grad_norm": 0.725104570388794, "learning_rate": 3.6549872498755614e-06, "loss": 0.3061, "step": 2806 }, { "epoch": 1.8224890475417816, "grad_norm": 0.6446887850761414, "learning_rate": 3.6480945956879765e-06, "loss": 0.2599, "step": 2808 }, { "epoch": 1.8237871166639623, "grad_norm": 0.7069984078407288, "learning_rate": 3.6412047121066115e-06, "loss": 0.286, "step": 2810 }, { "epoch": 1.825085185786143, "grad_norm": 0.723194420337677, "learning_rate": 3.6343176132516513e-06, "loss": 0.2974, "step": 2812 }, { "epoch": 1.8263832549083239, "grad_norm": 0.635728120803833, "learning_rate": 3.627433313237576e-06, "loss": 0.2568, "step": 2814 }, { "epoch": 1.8276813240305048, "grad_norm": 0.9081708788871765, "learning_rate": 3.6205518261731247e-06, "loss": 0.2544, "step": 2816 }, { "epoch": 1.8289793931526854, "grad_norm": 0.6836335062980652, "learning_rate": 3.6136731661612777e-06, "loss": 0.2782, "step": 2818 }, { "epoch": 1.830277462274866, "grad_norm": 0.665378987789154, "learning_rate": 3.606797347299216e-06, "loss": 0.2845, "step": 2820 }, { "epoch": 1.8315755313970468, "grad_norm": 0.7148879170417786, "learning_rate": 3.599924383678301e-06, "loss": 0.2519, "step": 2822 }, { "epoch": 1.8328736005192277, "grad_norm": 0.5942399501800537, "learning_rate": 3.59305428938404e-06, "loss": 0.2538, "step": 2824 }, { "epoch": 1.8341716696414085, "grad_norm": 0.9848334789276123, "learning_rate": 3.5861870784960657e-06, "loss": 0.2738, "step": 2826 }, { "epoch": 1.8354697387635892, "grad_norm": 0.8185152411460876, "learning_rate": 3.5793227650880928e-06, "loss": 0.2767, "step": 2828 }, { "epoch": 1.8367678078857699, "grad_norm": 0.723704993724823, "learning_rate": 3.572461363227906e-06, "loss": 0.2652, "step": 2830 }, { "epoch": 1.8380658770079505, "grad_norm": 0.774885356426239, "learning_rate": 3.565602886977318e-06, "loss": 0.2643, "step": 2832 }, { "epoch": 1.8393639461301314, "grad_norm": 0.9404758214950562, "learning_rate": 3.558747350392146e-06, "loss": 0.2753, "step": 2834 }, { "epoch": 1.8406620152523123, "grad_norm": 0.7198779582977295, "learning_rate": 3.5518947675221865e-06, "loss": 0.2718, "step": 2836 }, { "epoch": 1.841960084374493, "grad_norm": 0.8264615535736084, "learning_rate": 3.545045152411178e-06, "loss": 0.2577, "step": 2838 }, { "epoch": 1.8432581534966737, "grad_norm": 0.6123945713043213, "learning_rate": 3.5381985190967804e-06, "loss": 0.2474, "step": 2840 }, { "epoch": 1.8445562226188543, "grad_norm": 0.7886740565299988, "learning_rate": 3.531354881610539e-06, "loss": 0.2661, "step": 2842 }, { "epoch": 1.8458542917410352, "grad_norm": 0.7328199744224548, "learning_rate": 3.5245142539778655e-06, "loss": 0.2481, "step": 2844 }, { "epoch": 1.847152360863216, "grad_norm": 0.9207030534744263, "learning_rate": 3.517676650217995e-06, "loss": 0.2796, "step": 2846 }, { "epoch": 1.8484504299853968, "grad_norm": 0.7281858921051025, "learning_rate": 3.5108420843439734e-06, "loss": 0.2761, "step": 2848 }, { "epoch": 1.8497484991075774, "grad_norm": 0.8021315932273865, "learning_rate": 3.504010570362612e-06, "loss": 0.2833, "step": 2850 }, { "epoch": 1.851046568229758, "grad_norm": 0.7008223533630371, "learning_rate": 3.4971821222744797e-06, "loss": 0.2786, "step": 2852 }, { "epoch": 1.852344637351939, "grad_norm": 0.9246177673339844, "learning_rate": 3.490356754073849e-06, "loss": 0.2745, "step": 2854 }, { "epoch": 1.8536427064741199, "grad_norm": 0.8540204763412476, "learning_rate": 3.483534479748688e-06, "loss": 0.3045, "step": 2856 }, { "epoch": 1.8549407755963006, "grad_norm": 0.7283492088317871, "learning_rate": 3.476715313280624e-06, "loss": 0.2449, "step": 2858 }, { "epoch": 1.8562388447184812, "grad_norm": 0.7360115051269531, "learning_rate": 3.469899268644913e-06, "loss": 0.2703, "step": 2860 }, { "epoch": 1.8575369138406619, "grad_norm": 0.9895793199539185, "learning_rate": 3.4630863598104137e-06, "loss": 0.28, "step": 2862 }, { "epoch": 1.8588349829628428, "grad_norm": 0.7656643390655518, "learning_rate": 3.4562766007395577e-06, "loss": 0.2899, "step": 2864 }, { "epoch": 1.8601330520850237, "grad_norm": 0.8977930545806885, "learning_rate": 3.4494700053883236e-06, "loss": 0.2631, "step": 2866 }, { "epoch": 1.8614311212072043, "grad_norm": 0.9405370950698853, "learning_rate": 3.442666587706203e-06, "loss": 0.2701, "step": 2868 }, { "epoch": 1.862729190329385, "grad_norm": 0.666086733341217, "learning_rate": 3.4358663616361775e-06, "loss": 0.2466, "step": 2870 }, { "epoch": 1.8640272594515657, "grad_norm": 0.676517128944397, "learning_rate": 3.4290693411146882e-06, "loss": 0.2553, "step": 2872 }, { "epoch": 1.8653253285737466, "grad_norm": 1.009494662284851, "learning_rate": 3.4222755400716044e-06, "loss": 0.2899, "step": 2874 }, { "epoch": 1.8666233976959274, "grad_norm": 0.7408162951469421, "learning_rate": 3.4154849724302e-06, "loss": 0.2894, "step": 2876 }, { "epoch": 1.8679214668181081, "grad_norm": 0.9103520512580872, "learning_rate": 3.408697652107119e-06, "loss": 0.2962, "step": 2878 }, { "epoch": 1.8692195359402888, "grad_norm": 1.039710521697998, "learning_rate": 3.401913593012355e-06, "loss": 0.3098, "step": 2880 }, { "epoch": 1.8705176050624694, "grad_norm": 0.9186581373214722, "learning_rate": 3.395132809049212e-06, "loss": 0.2906, "step": 2882 }, { "epoch": 1.8718156741846503, "grad_norm": 0.9944295287132263, "learning_rate": 3.3883553141142884e-06, "loss": 0.3008, "step": 2884 }, { "epoch": 1.8731137433068312, "grad_norm": 0.6864521503448486, "learning_rate": 3.381581122097437e-06, "loss": 0.2834, "step": 2886 }, { "epoch": 1.874411812429012, "grad_norm": 0.6224085092544556, "learning_rate": 3.3748102468817443e-06, "loss": 0.2737, "step": 2888 }, { "epoch": 1.8757098815511926, "grad_norm": 0.7485184669494629, "learning_rate": 3.368042702343498e-06, "loss": 0.2732, "step": 2890 }, { "epoch": 1.8770079506733732, "grad_norm": 0.8181449770927429, "learning_rate": 3.361278502352161e-06, "loss": 0.2429, "step": 2892 }, { "epoch": 1.8783060197955541, "grad_norm": 0.893894374370575, "learning_rate": 3.35451766077034e-06, "loss": 0.2662, "step": 2894 }, { "epoch": 1.879604088917735, "grad_norm": 0.7209556698799133, "learning_rate": 3.347760191453763e-06, "loss": 0.2861, "step": 2896 }, { "epoch": 1.8809021580399157, "grad_norm": 0.8114523887634277, "learning_rate": 3.3410061082512422e-06, "loss": 0.2741, "step": 2898 }, { "epoch": 1.8822002271620963, "grad_norm": 0.6998618841171265, "learning_rate": 3.334255425004649e-06, "loss": 0.2586, "step": 2900 }, { "epoch": 1.883498296284277, "grad_norm": 0.7631589770317078, "learning_rate": 3.327508155548896e-06, "loss": 0.2606, "step": 2902 }, { "epoch": 1.884796365406458, "grad_norm": 0.9548107981681824, "learning_rate": 3.3207643137118872e-06, "loss": 0.3079, "step": 2904 }, { "epoch": 1.8860944345286388, "grad_norm": 0.7325564026832581, "learning_rate": 3.3140239133145145e-06, "loss": 0.2661, "step": 2906 }, { "epoch": 1.8873925036508195, "grad_norm": 0.9025892615318298, "learning_rate": 3.3072869681706067e-06, "loss": 0.2801, "step": 2908 }, { "epoch": 1.8886905727730001, "grad_norm": 1.025099277496338, "learning_rate": 3.3005534920869175e-06, "loss": 0.2808, "step": 2910 }, { "epoch": 1.8899886418951808, "grad_norm": 0.7414609789848328, "learning_rate": 3.293823498863087e-06, "loss": 0.2824, "step": 2912 }, { "epoch": 1.8912867110173617, "grad_norm": 0.6572422981262207, "learning_rate": 3.2870970022916213e-06, "loss": 0.2834, "step": 2914 }, { "epoch": 1.8925847801395426, "grad_norm": 0.6427735090255737, "learning_rate": 3.2803740161578566e-06, "loss": 0.2683, "step": 2916 }, { "epoch": 1.8938828492617232, "grad_norm": 0.7726417779922485, "learning_rate": 3.27365455423994e-06, "loss": 0.2793, "step": 2918 }, { "epoch": 1.895180918383904, "grad_norm": 1.1550577878952026, "learning_rate": 3.2669386303087907e-06, "loss": 0.3337, "step": 2920 }, { "epoch": 1.8964789875060846, "grad_norm": 0.636168360710144, "learning_rate": 3.2602262581280785e-06, "loss": 0.2798, "step": 2922 }, { "epoch": 1.8977770566282655, "grad_norm": 0.9127852320671082, "learning_rate": 3.2535174514541966e-06, "loss": 0.2866, "step": 2924 }, { "epoch": 1.8990751257504463, "grad_norm": 0.8961077332496643, "learning_rate": 3.2468122240362287e-06, "loss": 0.282, "step": 2926 }, { "epoch": 1.900373194872627, "grad_norm": 0.7684800624847412, "learning_rate": 3.2401105896159245e-06, "loss": 0.275, "step": 2928 }, { "epoch": 1.9016712639948077, "grad_norm": 0.7318122982978821, "learning_rate": 3.233412561927668e-06, "loss": 0.2453, "step": 2930 }, { "epoch": 1.9029693331169883, "grad_norm": 0.669316291809082, "learning_rate": 3.226718154698455e-06, "loss": 0.2663, "step": 2932 }, { "epoch": 1.9042674022391692, "grad_norm": 0.6498284935951233, "learning_rate": 3.220027381647857e-06, "loss": 0.2709, "step": 2934 }, { "epoch": 1.9055654713613501, "grad_norm": 0.7346310019493103, "learning_rate": 3.2133402564880033e-06, "loss": 0.2812, "step": 2936 }, { "epoch": 1.9068635404835308, "grad_norm": 1.1631336212158203, "learning_rate": 3.206656792923542e-06, "loss": 0.3061, "step": 2938 }, { "epoch": 1.9081616096057115, "grad_norm": 0.9604735374450684, "learning_rate": 3.1999770046516198e-06, "loss": 0.2374, "step": 2940 }, { "epoch": 1.9094596787278921, "grad_norm": 0.6796661019325256, "learning_rate": 3.1933009053618523e-06, "loss": 0.2553, "step": 2942 }, { "epoch": 1.910757747850073, "grad_norm": 0.7576373815536499, "learning_rate": 3.186628508736287e-06, "loss": 0.283, "step": 2944 }, { "epoch": 1.912055816972254, "grad_norm": 0.6638970375061035, "learning_rate": 3.179959828449397e-06, "loss": 0.2776, "step": 2946 }, { "epoch": 1.9133538860944346, "grad_norm": 0.834779679775238, "learning_rate": 3.173294878168025e-06, "loss": 0.2652, "step": 2948 }, { "epoch": 1.9146519552166152, "grad_norm": 0.7802719473838806, "learning_rate": 3.1666336715513813e-06, "loss": 0.2588, "step": 2950 }, { "epoch": 1.915950024338796, "grad_norm": 0.6618738770484924, "learning_rate": 3.159976222250992e-06, "loss": 0.237, "step": 2952 }, { "epoch": 1.9172480934609768, "grad_norm": 0.6236530542373657, "learning_rate": 3.1533225439106965e-06, "loss": 0.2831, "step": 2954 }, { "epoch": 1.9185461625831577, "grad_norm": 0.7212898135185242, "learning_rate": 3.146672650166592e-06, "loss": 0.2607, "step": 2956 }, { "epoch": 1.9198442317053384, "grad_norm": 1.160123348236084, "learning_rate": 3.140026554647032e-06, "loss": 0.2671, "step": 2958 }, { "epoch": 1.921142300827519, "grad_norm": 0.80462646484375, "learning_rate": 3.133384270972574e-06, "loss": 0.2661, "step": 2960 }, { "epoch": 1.9224403699496997, "grad_norm": 0.7213415503501892, "learning_rate": 3.1267458127559767e-06, "loss": 0.2483, "step": 2962 }, { "epoch": 1.9237384390718806, "grad_norm": 0.7708879113197327, "learning_rate": 3.1201111936021467e-06, "loss": 0.2642, "step": 2964 }, { "epoch": 1.9250365081940615, "grad_norm": 0.8336282968521118, "learning_rate": 3.1134804271081286e-06, "loss": 0.2714, "step": 2966 }, { "epoch": 1.9263345773162421, "grad_norm": 0.9941635727882385, "learning_rate": 3.106853526863073e-06, "loss": 0.3011, "step": 2968 }, { "epoch": 1.9276326464384228, "grad_norm": 0.7116511464118958, "learning_rate": 3.1002305064482006e-06, "loss": 0.278, "step": 2970 }, { "epoch": 1.9289307155606035, "grad_norm": 0.6688032150268555, "learning_rate": 3.0936113794367884e-06, "loss": 0.2694, "step": 2972 }, { "epoch": 1.9302287846827844, "grad_norm": 0.8202400803565979, "learning_rate": 3.086996159394129e-06, "loss": 0.3062, "step": 2974 }, { "epoch": 1.9315268538049652, "grad_norm": 0.7836296558380127, "learning_rate": 3.08038485987751e-06, "loss": 0.2632, "step": 2976 }, { "epoch": 1.932824922927146, "grad_norm": 0.9554230570793152, "learning_rate": 3.073777494436182e-06, "loss": 0.2622, "step": 2978 }, { "epoch": 1.9341229920493266, "grad_norm": 0.8061546087265015, "learning_rate": 3.067174076611339e-06, "loss": 0.2654, "step": 2980 }, { "epoch": 1.9354210611715073, "grad_norm": 0.8891945481300354, "learning_rate": 3.0605746199360755e-06, "loss": 0.3035, "step": 2982 }, { "epoch": 1.9367191302936881, "grad_norm": 0.6460965871810913, "learning_rate": 3.053979137935378e-06, "loss": 0.2594, "step": 2984 }, { "epoch": 1.938017199415869, "grad_norm": 0.7804474830627441, "learning_rate": 3.0473876441260786e-06, "loss": 0.2955, "step": 2986 }, { "epoch": 1.9393152685380497, "grad_norm": 0.8243818283081055, "learning_rate": 3.040800152016841e-06, "loss": 0.2742, "step": 2988 }, { "epoch": 1.9406133376602304, "grad_norm": 1.0873937606811523, "learning_rate": 3.0342166751081294e-06, "loss": 0.2933, "step": 2990 }, { "epoch": 1.941911406782411, "grad_norm": 0.8437172770500183, "learning_rate": 3.0276372268921694e-06, "loss": 0.3111, "step": 2992 }, { "epoch": 1.943209475904592, "grad_norm": 1.0183093547821045, "learning_rate": 3.021061820852945e-06, "loss": 0.2913, "step": 2994 }, { "epoch": 1.9445075450267728, "grad_norm": 0.6063757538795471, "learning_rate": 3.0144904704661413e-06, "loss": 0.2621, "step": 2996 }, { "epoch": 1.9458056141489535, "grad_norm": 1.3950034379959106, "learning_rate": 3.0079231891991444e-06, "loss": 0.2973, "step": 2998 }, { "epoch": 1.9471036832711341, "grad_norm": 1.1956634521484375, "learning_rate": 3.001359990510991e-06, "loss": 0.2741, "step": 3000 }, { "epoch": 1.9471036832711341, "eval_loss": 0.2778223752975464, "eval_runtime": 397.2435, "eval_samples_per_second": 26.13, "eval_steps_per_second": 3.268, "step": 3000 }, { "epoch": 1.9484017523933148, "grad_norm": 0.7920111417770386, "learning_rate": 2.994800887852359e-06, "loss": 0.2839, "step": 3002 }, { "epoch": 1.9496998215154957, "grad_norm": 0.729102611541748, "learning_rate": 2.9882458946655242e-06, "loss": 0.2539, "step": 3004 }, { "epoch": 1.9509978906376766, "grad_norm": 0.7524065375328064, "learning_rate": 2.981695024384349e-06, "loss": 0.3291, "step": 3006 }, { "epoch": 1.9522959597598573, "grad_norm": 0.6487220525741577, "learning_rate": 2.9751482904342375e-06, "loss": 0.259, "step": 3008 }, { "epoch": 1.953594028882038, "grad_norm": 0.8851397633552551, "learning_rate": 2.9686057062321226e-06, "loss": 0.2597, "step": 3010 }, { "epoch": 1.9548920980042186, "grad_norm": 0.7598209381103516, "learning_rate": 2.9620672851864313e-06, "loss": 0.2794, "step": 3012 }, { "epoch": 1.9561901671263995, "grad_norm": 0.8073711395263672, "learning_rate": 2.9555330406970568e-06, "loss": 0.2657, "step": 3014 }, { "epoch": 1.9574882362485804, "grad_norm": 0.5883775353431702, "learning_rate": 2.949002986155336e-06, "loss": 0.2925, "step": 3016 }, { "epoch": 1.958786305370761, "grad_norm": 0.8856020569801331, "learning_rate": 2.942477134944016e-06, "loss": 0.2935, "step": 3018 }, { "epoch": 1.9600843744929417, "grad_norm": 0.7591295838356018, "learning_rate": 2.935955500437232e-06, "loss": 0.243, "step": 3020 }, { "epoch": 1.9613824436151224, "grad_norm": 0.6521121263504028, "learning_rate": 2.9294380960004743e-06, "loss": 0.2755, "step": 3022 }, { "epoch": 1.9626805127373033, "grad_norm": 1.1657289266586304, "learning_rate": 2.9229249349905686e-06, "loss": 0.2734, "step": 3024 }, { "epoch": 1.9639785818594842, "grad_norm": 0.9355776906013489, "learning_rate": 2.916416030755639e-06, "loss": 0.2854, "step": 3026 }, { "epoch": 1.9652766509816648, "grad_norm": 0.7111332416534424, "learning_rate": 2.9099113966350893e-06, "loss": 0.2612, "step": 3028 }, { "epoch": 1.9665747201038455, "grad_norm": 0.6845047473907471, "learning_rate": 2.9034110459595716e-06, "loss": 0.2754, "step": 3030 }, { "epoch": 1.9678727892260262, "grad_norm": 0.7980267405509949, "learning_rate": 2.8969149920509574e-06, "loss": 0.2891, "step": 3032 }, { "epoch": 1.969170858348207, "grad_norm": 0.8722070455551147, "learning_rate": 2.8904232482223147e-06, "loss": 0.2631, "step": 3034 }, { "epoch": 1.970468927470388, "grad_norm": 1.019988775253296, "learning_rate": 2.8839358277778758e-06, "loss": 0.2803, "step": 3036 }, { "epoch": 1.9717669965925686, "grad_norm": 0.853305995464325, "learning_rate": 2.8774527440130173e-06, "loss": 0.2823, "step": 3038 }, { "epoch": 1.9730650657147493, "grad_norm": 0.9285976886749268, "learning_rate": 2.8709740102142236e-06, "loss": 0.2726, "step": 3040 }, { "epoch": 1.97436313483693, "grad_norm": 0.6591308116912842, "learning_rate": 2.8644996396590657e-06, "loss": 0.2722, "step": 3042 }, { "epoch": 1.9756612039591108, "grad_norm": 0.912419319152832, "learning_rate": 2.858029645616171e-06, "loss": 0.2837, "step": 3044 }, { "epoch": 1.9769592730812917, "grad_norm": 0.7600494027137756, "learning_rate": 2.8515640413452026e-06, "loss": 0.2769, "step": 3046 }, { "epoch": 1.9782573422034724, "grad_norm": 0.662398636341095, "learning_rate": 2.8451028400968233e-06, "loss": 0.273, "step": 3048 }, { "epoch": 1.979555411325653, "grad_norm": 0.9580662846565247, "learning_rate": 2.8386460551126716e-06, "loss": 0.2722, "step": 3050 }, { "epoch": 1.9808534804478337, "grad_norm": 0.734804630279541, "learning_rate": 2.8321936996253368e-06, "loss": 0.26, "step": 3052 }, { "epoch": 1.9821515495700146, "grad_norm": 0.8870983123779297, "learning_rate": 2.8257457868583283e-06, "loss": 0.2546, "step": 3054 }, { "epoch": 1.9834496186921955, "grad_norm": 0.8266785144805908, "learning_rate": 2.8193023300260565e-06, "loss": 0.2937, "step": 3056 }, { "epoch": 1.9847476878143762, "grad_norm": 0.680275559425354, "learning_rate": 2.8128633423337932e-06, "loss": 0.2691, "step": 3058 }, { "epoch": 1.9860457569365568, "grad_norm": 0.8035503029823303, "learning_rate": 2.806428836977654e-06, "loss": 0.2721, "step": 3060 }, { "epoch": 1.9873438260587375, "grad_norm": 0.6094067096710205, "learning_rate": 2.7999988271445643e-06, "loss": 0.2658, "step": 3062 }, { "epoch": 1.9886418951809184, "grad_norm": 0.9088681936264038, "learning_rate": 2.7935733260122467e-06, "loss": 0.2387, "step": 3064 }, { "epoch": 1.9899399643030993, "grad_norm": 0.9523714184761047, "learning_rate": 2.787152346749173e-06, "loss": 0.2961, "step": 3066 }, { "epoch": 1.99123803342528, "grad_norm": 0.8670559525489807, "learning_rate": 2.780735902514552e-06, "loss": 0.2645, "step": 3068 }, { "epoch": 1.9925361025474606, "grad_norm": 0.9355019330978394, "learning_rate": 2.774324006458296e-06, "loss": 0.2813, "step": 3070 }, { "epoch": 1.9938341716696413, "grad_norm": 0.7674852609634399, "learning_rate": 2.7679166717210027e-06, "loss": 0.2564, "step": 3072 }, { "epoch": 1.9951322407918222, "grad_norm": 0.9387212991714478, "learning_rate": 2.7615139114339175e-06, "loss": 0.2708, "step": 3074 }, { "epoch": 1.996430309914003, "grad_norm": 0.7877328395843506, "learning_rate": 2.7551157387189033e-06, "loss": 0.2948, "step": 3076 }, { "epoch": 1.9977283790361837, "grad_norm": 0.8506510257720947, "learning_rate": 2.748722166688437e-06, "loss": 0.2682, "step": 3078 }, { "epoch": 1.9990264481583644, "grad_norm": 0.9709959030151367, "learning_rate": 2.7423332084455543e-06, "loss": 0.3032, "step": 3080 }, { "epoch": 2.000324517280545, "grad_norm": 0.7936059832572937, "learning_rate": 2.7359488770838415e-06, "loss": 0.2602, "step": 3082 }, { "epoch": 2.0016225864027257, "grad_norm": 0.6516163349151611, "learning_rate": 2.7295691856873973e-06, "loss": 0.266, "step": 3084 }, { "epoch": 2.002920655524907, "grad_norm": 0.8180418014526367, "learning_rate": 2.723194147330819e-06, "loss": 0.2627, "step": 3086 }, { "epoch": 2.0042187246470875, "grad_norm": 0.8955652713775635, "learning_rate": 2.7168237750791602e-06, "loss": 0.2643, "step": 3088 }, { "epoch": 2.005516793769268, "grad_norm": 0.8380647897720337, "learning_rate": 2.710458081987918e-06, "loss": 0.2463, "step": 3090 }, { "epoch": 2.006814862891449, "grad_norm": 0.9310481548309326, "learning_rate": 2.7040970811029916e-06, "loss": 0.2654, "step": 3092 }, { "epoch": 2.0081129320136295, "grad_norm": 0.8562743067741394, "learning_rate": 2.697740785460675e-06, "loss": 0.261, "step": 3094 }, { "epoch": 2.0094110011358106, "grad_norm": 0.8164252042770386, "learning_rate": 2.6913892080876116e-06, "loss": 0.3146, "step": 3096 }, { "epoch": 2.0107090702579913, "grad_norm": 0.8310542702674866, "learning_rate": 2.685042362000776e-06, "loss": 0.2625, "step": 3098 }, { "epoch": 2.012007139380172, "grad_norm": 0.7564728856086731, "learning_rate": 2.678700260207449e-06, "loss": 0.2417, "step": 3100 }, { "epoch": 2.0133052085023526, "grad_norm": 0.7017265558242798, "learning_rate": 2.6723629157051844e-06, "loss": 0.2722, "step": 3102 }, { "epoch": 2.0146032776245333, "grad_norm": 0.7249272465705872, "learning_rate": 2.666030341481792e-06, "loss": 0.2618, "step": 3104 }, { "epoch": 2.0159013467467144, "grad_norm": 0.89203941822052, "learning_rate": 2.6597025505153017e-06, "loss": 0.2662, "step": 3106 }, { "epoch": 2.017199415868895, "grad_norm": 1.2427937984466553, "learning_rate": 2.6533795557739407e-06, "loss": 0.2589, "step": 3108 }, { "epoch": 2.0184974849910757, "grad_norm": 0.9459607601165771, "learning_rate": 2.6470613702161047e-06, "loss": 0.2776, "step": 3110 }, { "epoch": 2.0197955541132564, "grad_norm": 1.0149006843566895, "learning_rate": 2.64074800679034e-06, "loss": 0.2584, "step": 3112 }, { "epoch": 2.021093623235437, "grad_norm": 0.6701650619506836, "learning_rate": 2.634439478435305e-06, "loss": 0.2662, "step": 3114 }, { "epoch": 2.022391692357618, "grad_norm": 0.773186981678009, "learning_rate": 2.6281357980797493e-06, "loss": 0.2615, "step": 3116 }, { "epoch": 2.023689761479799, "grad_norm": 0.5617316961288452, "learning_rate": 2.621836978642489e-06, "loss": 0.2392, "step": 3118 }, { "epoch": 2.0249878306019795, "grad_norm": 0.96043860912323, "learning_rate": 2.6155430330323756e-06, "loss": 0.2613, "step": 3120 }, { "epoch": 2.02628589972416, "grad_norm": 0.7758943438529968, "learning_rate": 2.609253974148278e-06, "loss": 0.2582, "step": 3122 }, { "epoch": 2.027583968846341, "grad_norm": 0.7272630333900452, "learning_rate": 2.6029698148790392e-06, "loss": 0.2844, "step": 3124 }, { "epoch": 2.028882037968522, "grad_norm": 0.9311795234680176, "learning_rate": 2.596690568103474e-06, "loss": 0.2775, "step": 3126 }, { "epoch": 2.0301801070907026, "grad_norm": 0.7327467799186707, "learning_rate": 2.590416246690319e-06, "loss": 0.2682, "step": 3128 }, { "epoch": 2.0314781762128833, "grad_norm": 0.6938290596008301, "learning_rate": 2.584146863498226e-06, "loss": 0.3005, "step": 3130 }, { "epoch": 2.032776245335064, "grad_norm": 0.8273631930351257, "learning_rate": 2.5778824313757136e-06, "loss": 0.2775, "step": 3132 }, { "epoch": 2.0340743144572446, "grad_norm": 0.9417901635169983, "learning_rate": 2.571622963161168e-06, "loss": 0.2496, "step": 3134 }, { "epoch": 2.0353723835794257, "grad_norm": 0.7254298329353333, "learning_rate": 2.5653684716827904e-06, "loss": 0.2411, "step": 3136 }, { "epoch": 2.0366704527016064, "grad_norm": 0.8049398064613342, "learning_rate": 2.559118969758595e-06, "loss": 0.2608, "step": 3138 }, { "epoch": 2.037968521823787, "grad_norm": 0.8243045210838318, "learning_rate": 2.5528744701963563e-06, "loss": 0.282, "step": 3140 }, { "epoch": 2.0392665909459677, "grad_norm": 1.036625862121582, "learning_rate": 2.5466349857936047e-06, "loss": 0.2733, "step": 3142 }, { "epoch": 2.0405646600681484, "grad_norm": 0.891358494758606, "learning_rate": 2.5404005293375955e-06, "loss": 0.2865, "step": 3144 }, { "epoch": 2.0418627291903295, "grad_norm": 0.8360962271690369, "learning_rate": 2.5341711136052728e-06, "loss": 0.2655, "step": 3146 }, { "epoch": 2.04316079831251, "grad_norm": 1.049869418144226, "learning_rate": 2.5279467513632537e-06, "loss": 0.2551, "step": 3148 }, { "epoch": 2.044458867434691, "grad_norm": 0.7041304111480713, "learning_rate": 2.5217274553677975e-06, "loss": 0.2483, "step": 3150 }, { "epoch": 2.0457569365568715, "grad_norm": 0.8953042030334473, "learning_rate": 2.5155132383647835e-06, "loss": 0.2499, "step": 3152 }, { "epoch": 2.047055005679052, "grad_norm": 0.7215287089347839, "learning_rate": 2.509304113089679e-06, "loss": 0.2665, "step": 3154 }, { "epoch": 2.0483530748012333, "grad_norm": 0.9666706919670105, "learning_rate": 2.5031000922675164e-06, "loss": 0.2914, "step": 3156 }, { "epoch": 2.049651143923414, "grad_norm": 0.8082274794578552, "learning_rate": 2.496901188612866e-06, "loss": 0.2408, "step": 3158 }, { "epoch": 2.0509492130455946, "grad_norm": 0.8339118361473083, "learning_rate": 2.4907074148298167e-06, "loss": 0.2526, "step": 3160 }, { "epoch": 2.0522472821677753, "grad_norm": 1.1327399015426636, "learning_rate": 2.4845187836119404e-06, "loss": 0.2894, "step": 3162 }, { "epoch": 2.053545351289956, "grad_norm": 0.7870436310768127, "learning_rate": 2.478335307642264e-06, "loss": 0.2533, "step": 3164 }, { "epoch": 2.054843420412137, "grad_norm": 1.2192871570587158, "learning_rate": 2.472156999593259e-06, "loss": 0.2974, "step": 3166 }, { "epoch": 2.0561414895343177, "grad_norm": 1.3366024494171143, "learning_rate": 2.4659838721268005e-06, "loss": 0.2667, "step": 3168 }, { "epoch": 2.0574395586564984, "grad_norm": 1.1170718669891357, "learning_rate": 2.4598159378941507e-06, "loss": 0.2828, "step": 3170 }, { "epoch": 2.058737627778679, "grad_norm": 1.0761653184890747, "learning_rate": 2.4536532095359185e-06, "loss": 0.2748, "step": 3172 }, { "epoch": 2.0600356969008597, "grad_norm": 0.802746057510376, "learning_rate": 2.4474956996820564e-06, "loss": 0.2395, "step": 3174 }, { "epoch": 2.061333766023041, "grad_norm": 0.892058789730072, "learning_rate": 2.4413434209518137e-06, "loss": 0.2343, "step": 3176 }, { "epoch": 2.0626318351452215, "grad_norm": 0.8063054084777832, "learning_rate": 2.435196385953727e-06, "loss": 0.2663, "step": 3178 }, { "epoch": 2.063929904267402, "grad_norm": 0.6660009622573853, "learning_rate": 2.4290546072855733e-06, "loss": 0.283, "step": 3180 }, { "epoch": 2.065227973389583, "grad_norm": 1.094445824623108, "learning_rate": 2.4229180975343702e-06, "loss": 0.2816, "step": 3182 }, { "epoch": 2.0665260425117635, "grad_norm": 1.0832817554473877, "learning_rate": 2.4167868692763314e-06, "loss": 0.2757, "step": 3184 }, { "epoch": 2.0678241116339446, "grad_norm": 0.9673899412155151, "learning_rate": 2.410660935076846e-06, "loss": 0.3478, "step": 3186 }, { "epoch": 2.0691221807561253, "grad_norm": 0.7864675521850586, "learning_rate": 2.404540307490456e-06, "loss": 0.3163, "step": 3188 }, { "epoch": 2.070420249878306, "grad_norm": 0.8471720218658447, "learning_rate": 2.3984249990608237e-06, "loss": 0.2687, "step": 3190 }, { "epoch": 2.0717183190004866, "grad_norm": 0.6562231779098511, "learning_rate": 2.3923150223207176e-06, "loss": 0.2628, "step": 3192 }, { "epoch": 2.0730163881226673, "grad_norm": 0.7097077965736389, "learning_rate": 2.3862103897919724e-06, "loss": 0.2622, "step": 3194 }, { "epoch": 2.0743144572448484, "grad_norm": 0.8782818913459778, "learning_rate": 2.3801111139854744e-06, "loss": 0.265, "step": 3196 }, { "epoch": 2.075612526367029, "grad_norm": 0.7384313941001892, "learning_rate": 2.3740172074011268e-06, "loss": 0.2983, "step": 3198 }, { "epoch": 2.0769105954892098, "grad_norm": 0.8707220554351807, "learning_rate": 2.367928682527838e-06, "loss": 0.2668, "step": 3200 }, { "epoch": 2.0782086646113904, "grad_norm": 0.8706156611442566, "learning_rate": 2.3618455518434785e-06, "loss": 0.2747, "step": 3202 }, { "epoch": 2.079506733733571, "grad_norm": 0.7681968212127686, "learning_rate": 2.3557678278148675e-06, "loss": 0.2707, "step": 3204 }, { "epoch": 2.080804802855752, "grad_norm": 0.6772947311401367, "learning_rate": 2.3496955228977437e-06, "loss": 0.2685, "step": 3206 }, { "epoch": 2.082102871977933, "grad_norm": 0.8043773174285889, "learning_rate": 2.343628649536737e-06, "loss": 0.2678, "step": 3208 }, { "epoch": 2.0834009411001135, "grad_norm": 0.8824250102043152, "learning_rate": 2.337567220165353e-06, "loss": 0.2678, "step": 3210 }, { "epoch": 2.084699010222294, "grad_norm": 1.1022475957870483, "learning_rate": 2.331511247205933e-06, "loss": 0.2483, "step": 3212 }, { "epoch": 2.085997079344475, "grad_norm": 0.9529703259468079, "learning_rate": 2.3254607430696393e-06, "loss": 0.2752, "step": 3214 }, { "epoch": 2.087295148466656, "grad_norm": 0.6713064312934875, "learning_rate": 2.319415720156422e-06, "loss": 0.2422, "step": 3216 }, { "epoch": 2.0885932175888366, "grad_norm": 1.3775089979171753, "learning_rate": 2.313376190855009e-06, "loss": 0.2887, "step": 3218 }, { "epoch": 2.0898912867110173, "grad_norm": 0.6152358651161194, "learning_rate": 2.307342167542854e-06, "loss": 0.266, "step": 3220 }, { "epoch": 2.091189355833198, "grad_norm": 1.066241979598999, "learning_rate": 2.3013136625861397e-06, "loss": 0.2566, "step": 3222 }, { "epoch": 2.0924874249553786, "grad_norm": 1.37330162525177, "learning_rate": 2.2952906883397317e-06, "loss": 0.2735, "step": 3224 }, { "epoch": 2.0937854940775598, "grad_norm": 0.7524691820144653, "learning_rate": 2.289273257147169e-06, "loss": 0.2345, "step": 3226 }, { "epoch": 2.0950835631997404, "grad_norm": 0.7679069638252258, "learning_rate": 2.2832613813406197e-06, "loss": 0.2645, "step": 3228 }, { "epoch": 2.096381632321921, "grad_norm": 1.000174641609192, "learning_rate": 2.2772550732408718e-06, "loss": 0.2768, "step": 3230 }, { "epoch": 2.0976797014441018, "grad_norm": 0.8310347199440002, "learning_rate": 2.271254345157307e-06, "loss": 0.2859, "step": 3232 }, { "epoch": 2.0989777705662824, "grad_norm": 0.752550482749939, "learning_rate": 2.265259209387867e-06, "loss": 0.2599, "step": 3234 }, { "epoch": 2.1002758396884635, "grad_norm": 0.7717733383178711, "learning_rate": 2.2592696782190317e-06, "loss": 0.2774, "step": 3236 }, { "epoch": 2.101573908810644, "grad_norm": 1.1514959335327148, "learning_rate": 2.2532857639257945e-06, "loss": 0.2707, "step": 3238 }, { "epoch": 2.102871977932825, "grad_norm": 0.7649049162864685, "learning_rate": 2.247307478771643e-06, "loss": 0.2651, "step": 3240 }, { "epoch": 2.1041700470550055, "grad_norm": 0.8774634599685669, "learning_rate": 2.241334835008524e-06, "loss": 0.2611, "step": 3242 }, { "epoch": 2.105468116177186, "grad_norm": 0.7372756600379944, "learning_rate": 2.2353678448768223e-06, "loss": 0.2691, "step": 3244 }, { "epoch": 2.1067661852993673, "grad_norm": 0.8805445432662964, "learning_rate": 2.229406520605336e-06, "loss": 0.2967, "step": 3246 }, { "epoch": 2.108064254421548, "grad_norm": 0.8618983626365662, "learning_rate": 2.2234508744112564e-06, "loss": 0.2758, "step": 3248 }, { "epoch": 2.1093623235437287, "grad_norm": 0.7669619917869568, "learning_rate": 2.217500918500133e-06, "loss": 0.2688, "step": 3250 }, { "epoch": 2.1106603926659093, "grad_norm": 0.9338579773902893, "learning_rate": 2.211556665065854e-06, "loss": 0.28, "step": 3252 }, { "epoch": 2.11195846178809, "grad_norm": 0.6930002570152283, "learning_rate": 2.205618126290623e-06, "loss": 0.2752, "step": 3254 }, { "epoch": 2.113256530910271, "grad_norm": 0.6933246850967407, "learning_rate": 2.1996853143449285e-06, "loss": 0.2759, "step": 3256 }, { "epoch": 2.1145546000324518, "grad_norm": 0.8742470741271973, "learning_rate": 2.193758241387529e-06, "loss": 0.252, "step": 3258 }, { "epoch": 2.1158526691546324, "grad_norm": 0.7028313279151917, "learning_rate": 2.187836919565416e-06, "loss": 0.266, "step": 3260 }, { "epoch": 2.117150738276813, "grad_norm": 0.8576483130455017, "learning_rate": 2.181921361013794e-06, "loss": 0.2854, "step": 3262 }, { "epoch": 2.1184488073989938, "grad_norm": 0.9929690957069397, "learning_rate": 2.176011577856058e-06, "loss": 0.261, "step": 3264 }, { "epoch": 2.119746876521175, "grad_norm": 0.9712622761726379, "learning_rate": 2.1701075822037703e-06, "loss": 0.2664, "step": 3266 }, { "epoch": 2.1210449456433556, "grad_norm": 0.8639007210731506, "learning_rate": 2.1642093861566265e-06, "loss": 0.2673, "step": 3268 }, { "epoch": 2.122343014765536, "grad_norm": 1.0651408433914185, "learning_rate": 2.158317001802439e-06, "loss": 0.2955, "step": 3270 }, { "epoch": 2.123641083887717, "grad_norm": 0.8414129018783569, "learning_rate": 2.15243044121711e-06, "loss": 0.2704, "step": 3272 }, { "epoch": 2.1249391530098976, "grad_norm": 0.9708347916603088, "learning_rate": 2.146549716464603e-06, "loss": 0.2669, "step": 3274 }, { "epoch": 2.1262372221320787, "grad_norm": 0.5966877937316895, "learning_rate": 2.140674839596931e-06, "loss": 0.2788, "step": 3276 }, { "epoch": 2.1275352912542593, "grad_norm": 0.76587975025177, "learning_rate": 2.1348058226541072e-06, "loss": 0.2626, "step": 3278 }, { "epoch": 2.12883336037644, "grad_norm": 0.6742775440216064, "learning_rate": 2.1289426776641507e-06, "loss": 0.2682, "step": 3280 }, { "epoch": 2.1301314294986207, "grad_norm": 1.5974807739257812, "learning_rate": 2.1230854166430374e-06, "loss": 0.2793, "step": 3282 }, { "epoch": 2.1314294986208013, "grad_norm": 1.0378100872039795, "learning_rate": 2.1172340515946872e-06, "loss": 0.2653, "step": 3284 }, { "epoch": 2.1327275677429824, "grad_norm": 0.804088294506073, "learning_rate": 2.1113885945109334e-06, "loss": 0.2675, "step": 3286 }, { "epoch": 2.134025636865163, "grad_norm": 0.8223013877868652, "learning_rate": 2.10554905737151e-06, "loss": 0.2668, "step": 3288 }, { "epoch": 2.135323705987344, "grad_norm": 0.8943898677825928, "learning_rate": 2.09971545214401e-06, "loss": 0.2811, "step": 3290 }, { "epoch": 2.1366217751095244, "grad_norm": 0.7888131737709045, "learning_rate": 2.093887790783873e-06, "loss": 0.2488, "step": 3292 }, { "epoch": 2.137919844231705, "grad_norm": 0.7093624472618103, "learning_rate": 2.088066085234357e-06, "loss": 0.2577, "step": 3294 }, { "epoch": 2.1392179133538862, "grad_norm": 0.8687063455581665, "learning_rate": 2.082250347426513e-06, "loss": 0.274, "step": 3296 }, { "epoch": 2.140515982476067, "grad_norm": 0.8763734102249146, "learning_rate": 2.0764405892791637e-06, "loss": 0.2707, "step": 3298 }, { "epoch": 2.1418140515982476, "grad_norm": 0.7043830156326294, "learning_rate": 2.0706368226988772e-06, "loss": 0.2649, "step": 3300 }, { "epoch": 2.1431121207204282, "grad_norm": 0.8492320775985718, "learning_rate": 2.064839059579939e-06, "loss": 0.273, "step": 3302 }, { "epoch": 2.144410189842609, "grad_norm": 0.7296220660209656, "learning_rate": 2.0590473118043326e-06, "loss": 0.2746, "step": 3304 }, { "epoch": 2.14570825896479, "grad_norm": 0.9824730157852173, "learning_rate": 2.053261591241717e-06, "loss": 0.2578, "step": 3306 }, { "epoch": 2.1470063280869707, "grad_norm": 0.9059824347496033, "learning_rate": 2.047481909749395e-06, "loss": 0.2731, "step": 3308 }, { "epoch": 2.1483043972091513, "grad_norm": 0.6464698314666748, "learning_rate": 2.041708279172294e-06, "loss": 0.2665, "step": 3310 }, { "epoch": 2.149602466331332, "grad_norm": 0.7821316719055176, "learning_rate": 2.0359407113429385e-06, "loss": 0.2779, "step": 3312 }, { "epoch": 2.1509005354535127, "grad_norm": 0.8479833602905273, "learning_rate": 2.0301792180814344e-06, "loss": 0.283, "step": 3314 }, { "epoch": 2.152198604575694, "grad_norm": 0.8406021595001221, "learning_rate": 2.024423811195434e-06, "loss": 0.2415, "step": 3316 }, { "epoch": 2.1534966736978745, "grad_norm": 1.0872950553894043, "learning_rate": 2.01867450248011e-06, "loss": 0.2928, "step": 3318 }, { "epoch": 2.154794742820055, "grad_norm": 0.7076984643936157, "learning_rate": 2.01293130371815e-06, "loss": 0.2859, "step": 3320 }, { "epoch": 2.156092811942236, "grad_norm": 0.885038435459137, "learning_rate": 2.00719422667971e-06, "loss": 0.2626, "step": 3322 }, { "epoch": 2.1573908810644165, "grad_norm": 1.0061466693878174, "learning_rate": 2.001463283122408e-06, "loss": 0.2611, "step": 3324 }, { "epoch": 2.1586889501865976, "grad_norm": 0.8061853647232056, "learning_rate": 1.995738484791281e-06, "loss": 0.3256, "step": 3326 }, { "epoch": 2.1599870193087782, "grad_norm": 1.1084691286087036, "learning_rate": 1.9900198434187838e-06, "loss": 0.2752, "step": 3328 }, { "epoch": 2.161285088430959, "grad_norm": 0.6730486750602722, "learning_rate": 1.984307370724744e-06, "loss": 0.2769, "step": 3330 }, { "epoch": 2.1625831575531396, "grad_norm": 0.7870195508003235, "learning_rate": 1.978601078416357e-06, "loss": 0.2388, "step": 3332 }, { "epoch": 2.1638812266753202, "grad_norm": 0.8111714124679565, "learning_rate": 1.972900978188137e-06, "loss": 0.2708, "step": 3334 }, { "epoch": 2.1651792957975013, "grad_norm": 0.6921493411064148, "learning_rate": 1.9672070817219242e-06, "loss": 0.2843, "step": 3336 }, { "epoch": 2.166477364919682, "grad_norm": 0.8148288130760193, "learning_rate": 1.9615194006868347e-06, "loss": 0.284, "step": 3338 }, { "epoch": 2.1677754340418627, "grad_norm": 0.748518168926239, "learning_rate": 1.9558379467392503e-06, "loss": 0.2709, "step": 3340 }, { "epoch": 2.1690735031640433, "grad_norm": 0.8002697229385376, "learning_rate": 1.9501627315227887e-06, "loss": 0.2822, "step": 3342 }, { "epoch": 2.170371572286224, "grad_norm": 0.8966932892799377, "learning_rate": 1.9444937666682834e-06, "loss": 0.2502, "step": 3344 }, { "epoch": 2.171669641408405, "grad_norm": 0.8819925785064697, "learning_rate": 1.9388310637937606e-06, "loss": 0.2594, "step": 3346 }, { "epoch": 2.172967710530586, "grad_norm": 0.8028013110160828, "learning_rate": 1.9331746345044095e-06, "loss": 0.3132, "step": 3348 }, { "epoch": 2.1742657796527665, "grad_norm": 0.7339478135108948, "learning_rate": 1.9275244903925632e-06, "loss": 0.2747, "step": 3350 }, { "epoch": 2.175563848774947, "grad_norm": 0.7158710956573486, "learning_rate": 1.921880643037673e-06, "loss": 0.2959, "step": 3352 }, { "epoch": 2.176861917897128, "grad_norm": 0.8162385821342468, "learning_rate": 1.9162431040062897e-06, "loss": 0.2702, "step": 3354 }, { "epoch": 2.178159987019309, "grad_norm": 0.7591844797134399, "learning_rate": 1.910611884852031e-06, "loss": 0.2829, "step": 3356 }, { "epoch": 2.1794580561414896, "grad_norm": 0.6242380142211914, "learning_rate": 1.9049869971155655e-06, "loss": 0.2555, "step": 3358 }, { "epoch": 2.1807561252636702, "grad_norm": 0.7767975926399231, "learning_rate": 1.8993684523245842e-06, "loss": 0.2585, "step": 3360 }, { "epoch": 2.182054194385851, "grad_norm": 1.2382186651229858, "learning_rate": 1.8937562619937783e-06, "loss": 0.2915, "step": 3362 }, { "epoch": 2.1833522635080316, "grad_norm": 0.6860133409500122, "learning_rate": 1.8881504376248222e-06, "loss": 0.2548, "step": 3364 }, { "epoch": 2.1846503326302127, "grad_norm": 0.9623684883117676, "learning_rate": 1.8825509907063328e-06, "loss": 0.2938, "step": 3366 }, { "epoch": 2.1859484017523934, "grad_norm": 0.7736422419548035, "learning_rate": 1.8769579327138682e-06, "loss": 0.2704, "step": 3368 }, { "epoch": 2.187246470874574, "grad_norm": 0.9744384288787842, "learning_rate": 1.871371275109885e-06, "loss": 0.2251, "step": 3370 }, { "epoch": 2.1885445399967547, "grad_norm": 0.793778121471405, "learning_rate": 1.865791029343731e-06, "loss": 0.2466, "step": 3372 }, { "epoch": 2.1898426091189354, "grad_norm": 0.7892866730690002, "learning_rate": 1.8602172068516011e-06, "loss": 0.2739, "step": 3374 }, { "epoch": 2.1911406782411165, "grad_norm": 1.2817727327346802, "learning_rate": 1.8546498190565399e-06, "loss": 0.2697, "step": 3376 }, { "epoch": 2.192438747363297, "grad_norm": 0.8368136286735535, "learning_rate": 1.8490888773683935e-06, "loss": 0.2844, "step": 3378 }, { "epoch": 2.193736816485478, "grad_norm": 1.3145065307617188, "learning_rate": 1.843534393183809e-06, "loss": 0.3197, "step": 3380 }, { "epoch": 2.1950348856076585, "grad_norm": 1.037109136581421, "learning_rate": 1.8379863778861862e-06, "loss": 0.2584, "step": 3382 }, { "epoch": 2.196332954729839, "grad_norm": 0.9381075501441956, "learning_rate": 1.8324448428456742e-06, "loss": 0.2387, "step": 3384 }, { "epoch": 2.1976310238520202, "grad_norm": 0.9714658856391907, "learning_rate": 1.8269097994191448e-06, "loss": 0.3058, "step": 3386 }, { "epoch": 2.198929092974201, "grad_norm": 0.6904110908508301, "learning_rate": 1.8213812589501611e-06, "loss": 0.2861, "step": 3388 }, { "epoch": 2.2002271620963816, "grad_norm": 0.7859236001968384, "learning_rate": 1.815859232768959e-06, "loss": 0.2467, "step": 3390 }, { "epoch": 2.2015252312185623, "grad_norm": 1.1572074890136719, "learning_rate": 1.810343732192424e-06, "loss": 0.2457, "step": 3392 }, { "epoch": 2.202823300340743, "grad_norm": 0.7067339420318604, "learning_rate": 1.8048347685240724e-06, "loss": 0.2599, "step": 3394 }, { "epoch": 2.204121369462924, "grad_norm": 0.7361140251159668, "learning_rate": 1.7993323530540185e-06, "loss": 0.2689, "step": 3396 }, { "epoch": 2.2054194385851047, "grad_norm": 0.7630223631858826, "learning_rate": 1.7938364970589584e-06, "loss": 0.263, "step": 3398 }, { "epoch": 2.2067175077072854, "grad_norm": 1.1738401651382446, "learning_rate": 1.7883472118021433e-06, "loss": 0.2558, "step": 3400 }, { "epoch": 2.208015576829466, "grad_norm": 0.9067807197570801, "learning_rate": 1.7828645085333645e-06, "loss": 0.261, "step": 3402 }, { "epoch": 2.2093136459516467, "grad_norm": 0.8910717964172363, "learning_rate": 1.7773883984889178e-06, "loss": 0.281, "step": 3404 }, { "epoch": 2.210611715073828, "grad_norm": 0.965601921081543, "learning_rate": 1.7719188928915882e-06, "loss": 0.2535, "step": 3406 }, { "epoch": 2.2119097841960085, "grad_norm": 0.9750633239746094, "learning_rate": 1.7664560029506268e-06, "loss": 0.2876, "step": 3408 }, { "epoch": 2.213207853318189, "grad_norm": 0.8328800201416016, "learning_rate": 1.760999739861724e-06, "loss": 0.2769, "step": 3410 }, { "epoch": 2.21450592244037, "grad_norm": 0.7959421873092651, "learning_rate": 1.7555501148069942e-06, "loss": 0.2634, "step": 3412 }, { "epoch": 2.2158039915625505, "grad_norm": 0.8195016980171204, "learning_rate": 1.7501071389549429e-06, "loss": 0.2779, "step": 3414 }, { "epoch": 2.2171020606847316, "grad_norm": 1.1042840480804443, "learning_rate": 1.7446708234604498e-06, "loss": 0.2676, "step": 3416 }, { "epoch": 2.2184001298069123, "grad_norm": 0.752224862575531, "learning_rate": 1.7392411794647445e-06, "loss": 0.2387, "step": 3418 }, { "epoch": 2.219698198929093, "grad_norm": 0.7471106648445129, "learning_rate": 1.7338182180953889e-06, "loss": 0.2586, "step": 3420 }, { "epoch": 2.2209962680512736, "grad_norm": 0.9740477800369263, "learning_rate": 1.7284019504662393e-06, "loss": 0.2784, "step": 3422 }, { "epoch": 2.2222943371734543, "grad_norm": 0.7991481423377991, "learning_rate": 1.7229923876774441e-06, "loss": 0.2768, "step": 3424 }, { "epoch": 2.2235924062956354, "grad_norm": 0.7909055948257446, "learning_rate": 1.7175895408154037e-06, "loss": 0.2938, "step": 3426 }, { "epoch": 2.224890475417816, "grad_norm": 0.9303129315376282, "learning_rate": 1.7121934209527619e-06, "loss": 0.2542, "step": 3428 }, { "epoch": 2.2261885445399967, "grad_norm": 0.83087557554245, "learning_rate": 1.7068040391483676e-06, "loss": 0.2796, "step": 3430 }, { "epoch": 2.2274866136621774, "grad_norm": 0.8012251257896423, "learning_rate": 1.7014214064472646e-06, "loss": 0.2713, "step": 3432 }, { "epoch": 2.228784682784358, "grad_norm": 0.7880580425262451, "learning_rate": 1.696045533880668e-06, "loss": 0.2755, "step": 3434 }, { "epoch": 2.230082751906539, "grad_norm": 0.7117996215820312, "learning_rate": 1.6906764324659346e-06, "loss": 0.2549, "step": 3436 }, { "epoch": 2.23138082102872, "grad_norm": 0.6848440170288086, "learning_rate": 1.6853141132065465e-06, "loss": 0.2603, "step": 3438 }, { "epoch": 2.2326788901509005, "grad_norm": 0.9833489656448364, "learning_rate": 1.6799585870920827e-06, "loss": 0.2928, "step": 3440 }, { "epoch": 2.233976959273081, "grad_norm": 0.9094142317771912, "learning_rate": 1.6746098650982072e-06, "loss": 0.2714, "step": 3442 }, { "epoch": 2.235275028395262, "grad_norm": 0.8841666579246521, "learning_rate": 1.6692679581866334e-06, "loss": 0.2838, "step": 3444 }, { "epoch": 2.236573097517443, "grad_norm": 0.8807998895645142, "learning_rate": 1.66393287730511e-06, "loss": 0.2668, "step": 3446 }, { "epoch": 2.2378711666396236, "grad_norm": 0.7663515210151672, "learning_rate": 1.658604633387395e-06, "loss": 0.2676, "step": 3448 }, { "epoch": 2.2391692357618043, "grad_norm": 0.7444564700126648, "learning_rate": 1.6532832373532388e-06, "loss": 0.2746, "step": 3450 }, { "epoch": 2.240467304883985, "grad_norm": 0.7776057124137878, "learning_rate": 1.6479687001083532e-06, "loss": 0.2641, "step": 3452 }, { "epoch": 2.2417653740061656, "grad_norm": 0.7325299382209778, "learning_rate": 1.642661032544396e-06, "loss": 0.2614, "step": 3454 }, { "epoch": 2.2430634431283467, "grad_norm": 0.6522035002708435, "learning_rate": 1.6373602455389443e-06, "loss": 0.2729, "step": 3456 }, { "epoch": 2.2443615122505274, "grad_norm": 0.7399782538414001, "learning_rate": 1.632066349955474e-06, "loss": 0.2504, "step": 3458 }, { "epoch": 2.245659581372708, "grad_norm": 0.6232579350471497, "learning_rate": 1.6267793566433426e-06, "loss": 0.2755, "step": 3460 }, { "epoch": 2.2469576504948887, "grad_norm": 0.7867412567138672, "learning_rate": 1.6214992764377563e-06, "loss": 0.2825, "step": 3462 }, { "epoch": 2.2482557196170694, "grad_norm": 0.8099661469459534, "learning_rate": 1.6162261201597557e-06, "loss": 0.2612, "step": 3464 }, { "epoch": 2.2495537887392505, "grad_norm": 1.3284928798675537, "learning_rate": 1.6109598986161895e-06, "loss": 0.2777, "step": 3466 }, { "epoch": 2.250851857861431, "grad_norm": 0.6383387446403503, "learning_rate": 1.605700622599699e-06, "loss": 0.2734, "step": 3468 }, { "epoch": 2.252149926983612, "grad_norm": 0.6762887239456177, "learning_rate": 1.600448302888688e-06, "loss": 0.2698, "step": 3470 }, { "epoch": 2.2534479961057925, "grad_norm": 0.9691365361213684, "learning_rate": 1.5952029502473032e-06, "loss": 0.2492, "step": 3472 }, { "epoch": 2.254746065227973, "grad_norm": 0.7580543756484985, "learning_rate": 1.5899645754254144e-06, "loss": 0.2981, "step": 3474 }, { "epoch": 2.2560441343501543, "grad_norm": 0.9878669381141663, "learning_rate": 1.5847331891585888e-06, "loss": 0.2533, "step": 3476 }, { "epoch": 2.257342203472335, "grad_norm": 1.5268748998641968, "learning_rate": 1.5795088021680787e-06, "loss": 0.2961, "step": 3478 }, { "epoch": 2.2586402725945156, "grad_norm": 0.6784735918045044, "learning_rate": 1.5742914251607794e-06, "loss": 0.2729, "step": 3480 }, { "epoch": 2.2599383417166963, "grad_norm": 0.823279857635498, "learning_rate": 1.5690810688292318e-06, "loss": 0.2825, "step": 3482 }, { "epoch": 2.261236410838877, "grad_norm": 0.9951549172401428, "learning_rate": 1.5638777438515817e-06, "loss": 0.2991, "step": 3484 }, { "epoch": 2.262534479961058, "grad_norm": 0.7729899883270264, "learning_rate": 1.5586814608915673e-06, "loss": 0.2566, "step": 3486 }, { "epoch": 2.2638325490832387, "grad_norm": 0.8917664885520935, "learning_rate": 1.553492230598493e-06, "loss": 0.2669, "step": 3488 }, { "epoch": 2.2651306182054194, "grad_norm": 1.0834978818893433, "learning_rate": 1.548310063607213e-06, "loss": 0.2728, "step": 3490 }, { "epoch": 2.2664286873276, "grad_norm": 0.8776866793632507, "learning_rate": 1.5431349705381021e-06, "loss": 0.2554, "step": 3492 }, { "epoch": 2.2677267564497807, "grad_norm": 0.9426636695861816, "learning_rate": 1.5379669619970393e-06, "loss": 0.3027, "step": 3494 }, { "epoch": 2.269024825571962, "grad_norm": 0.9937571287155151, "learning_rate": 1.5328060485753853e-06, "loss": 0.2696, "step": 3496 }, { "epoch": 2.2703228946941425, "grad_norm": 0.6763808131217957, "learning_rate": 1.5276522408499567e-06, "loss": 0.2754, "step": 3498 }, { "epoch": 2.271620963816323, "grad_norm": 0.9222772121429443, "learning_rate": 1.5225055493830132e-06, "loss": 0.2869, "step": 3500 }, { "epoch": 2.271620963816323, "eval_loss": 0.27621185779571533, "eval_runtime": 397.1778, "eval_samples_per_second": 26.134, "eval_steps_per_second": 3.268, "step": 3500 }, { "epoch": 2.272919032938504, "grad_norm": 0.7856032252311707, "learning_rate": 1.5173659847222266e-06, "loss": 0.246, "step": 3502 }, { "epoch": 2.2742171020606845, "grad_norm": 0.864673376083374, "learning_rate": 1.5122335574006624e-06, "loss": 0.2711, "step": 3504 }, { "epoch": 2.2755151711828656, "grad_norm": 0.8020566701889038, "learning_rate": 1.5071082779367591e-06, "loss": 0.2614, "step": 3506 }, { "epoch": 2.2768132403050463, "grad_norm": 0.7000688314437866, "learning_rate": 1.5019901568343097e-06, "loss": 0.2613, "step": 3508 }, { "epoch": 2.278111309427227, "grad_norm": 1.1961288452148438, "learning_rate": 1.4968792045824338e-06, "loss": 0.2558, "step": 3510 }, { "epoch": 2.2794093785494076, "grad_norm": 1.1285680532455444, "learning_rate": 1.4917754316555594e-06, "loss": 0.2652, "step": 3512 }, { "epoch": 2.2807074476715883, "grad_norm": 0.8875559568405151, "learning_rate": 1.4866788485133988e-06, "loss": 0.2397, "step": 3514 }, { "epoch": 2.2820055167937694, "grad_norm": 0.761859655380249, "learning_rate": 1.4815894656009361e-06, "loss": 0.3022, "step": 3516 }, { "epoch": 2.28330358591595, "grad_norm": 0.9377442002296448, "learning_rate": 1.4765072933483949e-06, "loss": 0.2533, "step": 3518 }, { "epoch": 2.2846016550381307, "grad_norm": 0.8231542110443115, "learning_rate": 1.4714323421712163e-06, "loss": 0.2559, "step": 3520 }, { "epoch": 2.2858997241603114, "grad_norm": 0.9686898589134216, "learning_rate": 1.4663646224700534e-06, "loss": 0.2718, "step": 3522 }, { "epoch": 2.287197793282492, "grad_norm": 1.1106756925582886, "learning_rate": 1.4613041446307286e-06, "loss": 0.3095, "step": 3524 }, { "epoch": 2.288495862404673, "grad_norm": 0.903839111328125, "learning_rate": 1.4562509190242335e-06, "loss": 0.2847, "step": 3526 }, { "epoch": 2.289793931526854, "grad_norm": 0.8948979377746582, "learning_rate": 1.4512049560066837e-06, "loss": 0.2474, "step": 3528 }, { "epoch": 2.2910920006490345, "grad_norm": 0.8135784864425659, "learning_rate": 1.4461662659193231e-06, "loss": 0.2492, "step": 3530 }, { "epoch": 2.292390069771215, "grad_norm": 0.7863655686378479, "learning_rate": 1.441134859088482e-06, "loss": 0.2213, "step": 3532 }, { "epoch": 2.293688138893396, "grad_norm": 0.6662296056747437, "learning_rate": 1.4361107458255723e-06, "loss": 0.2913, "step": 3534 }, { "epoch": 2.294986208015577, "grad_norm": 0.918990433216095, "learning_rate": 1.431093936427047e-06, "loss": 0.2464, "step": 3536 }, { "epoch": 2.2962842771377576, "grad_norm": 0.8023175597190857, "learning_rate": 1.4260844411744024e-06, "loss": 0.2401, "step": 3538 }, { "epoch": 2.2975823462599383, "grad_norm": 0.7579688429832458, "learning_rate": 1.421082270334138e-06, "loss": 0.2774, "step": 3540 }, { "epoch": 2.298880415382119, "grad_norm": 0.7564871311187744, "learning_rate": 1.4160874341577447e-06, "loss": 0.2813, "step": 3542 }, { "epoch": 2.3001784845042996, "grad_norm": 0.7343440055847168, "learning_rate": 1.4110999428816808e-06, "loss": 0.2676, "step": 3544 }, { "epoch": 2.3014765536264807, "grad_norm": 0.8719520568847656, "learning_rate": 1.4061198067273519e-06, "loss": 0.2745, "step": 3546 }, { "epoch": 2.3027746227486614, "grad_norm": 0.9057812094688416, "learning_rate": 1.4011470359010936e-06, "loss": 0.2941, "step": 3548 }, { "epoch": 2.304072691870842, "grad_norm": 1.108269453048706, "learning_rate": 1.3961816405941415e-06, "loss": 0.286, "step": 3550 }, { "epoch": 2.3053707609930227, "grad_norm": 0.7684711813926697, "learning_rate": 1.3912236309826193e-06, "loss": 0.2542, "step": 3552 }, { "epoch": 2.3066688301152034, "grad_norm": 0.6782795786857605, "learning_rate": 1.3862730172275114e-06, "loss": 0.2659, "step": 3554 }, { "epoch": 2.3079668992373845, "grad_norm": 0.7142642140388489, "learning_rate": 1.3813298094746491e-06, "loss": 0.2356, "step": 3556 }, { "epoch": 2.309264968359565, "grad_norm": 0.8519942760467529, "learning_rate": 1.3763940178546836e-06, "loss": 0.2621, "step": 3558 }, { "epoch": 2.310563037481746, "grad_norm": 0.7501600980758667, "learning_rate": 1.3714656524830661e-06, "loss": 0.2612, "step": 3560 }, { "epoch": 2.3118611066039265, "grad_norm": 0.7509642243385315, "learning_rate": 1.3665447234600309e-06, "loss": 0.2781, "step": 3562 }, { "epoch": 2.313159175726107, "grad_norm": 0.9416132569313049, "learning_rate": 1.361631240870569e-06, "loss": 0.3125, "step": 3564 }, { "epoch": 2.3144572448482883, "grad_norm": 0.7632682919502258, "learning_rate": 1.3567252147844167e-06, "loss": 0.2533, "step": 3566 }, { "epoch": 2.315755313970469, "grad_norm": 0.7034697532653809, "learning_rate": 1.3518266552560195e-06, "loss": 0.2614, "step": 3568 }, { "epoch": 2.3170533830926496, "grad_norm": 1.1077580451965332, "learning_rate": 1.3469355723245303e-06, "loss": 0.297, "step": 3570 }, { "epoch": 2.3183514522148303, "grad_norm": 0.7333670258522034, "learning_rate": 1.3420519760137724e-06, "loss": 0.3037, "step": 3572 }, { "epoch": 2.319649521337011, "grad_norm": 1.1022008657455444, "learning_rate": 1.3371758763322335e-06, "loss": 0.3035, "step": 3574 }, { "epoch": 2.320947590459192, "grad_norm": 1.1581895351409912, "learning_rate": 1.3323072832730267e-06, "loss": 0.2775, "step": 3576 }, { "epoch": 2.3222456595813727, "grad_norm": 0.7750434279441833, "learning_rate": 1.327446206813892e-06, "loss": 0.2662, "step": 3578 }, { "epoch": 2.3235437287035534, "grad_norm": 1.1053221225738525, "learning_rate": 1.3225926569171572e-06, "loss": 0.246, "step": 3580 }, { "epoch": 2.324841797825734, "grad_norm": 0.8594192266464233, "learning_rate": 1.317746643529732e-06, "loss": 0.2774, "step": 3582 }, { "epoch": 2.3261398669479147, "grad_norm": 0.7061358094215393, "learning_rate": 1.3129081765830725e-06, "loss": 0.2718, "step": 3584 }, { "epoch": 2.327437936070096, "grad_norm": 0.8806636333465576, "learning_rate": 1.3080772659931728e-06, "loss": 0.2763, "step": 3586 }, { "epoch": 2.3287360051922765, "grad_norm": 0.8151506185531616, "learning_rate": 1.3032539216605456e-06, "loss": 0.2299, "step": 3588 }, { "epoch": 2.330034074314457, "grad_norm": 0.7236583232879639, "learning_rate": 1.29843815347019e-06, "loss": 0.2772, "step": 3590 }, { "epoch": 2.331332143436638, "grad_norm": 0.8179826140403748, "learning_rate": 1.2936299712915828e-06, "loss": 0.2672, "step": 3592 }, { "epoch": 2.3326302125588185, "grad_norm": 0.7915949821472168, "learning_rate": 1.2888293849786503e-06, "loss": 0.2682, "step": 3594 }, { "epoch": 2.3339282816809996, "grad_norm": 0.7968785762786865, "learning_rate": 1.2840364043697572e-06, "loss": 0.2512, "step": 3596 }, { "epoch": 2.3352263508031803, "grad_norm": 0.787616491317749, "learning_rate": 1.2792510392876777e-06, "loss": 0.2794, "step": 3598 }, { "epoch": 2.336524419925361, "grad_norm": 0.7420127987861633, "learning_rate": 1.2744732995395776e-06, "loss": 0.257, "step": 3600 }, { "epoch": 2.3378224890475416, "grad_norm": 0.7289307117462158, "learning_rate": 1.2697031949169952e-06, "loss": 0.2504, "step": 3602 }, { "epoch": 2.3391205581697223, "grad_norm": 1.2782809734344482, "learning_rate": 1.2649407351958264e-06, "loss": 0.3404, "step": 3604 }, { "epoch": 2.3404186272919034, "grad_norm": 0.9840972423553467, "learning_rate": 1.260185930136294e-06, "loss": 0.2635, "step": 3606 }, { "epoch": 2.341716696414084, "grad_norm": 0.8543412089347839, "learning_rate": 1.255438789482935e-06, "loss": 0.2662, "step": 3608 }, { "epoch": 2.3430147655362648, "grad_norm": 1.3372197151184082, "learning_rate": 1.2506993229645798e-06, "loss": 0.2771, "step": 3610 }, { "epoch": 2.3443128346584454, "grad_norm": 0.9460824131965637, "learning_rate": 1.245967540294329e-06, "loss": 0.2792, "step": 3612 }, { "epoch": 2.345610903780626, "grad_norm": 1.0383739471435547, "learning_rate": 1.2412434511695392e-06, "loss": 0.3055, "step": 3614 }, { "epoch": 2.346908972902807, "grad_norm": 0.7190493941307068, "learning_rate": 1.2365270652717986e-06, "loss": 0.2579, "step": 3616 }, { "epoch": 2.348207042024988, "grad_norm": 0.9406570792198181, "learning_rate": 1.2318183922669065e-06, "loss": 0.2913, "step": 3618 }, { "epoch": 2.3495051111471685, "grad_norm": 0.9685686230659485, "learning_rate": 1.2271174418048553e-06, "loss": 0.2819, "step": 3620 }, { "epoch": 2.350803180269349, "grad_norm": 0.9473106265068054, "learning_rate": 1.2224242235198163e-06, "loss": 0.2511, "step": 3622 }, { "epoch": 2.35210124939153, "grad_norm": 0.7247506976127625, "learning_rate": 1.2177387470301055e-06, "loss": 0.2551, "step": 3624 }, { "epoch": 2.353399318513711, "grad_norm": 0.8432331085205078, "learning_rate": 1.2130610219381811e-06, "loss": 0.2587, "step": 3626 }, { "epoch": 2.3546973876358916, "grad_norm": 0.910476803779602, "learning_rate": 1.2083910578306107e-06, "loss": 0.2527, "step": 3628 }, { "epoch": 2.3559954567580723, "grad_norm": 0.7169373631477356, "learning_rate": 1.2037288642780575e-06, "loss": 0.2771, "step": 3630 }, { "epoch": 2.357293525880253, "grad_norm": 0.8528012633323669, "learning_rate": 1.1990744508352604e-06, "loss": 0.2568, "step": 3632 }, { "epoch": 2.3585915950024336, "grad_norm": 0.7306681275367737, "learning_rate": 1.194427827041011e-06, "loss": 0.2774, "step": 3634 }, { "epoch": 2.3598896641246148, "grad_norm": 0.9561393857002258, "learning_rate": 1.1897890024181418e-06, "loss": 0.2523, "step": 3636 }, { "epoch": 2.3611877332467954, "grad_norm": 0.796177327632904, "learning_rate": 1.185157986473498e-06, "loss": 0.2668, "step": 3638 }, { "epoch": 2.362485802368976, "grad_norm": 0.9648365378379822, "learning_rate": 1.1805347886979219e-06, "loss": 0.235, "step": 3640 }, { "epoch": 2.3637838714911568, "grad_norm": 0.7857531905174255, "learning_rate": 1.175919418566232e-06, "loss": 0.2411, "step": 3642 }, { "epoch": 2.3650819406133374, "grad_norm": 0.723564088344574, "learning_rate": 1.1713118855372096e-06, "loss": 0.2738, "step": 3644 }, { "epoch": 2.3663800097355185, "grad_norm": 1.0970890522003174, "learning_rate": 1.16671219905357e-06, "loss": 0.3061, "step": 3646 }, { "epoch": 2.367678078857699, "grad_norm": 0.9193376898765564, "learning_rate": 1.1621203685419485e-06, "loss": 0.2437, "step": 3648 }, { "epoch": 2.36897614797988, "grad_norm": 0.8713465929031372, "learning_rate": 1.1575364034128817e-06, "loss": 0.2784, "step": 3650 }, { "epoch": 2.3702742171020605, "grad_norm": 1.1179596185684204, "learning_rate": 1.1529603130607837e-06, "loss": 0.2475, "step": 3652 }, { "epoch": 2.371572286224241, "grad_norm": 0.9668818712234497, "learning_rate": 1.1483921068639353e-06, "loss": 0.2812, "step": 3654 }, { "epoch": 2.3728703553464223, "grad_norm": 0.7467620968818665, "learning_rate": 1.1438317941844557e-06, "loss": 0.2609, "step": 3656 }, { "epoch": 2.374168424468603, "grad_norm": 1.1956732273101807, "learning_rate": 1.139279384368287e-06, "loss": 0.3126, "step": 3658 }, { "epoch": 2.3754664935907837, "grad_norm": 0.8619592785835266, "learning_rate": 1.1347348867451745e-06, "loss": 0.2551, "step": 3660 }, { "epoch": 2.3767645627129643, "grad_norm": 1.0478953123092651, "learning_rate": 1.1301983106286535e-06, "loss": 0.2886, "step": 3662 }, { "epoch": 2.378062631835145, "grad_norm": 0.7662360668182373, "learning_rate": 1.125669665316018e-06, "loss": 0.2658, "step": 3664 }, { "epoch": 2.379360700957326, "grad_norm": 1.110558271408081, "learning_rate": 1.1211489600883124e-06, "loss": 0.2806, "step": 3666 }, { "epoch": 2.3806587700795068, "grad_norm": 1.1200811862945557, "learning_rate": 1.1166362042103056e-06, "loss": 0.2729, "step": 3668 }, { "epoch": 2.3819568392016874, "grad_norm": 0.9004538059234619, "learning_rate": 1.112131406930481e-06, "loss": 0.2582, "step": 3670 }, { "epoch": 2.383254908323868, "grad_norm": 0.8330298066139221, "learning_rate": 1.1076345774810077e-06, "loss": 0.2603, "step": 3672 }, { "epoch": 2.3845529774460488, "grad_norm": 0.7739983797073364, "learning_rate": 1.1031457250777206e-06, "loss": 0.2602, "step": 3674 }, { "epoch": 2.38585104656823, "grad_norm": 0.7264171242713928, "learning_rate": 1.0986648589201153e-06, "loss": 0.268, "step": 3676 }, { "epoch": 2.3871491156904106, "grad_norm": 0.8838381767272949, "learning_rate": 1.0941919881913142e-06, "loss": 0.2493, "step": 3678 }, { "epoch": 2.388447184812591, "grad_norm": 1.031373143196106, "learning_rate": 1.0897271220580597e-06, "loss": 0.2839, "step": 3680 }, { "epoch": 2.389745253934772, "grad_norm": 0.8070496916770935, "learning_rate": 1.0852702696706807e-06, "loss": 0.2649, "step": 3682 }, { "epoch": 2.3910433230569526, "grad_norm": 0.9026498198509216, "learning_rate": 1.0808214401630913e-06, "loss": 0.2702, "step": 3684 }, { "epoch": 2.3923413921791337, "grad_norm": 0.7816824913024902, "learning_rate": 1.0763806426527584e-06, "loss": 0.276, "step": 3686 }, { "epoch": 2.3936394613013143, "grad_norm": 0.9101266264915466, "learning_rate": 1.0719478862406896e-06, "loss": 0.2347, "step": 3688 }, { "epoch": 2.394937530423495, "grad_norm": 0.7900228500366211, "learning_rate": 1.0675231800114116e-06, "loss": 0.2757, "step": 3690 }, { "epoch": 2.3962355995456757, "grad_norm": 0.7837464213371277, "learning_rate": 1.0631065330329575e-06, "loss": 0.243, "step": 3692 }, { "epoch": 2.3975336686678563, "grad_norm": 0.8877315521240234, "learning_rate": 1.0586979543568388e-06, "loss": 0.2634, "step": 3694 }, { "epoch": 2.3988317377900374, "grad_norm": 1.4940063953399658, "learning_rate": 1.0542974530180327e-06, "loss": 0.2619, "step": 3696 }, { "epoch": 2.400129806912218, "grad_norm": 1.229384422302246, "learning_rate": 1.049905038034964e-06, "loss": 0.2835, "step": 3698 }, { "epoch": 2.401427876034399, "grad_norm": 0.84694504737854, "learning_rate": 1.0455207184094834e-06, "loss": 0.2643, "step": 3700 }, { "epoch": 2.4027259451565794, "grad_norm": 0.8757997751235962, "learning_rate": 1.0411445031268553e-06, "loss": 0.2622, "step": 3702 }, { "epoch": 2.40402401427876, "grad_norm": 1.2360705137252808, "learning_rate": 1.0367764011557291e-06, "loss": 0.2426, "step": 3704 }, { "epoch": 2.4053220834009412, "grad_norm": 0.9006813168525696, "learning_rate": 1.0324164214481302e-06, "loss": 0.2626, "step": 3706 }, { "epoch": 2.406620152523122, "grad_norm": 0.8661423921585083, "learning_rate": 1.0280645729394368e-06, "loss": 0.2372, "step": 3708 }, { "epoch": 2.4079182216453026, "grad_norm": 0.7156113386154175, "learning_rate": 1.0237208645483648e-06, "loss": 0.2353, "step": 3710 }, { "epoch": 2.4092162907674832, "grad_norm": 0.8351601362228394, "learning_rate": 1.0193853051769465e-06, "loss": 0.262, "step": 3712 }, { "epoch": 2.410514359889664, "grad_norm": 0.883045494556427, "learning_rate": 1.0150579037105123e-06, "loss": 0.2676, "step": 3714 }, { "epoch": 2.411812429011845, "grad_norm": 0.7174056172370911, "learning_rate": 1.0107386690176751e-06, "loss": 0.2692, "step": 3716 }, { "epoch": 2.4131104981340257, "grad_norm": 0.7943129539489746, "learning_rate": 1.00642760995031e-06, "loss": 0.2575, "step": 3718 }, { "epoch": 2.4144085672562063, "grad_norm": 0.741243839263916, "learning_rate": 1.0021247353435408e-06, "loss": 0.2671, "step": 3720 }, { "epoch": 2.415706636378387, "grad_norm": 0.7853082418441772, "learning_rate": 9.978300540157099e-07, "loss": 0.267, "step": 3722 }, { "epoch": 2.4170047055005677, "grad_norm": 1.0844582319259644, "learning_rate": 9.935435747683758e-07, "loss": 0.2815, "step": 3724 }, { "epoch": 2.418302774622749, "grad_norm": 1.2301080226898193, "learning_rate": 9.892653063862834e-07, "loss": 0.2681, "step": 3726 }, { "epoch": 2.4196008437449295, "grad_norm": 1.3361799716949463, "learning_rate": 9.84995257637355e-07, "loss": 0.2892, "step": 3728 }, { "epoch": 2.42089891286711, "grad_norm": 1.0597312450408936, "learning_rate": 9.807334372726595e-07, "loss": 0.26, "step": 3730 }, { "epoch": 2.422196981989291, "grad_norm": 0.8728967308998108, "learning_rate": 9.764798540264103e-07, "loss": 0.263, "step": 3732 }, { "epoch": 2.4234950511114715, "grad_norm": 0.8494249582290649, "learning_rate": 9.722345166159342e-07, "loss": 0.2498, "step": 3734 }, { "epoch": 2.4247931202336526, "grad_norm": 0.7726752758026123, "learning_rate": 9.679974337416654e-07, "loss": 0.2638, "step": 3736 }, { "epoch": 2.4260911893558332, "grad_norm": 0.7557317018508911, "learning_rate": 9.637686140871121e-07, "loss": 0.2799, "step": 3738 }, { "epoch": 2.427389258478014, "grad_norm": 0.97322678565979, "learning_rate": 9.595480663188528e-07, "loss": 0.2682, "step": 3740 }, { "epoch": 2.4286873276001946, "grad_norm": 0.8444679379463196, "learning_rate": 9.55335799086517e-07, "loss": 0.2902, "step": 3742 }, { "epoch": 2.4299853967223752, "grad_norm": 1.0076559782028198, "learning_rate": 9.511318210227577e-07, "loss": 0.2573, "step": 3744 }, { "epoch": 2.4312834658445563, "grad_norm": 0.7156585454940796, "learning_rate": 9.469361407432431e-07, "loss": 0.2505, "step": 3746 }, { "epoch": 2.432581534966737, "grad_norm": 0.8687481880187988, "learning_rate": 9.427487668466345e-07, "loss": 0.2638, "step": 3748 }, { "epoch": 2.4338796040889177, "grad_norm": 1.1978042125701904, "learning_rate": 9.385697079145734e-07, "loss": 0.2814, "step": 3750 }, { "epoch": 2.4351776732110983, "grad_norm": 0.8107622861862183, "learning_rate": 9.34398972511656e-07, "loss": 0.2353, "step": 3752 }, { "epoch": 2.436475742333279, "grad_norm": 0.9831713438034058, "learning_rate": 9.302365691854231e-07, "loss": 0.27, "step": 3754 }, { "epoch": 2.43777381145546, "grad_norm": 0.7902975082397461, "learning_rate": 9.26082506466337e-07, "loss": 0.2902, "step": 3756 }, { "epoch": 2.439071880577641, "grad_norm": 0.760802149772644, "learning_rate": 9.21936792867772e-07, "loss": 0.2643, "step": 3758 }, { "epoch": 2.4403699496998215, "grad_norm": 1.1900564432144165, "learning_rate": 9.177994368859866e-07, "loss": 0.2724, "step": 3760 }, { "epoch": 2.441668018822002, "grad_norm": 0.7975614070892334, "learning_rate": 9.136704470001101e-07, "loss": 0.2639, "step": 3762 }, { "epoch": 2.442966087944183, "grad_norm": 0.8505778312683105, "learning_rate": 9.095498316721324e-07, "loss": 0.2385, "step": 3764 }, { "epoch": 2.444264157066364, "grad_norm": 0.8243236541748047, "learning_rate": 9.054375993468745e-07, "loss": 0.2741, "step": 3766 }, { "epoch": 2.4455622261885446, "grad_norm": 1.0620031356811523, "learning_rate": 9.013337584519827e-07, "loss": 0.2733, "step": 3768 }, { "epoch": 2.4468602953107252, "grad_norm": 0.663241446018219, "learning_rate": 8.972383173978987e-07, "loss": 0.2699, "step": 3770 }, { "epoch": 2.448158364432906, "grad_norm": 0.7733364701271057, "learning_rate": 8.931512845778568e-07, "loss": 0.2653, "step": 3772 }, { "epoch": 2.4494564335550866, "grad_norm": 0.7479226589202881, "learning_rate": 8.890726683678541e-07, "loss": 0.2577, "step": 3774 }, { "epoch": 2.4507545026772677, "grad_norm": 0.9609542489051819, "learning_rate": 8.850024771266441e-07, "loss": 0.2745, "step": 3776 }, { "epoch": 2.4520525717994484, "grad_norm": 1.2257568836212158, "learning_rate": 8.809407191957054e-07, "loss": 0.2575, "step": 3778 }, { "epoch": 2.453350640921629, "grad_norm": 1.09929358959198, "learning_rate": 8.768874028992431e-07, "loss": 0.2881, "step": 3780 }, { "epoch": 2.4546487100438097, "grad_norm": 1.0986305475234985, "learning_rate": 8.728425365441556e-07, "loss": 0.2766, "step": 3782 }, { "epoch": 2.4559467791659904, "grad_norm": 1.0011721849441528, "learning_rate": 8.688061284200266e-07, "loss": 0.2575, "step": 3784 }, { "epoch": 2.4572448482881715, "grad_norm": 0.8661705255508423, "learning_rate": 8.647781867991034e-07, "loss": 0.2842, "step": 3786 }, { "epoch": 2.458542917410352, "grad_norm": 1.1256672143936157, "learning_rate": 8.607587199362826e-07, "loss": 0.2697, "step": 3788 }, { "epoch": 2.459840986532533, "grad_norm": 0.9281749725341797, "learning_rate": 8.567477360690962e-07, "loss": 0.2469, "step": 3790 }, { "epoch": 2.4611390556547135, "grad_norm": 0.8208118677139282, "learning_rate": 8.527452434176858e-07, "loss": 0.272, "step": 3792 }, { "epoch": 2.462437124776894, "grad_norm": 0.8958832025527954, "learning_rate": 8.487512501847933e-07, "loss": 0.2427, "step": 3794 }, { "epoch": 2.4637351938990752, "grad_norm": 0.8788226246833801, "learning_rate": 8.447657645557417e-07, "loss": 0.2595, "step": 3796 }, { "epoch": 2.465033263021256, "grad_norm": 0.9506201148033142, "learning_rate": 8.407887946984195e-07, "loss": 0.2898, "step": 3798 }, { "epoch": 2.4663313321434366, "grad_norm": 0.6115549206733704, "learning_rate": 8.368203487632615e-07, "loss": 0.2376, "step": 3800 }, { "epoch": 2.4676294012656173, "grad_norm": 0.8967217206954956, "learning_rate": 8.328604348832337e-07, "loss": 0.2668, "step": 3802 }, { "epoch": 2.468927470387798, "grad_norm": 0.7681214809417725, "learning_rate": 8.28909061173817e-07, "loss": 0.2482, "step": 3804 }, { "epoch": 2.470225539509979, "grad_norm": 0.7351979613304138, "learning_rate": 8.24966235732988e-07, "loss": 0.2599, "step": 3806 }, { "epoch": 2.4715236086321597, "grad_norm": 0.9117375016212463, "learning_rate": 8.210319666412087e-07, "loss": 0.2625, "step": 3808 }, { "epoch": 2.4728216777543404, "grad_norm": 0.8857051730155945, "learning_rate": 8.171062619614017e-07, "loss": 0.284, "step": 3810 }, { "epoch": 2.474119746876521, "grad_norm": 0.9182189106941223, "learning_rate": 8.131891297389388e-07, "loss": 0.2502, "step": 3812 }, { "epoch": 2.4754178159987017, "grad_norm": 0.8733800053596497, "learning_rate": 8.092805780016228e-07, "loss": 0.2536, "step": 3814 }, { "epoch": 2.476715885120883, "grad_norm": 0.8436048626899719, "learning_rate": 8.053806147596743e-07, "loss": 0.2729, "step": 3816 }, { "epoch": 2.4780139542430635, "grad_norm": 0.8773142099380493, "learning_rate": 8.014892480057096e-07, "loss": 0.2708, "step": 3818 }, { "epoch": 2.479312023365244, "grad_norm": 0.7254986763000488, "learning_rate": 7.976064857147281e-07, "loss": 0.2531, "step": 3820 }, { "epoch": 2.480610092487425, "grad_norm": 0.9052957892417908, "learning_rate": 7.937323358440935e-07, "loss": 0.2864, "step": 3822 }, { "epoch": 2.4819081616096055, "grad_norm": 0.796226441860199, "learning_rate": 7.898668063335252e-07, "loss": 0.2681, "step": 3824 }, { "epoch": 2.4832062307317866, "grad_norm": 0.739266037940979, "learning_rate": 7.860099051050679e-07, "loss": 0.2678, "step": 3826 }, { "epoch": 2.4845042998539673, "grad_norm": 0.8353003263473511, "learning_rate": 7.821616400630866e-07, "loss": 0.3012, "step": 3828 }, { "epoch": 2.485802368976148, "grad_norm": 0.8049766421318054, "learning_rate": 7.783220190942514e-07, "loss": 0.2668, "step": 3830 }, { "epoch": 2.4871004380983286, "grad_norm": 0.8848294615745544, "learning_rate": 7.744910500675106e-07, "loss": 0.246, "step": 3832 }, { "epoch": 2.4883985072205093, "grad_norm": 0.9075513482093811, "learning_rate": 7.706687408340862e-07, "loss": 0.2694, "step": 3834 }, { "epoch": 2.4896965763426904, "grad_norm": 0.8864970207214355, "learning_rate": 7.668550992274476e-07, "loss": 0.2577, "step": 3836 }, { "epoch": 2.490994645464871, "grad_norm": 1.0597831010818481, "learning_rate": 7.630501330633072e-07, "loss": 0.2659, "step": 3838 }, { "epoch": 2.4922927145870517, "grad_norm": 1.1199214458465576, "learning_rate": 7.59253850139593e-07, "loss": 0.2711, "step": 3840 }, { "epoch": 2.4935907837092324, "grad_norm": 0.8803431987762451, "learning_rate": 7.554662582364392e-07, "loss": 0.2809, "step": 3842 }, { "epoch": 2.494888852831413, "grad_norm": 0.8627513647079468, "learning_rate": 7.516873651161661e-07, "loss": 0.2686, "step": 3844 }, { "epoch": 2.496186921953594, "grad_norm": 0.7798545360565186, "learning_rate": 7.47917178523272e-07, "loss": 0.2636, "step": 3846 }, { "epoch": 2.497484991075775, "grad_norm": 1.1237711906433105, "learning_rate": 7.441557061844074e-07, "loss": 0.2663, "step": 3848 }, { "epoch": 2.4987830601979555, "grad_norm": 0.8979664444923401, "learning_rate": 7.404029558083653e-07, "loss": 0.2482, "step": 3850 }, { "epoch": 2.500081129320136, "grad_norm": 0.9474239349365234, "learning_rate": 7.366589350860631e-07, "loss": 0.2763, "step": 3852 }, { "epoch": 2.501379198442317, "grad_norm": 0.8459784984588623, "learning_rate": 7.329236516905274e-07, "loss": 0.2642, "step": 3854 }, { "epoch": 2.502677267564498, "grad_norm": 0.8267672061920166, "learning_rate": 7.291971132768816e-07, "loss": 0.2674, "step": 3856 }, { "epoch": 2.5039753366866786, "grad_norm": 0.9268800616264343, "learning_rate": 7.254793274823235e-07, "loss": 0.2393, "step": 3858 }, { "epoch": 2.5052734058088593, "grad_norm": 0.7126993536949158, "learning_rate": 7.217703019261135e-07, "loss": 0.2663, "step": 3860 }, { "epoch": 2.50657147493104, "grad_norm": 0.6624277830123901, "learning_rate": 7.180700442095595e-07, "loss": 0.2515, "step": 3862 }, { "epoch": 2.5078695440532206, "grad_norm": 1.3616409301757812, "learning_rate": 7.143785619160026e-07, "loss": 0.3, "step": 3864 }, { "epoch": 2.5091676131754017, "grad_norm": 0.940330445766449, "learning_rate": 7.106958626107957e-07, "loss": 0.2779, "step": 3866 }, { "epoch": 2.5104656822975824, "grad_norm": 0.8096235394477844, "learning_rate": 7.070219538412936e-07, "loss": 0.2665, "step": 3868 }, { "epoch": 2.511763751419763, "grad_norm": 0.7483378052711487, "learning_rate": 7.033568431368359e-07, "loss": 0.2509, "step": 3870 }, { "epoch": 2.5130618205419437, "grad_norm": 0.8158838748931885, "learning_rate": 6.997005380087301e-07, "loss": 0.2826, "step": 3872 }, { "epoch": 2.5143598896641244, "grad_norm": 0.8278487920761108, "learning_rate": 6.960530459502418e-07, "loss": 0.2939, "step": 3874 }, { "epoch": 2.5156579587863055, "grad_norm": 0.8360022306442261, "learning_rate": 6.924143744365669e-07, "loss": 0.2707, "step": 3876 }, { "epoch": 2.516956027908486, "grad_norm": 0.810482382774353, "learning_rate": 6.887845309248326e-07, "loss": 0.2871, "step": 3878 }, { "epoch": 2.518254097030667, "grad_norm": 0.8793767094612122, "learning_rate": 6.851635228540693e-07, "loss": 0.2662, "step": 3880 }, { "epoch": 2.5195521661528475, "grad_norm": 0.7943509817123413, "learning_rate": 6.81551357645201e-07, "loss": 0.267, "step": 3882 }, { "epoch": 2.520850235275028, "grad_norm": 0.6648900508880615, "learning_rate": 6.779480427010288e-07, "loss": 0.2726, "step": 3884 }, { "epoch": 2.5221483043972093, "grad_norm": 1.6382938623428345, "learning_rate": 6.743535854062183e-07, "loss": 0.302, "step": 3886 }, { "epoch": 2.52344637351939, "grad_norm": 0.7505214214324951, "learning_rate": 6.707679931272787e-07, "loss": 0.2444, "step": 3888 }, { "epoch": 2.5247444426415706, "grad_norm": 0.9377225041389465, "learning_rate": 6.671912732125535e-07, "loss": 0.2746, "step": 3890 }, { "epoch": 2.5260425117637513, "grad_norm": 0.9834662675857544, "learning_rate": 6.63623432992202e-07, "loss": 0.2557, "step": 3892 }, { "epoch": 2.527340580885932, "grad_norm": 0.6412943005561829, "learning_rate": 6.600644797781847e-07, "loss": 0.2518, "step": 3894 }, { "epoch": 2.528638650008113, "grad_norm": 0.8995785117149353, "learning_rate": 6.565144208642521e-07, "loss": 0.2638, "step": 3896 }, { "epoch": 2.5299367191302937, "grad_norm": 0.8700165748596191, "learning_rate": 6.529732635259234e-07, "loss": 0.2358, "step": 3898 }, { "epoch": 2.5312347882524744, "grad_norm": 0.7336729168891907, "learning_rate": 6.494410150204766e-07, "loss": 0.2363, "step": 3900 }, { "epoch": 2.532532857374655, "grad_norm": 0.7733325958251953, "learning_rate": 6.459176825869296e-07, "loss": 0.2531, "step": 3902 }, { "epoch": 2.5338309264968357, "grad_norm": 0.8309046030044556, "learning_rate": 6.424032734460311e-07, "loss": 0.2485, "step": 3904 }, { "epoch": 2.535128995619017, "grad_norm": 1.298288106918335, "learning_rate": 6.388977948002406e-07, "loss": 0.2615, "step": 3906 }, { "epoch": 2.5364270647411975, "grad_norm": 0.7044572234153748, "learning_rate": 6.354012538337145e-07, "loss": 0.2508, "step": 3908 }, { "epoch": 2.537725133863378, "grad_norm": 1.0138155221939087, "learning_rate": 6.31913657712292e-07, "loss": 0.2568, "step": 3910 }, { "epoch": 2.539023202985559, "grad_norm": 1.071236252784729, "learning_rate": 6.284350135834838e-07, "loss": 0.2932, "step": 3912 }, { "epoch": 2.5403212721077395, "grad_norm": 0.7313894629478455, "learning_rate": 6.249653285764529e-07, "loss": 0.2524, "step": 3914 }, { "epoch": 2.5416193412299206, "grad_norm": 0.9270567297935486, "learning_rate": 6.215046098019967e-07, "loss": 0.2784, "step": 3916 }, { "epoch": 2.5429174103521013, "grad_norm": 1.2433044910430908, "learning_rate": 6.180528643525446e-07, "loss": 0.2705, "step": 3918 }, { "epoch": 2.544215479474282, "grad_norm": 0.8145274519920349, "learning_rate": 6.146100993021308e-07, "loss": 0.2837, "step": 3920 }, { "epoch": 2.5455135485964626, "grad_norm": 1.0433731079101562, "learning_rate": 6.111763217063893e-07, "loss": 0.2846, "step": 3922 }, { "epoch": 2.5468116177186433, "grad_norm": 0.7822602987289429, "learning_rate": 6.077515386025284e-07, "loss": 0.2581, "step": 3924 }, { "epoch": 2.5481096868408244, "grad_norm": 0.8020090460777283, "learning_rate": 6.043357570093311e-07, "loss": 0.2656, "step": 3926 }, { "epoch": 2.549407755963005, "grad_norm": 0.8298524022102356, "learning_rate": 6.00928983927126e-07, "loss": 0.2548, "step": 3928 }, { "epoch": 2.5507058250851857, "grad_norm": 0.8167527914047241, "learning_rate": 5.975312263377853e-07, "loss": 0.2474, "step": 3930 }, { "epoch": 2.5520038942073664, "grad_norm": 0.8012738227844238, "learning_rate": 5.941424912046978e-07, "loss": 0.2428, "step": 3932 }, { "epoch": 2.553301963329547, "grad_norm": 0.73492830991745, "learning_rate": 5.907627854727688e-07, "loss": 0.2586, "step": 3934 }, { "epoch": 2.554600032451728, "grad_norm": 0.8508820533752441, "learning_rate": 5.873921160683943e-07, "loss": 0.2899, "step": 3936 }, { "epoch": 2.555898101573909, "grad_norm": 0.8964642882347107, "learning_rate": 5.84030489899452e-07, "loss": 0.2733, "step": 3938 }, { "epoch": 2.5571961706960895, "grad_norm": 1.2108659744262695, "learning_rate": 5.806779138552876e-07, "loss": 0.2743, "step": 3940 }, { "epoch": 2.55849423981827, "grad_norm": 0.8052947521209717, "learning_rate": 5.773343948066962e-07, "loss": 0.2698, "step": 3942 }, { "epoch": 2.559792308940451, "grad_norm": 1.39752995967865, "learning_rate": 5.739999396059165e-07, "loss": 0.2824, "step": 3944 }, { "epoch": 2.561090378062632, "grad_norm": 1.0916484594345093, "learning_rate": 5.706745550866072e-07, "loss": 0.29, "step": 3946 }, { "epoch": 2.5623884471848126, "grad_norm": 0.7653573155403137, "learning_rate": 5.673582480638395e-07, "loss": 0.2711, "step": 3948 }, { "epoch": 2.5636865163069933, "grad_norm": 0.7510469555854797, "learning_rate": 5.640510253340803e-07, "loss": 0.2688, "step": 3950 }, { "epoch": 2.564984585429174, "grad_norm": 0.9265086650848389, "learning_rate": 5.607528936751799e-07, "loss": 0.2781, "step": 3952 }, { "epoch": 2.5662826545513546, "grad_norm": 0.8457607626914978, "learning_rate": 5.574638598463578e-07, "loss": 0.2395, "step": 3954 }, { "epoch": 2.5675807236735357, "grad_norm": 0.7980404496192932, "learning_rate": 5.541839305881853e-07, "loss": 0.2816, "step": 3956 }, { "epoch": 2.5688787927957164, "grad_norm": 1.0162101984024048, "learning_rate": 5.509131126225787e-07, "loss": 0.2468, "step": 3958 }, { "epoch": 2.570176861917897, "grad_norm": 0.9657500982284546, "learning_rate": 5.476514126527771e-07, "loss": 0.2378, "step": 3960 }, { "epoch": 2.5714749310400777, "grad_norm": 0.6601144671440125, "learning_rate": 5.443988373633397e-07, "loss": 0.2674, "step": 3962 }, { "epoch": 2.5727730001622584, "grad_norm": 0.7905781865119934, "learning_rate": 5.411553934201169e-07, "loss": 0.2499, "step": 3964 }, { "epoch": 2.5740710692844395, "grad_norm": 0.8233057856559753, "learning_rate": 5.379210874702534e-07, "loss": 0.2633, "step": 3966 }, { "epoch": 2.57536913840662, "grad_norm": 1.04177725315094, "learning_rate": 5.346959261421597e-07, "loss": 0.284, "step": 3968 }, { "epoch": 2.576667207528801, "grad_norm": 0.8093863725662231, "learning_rate": 5.314799160455126e-07, "loss": 0.2669, "step": 3970 }, { "epoch": 2.5779652766509815, "grad_norm": 1.3493350744247437, "learning_rate": 5.282730637712252e-07, "loss": 0.2906, "step": 3972 }, { "epoch": 2.579263345773162, "grad_norm": 0.9302629828453064, "learning_rate": 5.250753758914506e-07, "loss": 0.2836, "step": 3974 }, { "epoch": 2.5805614148953433, "grad_norm": 0.8583089113235474, "learning_rate": 5.218868589595555e-07, "loss": 0.2809, "step": 3976 }, { "epoch": 2.581859484017524, "grad_norm": 0.8777998089790344, "learning_rate": 5.187075195101154e-07, "loss": 0.267, "step": 3978 }, { "epoch": 2.5831575531397046, "grad_norm": 0.8318948149681091, "learning_rate": 5.155373640588923e-07, "loss": 0.2617, "step": 3980 }, { "epoch": 2.5844556222618853, "grad_norm": 1.012461543083191, "learning_rate": 5.123763991028291e-07, "loss": 0.2791, "step": 3982 }, { "epoch": 2.585753691384066, "grad_norm": 0.662935197353363, "learning_rate": 5.09224631120036e-07, "loss": 0.273, "step": 3984 }, { "epoch": 2.587051760506247, "grad_norm": 0.8553810715675354, "learning_rate": 5.060820665697719e-07, "loss": 0.2847, "step": 3986 }, { "epoch": 2.5883498296284277, "grad_norm": 0.9301263093948364, "learning_rate": 5.029487118924342e-07, "loss": 0.2745, "step": 3988 }, { "epoch": 2.5896478987506084, "grad_norm": 0.7946049571037292, "learning_rate": 4.998245735095459e-07, "loss": 0.2855, "step": 3990 }, { "epoch": 2.590945967872789, "grad_norm": 0.8933957815170288, "learning_rate": 4.967096578237435e-07, "loss": 0.2507, "step": 3992 }, { "epoch": 2.5922440369949697, "grad_norm": 0.8850719332695007, "learning_rate": 4.936039712187602e-07, "loss": 0.2662, "step": 3994 }, { "epoch": 2.593542106117151, "grad_norm": 1.1123497486114502, "learning_rate": 4.905075200594167e-07, "loss": 0.3292, "step": 3996 }, { "epoch": 2.5948401752393315, "grad_norm": 0.8042243123054504, "learning_rate": 4.874203106916048e-07, "loss": 0.249, "step": 3998 }, { "epoch": 2.596138244361512, "grad_norm": 0.8544642925262451, "learning_rate": 4.843423494422783e-07, "loss": 0.2339, "step": 4000 }, { "epoch": 2.596138244361512, "eval_loss": 0.2755909264087677, "eval_runtime": 397.3707, "eval_samples_per_second": 26.122, "eval_steps_per_second": 3.266, "step": 4000 }, { "epoch": 2.597436313483693, "grad_norm": 0.9791273474693298, "learning_rate": 4.812736426194369e-07, "loss": 0.2525, "step": 4002 }, { "epoch": 2.5987343826058735, "grad_norm": 0.9136351346969604, "learning_rate": 4.782141965121129e-07, "loss": 0.2906, "step": 4004 }, { "epoch": 2.6000324517280546, "grad_norm": 0.7500069737434387, "learning_rate": 4.751640173903616e-07, "loss": 0.2733, "step": 4006 }, { "epoch": 2.6013305208502353, "grad_norm": 0.8558971881866455, "learning_rate": 4.721231115052438e-07, "loss": 0.2695, "step": 4008 }, { "epoch": 2.602628589972416, "grad_norm": 0.7947621941566467, "learning_rate": 4.690914850888195e-07, "loss": 0.2625, "step": 4010 }, { "epoch": 2.6039266590945966, "grad_norm": 0.8564437031745911, "learning_rate": 4.660691443541282e-07, "loss": 0.2469, "step": 4012 }, { "epoch": 2.6052247282167773, "grad_norm": 0.9768598079681396, "learning_rate": 4.6305609549518127e-07, "loss": 0.2646, "step": 4014 }, { "epoch": 2.6065227973389584, "grad_norm": 1.0655745267868042, "learning_rate": 4.600523446869437e-07, "loss": 0.2682, "step": 4016 }, { "epoch": 2.607820866461139, "grad_norm": 0.7878484129905701, "learning_rate": 4.570578980853302e-07, "loss": 0.3075, "step": 4018 }, { "epoch": 2.6091189355833198, "grad_norm": 0.7727589011192322, "learning_rate": 4.540727618271834e-07, "loss": 0.2345, "step": 4020 }, { "epoch": 2.6104170047055004, "grad_norm": 0.7853579521179199, "learning_rate": 4.510969420302669e-07, "loss": 0.2578, "step": 4022 }, { "epoch": 2.611715073827681, "grad_norm": 0.7429612278938293, "learning_rate": 4.4813044479325054e-07, "loss": 0.2895, "step": 4024 }, { "epoch": 2.613013142949862, "grad_norm": 0.9233436584472656, "learning_rate": 4.4517327619569784e-07, "loss": 0.327, "step": 4026 }, { "epoch": 2.614311212072043, "grad_norm": 0.8992781043052673, "learning_rate": 4.4222544229805543e-07, "loss": 0.2586, "step": 4028 }, { "epoch": 2.6156092811942235, "grad_norm": 0.853420078754425, "learning_rate": 4.3928694914163736e-07, "loss": 0.2482, "step": 4030 }, { "epoch": 2.616907350316404, "grad_norm": 0.8318231701850891, "learning_rate": 4.363578027486187e-07, "loss": 0.2839, "step": 4032 }, { "epoch": 2.618205419438585, "grad_norm": 0.9198233485221863, "learning_rate": 4.334380091220147e-07, "loss": 0.3108, "step": 4034 }, { "epoch": 2.619503488560766, "grad_norm": 0.8474653363227844, "learning_rate": 4.3052757424567547e-07, "loss": 0.2619, "step": 4036 }, { "epoch": 2.6208015576829466, "grad_norm": 0.9051506519317627, "learning_rate": 4.276265040842692e-07, "loss": 0.2912, "step": 4038 }, { "epoch": 2.6220996268051273, "grad_norm": 0.9365050792694092, "learning_rate": 4.2473480458327496e-07, "loss": 0.2813, "step": 4040 }, { "epoch": 2.623397695927308, "grad_norm": 1.2990094423294067, "learning_rate": 4.2185248166896564e-07, "loss": 0.3004, "step": 4042 }, { "epoch": 2.6246957650494886, "grad_norm": 1.0054802894592285, "learning_rate": 4.189795412483971e-07, "loss": 0.2797, "step": 4044 }, { "epoch": 2.6259938341716698, "grad_norm": 0.7491018176078796, "learning_rate": 4.16115989209398e-07, "loss": 0.241, "step": 4046 }, { "epoch": 2.6272919032938504, "grad_norm": 0.67268306016922, "learning_rate": 4.132618314205544e-07, "loss": 0.2594, "step": 4048 }, { "epoch": 2.628589972416031, "grad_norm": 0.9772464036941528, "learning_rate": 4.1041707373120354e-07, "loss": 0.2722, "step": 4050 }, { "epoch": 2.6298880415382118, "grad_norm": 1.0358226299285889, "learning_rate": 4.07581721971414e-07, "loss": 0.2884, "step": 4052 }, { "epoch": 2.6311861106603924, "grad_norm": 0.974997341632843, "learning_rate": 4.047557819519793e-07, "loss": 0.2789, "step": 4054 }, { "epoch": 2.6324841797825735, "grad_norm": 0.8397787809371948, "learning_rate": 4.0193925946440447e-07, "loss": 0.2729, "step": 4056 }, { "epoch": 2.633782248904754, "grad_norm": 0.8037217855453491, "learning_rate": 3.991321602808956e-07, "loss": 0.278, "step": 4058 }, { "epoch": 2.635080318026935, "grad_norm": 0.9947704076766968, "learning_rate": 3.963344901543437e-07, "loss": 0.2657, "step": 4060 }, { "epoch": 2.6363783871491155, "grad_norm": 1.3484654426574707, "learning_rate": 3.935462548183183e-07, "loss": 0.2844, "step": 4062 }, { "epoch": 2.637676456271296, "grad_norm": 0.9467418789863586, "learning_rate": 3.9076745998705034e-07, "loss": 0.2801, "step": 4064 }, { "epoch": 2.6389745253934773, "grad_norm": 0.9333597421646118, "learning_rate": 3.879981113554271e-07, "loss": 0.2833, "step": 4066 }, { "epoch": 2.640272594515658, "grad_norm": 0.7955147624015808, "learning_rate": 3.852382145989758e-07, "loss": 0.266, "step": 4068 }, { "epoch": 2.6415706636378387, "grad_norm": 0.8043116927146912, "learning_rate": 3.8248777537384763e-07, "loss": 0.2556, "step": 4070 }, { "epoch": 2.6428687327600193, "grad_norm": 0.7456397414207458, "learning_rate": 3.797467993168197e-07, "loss": 0.258, "step": 4072 }, { "epoch": 2.6441668018822, "grad_norm": 1.1371774673461914, "learning_rate": 3.7701529204526856e-07, "loss": 0.2667, "step": 4074 }, { "epoch": 2.645464871004381, "grad_norm": 1.0636049509048462, "learning_rate": 3.742932591571713e-07, "loss": 0.2703, "step": 4076 }, { "epoch": 2.6467629401265618, "grad_norm": 0.7904603481292725, "learning_rate": 3.7158070623108056e-07, "loss": 0.258, "step": 4078 }, { "epoch": 2.6480610092487424, "grad_norm": 0.8348304033279419, "learning_rate": 3.6887763882612835e-07, "loss": 0.2472, "step": 4080 }, { "epoch": 2.649359078370923, "grad_norm": 1.1288135051727295, "learning_rate": 3.661840624820029e-07, "loss": 0.2798, "step": 4082 }, { "epoch": 2.6506571474931038, "grad_norm": 0.7080700397491455, "learning_rate": 3.6349998271894116e-07, "loss": 0.2707, "step": 4084 }, { "epoch": 2.651955216615285, "grad_norm": 0.7422959804534912, "learning_rate": 3.60825405037718e-07, "loss": 0.2786, "step": 4086 }, { "epoch": 2.6532532857374655, "grad_norm": 0.9330370426177979, "learning_rate": 3.581603349196372e-07, "loss": 0.2628, "step": 4088 }, { "epoch": 2.654551354859646, "grad_norm": 0.7587841749191284, "learning_rate": 3.5550477782651473e-07, "loss": 0.2526, "step": 4090 }, { "epoch": 2.655849423981827, "grad_norm": 1.2852766513824463, "learning_rate": 3.528587392006716e-07, "loss": 0.3049, "step": 4092 }, { "epoch": 2.6571474931040076, "grad_norm": 1.0950839519500732, "learning_rate": 3.502222244649212e-07, "loss": 0.2487, "step": 4094 }, { "epoch": 2.6584455622261887, "grad_norm": 0.89836585521698, "learning_rate": 3.4759523902255845e-07, "loss": 0.2534, "step": 4096 }, { "epoch": 2.6597436313483693, "grad_norm": 0.8535431027412415, "learning_rate": 3.449777882573502e-07, "loss": 0.249, "step": 4098 }, { "epoch": 2.66104170047055, "grad_norm": 0.8421931266784668, "learning_rate": 3.423698775335216e-07, "loss": 0.2943, "step": 4100 }, { "epoch": 2.6623397695927307, "grad_norm": 0.7945136427879333, "learning_rate": 3.397715121957468e-07, "loss": 0.2527, "step": 4102 }, { "epoch": 2.6636378387149113, "grad_norm": 0.8033533692359924, "learning_rate": 3.3718269756913615e-07, "loss": 0.2809, "step": 4104 }, { "epoch": 2.6649359078370924, "grad_norm": 0.7833284735679626, "learning_rate": 3.346034389592295e-07, "loss": 0.2667, "step": 4106 }, { "epoch": 2.666233976959273, "grad_norm": 0.8529292941093445, "learning_rate": 3.3203374165198085e-07, "loss": 0.2702, "step": 4108 }, { "epoch": 2.667532046081454, "grad_norm": 0.823445200920105, "learning_rate": 3.294736109137486e-07, "loss": 0.2635, "step": 4110 }, { "epoch": 2.6688301152036344, "grad_norm": 0.7254524230957031, "learning_rate": 3.269230519912869e-07, "loss": 0.2634, "step": 4112 }, { "epoch": 2.670128184325815, "grad_norm": 0.779386043548584, "learning_rate": 3.243820701117306e-07, "loss": 0.2539, "step": 4114 }, { "epoch": 2.6714262534479962, "grad_norm": 0.6093817949295044, "learning_rate": 3.2185067048259245e-07, "loss": 0.2431, "step": 4116 }, { "epoch": 2.672724322570177, "grad_norm": 0.8845010995864868, "learning_rate": 3.193288582917403e-07, "loss": 0.2638, "step": 4118 }, { "epoch": 2.6740223916923576, "grad_norm": 0.8316082954406738, "learning_rate": 3.1681663870739955e-07, "loss": 0.2862, "step": 4120 }, { "epoch": 2.6753204608145382, "grad_norm": 0.7334588766098022, "learning_rate": 3.143140168781328e-07, "loss": 0.2704, "step": 4122 }, { "epoch": 2.676618529936719, "grad_norm": 1.2161976099014282, "learning_rate": 3.118209979328363e-07, "loss": 0.3115, "step": 4124 }, { "epoch": 2.6779165990589, "grad_norm": 0.8920618295669556, "learning_rate": 3.0933758698072023e-07, "loss": 0.2533, "step": 4126 }, { "epoch": 2.6792146681810807, "grad_norm": 0.9487091898918152, "learning_rate": 3.068637891113108e-07, "loss": 0.2419, "step": 4128 }, { "epoch": 2.6805127373032613, "grad_norm": 0.814480721950531, "learning_rate": 3.0439960939442794e-07, "loss": 0.2442, "step": 4130 }, { "epoch": 2.681810806425442, "grad_norm": 0.9836001396179199, "learning_rate": 3.0194505288018484e-07, "loss": 0.247, "step": 4132 }, { "epoch": 2.6831088755476227, "grad_norm": 0.8221400380134583, "learning_rate": 2.9950012459896704e-07, "loss": 0.2429, "step": 4134 }, { "epoch": 2.684406944669804, "grad_norm": 0.7867438793182373, "learning_rate": 2.97064829561432e-07, "loss": 0.2674, "step": 4136 }, { "epoch": 2.6857050137919845, "grad_norm": 1.0275496244430542, "learning_rate": 2.946391727584952e-07, "loss": 0.2538, "step": 4138 }, { "epoch": 2.687003082914165, "grad_norm": 0.7452818155288696, "learning_rate": 2.922231591613162e-07, "loss": 0.2396, "step": 4140 }, { "epoch": 2.688301152036346, "grad_norm": 0.944563627243042, "learning_rate": 2.8981679372129424e-07, "loss": 0.2951, "step": 4142 }, { "epoch": 2.6895992211585265, "grad_norm": 0.9158068895339966, "learning_rate": 2.874200813700534e-07, "loss": 0.2588, "step": 4144 }, { "epoch": 2.6908972902807076, "grad_norm": 0.8479757905006409, "learning_rate": 2.850330270194379e-07, "loss": 0.2753, "step": 4146 }, { "epoch": 2.6921953594028882, "grad_norm": 0.725744903087616, "learning_rate": 2.8265563556149625e-07, "loss": 0.2749, "step": 4148 }, { "epoch": 2.693493428525069, "grad_norm": 0.996493935585022, "learning_rate": 2.802879118684737e-07, "loss": 0.3023, "step": 4150 }, { "epoch": 2.6947914976472496, "grad_norm": 0.8474781513214111, "learning_rate": 2.779298607928033e-07, "loss": 0.2784, "step": 4152 }, { "epoch": 2.6960895667694302, "grad_norm": 1.4088493585586548, "learning_rate": 2.7558148716709475e-07, "loss": 0.3228, "step": 4154 }, { "epoch": 2.6973876358916113, "grad_norm": 0.7158757448196411, "learning_rate": 2.7324279580412507e-07, "loss": 0.2565, "step": 4156 }, { "epoch": 2.698685705013792, "grad_norm": 0.8544564843177795, "learning_rate": 2.7091379149682683e-07, "loss": 0.2855, "step": 4158 }, { "epoch": 2.6999837741359727, "grad_norm": 0.7177453637123108, "learning_rate": 2.685944790182815e-07, "loss": 0.2486, "step": 4160 }, { "epoch": 2.7012818432581533, "grad_norm": 1.0798395872116089, "learning_rate": 2.6628486312170687e-07, "loss": 0.2943, "step": 4162 }, { "epoch": 2.702579912380334, "grad_norm": 0.8036947250366211, "learning_rate": 2.6398494854045055e-07, "loss": 0.2827, "step": 4164 }, { "epoch": 2.703877981502515, "grad_norm": 0.834876298904419, "learning_rate": 2.6169473998797377e-07, "loss": 0.2664, "step": 4166 }, { "epoch": 2.705176050624696, "grad_norm": 0.7443352937698364, "learning_rate": 2.5941424215785216e-07, "loss": 0.2779, "step": 4168 }, { "epoch": 2.7064741197468765, "grad_norm": 0.9498503804206848, "learning_rate": 2.5714345972375486e-07, "loss": 0.2951, "step": 4170 }, { "epoch": 2.707772188869057, "grad_norm": 0.7738152742385864, "learning_rate": 2.548823973394449e-07, "loss": 0.2705, "step": 4172 }, { "epoch": 2.709070257991238, "grad_norm": 2.0925073623657227, "learning_rate": 2.526310596387588e-07, "loss": 0.3127, "step": 4174 }, { "epoch": 2.710368327113419, "grad_norm": 0.9136667847633362, "learning_rate": 2.5038945123560976e-07, "loss": 0.2405, "step": 4176 }, { "epoch": 2.7116663962355996, "grad_norm": 0.8882178068161011, "learning_rate": 2.4815757672396744e-07, "loss": 0.2901, "step": 4178 }, { "epoch": 2.7129644653577802, "grad_norm": 1.0070935487747192, "learning_rate": 2.459354406778547e-07, "loss": 0.2955, "step": 4180 }, { "epoch": 2.714262534479961, "grad_norm": 0.7236667275428772, "learning_rate": 2.4372304765133525e-07, "loss": 0.2737, "step": 4182 }, { "epoch": 2.7155606036021416, "grad_norm": 0.7948781847953796, "learning_rate": 2.4152040217850556e-07, "loss": 0.2925, "step": 4184 }, { "epoch": 2.7168586727243227, "grad_norm": 1.1693648099899292, "learning_rate": 2.393275087734864e-07, "loss": 0.3126, "step": 4186 }, { "epoch": 2.7181567418465034, "grad_norm": 1.0163978338241577, "learning_rate": 2.3714437193041174e-07, "loss": 0.3123, "step": 4188 }, { "epoch": 2.719454810968684, "grad_norm": 0.9018874168395996, "learning_rate": 2.3497099612341977e-07, "loss": 0.2726, "step": 4190 }, { "epoch": 2.7207528800908647, "grad_norm": 0.9755562543869019, "learning_rate": 2.328073858066454e-07, "loss": 0.2862, "step": 4192 }, { "epoch": 2.7220509492130454, "grad_norm": 0.8333150148391724, "learning_rate": 2.3065354541421003e-07, "loss": 0.2792, "step": 4194 }, { "epoch": 2.7233490183352265, "grad_norm": 0.8577569127082825, "learning_rate": 2.2850947936021163e-07, "loss": 0.2703, "step": 4196 }, { "epoch": 2.724647087457407, "grad_norm": 0.9458657503128052, "learning_rate": 2.263751920387175e-07, "loss": 0.2639, "step": 4198 }, { "epoch": 2.725945156579588, "grad_norm": 0.7938470840454102, "learning_rate": 2.242506878237538e-07, "loss": 0.2568, "step": 4200 }, { "epoch": 2.7272432257017685, "grad_norm": 0.9100823998451233, "learning_rate": 2.2213597106929608e-07, "loss": 0.2909, "step": 4202 }, { "epoch": 2.728541294823949, "grad_norm": 0.8512047529220581, "learning_rate": 2.2003104610926474e-07, "loss": 0.2691, "step": 4204 }, { "epoch": 2.7298393639461302, "grad_norm": 0.9035998582839966, "learning_rate": 2.1793591725750851e-07, "loss": 0.3188, "step": 4206 }, { "epoch": 2.731137433068311, "grad_norm": 0.9787468314170837, "learning_rate": 2.1585058880780328e-07, "loss": 0.2788, "step": 4208 }, { "epoch": 2.7324355021904916, "grad_norm": 0.9367960691452026, "learning_rate": 2.1377506503383716e-07, "loss": 0.2692, "step": 4210 }, { "epoch": 2.7337335713126723, "grad_norm": 0.9618814587593079, "learning_rate": 2.1170935018920758e-07, "loss": 0.277, "step": 4212 }, { "epoch": 2.735031640434853, "grad_norm": 0.961176335811615, "learning_rate": 2.0965344850740698e-07, "loss": 0.2933, "step": 4214 }, { "epoch": 2.736329709557034, "grad_norm": 0.7064293622970581, "learning_rate": 2.0760736420181726e-07, "loss": 0.2389, "step": 4216 }, { "epoch": 2.7376277786792147, "grad_norm": 0.8214148879051208, "learning_rate": 2.055711014657008e-07, "loss": 0.2922, "step": 4218 }, { "epoch": 2.7389258478013954, "grad_norm": 0.7367089986801147, "learning_rate": 2.0354466447219224e-07, "loss": 0.2578, "step": 4220 }, { "epoch": 2.740223916923576, "grad_norm": 1.016435146331787, "learning_rate": 2.015280573742867e-07, "loss": 0.2941, "step": 4222 }, { "epoch": 2.7415219860457567, "grad_norm": 1.1729692220687866, "learning_rate": 1.9952128430483718e-07, "loss": 0.2824, "step": 4224 }, { "epoch": 2.742820055167938, "grad_norm": 0.8945146203041077, "learning_rate": 1.9752434937654051e-07, "loss": 0.2417, "step": 4226 }, { "epoch": 2.7441181242901185, "grad_norm": 0.8259299993515015, "learning_rate": 1.9553725668193192e-07, "loss": 0.2707, "step": 4228 }, { "epoch": 2.745416193412299, "grad_norm": 0.7634604573249817, "learning_rate": 1.93560010293376e-07, "loss": 0.2651, "step": 4230 }, { "epoch": 2.74671426253448, "grad_norm": 0.7292312383651733, "learning_rate": 1.9159261426305698e-07, "loss": 0.2528, "step": 4232 }, { "epoch": 2.7480123316566605, "grad_norm": 0.7373026609420776, "learning_rate": 1.8963507262297398e-07, "loss": 0.2609, "step": 4234 }, { "epoch": 2.7493104007788416, "grad_norm": 0.8174372911453247, "learning_rate": 1.8768738938492903e-07, "loss": 0.2632, "step": 4236 }, { "epoch": 2.7506084699010223, "grad_norm": 0.8417197465896606, "learning_rate": 1.8574956854051973e-07, "loss": 0.2776, "step": 4238 }, { "epoch": 2.751906539023203, "grad_norm": 0.9213493466377258, "learning_rate": 1.8382161406113208e-07, "loss": 0.3046, "step": 4240 }, { "epoch": 2.7532046081453836, "grad_norm": 0.8253766298294067, "learning_rate": 1.8190352989793325e-07, "loss": 0.265, "step": 4242 }, { "epoch": 2.7545026772675643, "grad_norm": 0.6478874683380127, "learning_rate": 1.7999531998186049e-07, "loss": 0.2168, "step": 4244 }, { "epoch": 2.7558007463897454, "grad_norm": 0.7077654004096985, "learning_rate": 1.78096988223615e-07, "loss": 0.26, "step": 4246 }, { "epoch": 2.757098815511926, "grad_norm": 0.7686173319816589, "learning_rate": 1.762085385136536e-07, "loss": 0.2698, "step": 4248 }, { "epoch": 2.7583968846341067, "grad_norm": 0.7813746333122253, "learning_rate": 1.743299747221805e-07, "loss": 0.2896, "step": 4250 }, { "epoch": 2.7596949537562874, "grad_norm": 1.1637264490127563, "learning_rate": 1.7246130069914102e-07, "loss": 0.3085, "step": 4252 }, { "epoch": 2.760993022878468, "grad_norm": 0.8240634202957153, "learning_rate": 1.706025202742112e-07, "loss": 0.2632, "step": 4254 }, { "epoch": 2.762291092000649, "grad_norm": 1.096686840057373, "learning_rate": 1.6875363725679052e-07, "loss": 0.2745, "step": 4256 }, { "epoch": 2.76358916112283, "grad_norm": 0.7117223143577576, "learning_rate": 1.6691465543599462e-07, "loss": 0.2803, "step": 4258 }, { "epoch": 2.7648872302450105, "grad_norm": 0.7192531824111938, "learning_rate": 1.6508557858064988e-07, "loss": 0.2522, "step": 4260 }, { "epoch": 2.766185299367191, "grad_norm": 0.9329693913459778, "learning_rate": 1.632664104392806e-07, "loss": 0.282, "step": 4262 }, { "epoch": 2.767483368489372, "grad_norm": 0.8950532078742981, "learning_rate": 1.614571547401056e-07, "loss": 0.2727, "step": 4264 }, { "epoch": 2.768781437611553, "grad_norm": 0.8049120306968689, "learning_rate": 1.596578151910283e-07, "loss": 0.2674, "step": 4266 }, { "epoch": 2.7700795067337336, "grad_norm": 0.6812533736228943, "learning_rate": 1.5786839547963008e-07, "loss": 0.2783, "step": 4268 }, { "epoch": 2.7713775758559143, "grad_norm": 0.7448317408561707, "learning_rate": 1.5608889927316407e-07, "loss": 0.2506, "step": 4270 }, { "epoch": 2.772675644978095, "grad_norm": 0.7623326778411865, "learning_rate": 1.5431933021854196e-07, "loss": 0.2629, "step": 4272 }, { "epoch": 2.7739737141002756, "grad_norm": 0.7743139266967773, "learning_rate": 1.5255969194233556e-07, "loss": 0.2715, "step": 4274 }, { "epoch": 2.7752717832224567, "grad_norm": 0.6970307230949402, "learning_rate": 1.508099880507613e-07, "loss": 0.2492, "step": 4276 }, { "epoch": 2.7765698523446374, "grad_norm": 1.094951868057251, "learning_rate": 1.4907022212967803e-07, "loss": 0.2641, "step": 4278 }, { "epoch": 2.777867921466818, "grad_norm": 1.4840141534805298, "learning_rate": 1.4734039774457476e-07, "loss": 0.2772, "step": 4280 }, { "epoch": 2.7791659905889987, "grad_norm": 0.884491503238678, "learning_rate": 1.4562051844056902e-07, "loss": 0.2481, "step": 4282 }, { "epoch": 2.7804640597111794, "grad_norm": 0.721507728099823, "learning_rate": 1.439105877423963e-07, "loss": 0.2431, "step": 4284 }, { "epoch": 2.7817621288333605, "grad_norm": 0.7842427492141724, "learning_rate": 1.4221060915440176e-07, "loss": 0.2693, "step": 4286 }, { "epoch": 2.783060197955541, "grad_norm": 0.9018750190734863, "learning_rate": 1.4052058616053633e-07, "loss": 0.2468, "step": 4288 }, { "epoch": 2.784358267077722, "grad_norm": 0.8325943946838379, "learning_rate": 1.388405222243472e-07, "loss": 0.2567, "step": 4290 }, { "epoch": 2.7856563361999025, "grad_norm": 0.9082318544387817, "learning_rate": 1.371704207889718e-07, "loss": 0.261, "step": 4292 }, { "epoch": 2.786954405322083, "grad_norm": 1.0102852582931519, "learning_rate": 1.3551028527712896e-07, "loss": 0.2804, "step": 4294 }, { "epoch": 2.7882524744442643, "grad_norm": 0.8558845520019531, "learning_rate": 1.338601190911154e-07, "loss": 0.2452, "step": 4296 }, { "epoch": 2.789550543566445, "grad_norm": 0.7977086901664734, "learning_rate": 1.322199256127943e-07, "loss": 0.2555, "step": 4298 }, { "epoch": 2.7908486126886256, "grad_norm": 0.8875210285186768, "learning_rate": 1.3058970820359285e-07, "loss": 0.2673, "step": 4300 }, { "epoch": 2.7921466818108063, "grad_norm": 0.7724189162254333, "learning_rate": 1.2896947020449192e-07, "loss": 0.2477, "step": 4302 }, { "epoch": 2.793444750932987, "grad_norm": 1.1928030252456665, "learning_rate": 1.2735921493602033e-07, "loss": 0.2806, "step": 4304 }, { "epoch": 2.794742820055168, "grad_norm": 1.0334746837615967, "learning_rate": 1.2575894569824943e-07, "loss": 0.362, "step": 4306 }, { "epoch": 2.7960408891773487, "grad_norm": 0.8398367762565613, "learning_rate": 1.2416866577078358e-07, "loss": 0.2438, "step": 4308 }, { "epoch": 2.7973389582995294, "grad_norm": 0.8887611031532288, "learning_rate": 1.2258837841275683e-07, "loss": 0.2744, "step": 4310 }, { "epoch": 2.79863702742171, "grad_norm": 0.8362759947776794, "learning_rate": 1.210180868628219e-07, "loss": 0.2633, "step": 4312 }, { "epoch": 2.7999350965438907, "grad_norm": 1.21372652053833, "learning_rate": 1.194577943391484e-07, "loss": 0.2933, "step": 4314 }, { "epoch": 2.801233165666072, "grad_norm": 0.6957341432571411, "learning_rate": 1.1790750403941231e-07, "loss": 0.2626, "step": 4316 }, { "epoch": 2.8025312347882525, "grad_norm": 0.7891921997070312, "learning_rate": 1.1636721914079274e-07, "loss": 0.2533, "step": 4318 }, { "epoch": 2.803829303910433, "grad_norm": 0.8788295388221741, "learning_rate": 1.1483694279996016e-07, "loss": 0.2846, "step": 4320 }, { "epoch": 2.805127373032614, "grad_norm": 0.6822298169136047, "learning_rate": 1.1331667815307756e-07, "loss": 0.2478, "step": 4322 }, { "epoch": 2.8064254421547945, "grad_norm": 0.7573204040527344, "learning_rate": 1.1180642831578658e-07, "loss": 0.252, "step": 4324 }, { "epoch": 2.8077235112769756, "grad_norm": 0.7564897537231445, "learning_rate": 1.1030619638320805e-07, "loss": 0.2574, "step": 4326 }, { "epoch": 2.8090215803991563, "grad_norm": 0.9118068218231201, "learning_rate": 1.0881598542992755e-07, "loss": 0.2678, "step": 4328 }, { "epoch": 2.810319649521337, "grad_norm": 1.1704000234603882, "learning_rate": 1.0733579850999654e-07, "loss": 0.2989, "step": 4330 }, { "epoch": 2.8116177186435176, "grad_norm": 0.78289395570755, "learning_rate": 1.058656386569229e-07, "loss": 0.2461, "step": 4332 }, { "epoch": 2.8129157877656983, "grad_norm": 0.9127869606018066, "learning_rate": 1.0440550888366485e-07, "loss": 0.2618, "step": 4334 }, { "epoch": 2.8142138568878794, "grad_norm": 1.0992515087127686, "learning_rate": 1.0295541218262317e-07, "loss": 0.2463, "step": 4336 }, { "epoch": 2.81551192601006, "grad_norm": 0.7859074473381042, "learning_rate": 1.015153515256384e-07, "loss": 0.2637, "step": 4338 }, { "epoch": 2.8168099951322407, "grad_norm": 1.039808988571167, "learning_rate": 1.0008532986398422e-07, "loss": 0.3072, "step": 4340 }, { "epoch": 2.8181080642544214, "grad_norm": 0.9069875478744507, "learning_rate": 9.866535012835799e-08, "loss": 0.262, "step": 4342 }, { "epoch": 2.819406133376602, "grad_norm": 0.8937183618545532, "learning_rate": 9.725541522887794e-08, "loss": 0.312, "step": 4344 }, { "epoch": 2.820704202498783, "grad_norm": 0.8297768235206604, "learning_rate": 9.585552805507659e-08, "loss": 0.2742, "step": 4346 }, { "epoch": 2.822002271620964, "grad_norm": 1.0624281167984009, "learning_rate": 9.446569147589457e-08, "loss": 0.2973, "step": 4348 }, { "epoch": 2.8233003407431445, "grad_norm": 0.8189355731010437, "learning_rate": 9.308590833967457e-08, "loss": 0.2729, "step": 4350 }, { "epoch": 2.824598409865325, "grad_norm": 0.862359881401062, "learning_rate": 9.171618147415518e-08, "loss": 0.2476, "step": 4352 }, { "epoch": 2.825896478987506, "grad_norm": 1.1735388040542603, "learning_rate": 9.035651368646647e-08, "loss": 0.2906, "step": 4354 }, { "epoch": 2.827194548109687, "grad_norm": 0.7190595865249634, "learning_rate": 8.900690776312282e-08, "loss": 0.2632, "step": 4356 }, { "epoch": 2.8284926172318676, "grad_norm": 0.7659333944320679, "learning_rate": 8.766736647001839e-08, "loss": 0.3414, "step": 4358 }, { "epoch": 2.8297906863540483, "grad_norm": 0.7426614761352539, "learning_rate": 8.633789255241998e-08, "loss": 0.2651, "step": 4360 }, { "epoch": 2.831088755476229, "grad_norm": 1.0267746448516846, "learning_rate": 8.501848873496199e-08, "loss": 0.2752, "step": 4362 }, { "epoch": 2.8323868245984096, "grad_norm": 0.8870016932487488, "learning_rate": 8.370915772164201e-08, "loss": 0.244, "step": 4364 }, { "epoch": 2.8336848937205907, "grad_norm": 0.9300747513771057, "learning_rate": 8.240990219581414e-08, "loss": 0.2544, "step": 4366 }, { "epoch": 2.8349829628427714, "grad_norm": 0.7370412349700928, "learning_rate": 8.11207248201834e-08, "loss": 0.2379, "step": 4368 }, { "epoch": 2.836281031964952, "grad_norm": 0.9741052985191345, "learning_rate": 7.984162823680031e-08, "loss": 0.2686, "step": 4370 }, { "epoch": 2.8375791010871327, "grad_norm": 1.4013510942459106, "learning_rate": 7.857261506705627e-08, "loss": 0.291, "step": 4372 }, { "epoch": 2.8388771702093134, "grad_norm": 0.9494476318359375, "learning_rate": 7.731368791167815e-08, "loss": 0.2933, "step": 4374 }, { "epoch": 2.8401752393314945, "grad_norm": 0.9522914290428162, "learning_rate": 7.6064849350721e-08, "loss": 0.2767, "step": 4376 }, { "epoch": 2.841473308453675, "grad_norm": 1.1752058267593384, "learning_rate": 7.482610194356477e-08, "loss": 0.3146, "step": 4378 }, { "epoch": 2.842771377575856, "grad_norm": 0.9284898638725281, "learning_rate": 7.35974482289098e-08, "loss": 0.2593, "step": 4380 }, { "epoch": 2.8440694466980365, "grad_norm": 0.7672584652900696, "learning_rate": 7.237889072476856e-08, "loss": 0.2583, "step": 4382 }, { "epoch": 2.845367515820217, "grad_norm": 0.7872727513313293, "learning_rate": 7.117043192846284e-08, "loss": 0.2718, "step": 4384 }, { "epoch": 2.8466655849423983, "grad_norm": 0.8427248001098633, "learning_rate": 6.997207431661768e-08, "loss": 0.2619, "step": 4386 }, { "epoch": 2.847963654064579, "grad_norm": 0.9040888547897339, "learning_rate": 6.878382034515907e-08, "loss": 0.2712, "step": 4388 }, { "epoch": 2.8492617231867596, "grad_norm": 0.8873353004455566, "learning_rate": 6.760567244930294e-08, "loss": 0.2676, "step": 4390 }, { "epoch": 2.8505597923089403, "grad_norm": 0.8109408020973206, "learning_rate": 6.643763304355566e-08, "loss": 0.2475, "step": 4392 }, { "epoch": 2.851857861431121, "grad_norm": 0.953660786151886, "learning_rate": 6.527970452170685e-08, "loss": 0.2686, "step": 4394 }, { "epoch": 2.853155930553302, "grad_norm": 0.7933067679405212, "learning_rate": 6.4131889256826e-08, "loss": 0.2569, "step": 4396 }, { "epoch": 2.8544539996754827, "grad_norm": 0.7448694109916687, "learning_rate": 6.299418960125425e-08, "loss": 0.2748, "step": 4398 }, { "epoch": 2.8557520687976634, "grad_norm": 1.4045320749282837, "learning_rate": 6.186660788660315e-08, "loss": 0.2844, "step": 4400 }, { "epoch": 2.857050137919844, "grad_norm": 0.8880423903465271, "learning_rate": 6.074914642374807e-08, "loss": 0.2341, "step": 4402 }, { "epoch": 2.8583482070420247, "grad_norm": 0.7947379350662231, "learning_rate": 5.964180750282323e-08, "loss": 0.2762, "step": 4404 }, { "epoch": 2.859646276164206, "grad_norm": 0.8628886342048645, "learning_rate": 5.854459339321939e-08, "loss": 0.2519, "step": 4406 }, { "epoch": 2.8609443452863865, "grad_norm": 0.8891245126724243, "learning_rate": 5.745750634357505e-08, "loss": 0.2566, "step": 4408 }, { "epoch": 2.862242414408567, "grad_norm": 0.942315936088562, "learning_rate": 5.638054858177644e-08, "loss": 0.2664, "step": 4410 }, { "epoch": 2.863540483530748, "grad_norm": 0.7043654918670654, "learning_rate": 5.531372231494969e-08, "loss": 0.2586, "step": 4412 }, { "epoch": 2.8648385526529285, "grad_norm": 0.7274218797683716, "learning_rate": 5.425702972945701e-08, "loss": 0.2724, "step": 4414 }, { "epoch": 2.8661366217751096, "grad_norm": 1.204500675201416, "learning_rate": 5.3210472990893324e-08, "loss": 0.2699, "step": 4416 }, { "epoch": 2.8674346908972903, "grad_norm": 0.6792069673538208, "learning_rate": 5.217405424408073e-08, "loss": 0.2634, "step": 4418 }, { "epoch": 2.868732760019471, "grad_norm": 0.7910333871841431, "learning_rate": 5.114777561306406e-08, "loss": 0.2558, "step": 4420 }, { "epoch": 2.8700308291416516, "grad_norm": 0.8410026431083679, "learning_rate": 5.013163920110864e-08, "loss": 0.2464, "step": 4422 }, { "epoch": 2.8713288982638323, "grad_norm": 0.729112446308136, "learning_rate": 4.91256470906909e-08, "loss": 0.2414, "step": 4424 }, { "epoch": 2.8726269673860134, "grad_norm": 0.7718569040298462, "learning_rate": 4.8129801343500534e-08, "loss": 0.2703, "step": 4426 }, { "epoch": 2.873925036508194, "grad_norm": 0.814672589302063, "learning_rate": 4.714410400043279e-08, "loss": 0.2763, "step": 4428 }, { "epoch": 2.8752231056303748, "grad_norm": 0.8735334277153015, "learning_rate": 4.6168557081582854e-08, "loss": 0.2776, "step": 4430 }, { "epoch": 2.8765211747525554, "grad_norm": 0.6574318408966064, "learning_rate": 4.520316258624535e-08, "loss": 0.2696, "step": 4432 }, { "epoch": 2.877819243874736, "grad_norm": 0.8373961448669434, "learning_rate": 4.42479224929071e-08, "loss": 0.2542, "step": 4434 }, { "epoch": 2.879117312996917, "grad_norm": 0.9358408451080322, "learning_rate": 4.330283875924601e-08, "loss": 0.2796, "step": 4436 }, { "epoch": 2.880415382119098, "grad_norm": 0.863348126411438, "learning_rate": 4.236791332212498e-08, "loss": 0.2615, "step": 4438 }, { "epoch": 2.8817134512412785, "grad_norm": 0.8536419868469238, "learning_rate": 4.144314809758632e-08, "loss": 0.2531, "step": 4440 }, { "epoch": 2.883011520363459, "grad_norm": 0.8574737906455994, "learning_rate": 4.05285449808529e-08, "loss": 0.256, "step": 4442 }, { "epoch": 2.88430958948564, "grad_norm": 1.057780146598816, "learning_rate": 3.9624105846319813e-08, "loss": 0.2768, "step": 4444 }, { "epoch": 2.885607658607821, "grad_norm": 0.9273356199264526, "learning_rate": 3.872983254755158e-08, "loss": 0.3013, "step": 4446 }, { "epoch": 2.8869057277300016, "grad_norm": 0.7420692443847656, "learning_rate": 3.78457269172805e-08, "loss": 0.2719, "step": 4448 }, { "epoch": 2.8882037968521823, "grad_norm": 0.7258841395378113, "learning_rate": 3.6971790767398874e-08, "loss": 0.266, "step": 4450 }, { "epoch": 2.889501865974363, "grad_norm": 0.8452582955360413, "learning_rate": 3.610802588895845e-08, "loss": 0.2472, "step": 4452 }, { "epoch": 2.8907999350965436, "grad_norm": 1.1384495496749878, "learning_rate": 3.5254434052168215e-08, "loss": 0.2842, "step": 4454 }, { "epoch": 2.8920980042187248, "grad_norm": 0.8942080140113831, "learning_rate": 3.4411017006384383e-08, "loss": 0.2702, "step": 4456 }, { "epoch": 2.8933960733409054, "grad_norm": 1.2836562395095825, "learning_rate": 3.357777648011373e-08, "loss": 0.3061, "step": 4458 }, { "epoch": 2.894694142463086, "grad_norm": 0.7322551608085632, "learning_rate": 3.2754714181005845e-08, "loss": 0.2559, "step": 4460 }, { "epoch": 2.8959922115852668, "grad_norm": 0.6912504434585571, "learning_rate": 3.194183179585253e-08, "loss": 0.2767, "step": 4462 }, { "epoch": 2.8972902807074474, "grad_norm": 0.9261453151702881, "learning_rate": 3.1139130990580633e-08, "loss": 0.2509, "step": 4464 }, { "epoch": 2.8985883498296285, "grad_norm": 0.9264556765556335, "learning_rate": 3.034661341025258e-08, "loss": 0.2354, "step": 4466 }, { "epoch": 2.899886418951809, "grad_norm": 0.7595345377922058, "learning_rate": 2.9564280679060255e-08, "loss": 0.2465, "step": 4468 }, { "epoch": 2.90118448807399, "grad_norm": 0.8120997548103333, "learning_rate": 2.8792134400322803e-08, "loss": 0.283, "step": 4470 }, { "epoch": 2.9024825571961705, "grad_norm": 0.9301403760910034, "learning_rate": 2.8030176156483292e-08, "loss": 0.2815, "step": 4472 }, { "epoch": 2.903780626318351, "grad_norm": 0.9242321252822876, "learning_rate": 2.7278407509105376e-08, "loss": 0.2727, "step": 4474 }, { "epoch": 2.9050786954405323, "grad_norm": 0.9509041905403137, "learning_rate": 2.6536829998869972e-08, "loss": 0.2807, "step": 4476 }, { "epoch": 2.906376764562713, "grad_norm": 0.9169664978981018, "learning_rate": 2.5805445145571927e-08, "loss": 0.2532, "step": 4478 }, { "epoch": 2.9076748336848937, "grad_norm": 0.8621170520782471, "learning_rate": 2.5084254448117794e-08, "loss": 0.2842, "step": 4480 }, { "epoch": 2.9089729028070743, "grad_norm": 1.009739637374878, "learning_rate": 2.437325938452195e-08, "loss": 0.2869, "step": 4482 }, { "epoch": 2.910270971929255, "grad_norm": 0.7607407569885254, "learning_rate": 2.3672461411903268e-08, "loss": 0.2582, "step": 4484 }, { "epoch": 2.911569041051436, "grad_norm": 0.6525920629501343, "learning_rate": 2.298186196648344e-08, "loss": 0.2821, "step": 4486 }, { "epoch": 2.9128671101736168, "grad_norm": 0.7735894322395325, "learning_rate": 2.230146246358256e-08, "loss": 0.2595, "step": 4488 }, { "epoch": 2.9141651792957974, "grad_norm": 0.7965257167816162, "learning_rate": 2.1631264297616307e-08, "loss": 0.2537, "step": 4490 }, { "epoch": 2.915463248417978, "grad_norm": 1.2043837308883667, "learning_rate": 2.097126884209544e-08, "loss": 0.2683, "step": 4492 }, { "epoch": 2.9167613175401588, "grad_norm": 0.9355093836784363, "learning_rate": 2.0321477449619098e-08, "loss": 0.2688, "step": 4494 }, { "epoch": 2.91805938666234, "grad_norm": 1.0026344060897827, "learning_rate": 1.968189145187649e-08, "loss": 0.2635, "step": 4496 }, { "epoch": 2.9193574557845205, "grad_norm": 0.7665977478027344, "learning_rate": 1.9052512159639656e-08, "loss": 0.2601, "step": 4498 }, { "epoch": 2.920655524906701, "grad_norm": 0.7309552431106567, "learning_rate": 1.8433340862763493e-08, "loss": 0.2879, "step": 4500 }, { "epoch": 2.920655524906701, "eval_loss": 0.2755427360534668, "eval_runtime": 397.3532, "eval_samples_per_second": 26.123, "eval_steps_per_second": 3.267, "step": 4500 }, { "epoch": 2.921953594028882, "grad_norm": 1.5020800828933716, "learning_rate": 1.7824378830184065e-08, "loss": 0.2948, "step": 4502 }, { "epoch": 2.9232516631510626, "grad_norm": 0.7444823980331421, "learning_rate": 1.7225627309911953e-08, "loss": 0.2803, "step": 4504 }, { "epoch": 2.9245497322732437, "grad_norm": 0.7420012950897217, "learning_rate": 1.6637087529033925e-08, "loss": 0.2788, "step": 4506 }, { "epoch": 2.9258478013954243, "grad_norm": 0.7887323498725891, "learning_rate": 1.6058760693708487e-08, "loss": 0.2655, "step": 4508 }, { "epoch": 2.927145870517605, "grad_norm": 0.9188719987869263, "learning_rate": 1.549064798916311e-08, "loss": 0.2572, "step": 4510 }, { "epoch": 2.9284439396397857, "grad_norm": 0.6691784858703613, "learning_rate": 1.4932750579693677e-08, "loss": 0.3003, "step": 4512 }, { "epoch": 2.9297420087619663, "grad_norm": 0.7408701181411743, "learning_rate": 1.4385069608658376e-08, "loss": 0.2517, "step": 4514 }, { "epoch": 2.9310400778841474, "grad_norm": 0.9264618158340454, "learning_rate": 1.3847606198480467e-08, "loss": 0.2736, "step": 4516 }, { "epoch": 2.932338147006328, "grad_norm": 0.8773911595344543, "learning_rate": 1.3320361450641085e-08, "loss": 0.2457, "step": 4518 }, { "epoch": 2.933636216128509, "grad_norm": 1.3299638032913208, "learning_rate": 1.2803336445680325e-08, "loss": 0.2909, "step": 4520 }, { "epoch": 2.9349342852506894, "grad_norm": 0.8642644882202148, "learning_rate": 1.2296532243193382e-08, "loss": 0.2898, "step": 4522 }, { "epoch": 2.93623235437287, "grad_norm": 0.7709313631057739, "learning_rate": 1.179994988182942e-08, "loss": 0.2912, "step": 4524 }, { "epoch": 2.9375304234950512, "grad_norm": 0.8457815647125244, "learning_rate": 1.1313590379288252e-08, "loss": 0.2488, "step": 4526 }, { "epoch": 2.938828492617232, "grad_norm": 0.7591208219528198, "learning_rate": 1.0837454732319231e-08, "loss": 0.284, "step": 4528 }, { "epoch": 2.9401265617394126, "grad_norm": 0.7822162508964539, "learning_rate": 1.0371543916718462e-08, "loss": 0.257, "step": 4530 }, { "epoch": 2.9414246308615932, "grad_norm": 0.7250688672065735, "learning_rate": 9.915858887327157e-09, "loss": 0.2687, "step": 4532 }, { "epoch": 2.942722699983774, "grad_norm": 0.9081250429153442, "learning_rate": 9.470400578031057e-09, "loss": 0.2513, "step": 4534 }, { "epoch": 2.944020769105955, "grad_norm": 0.8742815852165222, "learning_rate": 9.035169901754902e-09, "loss": 0.2577, "step": 4536 }, { "epoch": 2.9453188382281357, "grad_norm": 0.8632300496101379, "learning_rate": 8.610167750465192e-09, "loss": 0.2588, "step": 4538 }, { "epoch": 2.9466169073503163, "grad_norm": 1.0938827991485596, "learning_rate": 8.195394995164086e-09, "loss": 0.271, "step": 4540 }, { "epoch": 2.947914976472497, "grad_norm": 0.7120091319084167, "learning_rate": 7.790852485891064e-09, "loss": 0.2592, "step": 4542 }, { "epoch": 2.9492130455946777, "grad_norm": 0.7964645028114319, "learning_rate": 7.3965410517179426e-09, "loss": 0.2596, "step": 4544 }, { "epoch": 2.950511114716859, "grad_norm": 0.8639079928398132, "learning_rate": 7.0124615007505225e-09, "loss": 0.2576, "step": 4546 }, { "epoch": 2.9518091838390395, "grad_norm": 0.9517873525619507, "learning_rate": 6.638614620124717e-09, "loss": 0.2824, "step": 4548 }, { "epoch": 2.95310725296122, "grad_norm": 0.9042484760284424, "learning_rate": 6.2750011760054355e-09, "loss": 0.2408, "step": 4550 }, { "epoch": 2.954405322083401, "grad_norm": 0.9543337821960449, "learning_rate": 5.921621913584363e-09, "loss": 0.2608, "step": 4552 }, { "epoch": 2.9557033912055815, "grad_norm": 0.8704521059989929, "learning_rate": 5.578477557081074e-09, "loss": 0.2361, "step": 4554 }, { "epoch": 2.9570014603277626, "grad_norm": 0.8277267217636108, "learning_rate": 5.245568809737478e-09, "loss": 0.268, "step": 4556 }, { "epoch": 2.9582995294499432, "grad_norm": 0.8721728920936584, "learning_rate": 4.922896353820039e-09, "loss": 0.2412, "step": 4558 }, { "epoch": 2.959597598572124, "grad_norm": 0.9036709666252136, "learning_rate": 4.61046085061645e-09, "loss": 0.2639, "step": 4560 }, { "epoch": 2.9608956676943046, "grad_norm": 0.9520308375358582, "learning_rate": 4.3082629404345155e-09, "loss": 0.2723, "step": 4562 }, { "epoch": 2.9621937368164852, "grad_norm": 0.6926831603050232, "learning_rate": 4.016303242600495e-09, "loss": 0.2416, "step": 4564 }, { "epoch": 2.9634918059386663, "grad_norm": 1.3578779697418213, "learning_rate": 3.7345823554602036e-09, "loss": 0.2851, "step": 4566 }, { "epoch": 2.964789875060847, "grad_norm": 1.3651506900787354, "learning_rate": 3.4631008563740243e-09, "loss": 0.331, "step": 4568 }, { "epoch": 2.9660879441830277, "grad_norm": 0.8616597056388855, "learning_rate": 3.2018593017191234e-09, "loss": 0.2626, "step": 4570 }, { "epoch": 2.9673860133052083, "grad_norm": 0.6796725988388062, "learning_rate": 2.9508582268850116e-09, "loss": 0.2657, "step": 4572 }, { "epoch": 2.968684082427389, "grad_norm": 0.8624830842018127, "learning_rate": 2.7100981462757634e-09, "loss": 0.2908, "step": 4574 }, { "epoch": 2.96998215154957, "grad_norm": 1.1119035482406616, "learning_rate": 2.479579553307798e-09, "loss": 0.2914, "step": 4576 }, { "epoch": 2.971280220671751, "grad_norm": 0.6114272475242615, "learning_rate": 2.2593029204076578e-09, "loss": 0.2455, "step": 4578 }, { "epoch": 2.9725782897939315, "grad_norm": 0.7716228365898132, "learning_rate": 2.049268699010898e-09, "loss": 0.2613, "step": 4580 }, { "epoch": 2.973876358916112, "grad_norm": 0.7678053975105286, "learning_rate": 1.849477319564863e-09, "loss": 0.269, "step": 4582 }, { "epoch": 2.975174428038293, "grad_norm": 0.8480517864227295, "learning_rate": 1.6599291915231352e-09, "loss": 0.251, "step": 4584 }, { "epoch": 2.976472497160474, "grad_norm": 0.8516966104507446, "learning_rate": 1.4806247033471998e-09, "loss": 0.2804, "step": 4586 }, { "epoch": 2.9777705662826546, "grad_norm": 0.8488627076148987, "learning_rate": 1.3115642225053348e-09, "loss": 0.2487, "step": 4588 }, { "epoch": 2.9790686354048352, "grad_norm": 0.8471726179122925, "learning_rate": 1.1527480954715009e-09, "loss": 0.2719, "step": 4590 }, { "epoch": 2.980366704527016, "grad_norm": 0.7898222208023071, "learning_rate": 1.004176647724231e-09, "loss": 0.2728, "step": 4592 }, { "epoch": 2.9816647736491966, "grad_norm": 0.9357900023460388, "learning_rate": 8.658501837477406e-10, "loss": 0.2956, "step": 4594 }, { "epoch": 2.9829628427713777, "grad_norm": 0.7333936095237732, "learning_rate": 7.377689870291527e-10, "loss": 0.261, "step": 4596 }, { "epoch": 2.9842609118935584, "grad_norm": 0.7239441275596619, "learning_rate": 6.199333200590519e-10, "loss": 0.2659, "step": 4598 }, { "epoch": 2.985558981015739, "grad_norm": 0.7403579950332642, "learning_rate": 5.123434243314851e-10, "loss": 0.2574, "step": 4600 }, { "epoch": 2.9868570501379197, "grad_norm": 0.806096076965332, "learning_rate": 4.1499952034118605e-10, "loss": 0.258, "step": 4602 }, { "epoch": 2.9881551192601004, "grad_norm": 0.7120212912559509, "learning_rate": 3.279018075857954e-10, "loss": 0.2539, "step": 4604 }, { "epoch": 2.9894531883822815, "grad_norm": 0.7471323013305664, "learning_rate": 2.5105046456475047e-10, "loss": 0.2469, "step": 4606 }, { "epoch": 2.990751257504462, "grad_norm": 1.0312440395355225, "learning_rate": 1.8444564877706517e-10, "loss": 0.2643, "step": 4608 }, { "epoch": 2.992049326626643, "grad_norm": 0.8243621587753296, "learning_rate": 1.2808749672355015e-10, "loss": 0.2346, "step": 4610 }, { "epoch": 2.9933473957488235, "grad_norm": 0.739955484867096, "learning_rate": 8.197612390514753e-11, "loss": 0.2575, "step": 4612 }, { "epoch": 2.994645464871004, "grad_norm": 0.8210440278053284, "learning_rate": 4.6111624822375853e-11, "loss": 0.2593, "step": 4614 }, { "epoch": 2.9959435339931852, "grad_norm": 0.9462283849716187, "learning_rate": 2.049407297699535e-11, "loss": 0.3198, "step": 4616 }, { "epoch": 2.997241603115366, "grad_norm": 0.9472498297691345, "learning_rate": 5.123520869232401e-12, "loss": 0.2704, "step": 4618 }, { "epoch": 2.9985396722375466, "grad_norm": 0.8899651765823364, "learning_rate": 0.0, "loss": 0.2639, "step": 4620 }, { "epoch": 2.9985396722375466, "step": 4620, "total_flos": 8.62655347699168e+18, "train_loss": 0.29317476286903604, "train_runtime": 46635.6944, "train_samples_per_second": 12.686, "train_steps_per_second": 0.099 } ], "logging_steps": 2, "max_steps": 4620, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.62655347699168e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }