{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9961464354527938, "eval_steps": 500, "global_step": 518, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038535645472061657, "grad_norm": 1432.749671594044, "learning_rate": 7.692307692307694e-07, "loss": 11.5124, "step": 1 }, { "epoch": 0.007707129094412331, "grad_norm": 1357.8414028391733, "learning_rate": 1.5384615384615387e-06, "loss": 11.4455, "step": 2 }, { "epoch": 0.011560693641618497, "grad_norm": 1337.1958013405376, "learning_rate": 2.307692307692308e-06, "loss": 11.2831, "step": 3 }, { "epoch": 0.015414258188824663, "grad_norm": 916.8677401776845, "learning_rate": 3.0769230769230774e-06, "loss": 9.6523, "step": 4 }, { "epoch": 0.019267822736030827, "grad_norm": 668.9521736359394, "learning_rate": 3.846153846153847e-06, "loss": 7.3813, "step": 5 }, { "epoch": 0.023121387283236993, "grad_norm": 409.96263800148665, "learning_rate": 4.615384615384616e-06, "loss": 5.8322, "step": 6 }, { "epoch": 0.02697495183044316, "grad_norm": 226.39376976378787, "learning_rate": 5.384615384615385e-06, "loss": 4.2464, "step": 7 }, { "epoch": 0.030828516377649325, "grad_norm": 165.82496194680013, "learning_rate": 6.153846153846155e-06, "loss": 3.7425, "step": 8 }, { "epoch": 0.03468208092485549, "grad_norm": 173.89210974421533, "learning_rate": 6.923076923076923e-06, "loss": 4.0184, "step": 9 }, { "epoch": 0.038535645472061654, "grad_norm": 168.95311819558103, "learning_rate": 7.692307692307694e-06, "loss": 3.7993, "step": 10 }, { "epoch": 0.04238921001926782, "grad_norm": 76.30995564433688, "learning_rate": 8.461538461538462e-06, "loss": 4.4087, "step": 11 }, { "epoch": 0.046242774566473986, "grad_norm": 74.48219270936819, "learning_rate": 9.230769230769232e-06, "loss": 4.141, "step": 12 }, { "epoch": 0.05009633911368015, "grad_norm": 80.53194275608223, "learning_rate": 1e-05, "loss": 4.1239, "step": 13 }, { "epoch": 0.05394990366088632, "grad_norm": 33.55587129310653, "learning_rate": 1.076923076923077e-05, "loss": 3.6328, "step": 14 }, { "epoch": 0.057803468208092484, "grad_norm": 63.09531505114449, "learning_rate": 1.1538461538461538e-05, "loss": 3.8233, "step": 15 }, { "epoch": 0.06165703275529865, "grad_norm": 67.77031346149326, "learning_rate": 1.230769230769231e-05, "loss": 3.8786, "step": 16 }, { "epoch": 0.06551059730250482, "grad_norm": 82.90135582492816, "learning_rate": 1.3076923076923078e-05, "loss": 4.1657, "step": 17 }, { "epoch": 0.06936416184971098, "grad_norm": 41.832941343610734, "learning_rate": 1.3846153846153847e-05, "loss": 3.408, "step": 18 }, { "epoch": 0.07321772639691715, "grad_norm": 48.26728390692674, "learning_rate": 1.4615384615384615e-05, "loss": 3.3997, "step": 19 }, { "epoch": 0.07707129094412331, "grad_norm": 68.06129545630621, "learning_rate": 1.5384615384615387e-05, "loss": 3.7407, "step": 20 }, { "epoch": 0.08092485549132948, "grad_norm": 22.59279364181322, "learning_rate": 1.6153846153846154e-05, "loss": 3.2956, "step": 21 }, { "epoch": 0.08477842003853564, "grad_norm": 15.9339021622544, "learning_rate": 1.6923076923076924e-05, "loss": 2.8921, "step": 22 }, { "epoch": 0.08863198458574181, "grad_norm": 41.43770891931384, "learning_rate": 1.7692307692307694e-05, "loss": 3.2055, "step": 23 }, { "epoch": 0.09248554913294797, "grad_norm": 42.58954954740387, "learning_rate": 1.8461538461538465e-05, "loss": 3.2822, "step": 24 }, { "epoch": 0.09633911368015415, "grad_norm": 23.174039496957583, "learning_rate": 1.923076923076923e-05, "loss": 2.7902, "step": 25 }, { "epoch": 0.1001926782273603, "grad_norm": 21.687527912513175, "learning_rate": 2e-05, "loss": 2.7725, "step": 26 }, { "epoch": 0.10404624277456648, "grad_norm": 25.192833588413695, "learning_rate": 2.0769230769230772e-05, "loss": 2.6381, "step": 27 }, { "epoch": 0.10789980732177264, "grad_norm": 27.608898717578253, "learning_rate": 2.153846153846154e-05, "loss": 2.633, "step": 28 }, { "epoch": 0.11175337186897881, "grad_norm": 23.800035559481127, "learning_rate": 2.230769230769231e-05, "loss": 2.5231, "step": 29 }, { "epoch": 0.11560693641618497, "grad_norm": 13.963396003674896, "learning_rate": 2.3076923076923076e-05, "loss": 2.1474, "step": 30 }, { "epoch": 0.11946050096339114, "grad_norm": 13.74744780299132, "learning_rate": 2.384615384615385e-05, "loss": 2.0851, "step": 31 }, { "epoch": 0.1233140655105973, "grad_norm": 14.540640312216865, "learning_rate": 2.461538461538462e-05, "loss": 2.0825, "step": 32 }, { "epoch": 0.12716763005780346, "grad_norm": 13.210931532751994, "learning_rate": 2.5384615384615386e-05, "loss": 1.8231, "step": 33 }, { "epoch": 0.13102119460500963, "grad_norm": 14.18049338122164, "learning_rate": 2.6153846153846157e-05, "loss": 2.0161, "step": 34 }, { "epoch": 0.1348747591522158, "grad_norm": 7.365239232178003, "learning_rate": 2.6923076923076927e-05, "loss": 1.636, "step": 35 }, { "epoch": 0.13872832369942195, "grad_norm": 11.42487604515666, "learning_rate": 2.7692307692307694e-05, "loss": 1.7636, "step": 36 }, { "epoch": 0.14258188824662812, "grad_norm": 10.521730704250018, "learning_rate": 2.8461538461538464e-05, "loss": 1.6425, "step": 37 }, { "epoch": 0.1464354527938343, "grad_norm": 15.837868821185545, "learning_rate": 2.923076923076923e-05, "loss": 1.7335, "step": 38 }, { "epoch": 0.15028901734104047, "grad_norm": 10.63875357372187, "learning_rate": 3.0000000000000004e-05, "loss": 1.4748, "step": 39 }, { "epoch": 0.15414258188824662, "grad_norm": 15.842407967828168, "learning_rate": 3.0769230769230774e-05, "loss": 1.4917, "step": 40 }, { "epoch": 0.1579961464354528, "grad_norm": 8.822203607549136, "learning_rate": 3.153846153846154e-05, "loss": 1.4943, "step": 41 }, { "epoch": 0.16184971098265896, "grad_norm": 13.809138343712586, "learning_rate": 3.230769230769231e-05, "loss": 1.5356, "step": 42 }, { "epoch": 0.16570327552986513, "grad_norm": 9.367395780583822, "learning_rate": 3.307692307692308e-05, "loss": 1.4704, "step": 43 }, { "epoch": 0.16955684007707128, "grad_norm": 10.227364292661408, "learning_rate": 3.384615384615385e-05, "loss": 1.3102, "step": 44 }, { "epoch": 0.17341040462427745, "grad_norm": 9.413833051288245, "learning_rate": 3.461538461538462e-05, "loss": 1.255, "step": 45 }, { "epoch": 0.17726396917148363, "grad_norm": 9.94687228320031, "learning_rate": 3.538461538461539e-05, "loss": 1.3163, "step": 46 }, { "epoch": 0.1811175337186898, "grad_norm": 7.092554527238564, "learning_rate": 3.615384615384616e-05, "loss": 1.2354, "step": 47 }, { "epoch": 0.18497109826589594, "grad_norm": 9.149268401400676, "learning_rate": 3.692307692307693e-05, "loss": 1.2504, "step": 48 }, { "epoch": 0.18882466281310212, "grad_norm": 6.289853231412649, "learning_rate": 3.769230769230769e-05, "loss": 1.1468, "step": 49 }, { "epoch": 0.1926782273603083, "grad_norm": 9.45344983093482, "learning_rate": 3.846153846153846e-05, "loss": 1.2058, "step": 50 }, { "epoch": 0.19653179190751446, "grad_norm": 10.360186688388032, "learning_rate": 3.923076923076923e-05, "loss": 1.3109, "step": 51 }, { "epoch": 0.2003853564547206, "grad_norm": 9.072775454723184, "learning_rate": 4e-05, "loss": 1.2695, "step": 52 }, { "epoch": 0.20423892100192678, "grad_norm": 8.438625780983935, "learning_rate": 3.999954550797489e-05, "loss": 1.3253, "step": 53 }, { "epoch": 0.20809248554913296, "grad_norm": 7.900240323931455, "learning_rate": 3.999818205255586e-05, "loss": 1.1021, "step": 54 }, { "epoch": 0.2119460500963391, "grad_norm": 5.82014263071836, "learning_rate": 3.9995909695710856e-05, "loss": 1.0841, "step": 55 }, { "epoch": 0.21579961464354527, "grad_norm": 7.201881900115451, "learning_rate": 3.999272854071669e-05, "loss": 1.133, "step": 56 }, { "epoch": 0.21965317919075145, "grad_norm": 7.6958656122670295, "learning_rate": 3.998863873215434e-05, "loss": 1.0334, "step": 57 }, { "epoch": 0.22350674373795762, "grad_norm": 5.905685554617661, "learning_rate": 3.998364045590232e-05, "loss": 1.017, "step": 58 }, { "epoch": 0.22736030828516376, "grad_norm": 6.204779741056835, "learning_rate": 3.9977733939128304e-05, "loss": 1.0027, "step": 59 }, { "epoch": 0.23121387283236994, "grad_norm": 4.951471635667849, "learning_rate": 3.997091945027878e-05, "loss": 0.9547, "step": 60 }, { "epoch": 0.2350674373795761, "grad_norm": 7.304094138753475, "learning_rate": 3.996319729906682e-05, "loss": 1.1246, "step": 61 }, { "epoch": 0.23892100192678228, "grad_norm": 6.298474879300807, "learning_rate": 3.995456783645805e-05, "loss": 1.0221, "step": 62 }, { "epoch": 0.24277456647398843, "grad_norm": 7.776498792672179, "learning_rate": 3.994503145465464e-05, "loss": 1.097, "step": 63 }, { "epoch": 0.2466281310211946, "grad_norm": 5.607410872804594, "learning_rate": 3.993458858707756e-05, "loss": 1.0371, "step": 64 }, { "epoch": 0.2504816955684008, "grad_norm": 4.77643068735735, "learning_rate": 3.992323970834682e-05, "loss": 1.004, "step": 65 }, { "epoch": 0.2543352601156069, "grad_norm": 7.743150531664644, "learning_rate": 3.991098533425988e-05, "loss": 1.0132, "step": 66 }, { "epoch": 0.2581888246628131, "grad_norm": 5.8368050703956245, "learning_rate": 3.989782602176829e-05, "loss": 0.9401, "step": 67 }, { "epoch": 0.26204238921001927, "grad_norm": 3.2101782475861, "learning_rate": 3.988376236895231e-05, "loss": 0.9025, "step": 68 }, { "epoch": 0.2658959537572254, "grad_norm": 13.049333458920938, "learning_rate": 3.986879501499373e-05, "loss": 1.0269, "step": 69 }, { "epoch": 0.2697495183044316, "grad_norm": 7.880498887675296, "learning_rate": 3.985292464014686e-05, "loss": 1.0398, "step": 70 }, { "epoch": 0.27360308285163776, "grad_norm": 5.313107024369285, "learning_rate": 3.9836151965707585e-05, "loss": 0.958, "step": 71 }, { "epoch": 0.2774566473988439, "grad_norm": 4.395056829854694, "learning_rate": 3.9818477753980566e-05, "loss": 0.9323, "step": 72 }, { "epoch": 0.2813102119460501, "grad_norm": 6.266655645573501, "learning_rate": 3.979990280824465e-05, "loss": 0.9909, "step": 73 }, { "epoch": 0.28516377649325625, "grad_norm": 3.957118496983222, "learning_rate": 3.9780427972716296e-05, "loss": 1.0391, "step": 74 }, { "epoch": 0.28901734104046245, "grad_norm": 3.508018525774183, "learning_rate": 3.976005413251125e-05, "loss": 1.0032, "step": 75 }, { "epoch": 0.2928709055876686, "grad_norm": 10.69825193127433, "learning_rate": 3.9738782213604305e-05, "loss": 1.0472, "step": 76 }, { "epoch": 0.29672447013487474, "grad_norm": 6.501782774298941, "learning_rate": 3.971661318278721e-05, "loss": 1.0126, "step": 77 }, { "epoch": 0.30057803468208094, "grad_norm": 5.11168856069891, "learning_rate": 3.969354804762473e-05, "loss": 0.96, "step": 78 }, { "epoch": 0.3044315992292871, "grad_norm": 5.039914432332962, "learning_rate": 3.966958785640887e-05, "loss": 0.9067, "step": 79 }, { "epoch": 0.30828516377649323, "grad_norm": 6.647066379901366, "learning_rate": 3.9644733698111206e-05, "loss": 0.9367, "step": 80 }, { "epoch": 0.31213872832369943, "grad_norm": 3.5432939083746655, "learning_rate": 3.9618986702333424e-05, "loss": 0.854, "step": 81 }, { "epoch": 0.3159922928709056, "grad_norm": 2.9831981250415525, "learning_rate": 3.959234803925594e-05, "loss": 0.8707, "step": 82 }, { "epoch": 0.3198458574181118, "grad_norm": 4.000857879722409, "learning_rate": 3.956481891958475e-05, "loss": 0.8518, "step": 83 }, { "epoch": 0.3236994219653179, "grad_norm": 3.4582769025055793, "learning_rate": 3.9536400594496386e-05, "loss": 0.8765, "step": 84 }, { "epoch": 0.32755298651252407, "grad_norm": 3.930610795014948, "learning_rate": 3.950709435558106e-05, "loss": 0.9758, "step": 85 }, { "epoch": 0.33140655105973027, "grad_norm": 3.7172875758901154, "learning_rate": 3.947690153478396e-05, "loss": 0.91, "step": 86 }, { "epoch": 0.3352601156069364, "grad_norm": 4.5575804046102455, "learning_rate": 3.9445823504344725e-05, "loss": 0.8402, "step": 87 }, { "epoch": 0.33911368015414256, "grad_norm": 3.8988704845029223, "learning_rate": 3.9413861676735034e-05, "loss": 0.9235, "step": 88 }, { "epoch": 0.34296724470134876, "grad_norm": 3.965085606406442, "learning_rate": 3.938101750459447e-05, "loss": 0.8447, "step": 89 }, { "epoch": 0.3468208092485549, "grad_norm": 3.046774416650305, "learning_rate": 3.9347292480664465e-05, "loss": 0.8227, "step": 90 }, { "epoch": 0.35067437379576105, "grad_norm": 4.720525497029076, "learning_rate": 3.931268813772047e-05, "loss": 0.8303, "step": 91 }, { "epoch": 0.35452793834296725, "grad_norm": 3.961704867922078, "learning_rate": 3.927720604850226e-05, "loss": 0.8955, "step": 92 }, { "epoch": 0.3583815028901734, "grad_norm": 3.3173574867131967, "learning_rate": 3.92408478256425e-05, "loss": 0.8748, "step": 93 }, { "epoch": 0.3622350674373796, "grad_norm": 4.270129625253774, "learning_rate": 3.920361512159343e-05, "loss": 0.8354, "step": 94 }, { "epoch": 0.36608863198458574, "grad_norm": 4.083301813465707, "learning_rate": 3.916550962855174e-05, "loss": 0.8428, "step": 95 }, { "epoch": 0.3699421965317919, "grad_norm": 3.8426066258082745, "learning_rate": 3.912653307838173e-05, "loss": 0.882, "step": 96 }, { "epoch": 0.3737957610789981, "grad_norm": 4.858960264331484, "learning_rate": 3.908668724253649e-05, "loss": 0.883, "step": 97 }, { "epoch": 0.37764932562620424, "grad_norm": 3.529748790118145, "learning_rate": 3.9045973931977495e-05, "loss": 0.8818, "step": 98 }, { "epoch": 0.3815028901734104, "grad_norm": 4.3956107314706685, "learning_rate": 3.900439499709224e-05, "loss": 0.8682, "step": 99 }, { "epoch": 0.3853564547206166, "grad_norm": 4.190396012770089, "learning_rate": 3.896195232761016e-05, "loss": 0.8228, "step": 100 }, { "epoch": 0.3892100192678227, "grad_norm": 3.10753800197593, "learning_rate": 3.891864785251673e-05, "loss": 0.8403, "step": 101 }, { "epoch": 0.3930635838150289, "grad_norm": 2.9288118038221125, "learning_rate": 3.887448353996582e-05, "loss": 0.8621, "step": 102 }, { "epoch": 0.3969171483622351, "grad_norm": 5.08781315771577, "learning_rate": 3.88294613971902e-05, "loss": 0.9015, "step": 103 }, { "epoch": 0.4007707129094412, "grad_norm": 2.825525117803277, "learning_rate": 3.8783583470410365e-05, "loss": 0.746, "step": 104 }, { "epoch": 0.4046242774566474, "grad_norm": 3.7918665790803674, "learning_rate": 3.87368518447415e-05, "loss": 0.7484, "step": 105 }, { "epoch": 0.40847784200385356, "grad_norm": 2.600804604880562, "learning_rate": 3.8689268644098715e-05, "loss": 0.8912, "step": 106 }, { "epoch": 0.4123314065510597, "grad_norm": 3.3941246553611775, "learning_rate": 3.864083603110053e-05, "loss": 0.7785, "step": 107 }, { "epoch": 0.4161849710982659, "grad_norm": 4.397542143481628, "learning_rate": 3.8591556206970594e-05, "loss": 0.8569, "step": 108 }, { "epoch": 0.42003853564547206, "grad_norm": 3.0636662940182124, "learning_rate": 3.8541431411437616e-05, "loss": 0.8718, "step": 109 }, { "epoch": 0.4238921001926782, "grad_norm": 4.18117825185653, "learning_rate": 3.8490463922633564e-05, "loss": 0.7702, "step": 110 }, { "epoch": 0.4277456647398844, "grad_norm": 2.43373547813132, "learning_rate": 3.843865605699017e-05, "loss": 0.7936, "step": 111 }, { "epoch": 0.43159922928709055, "grad_norm": 3.943685910342036, "learning_rate": 3.8386010169133596e-05, "loss": 0.8022, "step": 112 }, { "epoch": 0.43545279383429675, "grad_norm": 3.655514814817605, "learning_rate": 3.833252865177748e-05, "loss": 0.774, "step": 113 }, { "epoch": 0.4393063583815029, "grad_norm": 3.0272004910923487, "learning_rate": 3.8278213935614126e-05, "loss": 0.7132, "step": 114 }, { "epoch": 0.44315992292870904, "grad_norm": 4.609405687465504, "learning_rate": 3.8223068489204064e-05, "loss": 0.9129, "step": 115 }, { "epoch": 0.44701348747591524, "grad_norm": 3.223236986816773, "learning_rate": 3.816709481886386e-05, "loss": 0.766, "step": 116 }, { "epoch": 0.4508670520231214, "grad_norm": 3.4852321075320933, "learning_rate": 3.81102954685522e-05, "loss": 0.7169, "step": 117 }, { "epoch": 0.45472061657032753, "grad_norm": 2.6796073181657745, "learning_rate": 3.805267301975424e-05, "loss": 0.7362, "step": 118 }, { "epoch": 0.45857418111753373, "grad_norm": 2.839001944033485, "learning_rate": 3.799423009136434e-05, "loss": 0.8818, "step": 119 }, { "epoch": 0.4624277456647399, "grad_norm": 2.328668662319875, "learning_rate": 3.793496933956699e-05, "loss": 0.7686, "step": 120 }, { "epoch": 0.4662813102119461, "grad_norm": 3.5890624141236764, "learning_rate": 3.7874893457716086e-05, "loss": 0.7887, "step": 121 }, { "epoch": 0.4701348747591522, "grad_norm": 3.2377955419056494, "learning_rate": 3.7814005176212555e-05, "loss": 0.8295, "step": 122 }, { "epoch": 0.47398843930635837, "grad_norm": 3.1936213076859197, "learning_rate": 3.775230726238023e-05, "loss": 0.7653, "step": 123 }, { "epoch": 0.47784200385356457, "grad_norm": 3.314436046941277, "learning_rate": 3.7689802520340103e-05, "loss": 0.7326, "step": 124 }, { "epoch": 0.4816955684007707, "grad_norm": 1.6311346221294762, "learning_rate": 3.7626493790882846e-05, "loss": 0.7467, "step": 125 }, { "epoch": 0.48554913294797686, "grad_norm": 3.364013140417832, "learning_rate": 3.756238395133972e-05, "loss": 0.7601, "step": 126 }, { "epoch": 0.48940269749518306, "grad_norm": 2.347603254239086, "learning_rate": 3.7497475915451806e-05, "loss": 0.823, "step": 127 }, { "epoch": 0.4932562620423892, "grad_norm": 3.847030422820461, "learning_rate": 3.743177263323758e-05, "loss": 0.7091, "step": 128 }, { "epoch": 0.49710982658959535, "grad_norm": 3.463187618636158, "learning_rate": 3.7365277090858815e-05, "loss": 0.7412, "step": 129 }, { "epoch": 0.5009633911368016, "grad_norm": 2.818735989314964, "learning_rate": 3.729799231048488e-05, "loss": 0.7571, "step": 130 }, { "epoch": 0.5048169556840078, "grad_norm": 3.5601441226127655, "learning_rate": 3.722992135015539e-05, "loss": 0.6498, "step": 131 }, { "epoch": 0.5086705202312138, "grad_norm": 3.3243689525232716, "learning_rate": 3.71610673036412e-05, "loss": 0.7063, "step": 132 }, { "epoch": 0.51252408477842, "grad_norm": 2.79620087521176, "learning_rate": 3.709143330030383e-05, "loss": 0.7519, "step": 133 }, { "epoch": 0.5163776493256262, "grad_norm": 2.6274884881066813, "learning_rate": 3.702102250495318e-05, "loss": 0.6862, "step": 134 }, { "epoch": 0.5202312138728323, "grad_norm": 1.310261156652054, "learning_rate": 3.694983811770375e-05, "loss": 0.5968, "step": 135 }, { "epoch": 0.5240847784200385, "grad_norm": 2.961796401692403, "learning_rate": 3.687788337382918e-05, "loss": 0.6842, "step": 136 }, { "epoch": 0.5279383429672447, "grad_norm": 3.715435487194137, "learning_rate": 3.6805161543615186e-05, "loss": 0.7836, "step": 137 }, { "epoch": 0.5317919075144508, "grad_norm": 1.9101920792437495, "learning_rate": 3.673167593221097e-05, "loss": 0.6828, "step": 138 }, { "epoch": 0.535645472061657, "grad_norm": 4.2524126868327885, "learning_rate": 3.665742987947895e-05, "loss": 0.7685, "step": 139 }, { "epoch": 0.5394990366088632, "grad_norm": 2.3826617460275217, "learning_rate": 3.658242675984302e-05, "loss": 0.6548, "step": 140 }, { "epoch": 0.5433526011560693, "grad_norm": 2.7362234194986823, "learning_rate": 3.6506669982135166e-05, "loss": 0.7131, "step": 141 }, { "epoch": 0.5472061657032755, "grad_norm": 2.839379036151834, "learning_rate": 3.6430162989440495e-05, "loss": 0.6846, "step": 142 }, { "epoch": 0.5510597302504817, "grad_norm": 2.217944430478911, "learning_rate": 3.635290925894083e-05, "loss": 0.7183, "step": 143 }, { "epoch": 0.5549132947976878, "grad_norm": 2.2686355763545203, "learning_rate": 3.627491230175661e-05, "loss": 0.6742, "step": 144 }, { "epoch": 0.558766859344894, "grad_norm": 2.8612073083735075, "learning_rate": 3.6196175662787326e-05, "loss": 0.6969, "step": 145 }, { "epoch": 0.5626204238921002, "grad_norm": 2.379426816508497, "learning_rate": 3.6116702920550445e-05, "loss": 0.7102, "step": 146 }, { "epoch": 0.5664739884393064, "grad_norm": 3.5635505745818774, "learning_rate": 3.6036497687018704e-05, "loss": 0.737, "step": 147 }, { "epoch": 0.5703275529865125, "grad_norm": 2.5565000760425094, "learning_rate": 3.5955563607456025e-05, "loss": 0.6599, "step": 148 }, { "epoch": 0.5741811175337187, "grad_norm": 2.7437675382633326, "learning_rate": 3.5873904360251766e-05, "loss": 0.7464, "step": 149 }, { "epoch": 0.5780346820809249, "grad_norm": 2.3771605731220986, "learning_rate": 3.579152365675359e-05, "loss": 0.6092, "step": 150 }, { "epoch": 0.581888246628131, "grad_norm": 2.393218132886115, "learning_rate": 3.570842524109878e-05, "loss": 0.711, "step": 151 }, { "epoch": 0.5857418111753372, "grad_norm": 2.4377648761435196, "learning_rate": 3.562461289004406e-05, "loss": 0.6765, "step": 152 }, { "epoch": 0.5895953757225434, "grad_norm": 1.726127421558755, "learning_rate": 3.5540090412793926e-05, "loss": 0.6122, "step": 153 }, { "epoch": 0.5934489402697495, "grad_norm": 2.2397924893238628, "learning_rate": 3.545486165082759e-05, "loss": 0.6207, "step": 154 }, { "epoch": 0.5973025048169557, "grad_norm": 2.0431721405303764, "learning_rate": 3.53689304777243e-05, "loss": 0.621, "step": 155 }, { "epoch": 0.6011560693641619, "grad_norm": 1.8373936110598785, "learning_rate": 3.528230079898734e-05, "loss": 0.6832, "step": 156 }, { "epoch": 0.605009633911368, "grad_norm": 3.218900986872675, "learning_rate": 3.5194976551866535e-05, "loss": 0.6867, "step": 157 }, { "epoch": 0.6088631984585742, "grad_norm": 2.7681277459165186, "learning_rate": 3.510696170517927e-05, "loss": 0.6388, "step": 158 }, { "epoch": 0.6127167630057804, "grad_norm": 2.789942125405049, "learning_rate": 3.5018260259130134e-05, "loss": 0.65, "step": 159 }, { "epoch": 0.6165703275529865, "grad_norm": 2.3067452480491193, "learning_rate": 3.492887624512912e-05, "loss": 0.5852, "step": 160 }, { "epoch": 0.6204238921001927, "grad_norm": 1.5893723542941685, "learning_rate": 3.483881372560837e-05, "loss": 0.6286, "step": 161 }, { "epoch": 0.6242774566473989, "grad_norm": 2.8053299102587146, "learning_rate": 3.474807679383758e-05, "loss": 0.6866, "step": 162 }, { "epoch": 0.628131021194605, "grad_norm": 1.9932305081667505, "learning_rate": 3.4656669573737934e-05, "loss": 0.6618, "step": 163 }, { "epoch": 0.6319845857418112, "grad_norm": 2.5322390617691677, "learning_rate": 3.456459621969469e-05, "loss": 0.6699, "step": 164 }, { "epoch": 0.6358381502890174, "grad_norm": 2.714324952697276, "learning_rate": 3.447186091636836e-05, "loss": 0.567, "step": 165 }, { "epoch": 0.6396917148362236, "grad_norm": 2.3260631874283795, "learning_rate": 3.437846787850454e-05, "loss": 0.6787, "step": 166 }, { "epoch": 0.6435452793834296, "grad_norm": 3.244130833526928, "learning_rate": 3.42844213507423e-05, "loss": 0.6283, "step": 167 }, { "epoch": 0.6473988439306358, "grad_norm": 2.2697249458670403, "learning_rate": 3.418972560742133e-05, "loss": 0.6043, "step": 168 }, { "epoch": 0.651252408477842, "grad_norm": 2.261002336792345, "learning_rate": 3.409438495238765e-05, "loss": 0.7039, "step": 169 }, { "epoch": 0.6551059730250481, "grad_norm": 2.9798712765043542, "learning_rate": 3.3998403718798005e-05, "loss": 0.687, "step": 170 }, { "epoch": 0.6589595375722543, "grad_norm": 1.7575260995525863, "learning_rate": 3.390178626892291e-05, "loss": 0.5704, "step": 171 }, { "epoch": 0.6628131021194605, "grad_norm": 1.959592501385033, "learning_rate": 3.38045369939484e-05, "loss": 0.6365, "step": 172 }, { "epoch": 0.6666666666666666, "grad_norm": 2.7459302559428127, "learning_rate": 3.370666031377648e-05, "loss": 0.6081, "step": 173 }, { "epoch": 0.6705202312138728, "grad_norm": 1.868927815180204, "learning_rate": 3.3608160676824216e-05, "loss": 0.5927, "step": 174 }, { "epoch": 0.674373795761079, "grad_norm": 3.4467192352082856, "learning_rate": 3.350904255982154e-05, "loss": 0.6956, "step": 175 }, { "epoch": 0.6782273603082851, "grad_norm": 1.740260469132678, "learning_rate": 3.3409310467607824e-05, "loss": 0.5574, "step": 176 }, { "epoch": 0.6820809248554913, "grad_norm": 2.055938978641809, "learning_rate": 3.330896893292714e-05, "loss": 0.5761, "step": 177 }, { "epoch": 0.6859344894026975, "grad_norm": 2.2106935577605427, "learning_rate": 3.3208022516222195e-05, "loss": 0.5955, "step": 178 }, { "epoch": 0.6897880539499036, "grad_norm": 1.8460288906944335, "learning_rate": 3.310647580542715e-05, "loss": 0.5769, "step": 179 }, { "epoch": 0.6936416184971098, "grad_norm": 2.1583501173847024, "learning_rate": 3.300433341575901e-05, "loss": 0.5949, "step": 180 }, { "epoch": 0.697495183044316, "grad_norm": 2.748430083318313, "learning_rate": 3.2901599989507935e-05, "loss": 0.5629, "step": 181 }, { "epoch": 0.7013487475915221, "grad_norm": 1.942682422315833, "learning_rate": 3.279828019582621e-05, "loss": 0.612, "step": 182 }, { "epoch": 0.7052023121387283, "grad_norm": 3.1214108190867487, "learning_rate": 3.2694378730516074e-05, "loss": 0.5638, "step": 183 }, { "epoch": 0.7090558766859345, "grad_norm": 2.5088639648861912, "learning_rate": 3.2589900315816266e-05, "loss": 0.5643, "step": 184 }, { "epoch": 0.7129094412331407, "grad_norm": 1.6797637949182582, "learning_rate": 3.24848497001874e-05, "loss": 0.5467, "step": 185 }, { "epoch": 0.7167630057803468, "grad_norm": 2.2031923107978004, "learning_rate": 3.237923165809619e-05, "loss": 0.6403, "step": 186 }, { "epoch": 0.720616570327553, "grad_norm": 2.217707978859453, "learning_rate": 3.227305098979842e-05, "loss": 0.5575, "step": 187 }, { "epoch": 0.7244701348747592, "grad_norm": 1.8030994444840251, "learning_rate": 3.2166312521120775e-05, "loss": 0.5964, "step": 188 }, { "epoch": 0.7283236994219653, "grad_norm": 2.7881810961948035, "learning_rate": 3.2059021103241556e-05, "loss": 0.5627, "step": 189 }, { "epoch": 0.7321772639691715, "grad_norm": 1.6025626542354654, "learning_rate": 3.195118161247011e-05, "loss": 0.5366, "step": 190 }, { "epoch": 0.7360308285163777, "grad_norm": 2.7898667044495196, "learning_rate": 3.184279895002533e-05, "loss": 0.6269, "step": 191 }, { "epoch": 0.7398843930635838, "grad_norm": 2.012521948946768, "learning_rate": 3.1733878041812756e-05, "loss": 0.578, "step": 192 }, { "epoch": 0.74373795761079, "grad_norm": 1.7202183467823577, "learning_rate": 3.1624423838200824e-05, "loss": 0.6021, "step": 193 }, { "epoch": 0.7475915221579962, "grad_norm": 2.391244509621585, "learning_rate": 3.151444131379579e-05, "loss": 0.6227, "step": 194 }, { "epoch": 0.7514450867052023, "grad_norm": 1.290881388687675, "learning_rate": 3.140393546721569e-05, "loss": 0.4976, "step": 195 }, { "epoch": 0.7552986512524085, "grad_norm": 2.747140614154264, "learning_rate": 3.1292911320863104e-05, "loss": 0.5795, "step": 196 }, { "epoch": 0.7591522157996147, "grad_norm": 2.1090959982249022, "learning_rate": 3.118137392069696e-05, "loss": 0.5987, "step": 197 }, { "epoch": 0.7630057803468208, "grad_norm": 1.920242941952473, "learning_rate": 3.106932833600314e-05, "loss": 0.5862, "step": 198 }, { "epoch": 0.766859344894027, "grad_norm": 2.4556835621447477, "learning_rate": 3.095677965916411e-05, "loss": 0.5432, "step": 199 }, { "epoch": 0.7707129094412332, "grad_norm": 1.7404959875815949, "learning_rate": 3.084373300542748e-05, "loss": 0.4656, "step": 200 }, { "epoch": 0.7745664739884393, "grad_norm": 2.4156874663926433, "learning_rate": 3.0730193512673515e-05, "loss": 0.6341, "step": 201 }, { "epoch": 0.7784200385356455, "grad_norm": 1.9237119705970884, "learning_rate": 3.06161663411816e-05, "loss": 0.4988, "step": 202 }, { "epoch": 0.7822736030828517, "grad_norm": 2.1640461913240245, "learning_rate": 3.0501656673395756e-05, "loss": 0.5871, "step": 203 }, { "epoch": 0.7861271676300579, "grad_norm": 2.7705536239605704, "learning_rate": 3.0386669713689057e-05, "loss": 0.6961, "step": 204 }, { "epoch": 0.789980732177264, "grad_norm": 2.361536461003985, "learning_rate": 3.0271210688127123e-05, "loss": 0.5827, "step": 205 }, { "epoch": 0.7938342967244701, "grad_norm": 1.724308147378069, "learning_rate": 3.015528484423059e-05, "loss": 0.5157, "step": 206 }, { "epoch": 0.7976878612716763, "grad_norm": 1.8279294004024718, "learning_rate": 3.0038897450736612e-05, "loss": 0.5094, "step": 207 }, { "epoch": 0.8015414258188824, "grad_norm": 1.836316615161725, "learning_rate": 2.9922053797359406e-05, "loss": 0.5172, "step": 208 }, { "epoch": 0.8053949903660886, "grad_norm": 1.6458484857811015, "learning_rate": 2.980475919454984e-05, "loss": 0.5253, "step": 209 }, { "epoch": 0.8092485549132948, "grad_norm": 2.2344584087944535, "learning_rate": 2.9687018973254055e-05, "loss": 0.5379, "step": 210 }, { "epoch": 0.8131021194605009, "grad_norm": 1.8412670292971436, "learning_rate": 2.956883848467123e-05, "loss": 0.5655, "step": 211 }, { "epoch": 0.8169556840077071, "grad_norm": 1.6198673535897559, "learning_rate": 2.945022310001032e-05, "loss": 0.4977, "step": 212 }, { "epoch": 0.8208092485549133, "grad_norm": 1.6875535729885185, "learning_rate": 2.9331178210245962e-05, "loss": 0.4592, "step": 213 }, { "epoch": 0.8246628131021194, "grad_norm": 1.5196518137741946, "learning_rate": 2.921170922587346e-05, "loss": 0.5223, "step": 214 }, { "epoch": 0.8285163776493256, "grad_norm": 1.5091415438145863, "learning_rate": 2.909182157666287e-05, "loss": 0.4706, "step": 215 }, { "epoch": 0.8323699421965318, "grad_norm": 2.1212922277105, "learning_rate": 2.897152071141225e-05, "loss": 0.4369, "step": 216 }, { "epoch": 0.8362235067437379, "grad_norm": 1.6767049591987147, "learning_rate": 2.885081209769998e-05, "loss": 0.5098, "step": 217 }, { "epoch": 0.8400770712909441, "grad_norm": 1.6518844332576945, "learning_rate": 2.8729701221636294e-05, "loss": 0.5113, "step": 218 }, { "epoch": 0.8439306358381503, "grad_norm": 2.0287593345047665, "learning_rate": 2.8608193587613917e-05, "loss": 0.5342, "step": 219 }, { "epoch": 0.8477842003853564, "grad_norm": 1.4891909790069897, "learning_rate": 2.8486294718057936e-05, "loss": 0.4461, "step": 220 }, { "epoch": 0.8516377649325626, "grad_norm": 1.52500498490846, "learning_rate": 2.8364010153174733e-05, "loss": 0.543, "step": 221 }, { "epoch": 0.8554913294797688, "grad_norm": 1.5785692053698055, "learning_rate": 2.8241345450700275e-05, "loss": 0.4329, "step": 222 }, { "epoch": 0.859344894026975, "grad_norm": 2.2507626884600187, "learning_rate": 2.8118306185647458e-05, "loss": 0.5709, "step": 223 }, { "epoch": 0.8631984585741811, "grad_norm": 1.7477813182417148, "learning_rate": 2.7994897950052764e-05, "loss": 0.5607, "step": 224 }, { "epoch": 0.8670520231213873, "grad_norm": 1.5393581266661793, "learning_rate": 2.7871126352722086e-05, "loss": 0.4671, "step": 225 }, { "epoch": 0.8709055876685935, "grad_norm": 1.5948515405066201, "learning_rate": 2.7746997018975804e-05, "loss": 0.5141, "step": 226 }, { "epoch": 0.8747591522157996, "grad_norm": 1.6725721797685633, "learning_rate": 2.7622515590393158e-05, "loss": 0.5134, "step": 227 }, { "epoch": 0.8786127167630058, "grad_norm": 1.9430866061833014, "learning_rate": 2.74976877245558e-05, "loss": 0.4567, "step": 228 }, { "epoch": 0.882466281310212, "grad_norm": 1.7951935855325456, "learning_rate": 2.737251909479068e-05, "loss": 0.5176, "step": 229 }, { "epoch": 0.8863198458574181, "grad_norm": 1.5515427698063549, "learning_rate": 2.7247015389912203e-05, "loss": 0.4987, "step": 230 }, { "epoch": 0.8901734104046243, "grad_norm": 2.423785012348166, "learning_rate": 2.7121182313963666e-05, "loss": 0.4493, "step": 231 }, { "epoch": 0.8940269749518305, "grad_norm": 1.5248030223399374, "learning_rate": 2.6995025585958026e-05, "loss": 0.5154, "step": 232 }, { "epoch": 0.8978805394990366, "grad_norm": 1.9085369464839403, "learning_rate": 2.686855093961795e-05, "loss": 0.4695, "step": 233 }, { "epoch": 0.9017341040462428, "grad_norm": 2.4921294993771657, "learning_rate": 2.674176412311527e-05, "loss": 0.5108, "step": 234 }, { "epoch": 0.905587668593449, "grad_norm": 1.016492898729909, "learning_rate": 2.6614670898809675e-05, "loss": 0.4305, "step": 235 }, { "epoch": 0.9094412331406551, "grad_norm": 1.7973135593288705, "learning_rate": 2.648727704298685e-05, "loss": 0.5085, "step": 236 }, { "epoch": 0.9132947976878613, "grad_norm": 1.298610518247185, "learning_rate": 2.6359588345595956e-05, "loss": 0.4336, "step": 237 }, { "epoch": 0.9171483622350675, "grad_norm": 1.7615802157799905, "learning_rate": 2.6231610609986442e-05, "loss": 0.4425, "step": 238 }, { "epoch": 0.9210019267822736, "grad_norm": 2.0393603639058404, "learning_rate": 2.6103349652644356e-05, "loss": 0.5329, "step": 239 }, { "epoch": 0.9248554913294798, "grad_norm": 1.1646102704038852, "learning_rate": 2.5974811302927907e-05, "loss": 0.4098, "step": 240 }, { "epoch": 0.928709055876686, "grad_norm": 1.6746235496159276, "learning_rate": 2.5846001402802594e-05, "loss": 0.4589, "step": 241 }, { "epoch": 0.9325626204238922, "grad_norm": 1.4114912644400681, "learning_rate": 2.5716925806575628e-05, "loss": 0.4426, "step": 242 }, { "epoch": 0.9364161849710982, "grad_norm": 1.5300374827507783, "learning_rate": 2.5587590380629947e-05, "loss": 0.4389, "step": 243 }, { "epoch": 0.9402697495183044, "grad_norm": 2.084072213353826, "learning_rate": 2.54580010031575e-05, "loss": 0.5058, "step": 244 }, { "epoch": 0.9441233140655106, "grad_norm": 1.3650050749477864, "learning_rate": 2.5328163563892162e-05, "loss": 0.4072, "step": 245 }, { "epoch": 0.9479768786127167, "grad_norm": 1.3576937516102847, "learning_rate": 2.5198083963841988e-05, "loss": 0.4471, "step": 246 }, { "epoch": 0.9518304431599229, "grad_norm": 1.5006654890833853, "learning_rate": 2.5067768115021077e-05, "loss": 0.4013, "step": 247 }, { "epoch": 0.9556840077071291, "grad_norm": 1.3335516949676123, "learning_rate": 2.493722194018082e-05, "loss": 0.4171, "step": 248 }, { "epoch": 0.9595375722543352, "grad_norm": 1.7188267137418751, "learning_rate": 2.4806451372540767e-05, "loss": 0.436, "step": 249 }, { "epoch": 0.9633911368015414, "grad_norm": 1.5038230569609343, "learning_rate": 2.467546235551892e-05, "loss": 0.4305, "step": 250 }, { "epoch": 0.9672447013487476, "grad_norm": 1.2898240292343814, "learning_rate": 2.4544260842461638e-05, "loss": 0.4476, "step": 251 }, { "epoch": 0.9710982658959537, "grad_norm": 1.7360498495478183, "learning_rate": 2.441285279637307e-05, "loss": 0.4324, "step": 252 }, { "epoch": 0.9749518304431599, "grad_norm": 1.8924393082812891, "learning_rate": 2.4281244189644108e-05, "loss": 0.439, "step": 253 }, { "epoch": 0.9788053949903661, "grad_norm": 1.2832735489854135, "learning_rate": 2.414944100378097e-05, "loss": 0.3963, "step": 254 }, { "epoch": 0.9826589595375722, "grad_norm": 1.9408813544452537, "learning_rate": 2.401744922913334e-05, "loss": 0.4501, "step": 255 }, { "epoch": 0.9865125240847784, "grad_norm": 1.2867999932148788, "learning_rate": 2.388527486462212e-05, "loss": 0.3981, "step": 256 }, { "epoch": 0.9903660886319846, "grad_norm": 1.0196210656417135, "learning_rate": 2.3752923917466763e-05, "loss": 0.4396, "step": 257 }, { "epoch": 0.9942196531791907, "grad_norm": 1.2740978791601063, "learning_rate": 2.362040240291227e-05, "loss": 0.3905, "step": 258 }, { "epoch": 0.9980732177263969, "grad_norm": 1.050993597160996, "learning_rate": 2.34877163439558e-05, "loss": 0.3971, "step": 259 }, { "epoch": 0.9980732177263969, "eval_loss": 0.4258769750595093, "eval_runtime": 151.7734, "eval_samples_per_second": 13.164, "eval_steps_per_second": 0.415, "step": 259 }, { "epoch": 1.001926782273603, "grad_norm": 1.3018898740308236, "learning_rate": 2.3354871771072906e-05, "loss": 0.3663, "step": 260 }, { "epoch": 1.0057803468208093, "grad_norm": 1.768289755543873, "learning_rate": 2.3221874721943495e-05, "loss": 0.3706, "step": 261 }, { "epoch": 1.0096339113680155, "grad_norm": 1.3709089238690337, "learning_rate": 2.3088731241177378e-05, "loss": 0.4303, "step": 262 }, { "epoch": 1.0134874759152215, "grad_norm": 1.2747609251876415, "learning_rate": 2.2955447380039576e-05, "loss": 0.3878, "step": 263 }, { "epoch": 1.0173410404624277, "grad_norm": 1.833982622948961, "learning_rate": 2.282202919617529e-05, "loss": 0.4726, "step": 264 }, { "epoch": 1.0211946050096339, "grad_norm": 1.0257892835642075, "learning_rate": 2.2688482753334568e-05, "loss": 0.4311, "step": 265 }, { "epoch": 1.02504816955684, "grad_norm": 1.211081382753947, "learning_rate": 2.2554814121096748e-05, "loss": 0.3797, "step": 266 }, { "epoch": 1.0289017341040463, "grad_norm": 1.603141836779412, "learning_rate": 2.242102937459456e-05, "loss": 0.4334, "step": 267 }, { "epoch": 1.0327552986512525, "grad_norm": 0.9082356976857431, "learning_rate": 2.228713459423804e-05, "loss": 0.3477, "step": 268 }, { "epoch": 1.0366088631984587, "grad_norm": 1.4556959201763473, "learning_rate": 2.215313586543818e-05, "loss": 0.4605, "step": 269 }, { "epoch": 1.0404624277456647, "grad_norm": 1.157918758481043, "learning_rate": 2.2019039278330324e-05, "loss": 0.3749, "step": 270 }, { "epoch": 1.0443159922928709, "grad_norm": 1.2365434381583742, "learning_rate": 2.188485092749744e-05, "loss": 0.3438, "step": 271 }, { "epoch": 1.048169556840077, "grad_norm": 1.3583710499888804, "learning_rate": 2.1750576911693043e-05, "loss": 0.4338, "step": 272 }, { "epoch": 1.0520231213872833, "grad_norm": 1.373322024670892, "learning_rate": 2.161622333356408e-05, "loss": 0.4425, "step": 273 }, { "epoch": 1.0558766859344895, "grad_norm": 1.5222033465026896, "learning_rate": 2.148179629937352e-05, "loss": 0.3746, "step": 274 }, { "epoch": 1.0597302504816957, "grad_norm": 0.9301197570829759, "learning_rate": 2.134730191872288e-05, "loss": 0.3603, "step": 275 }, { "epoch": 1.0635838150289016, "grad_norm": 1.3349261365465133, "learning_rate": 2.1212746304274482e-05, "loss": 0.3896, "step": 276 }, { "epoch": 1.0674373795761078, "grad_norm": 1.6542379771126683, "learning_rate": 2.1078135571473712e-05, "loss": 0.4346, "step": 277 }, { "epoch": 1.071290944123314, "grad_norm": 1.2625424774815728, "learning_rate": 2.094347583827102e-05, "loss": 0.4369, "step": 278 }, { "epoch": 1.0751445086705202, "grad_norm": 1.242814350522191, "learning_rate": 2.0808773224843882e-05, "loss": 0.4306, "step": 279 }, { "epoch": 1.0789980732177264, "grad_norm": 1.441960169277262, "learning_rate": 2.0674033853318666e-05, "loss": 0.3905, "step": 280 }, { "epoch": 1.0828516377649327, "grad_norm": 1.176801789036799, "learning_rate": 2.0539263847492355e-05, "loss": 0.3745, "step": 281 }, { "epoch": 1.0867052023121386, "grad_norm": 1.0478011807250012, "learning_rate": 2.040446933255423e-05, "loss": 0.4304, "step": 282 }, { "epoch": 1.0905587668593448, "grad_norm": 0.991803808778807, "learning_rate": 2.0269656434807504e-05, "loss": 0.3328, "step": 283 }, { "epoch": 1.094412331406551, "grad_norm": 1.536330372789938, "learning_rate": 2.013483128139086e-05, "loss": 0.391, "step": 284 }, { "epoch": 1.0982658959537572, "grad_norm": 1.1995638320125537, "learning_rate": 2e-05, "loss": 0.4014, "step": 285 }, { "epoch": 1.1021194605009634, "grad_norm": 1.0998808179080417, "learning_rate": 1.9865168718609142e-05, "loss": 0.3467, "step": 286 }, { "epoch": 1.1059730250481696, "grad_norm": 1.1533276354863014, "learning_rate": 1.9730343565192506e-05, "loss": 0.3654, "step": 287 }, { "epoch": 1.1098265895953756, "grad_norm": 1.395165430926343, "learning_rate": 1.9595530667445775e-05, "loss": 0.3897, "step": 288 }, { "epoch": 1.1136801541425818, "grad_norm": 1.3829869394831174, "learning_rate": 1.946073615250765e-05, "loss": 0.3664, "step": 289 }, { "epoch": 1.117533718689788, "grad_norm": 1.6694598636029911, "learning_rate": 1.9325966146681337e-05, "loss": 0.3937, "step": 290 }, { "epoch": 1.1213872832369942, "grad_norm": 1.3093898924216434, "learning_rate": 1.919122677515612e-05, "loss": 0.4475, "step": 291 }, { "epoch": 1.1252408477842004, "grad_norm": 1.2830702205218405, "learning_rate": 1.905652416172899e-05, "loss": 0.4055, "step": 292 }, { "epoch": 1.1290944123314066, "grad_norm": 1.422016127363369, "learning_rate": 1.8921864428526295e-05, "loss": 0.4306, "step": 293 }, { "epoch": 1.1329479768786128, "grad_norm": 1.0443832316958135, "learning_rate": 1.8787253695725524e-05, "loss": 0.306, "step": 294 }, { "epoch": 1.1368015414258188, "grad_norm": 1.0396170075258768, "learning_rate": 1.8652698081277127e-05, "loss": 0.3808, "step": 295 }, { "epoch": 1.140655105973025, "grad_norm": 1.4482799449738568, "learning_rate": 1.851820370062648e-05, "loss": 0.3766, "step": 296 }, { "epoch": 1.1445086705202312, "grad_norm": 1.4383150667662055, "learning_rate": 1.8383776666435927e-05, "loss": 0.3807, "step": 297 }, { "epoch": 1.1483622350674374, "grad_norm": 1.052851787398113, "learning_rate": 1.824942308830696e-05, "loss": 0.3193, "step": 298 }, { "epoch": 1.1522157996146436, "grad_norm": 1.1901475083434114, "learning_rate": 1.8115149072502564e-05, "loss": 0.3644, "step": 299 }, { "epoch": 1.1560693641618498, "grad_norm": 1.0719741190000858, "learning_rate": 1.798096072166968e-05, "loss": 0.3265, "step": 300 }, { "epoch": 1.1599229287090558, "grad_norm": 0.9399484435440966, "learning_rate": 1.7846864134561828e-05, "loss": 0.3013, "step": 301 }, { "epoch": 1.163776493256262, "grad_norm": 0.8649780684652991, "learning_rate": 1.7712865405761967e-05, "loss": 0.331, "step": 302 }, { "epoch": 1.1676300578034682, "grad_norm": 1.093013914822741, "learning_rate": 1.757897062540545e-05, "loss": 0.3672, "step": 303 }, { "epoch": 1.1714836223506744, "grad_norm": 1.3242259881261507, "learning_rate": 1.7445185878903252e-05, "loss": 0.3461, "step": 304 }, { "epoch": 1.1753371868978806, "grad_norm": 1.5294924031862211, "learning_rate": 1.7311517246665435e-05, "loss": 0.3381, "step": 305 }, { "epoch": 1.1791907514450868, "grad_norm": 1.4097073214129088, "learning_rate": 1.7177970803824714e-05, "loss": 0.4501, "step": 306 }, { "epoch": 1.183044315992293, "grad_norm": 1.1405636655876248, "learning_rate": 1.7044552619960434e-05, "loss": 0.322, "step": 307 }, { "epoch": 1.186897880539499, "grad_norm": 1.172637190005047, "learning_rate": 1.691126875882263e-05, "loss": 0.3453, "step": 308 }, { "epoch": 1.1907514450867052, "grad_norm": 1.0104423854938296, "learning_rate": 1.677812527805651e-05, "loss": 0.3248, "step": 309 }, { "epoch": 1.1946050096339114, "grad_norm": 0.9585927088271504, "learning_rate": 1.6645128228927104e-05, "loss": 0.3153, "step": 310 }, { "epoch": 1.1984585741811176, "grad_norm": 0.7642180964221266, "learning_rate": 1.6512283656044207e-05, "loss": 0.3576, "step": 311 }, { "epoch": 1.2023121387283238, "grad_norm": 1.064288446414666, "learning_rate": 1.637959759708774e-05, "loss": 0.3842, "step": 312 }, { "epoch": 1.2061657032755297, "grad_norm": 1.1471701151580358, "learning_rate": 1.6247076082533244e-05, "loss": 0.3682, "step": 313 }, { "epoch": 1.210019267822736, "grad_norm": 0.7613384576508663, "learning_rate": 1.6114725135377883e-05, "loss": 0.3671, "step": 314 }, { "epoch": 1.2138728323699421, "grad_norm": 1.1314879376679692, "learning_rate": 1.5982550770866665e-05, "loss": 0.3509, "step": 315 }, { "epoch": 1.2177263969171483, "grad_norm": 1.0721877801971562, "learning_rate": 1.585055899621904e-05, "loss": 0.388, "step": 316 }, { "epoch": 1.2215799614643545, "grad_norm": 1.261955038248264, "learning_rate": 1.5718755810355895e-05, "loss": 0.3453, "step": 317 }, { "epoch": 1.2254335260115607, "grad_norm": 1.2728238431643502, "learning_rate": 1.5587147203626934e-05, "loss": 0.3638, "step": 318 }, { "epoch": 1.229287090558767, "grad_norm": 1.3228194347927702, "learning_rate": 1.5455739157538362e-05, "loss": 0.3922, "step": 319 }, { "epoch": 1.2331406551059731, "grad_norm": 0.759365184137883, "learning_rate": 1.532453764448109e-05, "loss": 0.3769, "step": 320 }, { "epoch": 1.2369942196531791, "grad_norm": 1.0387794799239043, "learning_rate": 1.5193548627459238e-05, "loss": 0.3487, "step": 321 }, { "epoch": 1.2408477842003853, "grad_norm": 1.653174900496495, "learning_rate": 1.5062778059819184e-05, "loss": 0.3325, "step": 322 }, { "epoch": 1.2447013487475915, "grad_norm": 0.9071493490028995, "learning_rate": 1.493223188497893e-05, "loss": 0.3099, "step": 323 }, { "epoch": 1.2485549132947977, "grad_norm": 0.909128453253325, "learning_rate": 1.4801916036158017e-05, "loss": 0.3484, "step": 324 }, { "epoch": 1.252408477842004, "grad_norm": 0.9104702137645303, "learning_rate": 1.4671836436107851e-05, "loss": 0.3151, "step": 325 }, { "epoch": 1.25626204238921, "grad_norm": 1.0657384235342173, "learning_rate": 1.4541998996842503e-05, "loss": 0.3328, "step": 326 }, { "epoch": 1.260115606936416, "grad_norm": 1.0115714194064442, "learning_rate": 1.4412409619370058e-05, "loss": 0.3114, "step": 327 }, { "epoch": 1.2639691714836223, "grad_norm": 0.9001086745125582, "learning_rate": 1.4283074193424379e-05, "loss": 0.3188, "step": 328 }, { "epoch": 1.2678227360308285, "grad_norm": 1.2276345166542122, "learning_rate": 1.4153998597197417e-05, "loss": 0.3498, "step": 329 }, { "epoch": 1.2716763005780347, "grad_norm": 0.7533021116197035, "learning_rate": 1.4025188697072098e-05, "loss": 0.3418, "step": 330 }, { "epoch": 1.275529865125241, "grad_norm": 0.7620164921410112, "learning_rate": 1.3896650347355652e-05, "loss": 0.2843, "step": 331 }, { "epoch": 1.2793834296724471, "grad_norm": 1.085725296514296, "learning_rate": 1.3768389390013558e-05, "loss": 0.3698, "step": 332 }, { "epoch": 1.2832369942196533, "grad_norm": 0.7656462946423266, "learning_rate": 1.3640411654404058e-05, "loss": 0.3526, "step": 333 }, { "epoch": 1.2870905587668593, "grad_norm": 0.8688261241761966, "learning_rate": 1.3512722957013157e-05, "loss": 0.3287, "step": 334 }, { "epoch": 1.2909441233140655, "grad_norm": 0.9102316741848134, "learning_rate": 1.3385329101190338e-05, "loss": 0.3328, "step": 335 }, { "epoch": 1.2947976878612717, "grad_norm": 0.8532527486919501, "learning_rate": 1.3258235876884735e-05, "loss": 0.3231, "step": 336 }, { "epoch": 1.298651252408478, "grad_norm": 0.6725300585239861, "learning_rate": 1.3131449060382053e-05, "loss": 0.2852, "step": 337 }, { "epoch": 1.3025048169556839, "grad_norm": 1.0698268373783526, "learning_rate": 1.3004974414041987e-05, "loss": 0.3087, "step": 338 }, { "epoch": 1.30635838150289, "grad_norm": 1.1722160127956573, "learning_rate": 1.287881768603634e-05, "loss": 0.3365, "step": 339 }, { "epoch": 1.3102119460500963, "grad_norm": 0.6736626507697272, "learning_rate": 1.27529846100878e-05, "loss": 0.2999, "step": 340 }, { "epoch": 1.3140655105973025, "grad_norm": 1.1063913885843315, "learning_rate": 1.2627480905209328e-05, "loss": 0.2894, "step": 341 }, { "epoch": 1.3179190751445087, "grad_norm": 1.109602932151094, "learning_rate": 1.2502312275444205e-05, "loss": 0.3495, "step": 342 }, { "epoch": 1.3217726396917149, "grad_norm": 1.1957687821810759, "learning_rate": 1.2377484409606848e-05, "loss": 0.3601, "step": 343 }, { "epoch": 1.325626204238921, "grad_norm": 1.128237352664405, "learning_rate": 1.22530029810242e-05, "loss": 0.3514, "step": 344 }, { "epoch": 1.3294797687861273, "grad_norm": 1.415323100911558, "learning_rate": 1.2128873647277919e-05, "loss": 0.3287, "step": 345 }, { "epoch": 1.3333333333333333, "grad_norm": 1.236826446216501, "learning_rate": 1.200510204994724e-05, "loss": 0.3062, "step": 346 }, { "epoch": 1.3371868978805395, "grad_norm": 1.0869697615225693, "learning_rate": 1.1881693814352543e-05, "loss": 0.3372, "step": 347 }, { "epoch": 1.3410404624277457, "grad_norm": 0.8579173376211442, "learning_rate": 1.1758654549299735e-05, "loss": 0.3162, "step": 348 }, { "epoch": 1.3448940269749519, "grad_norm": 1.1076519772349536, "learning_rate": 1.1635989846825275e-05, "loss": 0.3595, "step": 349 }, { "epoch": 1.348747591522158, "grad_norm": 1.0968809462254943, "learning_rate": 1.1513705281942072e-05, "loss": 0.3138, "step": 350 }, { "epoch": 1.352601156069364, "grad_norm": 0.8097631352039958, "learning_rate": 1.1391806412386086e-05, "loss": 0.3064, "step": 351 }, { "epoch": 1.3564547206165702, "grad_norm": 0.9302059439245486, "learning_rate": 1.127029877836371e-05, "loss": 0.3361, "step": 352 }, { "epoch": 1.3603082851637764, "grad_norm": 1.1085708625474628, "learning_rate": 1.1149187902300032e-05, "loss": 0.2918, "step": 353 }, { "epoch": 1.3641618497109826, "grad_norm": 1.2982099303179524, "learning_rate": 1.102847928858776e-05, "loss": 0.3243, "step": 354 }, { "epoch": 1.3680154142581888, "grad_norm": 0.8325178281772523, "learning_rate": 1.0908178423337135e-05, "loss": 0.3042, "step": 355 }, { "epoch": 1.371868978805395, "grad_norm": 1.0264691032333937, "learning_rate": 1.0788290774126549e-05, "loss": 0.3674, "step": 356 }, { "epoch": 1.3757225433526012, "grad_norm": 1.5611140030270088, "learning_rate": 1.0668821789754041e-05, "loss": 0.3395, "step": 357 }, { "epoch": 1.3795761078998074, "grad_norm": 0.6905356535042156, "learning_rate": 1.0549776899989686e-05, "loss": 0.326, "step": 358 }, { "epoch": 1.3834296724470134, "grad_norm": 0.813432697566474, "learning_rate": 1.043116151532877e-05, "loss": 0.3308, "step": 359 }, { "epoch": 1.3872832369942196, "grad_norm": 0.947125986929793, "learning_rate": 1.0312981026745952e-05, "loss": 0.3079, "step": 360 }, { "epoch": 1.3911368015414258, "grad_norm": 0.9541134784519973, "learning_rate": 1.019524080545017e-05, "loss": 0.3527, "step": 361 }, { "epoch": 1.394990366088632, "grad_norm": 0.650621794786191, "learning_rate": 1.0077946202640603e-05, "loss": 0.2879, "step": 362 }, { "epoch": 1.3988439306358382, "grad_norm": 0.7999542405054426, "learning_rate": 9.961102549263393e-06, "loss": 0.293, "step": 363 }, { "epoch": 1.4026974951830442, "grad_norm": 1.0074632544627469, "learning_rate": 9.844715155769418e-06, "loss": 0.3679, "step": 364 }, { "epoch": 1.4065510597302504, "grad_norm": 0.7914057577416254, "learning_rate": 9.72878931187288e-06, "loss": 0.2646, "step": 365 }, { "epoch": 1.4104046242774566, "grad_norm": 0.8463237752662545, "learning_rate": 9.613330286310952e-06, "loss": 0.2932, "step": 366 }, { "epoch": 1.4142581888246628, "grad_norm": 0.932880991782926, "learning_rate": 9.498343326604249e-06, "loss": 0.3013, "step": 367 }, { "epoch": 1.418111753371869, "grad_norm": 0.9821817374697017, "learning_rate": 9.3838336588184e-06, "loss": 0.3028, "step": 368 }, { "epoch": 1.4219653179190752, "grad_norm": 0.8469145630358891, "learning_rate": 9.269806487326491e-06, "loss": 0.3345, "step": 369 }, { "epoch": 1.4258188824662814, "grad_norm": 1.037169772414413, "learning_rate": 9.156266994572518e-06, "loss": 0.355, "step": 370 }, { "epoch": 1.4296724470134876, "grad_norm": 0.7213160294660059, "learning_rate": 9.043220340835895e-06, "loss": 0.2769, "step": 371 }, { "epoch": 1.4335260115606936, "grad_norm": 0.8831900688609602, "learning_rate": 8.930671663996864e-06, "loss": 0.2811, "step": 372 }, { "epoch": 1.4373795761078998, "grad_norm": 0.9088605343264572, "learning_rate": 8.818626079303038e-06, "loss": 0.3326, "step": 373 }, { "epoch": 1.441233140655106, "grad_norm": 1.0151812161616465, "learning_rate": 8.707088679136898e-06, "loss": 0.3964, "step": 374 }, { "epoch": 1.4450867052023122, "grad_norm": 0.721796968080708, "learning_rate": 8.59606453278432e-06, "loss": 0.2616, "step": 375 }, { "epoch": 1.4489402697495182, "grad_norm": 0.7436852548835157, "learning_rate": 8.485558686204215e-06, "loss": 0.3289, "step": 376 }, { "epoch": 1.4527938342967244, "grad_norm": 0.9085710653664684, "learning_rate": 8.37557616179918e-06, "loss": 0.3263, "step": 377 }, { "epoch": 1.4566473988439306, "grad_norm": 0.7242098860139589, "learning_rate": 8.266121958187246e-06, "loss": 0.3063, "step": 378 }, { "epoch": 1.4605009633911368, "grad_norm": 0.8037417491106659, "learning_rate": 8.15720104997468e-06, "loss": 0.2835, "step": 379 }, { "epoch": 1.464354527938343, "grad_norm": 0.6693841689504254, "learning_rate": 8.048818387529888e-06, "loss": 0.3096, "step": 380 }, { "epoch": 1.4682080924855492, "grad_norm": 0.6386913396187637, "learning_rate": 7.940978896758449e-06, "loss": 0.2871, "step": 381 }, { "epoch": 1.4720616570327554, "grad_norm": 0.7334763653626704, "learning_rate": 7.833687478879228e-06, "loss": 0.32, "step": 382 }, { "epoch": 1.4759152215799616, "grad_norm": 0.7171482705229991, "learning_rate": 7.726949010201585e-06, "loss": 0.3153, "step": 383 }, { "epoch": 1.4797687861271676, "grad_norm": 0.786019322707822, "learning_rate": 7.620768341903817e-06, "loss": 0.2843, "step": 384 }, { "epoch": 1.4836223506743738, "grad_norm": 0.9217824425126812, "learning_rate": 7.5151502998126035e-06, "loss": 0.342, "step": 385 }, { "epoch": 1.48747591522158, "grad_norm": 0.9974009492954946, "learning_rate": 7.410099684183738e-06, "loss": 0.2469, "step": 386 }, { "epoch": 1.4913294797687862, "grad_norm": 0.7613952875838962, "learning_rate": 7.305621269483927e-06, "loss": 0.311, "step": 387 }, { "epoch": 1.4951830443159924, "grad_norm": 0.8389199365953313, "learning_rate": 7.201719804173797e-06, "loss": 0.321, "step": 388 }, { "epoch": 1.4990366088631983, "grad_norm": 0.8436343238052573, "learning_rate": 7.098400010492079e-06, "loss": 0.3375, "step": 389 }, { "epoch": 1.5028901734104045, "grad_norm": 0.7057291557928584, "learning_rate": 6.995666584240998e-06, "loss": 0.2924, "step": 390 }, { "epoch": 1.5067437379576107, "grad_norm": 0.7636872711492277, "learning_rate": 6.893524194572856e-06, "loss": 0.293, "step": 391 }, { "epoch": 1.510597302504817, "grad_norm": 0.926804984511261, "learning_rate": 6.791977483777808e-06, "loss": 0.304, "step": 392 }, { "epoch": 1.5144508670520231, "grad_norm": 0.8113244891141366, "learning_rate": 6.691031067072866e-06, "loss": 0.3202, "step": 393 }, { "epoch": 1.5183044315992293, "grad_norm": 0.6667861770618242, "learning_rate": 6.5906895323921805e-06, "loss": 0.2816, "step": 394 }, { "epoch": 1.5221579961464355, "grad_norm": 0.8795394551278755, "learning_rate": 6.490957440178467e-06, "loss": 0.2892, "step": 395 }, { "epoch": 1.5260115606936417, "grad_norm": 0.9108462944261728, "learning_rate": 6.391839323175788e-06, "loss": 0.3503, "step": 396 }, { "epoch": 1.529865125240848, "grad_norm": 0.8980380594575577, "learning_rate": 6.293339686223521e-06, "loss": 0.3322, "step": 397 }, { "epoch": 1.533718689788054, "grad_norm": 0.6868397537542229, "learning_rate": 6.1954630060516005e-06, "loss": 0.2826, "step": 398 }, { "epoch": 1.5375722543352601, "grad_norm": 0.7791737364861953, "learning_rate": 6.098213731077101e-06, "loss": 0.3134, "step": 399 }, { "epoch": 1.5414258188824663, "grad_norm": 0.6867571528771647, "learning_rate": 6.001596281201998e-06, "loss": 0.2865, "step": 400 }, { "epoch": 1.5452793834296723, "grad_norm": 0.9804075340302699, "learning_rate": 5.905615047612352e-06, "loss": 0.3528, "step": 401 }, { "epoch": 1.5491329479768785, "grad_norm": 0.6720710667167724, "learning_rate": 5.810274392578672e-06, "loss": 0.2754, "step": 402 }, { "epoch": 1.5529865125240847, "grad_norm": 0.6802925027816661, "learning_rate": 5.715578649257709e-06, "loss": 0.2516, "step": 403 }, { "epoch": 1.556840077071291, "grad_norm": 0.7494941138748478, "learning_rate": 5.621532121495468e-06, "loss": 0.2928, "step": 404 }, { "epoch": 1.560693641618497, "grad_norm": 0.5855579359311635, "learning_rate": 5.528139083631641e-06, "loss": 0.2755, "step": 405 }, { "epoch": 1.5645472061657033, "grad_norm": 0.8765684000753372, "learning_rate": 5.4354037803053124e-06, "loss": 0.2721, "step": 406 }, { "epoch": 1.5684007707129095, "grad_norm": 0.8693102037593315, "learning_rate": 5.343330426262075e-06, "loss": 0.2847, "step": 407 }, { "epoch": 1.5722543352601157, "grad_norm": 1.0221938673812996, "learning_rate": 5.2519232061624255e-06, "loss": 0.3375, "step": 408 }, { "epoch": 1.576107899807322, "grad_norm": 0.6916381894839094, "learning_rate": 5.161186274391632e-06, "loss": 0.2952, "step": 409 }, { "epoch": 1.579961464354528, "grad_norm": 0.6607350250192912, "learning_rate": 5.071123754870888e-06, "loss": 0.2892, "step": 410 }, { "epoch": 1.583815028901734, "grad_norm": 0.709997113458217, "learning_rate": 4.981739740869866e-06, "loss": 0.3006, "step": 411 }, { "epoch": 1.5876685934489403, "grad_norm": 0.8317299871071954, "learning_rate": 4.893038294820736e-06, "loss": 0.2681, "step": 412 }, { "epoch": 1.5915221579961463, "grad_norm": 0.6514943944174995, "learning_rate": 4.805023448133468e-06, "loss": 0.3109, "step": 413 }, { "epoch": 1.5953757225433525, "grad_norm": 0.7060506760393167, "learning_rate": 4.717699201012658e-06, "loss": 0.2561, "step": 414 }, { "epoch": 1.5992292870905587, "grad_norm": 0.6560682680927723, "learning_rate": 4.6310695222757065e-06, "loss": 0.351, "step": 415 }, { "epoch": 1.6030828516377649, "grad_norm": 0.7519900581847846, "learning_rate": 4.545138349172418e-06, "loss": 0.2967, "step": 416 }, { "epoch": 1.606936416184971, "grad_norm": 0.7734834025884938, "learning_rate": 4.459909587206082e-06, "loss": 0.2859, "step": 417 }, { "epoch": 1.6107899807321773, "grad_norm": 0.6021137879538668, "learning_rate": 4.375387109955953e-06, "loss": 0.2766, "step": 418 }, { "epoch": 1.6146435452793835, "grad_norm": 0.617528729879879, "learning_rate": 4.291574758901224e-06, "loss": 0.2882, "step": 419 }, { "epoch": 1.6184971098265897, "grad_norm": 0.6024787320260311, "learning_rate": 4.208476343246417e-06, "loss": 0.3055, "step": 420 }, { "epoch": 1.6223506743737959, "grad_norm": 0.7809194181983308, "learning_rate": 4.12609563974824e-06, "loss": 0.3413, "step": 421 }, { "epoch": 1.626204238921002, "grad_norm": 0.7428919044347968, "learning_rate": 4.0444363925439845e-06, "loss": 0.2822, "step": 422 }, { "epoch": 1.630057803468208, "grad_norm": 0.8313087879713988, "learning_rate": 3.963502312981298e-06, "loss": 0.3037, "step": 423 }, { "epoch": 1.6339113680154143, "grad_norm": 0.7485349105988456, "learning_rate": 3.883297079449559e-06, "loss": 0.2736, "step": 424 }, { "epoch": 1.6377649325626205, "grad_norm": 0.6278539139461561, "learning_rate": 3.803824337212678e-06, "loss": 0.2834, "step": 425 }, { "epoch": 1.6416184971098264, "grad_norm": 0.6237587967099586, "learning_rate": 3.7250876982433947e-06, "loss": 0.275, "step": 426 }, { "epoch": 1.6454720616570326, "grad_norm": 0.6345631909381717, "learning_rate": 3.6470907410591695e-06, "loss": 0.2961, "step": 427 }, { "epoch": 1.6493256262042388, "grad_norm": 0.6867481603368362, "learning_rate": 3.569837010559505e-06, "loss": 0.3018, "step": 428 }, { "epoch": 1.653179190751445, "grad_norm": 0.5911022328472578, "learning_rate": 3.4933300178648423e-06, "loss": 0.257, "step": 429 }, { "epoch": 1.6570327552986512, "grad_norm": 0.6804693404362794, "learning_rate": 3.417573240156984e-06, "loss": 0.296, "step": 430 }, { "epoch": 1.6608863198458574, "grad_norm": 0.72735026642116, "learning_rate": 3.3425701205210557e-06, "loss": 0.3169, "step": 431 }, { "epoch": 1.6647398843930636, "grad_norm": 0.6635536624459238, "learning_rate": 3.2683240677890373e-06, "loss": 0.2652, "step": 432 }, { "epoch": 1.6685934489402698, "grad_norm": 0.607762952833936, "learning_rate": 3.194838456384819e-06, "loss": 0.2476, "step": 433 }, { "epoch": 1.672447013487476, "grad_norm": 0.5435405578589255, "learning_rate": 3.122116626170826e-06, "loss": 0.2365, "step": 434 }, { "epoch": 1.6763005780346822, "grad_norm": 0.6817577487553824, "learning_rate": 3.0501618822962566e-06, "loss": 0.3021, "step": 435 }, { "epoch": 1.6801541425818882, "grad_norm": 0.7502351560290864, "learning_rate": 2.9789774950468265e-06, "loss": 0.3289, "step": 436 }, { "epoch": 1.6840077071290944, "grad_norm": 0.6694176059458067, "learning_rate": 2.908566699696174e-06, "loss": 0.2915, "step": 437 }, { "epoch": 1.6878612716763006, "grad_norm": 0.6629312059099088, "learning_rate": 2.838932696358798e-06, "loss": 0.3103, "step": 438 }, { "epoch": 1.6917148362235066, "grad_norm": 0.8349480949645633, "learning_rate": 2.77007864984461e-06, "loss": 0.2938, "step": 439 }, { "epoch": 1.6955684007707128, "grad_norm": 0.7489107344916376, "learning_rate": 2.7020076895151226e-06, "loss": 0.3284, "step": 440 }, { "epoch": 1.699421965317919, "grad_norm": 0.6851569068577846, "learning_rate": 2.6347229091411876e-06, "loss": 0.3074, "step": 441 }, { "epoch": 1.7032755298651252, "grad_norm": 0.4757754476528696, "learning_rate": 2.5682273667624235e-06, "loss": 0.2425, "step": 442 }, { "epoch": 1.7071290944123314, "grad_norm": 0.6049269606790216, "learning_rate": 2.5025240845481945e-06, "loss": 0.2709, "step": 443 }, { "epoch": 1.7109826589595376, "grad_norm": 0.6082751240069765, "learning_rate": 2.4376160486602875e-06, "loss": 0.3106, "step": 444 }, { "epoch": 1.7148362235067438, "grad_norm": 0.6511104858946544, "learning_rate": 2.37350620911716e-06, "loss": 0.2983, "step": 445 }, { "epoch": 1.71868978805395, "grad_norm": 0.6008366910511428, "learning_rate": 2.3101974796599015e-06, "loss": 0.2836, "step": 446 }, { "epoch": 1.7225433526011562, "grad_norm": 0.6288330836855687, "learning_rate": 2.247692737619769e-06, "loss": 0.2594, "step": 447 }, { "epoch": 1.7263969171483622, "grad_norm": 0.6073567738874573, "learning_rate": 2.1859948237874517e-06, "loss": 0.2655, "step": 448 }, { "epoch": 1.7302504816955684, "grad_norm": 0.5540646853483305, "learning_rate": 2.1251065422839212e-06, "loss": 0.2721, "step": 449 }, { "epoch": 1.7341040462427746, "grad_norm": 0.6179650224299514, "learning_rate": 2.0650306604330163e-06, "loss": 0.2871, "step": 450 }, { "epoch": 1.7379576107899806, "grad_norm": 0.784590739172643, "learning_rate": 2.005769908635662e-06, "loss": 0.3636, "step": 451 }, { "epoch": 1.7418111753371868, "grad_norm": 0.48504899559732223, "learning_rate": 1.947326980245763e-06, "loss": 0.2493, "step": 452 }, { "epoch": 1.745664739884393, "grad_norm": 0.8987552328931523, "learning_rate": 1.889704531447809e-06, "loss": 0.2573, "step": 453 }, { "epoch": 1.7495183044315992, "grad_norm": 0.5767751215303322, "learning_rate": 1.832905181136142e-06, "loss": 0.2385, "step": 454 }, { "epoch": 1.7533718689788054, "grad_norm": 0.6027911760471953, "learning_rate": 1.7769315107959385e-06, "loss": 0.2577, "step": 455 }, { "epoch": 1.7572254335260116, "grad_norm": 0.5310368311114125, "learning_rate": 1.7217860643858797e-06, "loss": 0.2632, "step": 456 }, { "epoch": 1.7610789980732178, "grad_norm": 0.907578227845979, "learning_rate": 1.6674713482225246e-06, "loss": 0.2815, "step": 457 }, { "epoch": 1.764932562620424, "grad_norm": 0.6172664735840684, "learning_rate": 1.6139898308664093e-06, "loss": 0.2779, "step": 458 }, { "epoch": 1.7687861271676302, "grad_norm": 0.6470799229179937, "learning_rate": 1.5613439430098388e-06, "loss": 0.3075, "step": 459 }, { "epoch": 1.7726396917148364, "grad_norm": 0.6173914059389243, "learning_rate": 1.5095360773664402e-06, "loss": 0.2604, "step": 460 }, { "epoch": 1.7764932562620424, "grad_norm": 0.6555070912013644, "learning_rate": 1.4585685885623901e-06, "loss": 0.3061, "step": 461 }, { "epoch": 1.7803468208092486, "grad_norm": 0.5423585126994777, "learning_rate": 1.4084437930294059e-06, "loss": 0.2416, "step": 462 }, { "epoch": 1.7842003853564548, "grad_norm": 0.5128625019538622, "learning_rate": 1.359163968899473e-06, "loss": 0.2296, "step": 463 }, { "epoch": 1.7880539499036607, "grad_norm": 0.8828336205245328, "learning_rate": 1.3107313559012936e-06, "loss": 0.2975, "step": 464 }, { "epoch": 1.791907514450867, "grad_norm": 0.6056697475279614, "learning_rate": 1.2631481552585067e-06, "loss": 0.2689, "step": 465 }, { "epoch": 1.7957610789980731, "grad_norm": 0.7746740424893049, "learning_rate": 1.2164165295896392e-06, "loss": 0.2669, "step": 466 }, { "epoch": 1.7996146435452793, "grad_norm": 0.6006232506779448, "learning_rate": 1.1705386028098009e-06, "loss": 0.3039, "step": 467 }, { "epoch": 1.8034682080924855, "grad_norm": 0.5403526649265826, "learning_rate": 1.1255164600341816e-06, "loss": 0.2754, "step": 468 }, { "epoch": 1.8073217726396917, "grad_norm": 0.5862736861808353, "learning_rate": 1.08135214748327e-06, "loss": 0.287, "step": 469 }, { "epoch": 1.811175337186898, "grad_norm": 0.7474202062547819, "learning_rate": 1.0380476723898458e-06, "loss": 0.3006, "step": 470 }, { "epoch": 1.8150289017341041, "grad_norm": 0.5942504408819616, "learning_rate": 9.956050029077646e-07, "loss": 0.2911, "step": 471 }, { "epoch": 1.8188824662813103, "grad_norm": 0.5958984883417842, "learning_rate": 9.540260680225133e-07, "loss": 0.3008, "step": 472 }, { "epoch": 1.8227360308285165, "grad_norm": 0.5546378435963755, "learning_rate": 9.133127574635181e-07, "loss": 0.2325, "step": 473 }, { "epoch": 1.8265895953757225, "grad_norm": 0.6588191099653756, "learning_rate": 8.734669216182779e-07, "loss": 0.2822, "step": 474 }, { "epoch": 1.8304431599229287, "grad_norm": 0.5229342432978052, "learning_rate": 8.344903714482555e-07, "loss": 0.248, "step": 475 }, { "epoch": 1.834296724470135, "grad_norm": 0.5268589130032248, "learning_rate": 7.963848784065753e-07, "loss": 0.3033, "step": 476 }, { "epoch": 1.838150289017341, "grad_norm": 0.6011796025596555, "learning_rate": 7.591521743575003e-07, "loss": 0.289, "step": 477 }, { "epoch": 1.842003853564547, "grad_norm": 0.6073026645436564, "learning_rate": 7.227939514977422e-07, "loss": 0.3053, "step": 478 }, { "epoch": 1.8458574181117533, "grad_norm": 0.5341540712274524, "learning_rate": 6.87311862279536e-07, "loss": 0.2535, "step": 479 }, { "epoch": 1.8497109826589595, "grad_norm": 0.6384475776837815, "learning_rate": 6.527075193355337e-07, "loss": 0.3018, "step": 480 }, { "epoch": 1.8535645472061657, "grad_norm": 0.6168993774079564, "learning_rate": 6.189824954055335e-07, "loss": 0.3332, "step": 481 }, { "epoch": 1.857418111753372, "grad_norm": 0.6689061938191962, "learning_rate": 5.861383232649708e-07, "loss": 0.3018, "step": 482 }, { "epoch": 1.861271676300578, "grad_norm": 0.5773513436403104, "learning_rate": 5.541764956552831e-07, "loss": 0.3086, "step": 483 }, { "epoch": 1.8651252408477843, "grad_norm": 0.5674684683333645, "learning_rate": 5.230984652160387e-07, "loss": 0.2904, "step": 484 }, { "epoch": 1.8689788053949905, "grad_norm": 0.6136551285882403, "learning_rate": 4.92905644418944e-07, "loss": 0.2868, "step": 485 }, { "epoch": 1.8728323699421965, "grad_norm": 0.5814884659334725, "learning_rate": 4.635994055036208e-07, "loss": 0.2837, "step": 486 }, { "epoch": 1.8766859344894027, "grad_norm": 0.5724578775003472, "learning_rate": 4.3518108041525675e-07, "loss": 0.266, "step": 487 }, { "epoch": 1.8805394990366089, "grad_norm": 0.5758555299044388, "learning_rate": 4.0765196074406433e-07, "loss": 0.2733, "step": 488 }, { "epoch": 1.8843930635838149, "grad_norm": 1.583650395260831, "learning_rate": 3.8101329766657924e-07, "loss": 0.2487, "step": 489 }, { "epoch": 1.888246628131021, "grad_norm": 0.5001989878794011, "learning_rate": 3.5526630188879475e-07, "loss": 0.2928, "step": 490 }, { "epoch": 1.8921001926782273, "grad_norm": 0.5835273327136795, "learning_rate": 3.304121435911345e-07, "loss": 0.2951, "step": 491 }, { "epoch": 1.8959537572254335, "grad_norm": 0.6058658063907555, "learning_rate": 3.064519523752751e-07, "loss": 0.3092, "step": 492 }, { "epoch": 1.8998073217726397, "grad_norm": 0.48508060199041647, "learning_rate": 2.8338681721279627e-07, "loss": 0.2587, "step": 493 }, { "epoch": 1.9036608863198459, "grad_norm": 0.6173948412532628, "learning_rate": 2.612177863956977e-07, "loss": 0.3553, "step": 494 }, { "epoch": 1.907514450867052, "grad_norm": 0.5583615292360055, "learning_rate": 2.3994586748875116e-07, "loss": 0.2629, "step": 495 }, { "epoch": 1.9113680154142583, "grad_norm": 0.6358847375735043, "learning_rate": 2.1957202728370542e-07, "loss": 0.3183, "step": 496 }, { "epoch": 1.9152215799614645, "grad_norm": 0.5168808695590897, "learning_rate": 2.000971917553529e-07, "loss": 0.2417, "step": 497 }, { "epoch": 1.9190751445086707, "grad_norm": 0.6311552147683044, "learning_rate": 1.8152224601943435e-07, "loss": 0.297, "step": 498 }, { "epoch": 1.9229287090558767, "grad_norm": 0.6268922330585508, "learning_rate": 1.6384803429242202e-07, "loss": 0.3107, "step": 499 }, { "epoch": 1.9267822736030829, "grad_norm": 0.4990465421997129, "learning_rate": 1.4707535985314158e-07, "loss": 0.2362, "step": 500 }, { "epoch": 1.930635838150289, "grad_norm": 0.5842077868087064, "learning_rate": 1.3120498500627243e-07, "loss": 0.2877, "step": 501 }, { "epoch": 1.934489402697495, "grad_norm": 0.6539135809078528, "learning_rate": 1.1623763104769536e-07, "loss": 0.2189, "step": 502 }, { "epoch": 1.9383429672447012, "grad_norm": 0.5738597010304898, "learning_rate": 1.0217397823170771e-07, "loss": 0.3352, "step": 503 }, { "epoch": 1.9421965317919074, "grad_norm": 0.5459102574082527, "learning_rate": 8.901466574011919e-08, "loss": 0.2929, "step": 504 }, { "epoch": 1.9460500963391136, "grad_norm": 0.5914038878762246, "learning_rate": 7.676029165318622e-08, "loss": 0.2649, "step": 505 }, { "epoch": 1.9499036608863198, "grad_norm": 0.48711166138829143, "learning_rate": 6.541141292243814e-08, "loss": 0.2734, "step": 506 }, { "epoch": 1.953757225433526, "grad_norm": 0.5110998668531642, "learning_rate": 5.496854534536189e-08, "loss": 0.2689, "step": 507 }, { "epoch": 1.9576107899807322, "grad_norm": 0.580604888687543, "learning_rate": 4.5432163541960785e-08, "loss": 0.2636, "step": 508 }, { "epoch": 1.9614643545279384, "grad_norm": 0.5478455993832356, "learning_rate": 3.680270093318505e-08, "loss": 0.2662, "step": 509 }, { "epoch": 1.9653179190751446, "grad_norm": 0.5860745443181511, "learning_rate": 2.9080549721225426e-08, "loss": 0.2812, "step": 510 }, { "epoch": 1.9691714836223508, "grad_norm": 0.5859204655633964, "learning_rate": 2.226606087169847e-08, "loss": 0.2682, "step": 511 }, { "epoch": 1.9730250481695568, "grad_norm": 0.552238848617358, "learning_rate": 1.6359544097686033e-08, "loss": 0.2589, "step": 512 }, { "epoch": 1.976878612716763, "grad_norm": 0.6319565374117269, "learning_rate": 1.136126784566649e-08, "loss": 0.28, "step": 513 }, { "epoch": 1.9807321772639692, "grad_norm": 0.6410471677739641, "learning_rate": 7.271459283308968e-09, "loss": 0.3306, "step": 514 }, { "epoch": 1.9845857418111752, "grad_norm": 0.5612641252616729, "learning_rate": 4.090304289150471e-09, "loss": 0.2771, "step": 515 }, { "epoch": 1.9884393063583814, "grad_norm": 0.49679899268137545, "learning_rate": 1.817947444149315e-09, "loss": 0.2396, "step": 516 }, { "epoch": 1.9922928709055876, "grad_norm": 0.559672250420135, "learning_rate": 4.544920251126073e-10, "loss": 0.2364, "step": 517 }, { "epoch": 1.9961464354527938, "grad_norm": 0.5094885850047902, "learning_rate": 0.0, "loss": 0.2353, "step": 518 }, { "epoch": 1.9961464354527938, "eval_loss": 0.27941030263900757, "eval_runtime": 147.0149, "eval_samples_per_second": 13.59, "eval_steps_per_second": 0.429, "step": 518 }, { "epoch": 1.9961464354527938, "step": 518, "total_flos": 2.6836433607911014e+17, "train_loss": 0.7686227933723033, "train_runtime": 16387.3806, "train_samples_per_second": 4.053, "train_steps_per_second": 0.032 } ], "logging_steps": 1, "max_steps": 518, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6836433607911014e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }