diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5414 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999975471559273, + "eval_steps": 500, + "global_step": 7644, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003924550516323677, + "grad_norm": 17.851987080758406, + "learning_rate": 8.695652173913044e-07, + "loss": 1.6604, + "step": 10 + }, + { + "epoch": 0.007849101032647354, + "grad_norm": 4.080985759495849, + "learning_rate": 1.7391304347826088e-06, + "loss": 1.6397, + "step": 20 + }, + { + "epoch": 0.011773651548971031, + "grad_norm": 2.9426600727037857, + "learning_rate": 2.6086956521739132e-06, + "loss": 1.5109, + "step": 30 + }, + { + "epoch": 0.01569820206529471, + "grad_norm": 2.083885999160059, + "learning_rate": 3.4782608695652175e-06, + "loss": 1.4323, + "step": 40 + }, + { + "epoch": 0.019622752581618386, + "grad_norm": 5.06773100741999, + "learning_rate": 4.347826086956522e-06, + "loss": 1.4517, + "step": 50 + }, + { + "epoch": 0.023547303097942063, + "grad_norm": 3.058219108148106, + "learning_rate": 5.2173913043478265e-06, + "loss": 1.3684, + "step": 60 + }, + { + "epoch": 0.02747185361426574, + "grad_norm": 2.512374629159717, + "learning_rate": 6.086956521739132e-06, + "loss": 1.2584, + "step": 70 + }, + { + "epoch": 0.03139640413058942, + "grad_norm": 2.083781423381198, + "learning_rate": 6.956521739130435e-06, + "loss": 1.2888, + "step": 80 + }, + { + "epoch": 0.035320954646913094, + "grad_norm": 1.33754132363256, + "learning_rate": 7.82608695652174e-06, + "loss": 1.3061, + "step": 90 + }, + { + "epoch": 0.03924550516323677, + "grad_norm": 5.056402397307596, + "learning_rate": 8.695652173913044e-06, + "loss": 1.3281, + "step": 100 + }, + { + "epoch": 0.04317005567956045, + "grad_norm": 2.924060457678278, + "learning_rate": 9.565217391304349e-06, + "loss": 1.3235, + "step": 110 + }, + { + "epoch": 0.047094606195884126, + "grad_norm": 2.2704861146708177, + "learning_rate": 1.0434782608695653e-05, + "loss": 1.2073, + "step": 120 + }, + { + "epoch": 0.0510191567122078, + "grad_norm": 2.137074925911332, + "learning_rate": 1.1304347826086957e-05, + "loss": 1.2304, + "step": 130 + }, + { + "epoch": 0.05494370722853148, + "grad_norm": 1.7728719097748167, + "learning_rate": 1.2173913043478263e-05, + "loss": 1.2688, + "step": 140 + }, + { + "epoch": 0.05886825774485516, + "grad_norm": 4.131355974368673, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.2972, + "step": 150 + }, + { + "epoch": 0.06279280826117883, + "grad_norm": 2.799659453356947, + "learning_rate": 1.391304347826087e-05, + "loss": 1.2953, + "step": 160 + }, + { + "epoch": 0.06671735877750251, + "grad_norm": 2.111913097871217, + "learning_rate": 1.4782608695652174e-05, + "loss": 1.2443, + "step": 170 + }, + { + "epoch": 0.07064190929382619, + "grad_norm": 2.1912441872440005, + "learning_rate": 1.565217391304348e-05, + "loss": 1.236, + "step": 180 + }, + { + "epoch": 0.07456645981014987, + "grad_norm": 2.234098380713297, + "learning_rate": 1.6521739130434785e-05, + "loss": 1.259, + "step": 190 + }, + { + "epoch": 0.07849101032647354, + "grad_norm": 4.198102404688399, + "learning_rate": 1.739130434782609e-05, + "loss": 1.2817, + "step": 200 + }, + { + "epoch": 0.08241556084279722, + "grad_norm": 3.019782787557433, + "learning_rate": 1.8260869565217393e-05, + "loss": 1.2664, + "step": 210 + }, + { + "epoch": 0.0863401113591209, + "grad_norm": 2.0685750522996167, + "learning_rate": 1.9130434782608697e-05, + "loss": 1.2373, + "step": 220 + }, + { + "epoch": 0.09026466187544457, + "grad_norm": 4.300018110272735, + "learning_rate": 2e-05, + "loss": 1.2275, + "step": 230 + }, + { + "epoch": 0.09418921239176825, + "grad_norm": 1.7354497070996773, + "learning_rate": 1.9999910223238215e-05, + "loss": 1.2408, + "step": 240 + }, + { + "epoch": 0.09811376290809193, + "grad_norm": 3.4204284789804036, + "learning_rate": 1.999964089456483e-05, + "loss": 1.2965, + "step": 250 + }, + { + "epoch": 0.1020383134244156, + "grad_norm": 2.4941748811333215, + "learning_rate": 1.9999192018815737e-05, + "loss": 1.2779, + "step": 260 + }, + { + "epoch": 0.10596286394073928, + "grad_norm": 1.9457868113217196, + "learning_rate": 1.999856360405066e-05, + "loss": 1.2323, + "step": 270 + }, + { + "epoch": 0.10988741445706296, + "grad_norm": 1.6849461858955408, + "learning_rate": 1.9997755661553007e-05, + "loss": 1.2056, + "step": 280 + }, + { + "epoch": 0.11381196497338664, + "grad_norm": 1.867640144655699, + "learning_rate": 1.9996768205829667e-05, + "loss": 1.2369, + "step": 290 + }, + { + "epoch": 0.11773651548971031, + "grad_norm": 3.3865066094085785, + "learning_rate": 1.9995601254610757e-05, + "loss": 1.2683, + "step": 300 + }, + { + "epoch": 0.12166106600603399, + "grad_norm": 2.397528528715023, + "learning_rate": 1.99942548288493e-05, + "loss": 1.2799, + "step": 310 + }, + { + "epoch": 0.12558561652235767, + "grad_norm": 2.5820404632260487, + "learning_rate": 1.9992728952720842e-05, + "loss": 1.2019, + "step": 320 + }, + { + "epoch": 0.12951016703868135, + "grad_norm": 1.7151124312301378, + "learning_rate": 1.9991023653623028e-05, + "loss": 1.2133, + "step": 330 + }, + { + "epoch": 0.13343471755500502, + "grad_norm": 2.0107220916427164, + "learning_rate": 1.9989138962175105e-05, + "loss": 1.2405, + "step": 340 + }, + { + "epoch": 0.1373592680713287, + "grad_norm": 2.8574288343315897, + "learning_rate": 1.998707491221737e-05, + "loss": 1.2588, + "step": 350 + }, + { + "epoch": 0.14128381858765238, + "grad_norm": 2.4054121950102756, + "learning_rate": 1.9984831540810567e-05, + "loss": 1.2581, + "step": 360 + }, + { + "epoch": 0.14520836910397605, + "grad_norm": 1.8485304425199756, + "learning_rate": 1.9982408888235224e-05, + "loss": 1.1938, + "step": 370 + }, + { + "epoch": 0.14913291962029973, + "grad_norm": 1.5828252143457966, + "learning_rate": 1.997980699799092e-05, + "loss": 1.1862, + "step": 380 + }, + { + "epoch": 0.1530574701366234, + "grad_norm": 1.2627808446236994, + "learning_rate": 1.9977025916795503e-05, + "loss": 1.2135, + "step": 390 + }, + { + "epoch": 0.15698202065294709, + "grad_norm": 3.2564505072901664, + "learning_rate": 1.997406569458428e-05, + "loss": 1.2506, + "step": 400 + }, + { + "epoch": 0.16090657116927076, + "grad_norm": 2.3862130817749825, + "learning_rate": 1.997092638450907e-05, + "loss": 1.2391, + "step": 410 + }, + { + "epoch": 0.16483112168559444, + "grad_norm": 1.8315079964245025, + "learning_rate": 1.9967608042937303e-05, + "loss": 1.1829, + "step": 420 + }, + { + "epoch": 0.16875567220191812, + "grad_norm": 1.8430052290908208, + "learning_rate": 1.9964110729450966e-05, + "loss": 1.205, + "step": 430 + }, + { + "epoch": 0.1726802227182418, + "grad_norm": 1.5980747844446883, + "learning_rate": 1.9960434506845555e-05, + "loss": 1.2007, + "step": 440 + }, + { + "epoch": 0.17660477323456547, + "grad_norm": 2.942407149873092, + "learning_rate": 1.9956579441128942e-05, + "loss": 1.2365, + "step": 450 + }, + { + "epoch": 0.18052932375088915, + "grad_norm": 2.379751902649777, + "learning_rate": 1.995254560152019e-05, + "loss": 1.2276, + "step": 460 + }, + { + "epoch": 0.18445387426721283, + "grad_norm": 1.7688369917233489, + "learning_rate": 1.9948333060448314e-05, + "loss": 1.1968, + "step": 470 + }, + { + "epoch": 0.1883784247835365, + "grad_norm": 1.589083721261629, + "learning_rate": 1.994394189355097e-05, + "loss": 1.195, + "step": 480 + }, + { + "epoch": 0.19230297529986018, + "grad_norm": 1.5496967150579632, + "learning_rate": 1.9939372179673104e-05, + "loss": 1.1999, + "step": 490 + }, + { + "epoch": 0.19622752581618386, + "grad_norm": 2.718076034623976, + "learning_rate": 1.9934624000865542e-05, + "loss": 1.2349, + "step": 500 + }, + { + "epoch": 0.20015207633250753, + "grad_norm": 2.3890997580290567, + "learning_rate": 1.9929697442383514e-05, + "loss": 1.2326, + "step": 510 + }, + { + "epoch": 0.2040766268488312, + "grad_norm": 1.9579908821335397, + "learning_rate": 1.9924592592685105e-05, + "loss": 1.1975, + "step": 520 + }, + { + "epoch": 0.2080011773651549, + "grad_norm": 1.5776501334782151, + "learning_rate": 1.991930954342969e-05, + "loss": 1.1816, + "step": 530 + }, + { + "epoch": 0.21192572788147856, + "grad_norm": 1.1161863427171768, + "learning_rate": 1.9913848389476283e-05, + "loss": 1.1491, + "step": 540 + }, + { + "epoch": 0.21585027839780224, + "grad_norm": 2.648197492260032, + "learning_rate": 1.9908209228881826e-05, + "loss": 1.2435, + "step": 550 + }, + { + "epoch": 0.21977482891412592, + "grad_norm": 2.408343139002311, + "learning_rate": 1.990239216289944e-05, + "loss": 1.2316, + "step": 560 + }, + { + "epoch": 0.2236993794304496, + "grad_norm": 1.8673008806345932, + "learning_rate": 1.9896397295976585e-05, + "loss": 1.1606, + "step": 570 + }, + { + "epoch": 0.22762392994677327, + "grad_norm": 1.5191789990510323, + "learning_rate": 1.9890224735753215e-05, + "loss": 1.1647, + "step": 580 + }, + { + "epoch": 0.23154848046309695, + "grad_norm": 1.2422189101767849, + "learning_rate": 1.9883874593059825e-05, + "loss": 1.1751, + "step": 590 + }, + { + "epoch": 0.23547303097942063, + "grad_norm": 2.942648489159468, + "learning_rate": 1.987734698191546e-05, + "loss": 1.2334, + "step": 600 + }, + { + "epoch": 0.2393975814957443, + "grad_norm": 2.4573102563716294, + "learning_rate": 1.987064201952568e-05, + "loss": 1.237, + "step": 610 + }, + { + "epoch": 0.24332213201206798, + "grad_norm": 1.7727471642184118, + "learning_rate": 1.9863759826280446e-05, + "loss": 1.1665, + "step": 620 + }, + { + "epoch": 0.24724668252839166, + "grad_norm": 1.5235132834345957, + "learning_rate": 1.9856700525751967e-05, + "loss": 1.1724, + "step": 630 + }, + { + "epoch": 0.25117123304471534, + "grad_norm": 1.2376130762267767, + "learning_rate": 1.9849464244692465e-05, + "loss": 1.1712, + "step": 640 + }, + { + "epoch": 0.255095783561039, + "grad_norm": 3.040275337730361, + "learning_rate": 1.9842051113031922e-05, + "loss": 1.2033, + "step": 650 + }, + { + "epoch": 0.2590203340773627, + "grad_norm": 2.3310773148731445, + "learning_rate": 1.9834461263875728e-05, + "loss": 1.2291, + "step": 660 + }, + { + "epoch": 0.26294488459368637, + "grad_norm": 1.7613401699558136, + "learning_rate": 1.9826694833502295e-05, + "loss": 1.2002, + "step": 670 + }, + { + "epoch": 0.26686943511001004, + "grad_norm": 1.4212023760208006, + "learning_rate": 1.9818751961360623e-05, + "loss": 1.1806, + "step": 680 + }, + { + "epoch": 0.2707939856263337, + "grad_norm": 1.5547355362596973, + "learning_rate": 1.9810632790067773e-05, + "loss": 1.203, + "step": 690 + }, + { + "epoch": 0.2747185361426574, + "grad_norm": 2.864900900676303, + "learning_rate": 1.9802337465406332e-05, + "loss": 1.2255, + "step": 700 + }, + { + "epoch": 0.2786430866589811, + "grad_norm": 2.273750000509486, + "learning_rate": 1.9793866136321775e-05, + "loss": 1.2176, + "step": 710 + }, + { + "epoch": 0.28256763717530475, + "grad_norm": 1.8215540424542227, + "learning_rate": 1.97852189549198e-05, + "loss": 1.159, + "step": 720 + }, + { + "epoch": 0.28649218769162843, + "grad_norm": 1.4139436194360697, + "learning_rate": 1.9776396076463597e-05, + "loss": 1.1552, + "step": 730 + }, + { + "epoch": 0.2904167382079521, + "grad_norm": 1.2805284263458057, + "learning_rate": 1.9767397659371058e-05, + "loss": 1.1857, + "step": 740 + }, + { + "epoch": 0.2943412887242758, + "grad_norm": 2.6772231556409025, + "learning_rate": 1.975822386521193e-05, + "loss": 1.2332, + "step": 750 + }, + { + "epoch": 0.29826583924059946, + "grad_norm": 2.283135985364224, + "learning_rate": 1.974887485870492e-05, + "loss": 1.203, + "step": 760 + }, + { + "epoch": 0.30219038975692314, + "grad_norm": 1.6219262272093848, + "learning_rate": 1.973935080771474e-05, + "loss": 1.1696, + "step": 770 + }, + { + "epoch": 0.3061149402732468, + "grad_norm": 1.492581338030124, + "learning_rate": 1.9729651883249075e-05, + "loss": 1.1732, + "step": 780 + }, + { + "epoch": 0.3100394907895705, + "grad_norm": 1.0942797906848452, + "learning_rate": 1.9719778259455533e-05, + "loss": 1.1665, + "step": 790 + }, + { + "epoch": 0.31396404130589417, + "grad_norm": 2.7812656388019694, + "learning_rate": 1.9709730113618507e-05, + "loss": 1.224, + "step": 800 + }, + { + "epoch": 0.31788859182221785, + "grad_norm": 2.2668037248648494, + "learning_rate": 1.9699507626156e-05, + "loss": 1.2128, + "step": 810 + }, + { + "epoch": 0.3218131423385415, + "grad_norm": 1.653446544884032, + "learning_rate": 1.9689110980616374e-05, + "loss": 1.1697, + "step": 820 + }, + { + "epoch": 0.3257376928548652, + "grad_norm": 1.6562721449500364, + "learning_rate": 1.967854036367506e-05, + "loss": 1.1786, + "step": 830 + }, + { + "epoch": 0.3296622433711889, + "grad_norm": 1.3990148712849042, + "learning_rate": 1.9667795965131215e-05, + "loss": 1.1814, + "step": 840 + }, + { + "epoch": 0.33358679388751256, + "grad_norm": 2.709511506933081, + "learning_rate": 1.96568779779043e-05, + "loss": 1.2135, + "step": 850 + }, + { + "epoch": 0.33751134440383623, + "grad_norm": 2.1617540677777303, + "learning_rate": 1.9645786598030617e-05, + "loss": 1.1925, + "step": 860 + }, + { + "epoch": 0.3414358949201599, + "grad_norm": 1.6647622938579412, + "learning_rate": 1.9634522024659802e-05, + "loss": 1.1654, + "step": 870 + }, + { + "epoch": 0.3453604454364836, + "grad_norm": 1.445420557740109, + "learning_rate": 1.9623084460051246e-05, + "loss": 1.1318, + "step": 880 + }, + { + "epoch": 0.34928499595280726, + "grad_norm": 1.2317074284184588, + "learning_rate": 1.9611474109570446e-05, + "loss": 1.1489, + "step": 890 + }, + { + "epoch": 0.35320954646913094, + "grad_norm": 2.5433820603816497, + "learning_rate": 1.9599691181685335e-05, + "loss": 1.2242, + "step": 900 + }, + { + "epoch": 0.3571340969854546, + "grad_norm": 2.147149465279269, + "learning_rate": 1.9587735887962533e-05, + "loss": 1.2087, + "step": 910 + }, + { + "epoch": 0.3610586475017783, + "grad_norm": 1.6551085595896637, + "learning_rate": 1.957560844306356e-05, + "loss": 1.1401, + "step": 920 + }, + { + "epoch": 0.364983198018102, + "grad_norm": 1.511777154462307, + "learning_rate": 1.9563309064740955e-05, + "loss": 1.1597, + "step": 930 + }, + { + "epoch": 0.36890774853442565, + "grad_norm": 1.1094232746787396, + "learning_rate": 1.955083797383439e-05, + "loss": 1.1569, + "step": 940 + }, + { + "epoch": 0.3728322990507493, + "grad_norm": 2.572815297222484, + "learning_rate": 1.95381953942667e-05, + "loss": 1.2189, + "step": 950 + }, + { + "epoch": 0.376756849567073, + "grad_norm": 2.333811966623515, + "learning_rate": 1.9525381553039852e-05, + "loss": 1.2014, + "step": 960 + }, + { + "epoch": 0.3806814000833967, + "grad_norm": 1.6717294454527916, + "learning_rate": 1.951239668023088e-05, + "loss": 1.1645, + "step": 970 + }, + { + "epoch": 0.38460595059972036, + "grad_norm": 1.5026637953972823, + "learning_rate": 1.9499241008987758e-05, + "loss": 1.1632, + "step": 980 + }, + { + "epoch": 0.38853050111604404, + "grad_norm": 1.2986707472970862, + "learning_rate": 1.9485914775525193e-05, + "loss": 1.1644, + "step": 990 + }, + { + "epoch": 0.3924550516323677, + "grad_norm": 2.9685551602402573, + "learning_rate": 1.9472418219120403e-05, + "loss": 1.1866, + "step": 1000 + }, + { + "epoch": 0.3963796021486914, + "grad_norm": 2.234634727478329, + "learning_rate": 1.945875158210881e-05, + "loss": 1.2102, + "step": 1010 + }, + { + "epoch": 0.40030415266501507, + "grad_norm": 1.724781393052345, + "learning_rate": 1.9444915109879704e-05, + "loss": 1.1415, + "step": 1020 + }, + { + "epoch": 0.40422870318133874, + "grad_norm": 1.4455353570141956, + "learning_rate": 1.9430909050871815e-05, + "loss": 1.1638, + "step": 1030 + }, + { + "epoch": 0.4081532536976624, + "grad_norm": 1.1969939749246539, + "learning_rate": 1.9416733656568868e-05, + "loss": 1.1527, + "step": 1040 + }, + { + "epoch": 0.4120778042139861, + "grad_norm": 2.5537155941782026, + "learning_rate": 1.9402389181495063e-05, + "loss": 1.2141, + "step": 1050 + }, + { + "epoch": 0.4160023547303098, + "grad_norm": 2.08351625708691, + "learning_rate": 1.9387875883210507e-05, + "loss": 1.1907, + "step": 1060 + }, + { + "epoch": 0.41992690524663345, + "grad_norm": 1.683716206941947, + "learning_rate": 1.937319402230658e-05, + "loss": 1.1538, + "step": 1070 + }, + { + "epoch": 0.42385145576295713, + "grad_norm": 1.4654454547439344, + "learning_rate": 1.935834386240127e-05, + "loss": 1.1601, + "step": 1080 + }, + { + "epoch": 0.4277760062792808, + "grad_norm": 1.1890264813491744, + "learning_rate": 1.934332567013443e-05, + "loss": 1.1569, + "step": 1090 + }, + { + "epoch": 0.4317005567956045, + "grad_norm": 2.491064388570012, + "learning_rate": 1.9328139715162994e-05, + "loss": 1.2189, + "step": 1100 + }, + { + "epoch": 0.43562510731192816, + "grad_norm": 2.1439066913442444, + "learning_rate": 1.9312786270156135e-05, + "loss": 1.1932, + "step": 1110 + }, + { + "epoch": 0.43954965782825184, + "grad_norm": 1.669653473033775, + "learning_rate": 1.9297265610790373e-05, + "loss": 1.1387, + "step": 1120 + }, + { + "epoch": 0.4434742083445755, + "grad_norm": 1.3843825017667384, + "learning_rate": 1.9281578015744603e-05, + "loss": 1.1376, + "step": 1130 + }, + { + "epoch": 0.4473987588608992, + "grad_norm": 1.1753467269379858, + "learning_rate": 1.9265723766695135e-05, + "loss": 1.1481, + "step": 1140 + }, + { + "epoch": 0.45132330937722287, + "grad_norm": 2.6835572127286818, + "learning_rate": 1.9249703148310588e-05, + "loss": 1.1767, + "step": 1150 + }, + { + "epoch": 0.45524785989354655, + "grad_norm": 2.3542813633364106, + "learning_rate": 1.9233516448246815e-05, + "loss": 1.2115, + "step": 1160 + }, + { + "epoch": 0.4591724104098702, + "grad_norm": 1.7788410116492113, + "learning_rate": 1.9217163957141716e-05, + "loss": 1.1595, + "step": 1170 + }, + { + "epoch": 0.4630969609261939, + "grad_norm": 1.589344468558335, + "learning_rate": 1.9200645968610036e-05, + "loss": 1.1469, + "step": 1180 + }, + { + "epoch": 0.4670215114425176, + "grad_norm": 1.3723348105531945, + "learning_rate": 1.918396277923807e-05, + "loss": 1.1257, + "step": 1190 + }, + { + "epoch": 0.47094606195884126, + "grad_norm": 2.3205173632902674, + "learning_rate": 1.9167114688578368e-05, + "loss": 1.1712, + "step": 1200 + }, + { + "epoch": 0.47487061247516493, + "grad_norm": 2.20379995717616, + "learning_rate": 1.9150101999144338e-05, + "loss": 1.198, + "step": 1210 + }, + { + "epoch": 0.4787951629914886, + "grad_norm": 1.6571237734382616, + "learning_rate": 1.9132925016404805e-05, + "loss": 1.1346, + "step": 1220 + }, + { + "epoch": 0.4827197135078123, + "grad_norm": 1.5174786415787016, + "learning_rate": 1.911558404877855e-05, + "loss": 1.1382, + "step": 1230 + }, + { + "epoch": 0.48664426402413596, + "grad_norm": 1.0610840707954994, + "learning_rate": 1.909807940762876e-05, + "loss": 1.1223, + "step": 1240 + }, + { + "epoch": 0.49056881454045964, + "grad_norm": 2.628300567508133, + "learning_rate": 1.908041140725743e-05, + "loss": 1.1758, + "step": 1250 + }, + { + "epoch": 0.4944933650567833, + "grad_norm": 2.1593460670180655, + "learning_rate": 1.9062580364899735e-05, + "loss": 1.2182, + "step": 1260 + }, + { + "epoch": 0.498417915573107, + "grad_norm": 1.6850637902364638, + "learning_rate": 1.9044586600718323e-05, + "loss": 1.1582, + "step": 1270 + }, + { + "epoch": 0.5023424660894307, + "grad_norm": 1.6079103318853967, + "learning_rate": 1.9026430437797568e-05, + "loss": 1.1213, + "step": 1280 + }, + { + "epoch": 0.5062670166057543, + "grad_norm": 1.2518743683340756, + "learning_rate": 1.9008112202137777e-05, + "loss": 1.1546, + "step": 1290 + }, + { + "epoch": 0.510191567122078, + "grad_norm": 2.4116262224448057, + "learning_rate": 1.898963222264932e-05, + "loss": 1.1807, + "step": 1300 + }, + { + "epoch": 0.5141161176384017, + "grad_norm": 2.301017308903272, + "learning_rate": 1.8970990831146744e-05, + "loss": 1.1837, + "step": 1310 + }, + { + "epoch": 0.5180406681547254, + "grad_norm": 1.69892059072323, + "learning_rate": 1.8952188362342804e-05, + "loss": 1.1347, + "step": 1320 + }, + { + "epoch": 0.5219652186710491, + "grad_norm": 1.399901320658771, + "learning_rate": 1.8933225153842446e-05, + "loss": 1.1464, + "step": 1330 + }, + { + "epoch": 0.5258897691873727, + "grad_norm": 1.0557641784260816, + "learning_rate": 1.8914101546136766e-05, + "loss": 1.1349, + "step": 1340 + }, + { + "epoch": 0.5298143197036964, + "grad_norm": 2.5362209062888295, + "learning_rate": 1.889481788259688e-05, + "loss": 1.1834, + "step": 1350 + }, + { + "epoch": 0.5337388702200201, + "grad_norm": 2.1100281299468278, + "learning_rate": 1.8875374509467757e-05, + "loss": 1.2085, + "step": 1360 + }, + { + "epoch": 0.5376634207363438, + "grad_norm": 1.656879350518556, + "learning_rate": 1.8855771775862014e-05, + "loss": 1.145, + "step": 1370 + }, + { + "epoch": 0.5415879712526674, + "grad_norm": 1.3835758185671234, + "learning_rate": 1.8836010033753637e-05, + "loss": 1.1332, + "step": 1380 + }, + { + "epoch": 0.5455125217689911, + "grad_norm": 1.3074526959475135, + "learning_rate": 1.8816089637971674e-05, + "loss": 1.1337, + "step": 1390 + }, + { + "epoch": 0.5494370722853148, + "grad_norm": 2.471754592942074, + "learning_rate": 1.879601094619385e-05, + "loss": 1.177, + "step": 1400 + }, + { + "epoch": 0.5533616228016385, + "grad_norm": 2.0297870153949953, + "learning_rate": 1.877577431894015e-05, + "loss": 1.2251, + "step": 1410 + }, + { + "epoch": 0.5572861733179622, + "grad_norm": 1.6193945226941358, + "learning_rate": 1.8755380119566343e-05, + "loss": 1.0928, + "step": 1420 + }, + { + "epoch": 0.5612107238342858, + "grad_norm": 1.4634472002355838, + "learning_rate": 1.873482871425747e-05, + "loss": 1.143, + "step": 1430 + }, + { + "epoch": 0.5651352743506095, + "grad_norm": 1.2878805152891477, + "learning_rate": 1.8714120472021252e-05, + "loss": 1.1712, + "step": 1440 + }, + { + "epoch": 0.5690598248669332, + "grad_norm": 2.6107789403264965, + "learning_rate": 1.8693255764681476e-05, + "loss": 1.1793, + "step": 1450 + }, + { + "epoch": 0.5729843753832569, + "grad_norm": 2.101138870961313, + "learning_rate": 1.867223496687131e-05, + "loss": 1.1724, + "step": 1460 + }, + { + "epoch": 0.5769089258995805, + "grad_norm": 1.559869838184, + "learning_rate": 1.865105845602659e-05, + "loss": 1.1569, + "step": 1470 + }, + { + "epoch": 0.5808334764159042, + "grad_norm": 1.4484698696907943, + "learning_rate": 1.8629726612379034e-05, + "loss": 1.1461, + "step": 1480 + }, + { + "epoch": 0.5847580269322279, + "grad_norm": 1.1551387246395677, + "learning_rate": 1.86082398189494e-05, + "loss": 1.1276, + "step": 1490 + }, + { + "epoch": 0.5886825774485516, + "grad_norm": 2.3243966610365208, + "learning_rate": 1.8586598461540647e-05, + "loss": 1.1865, + "step": 1500 + }, + { + "epoch": 0.5926071279648752, + "grad_norm": 2.045560797585921, + "learning_rate": 1.8564802928730963e-05, + "loss": 1.1981, + "step": 1510 + }, + { + "epoch": 0.5965316784811989, + "grad_norm": 1.5774145920172018, + "learning_rate": 1.8542853611866826e-05, + "loss": 1.1475, + "step": 1520 + }, + { + "epoch": 0.6004562289975226, + "grad_norm": 1.401143146614057, + "learning_rate": 1.8520750905055948e-05, + "loss": 1.1113, + "step": 1530 + }, + { + "epoch": 0.6043807795138463, + "grad_norm": 1.0993576375496286, + "learning_rate": 1.849849520516023e-05, + "loss": 1.1196, + "step": 1540 + }, + { + "epoch": 0.60830533003017, + "grad_norm": 2.6837789697900694, + "learning_rate": 1.8476086911788588e-05, + "loss": 1.1731, + "step": 1550 + }, + { + "epoch": 0.6122298805464936, + "grad_norm": 2.2269178857118166, + "learning_rate": 1.8453526427289836e-05, + "loss": 1.1673, + "step": 1560 + }, + { + "epoch": 0.6161544310628173, + "grad_norm": 1.6753938116918217, + "learning_rate": 1.8430814156745424e-05, + "loss": 1.1212, + "step": 1570 + }, + { + "epoch": 0.620078981579141, + "grad_norm": 1.2926994229597162, + "learning_rate": 1.8407950507962166e-05, + "loss": 1.12, + "step": 1580 + }, + { + "epoch": 0.6240035320954647, + "grad_norm": 1.1434774002781025, + "learning_rate": 1.8384935891464938e-05, + "loss": 1.1059, + "step": 1590 + }, + { + "epoch": 0.6279280826117883, + "grad_norm": 2.5701272662383623, + "learning_rate": 1.8361770720489287e-05, + "loss": 1.1667, + "step": 1600 + }, + { + "epoch": 0.631852633128112, + "grad_norm": 2.099590587250419, + "learning_rate": 1.8338455410974017e-05, + "loss": 1.1811, + "step": 1610 + }, + { + "epoch": 0.6357771836444357, + "grad_norm": 1.5881595852499024, + "learning_rate": 1.831499038155373e-05, + "loss": 1.1198, + "step": 1620 + }, + { + "epoch": 0.6397017341607594, + "grad_norm": 1.364474265009956, + "learning_rate": 1.8291376053551293e-05, + "loss": 1.1348, + "step": 1630 + }, + { + "epoch": 0.643626284677083, + "grad_norm": 1.0132232392193459, + "learning_rate": 1.8267612850970292e-05, + "loss": 1.1341, + "step": 1640 + }, + { + "epoch": 0.6475508351934067, + "grad_norm": 2.375671216469934, + "learning_rate": 1.824370120048739e-05, + "loss": 1.1971, + "step": 1650 + }, + { + "epoch": 0.6514753857097304, + "grad_norm": 2.0072312065319142, + "learning_rate": 1.8219641531444713e-05, + "loss": 1.1696, + "step": 1660 + }, + { + "epoch": 0.6553999362260541, + "grad_norm": 1.621521304969733, + "learning_rate": 1.8195434275842088e-05, + "loss": 1.1116, + "step": 1670 + }, + { + "epoch": 0.6593244867423778, + "grad_norm": 1.289974630938439, + "learning_rate": 1.817107986832932e-05, + "loss": 1.1427, + "step": 1680 + }, + { + "epoch": 0.6632490372587014, + "grad_norm": 1.2226882453760828, + "learning_rate": 1.8146578746198374e-05, + "loss": 1.1324, + "step": 1690 + }, + { + "epoch": 0.6671735877750251, + "grad_norm": 2.6497361969234836, + "learning_rate": 1.812193134937554e-05, + "loss": 1.1518, + "step": 1700 + }, + { + "epoch": 0.6710981382913488, + "grad_norm": 1.934779659069536, + "learning_rate": 1.8097138120413503e-05, + "loss": 1.1667, + "step": 1710 + }, + { + "epoch": 0.6750226888076725, + "grad_norm": 1.682274234607041, + "learning_rate": 1.8072199504483428e-05, + "loss": 1.1094, + "step": 1720 + }, + { + "epoch": 0.6789472393239961, + "grad_norm": 1.3189385241228773, + "learning_rate": 1.8047115949366955e-05, + "loss": 1.1485, + "step": 1730 + }, + { + "epoch": 0.6828717898403198, + "grad_norm": 1.2380330513347648, + "learning_rate": 1.8021887905448146e-05, + "loss": 1.1228, + "step": 1740 + }, + { + "epoch": 0.6867963403566435, + "grad_norm": 2.503615625647334, + "learning_rate": 1.799651582570543e-05, + "loss": 1.1545, + "step": 1750 + }, + { + "epoch": 0.6907208908729672, + "grad_norm": 1.9970401432155471, + "learning_rate": 1.7971000165703434e-05, + "loss": 1.1698, + "step": 1760 + }, + { + "epoch": 0.6946454413892909, + "grad_norm": 1.585682831800493, + "learning_rate": 1.7945341383584818e-05, + "loss": 1.12, + "step": 1770 + }, + { + "epoch": 0.6985699919056145, + "grad_norm": 1.4103033699462193, + "learning_rate": 1.7919539940062068e-05, + "loss": 1.1375, + "step": 1780 + }, + { + "epoch": 0.7024945424219382, + "grad_norm": 1.1741968139844532, + "learning_rate": 1.7893596298409182e-05, + "loss": 1.1045, + "step": 1790 + }, + { + "epoch": 0.7064190929382619, + "grad_norm": 2.5435233808457265, + "learning_rate": 1.7867510924453394e-05, + "loss": 1.1561, + "step": 1800 + }, + { + "epoch": 0.7103436434545856, + "grad_norm": 2.0058708638995744, + "learning_rate": 1.784128428656678e-05, + "loss": 1.1905, + "step": 1810 + }, + { + "epoch": 0.7142681939709092, + "grad_norm": 1.5513764227014477, + "learning_rate": 1.7814916855657872e-05, + "loss": 1.116, + "step": 1820 + }, + { + "epoch": 0.7181927444872329, + "grad_norm": 1.3841452634663314, + "learning_rate": 1.7788409105163178e-05, + "loss": 1.1359, + "step": 1830 + }, + { + "epoch": 0.7221172950035566, + "grad_norm": 1.0616071904873385, + "learning_rate": 1.7761761511038694e-05, + "loss": 1.0973, + "step": 1840 + }, + { + "epoch": 0.7260418455198803, + "grad_norm": 2.4472932789694726, + "learning_rate": 1.773497455175137e-05, + "loss": 1.1611, + "step": 1850 + }, + { + "epoch": 0.729966396036204, + "grad_norm": 2.1478318961127325, + "learning_rate": 1.7708048708270497e-05, + "loss": 1.1637, + "step": 1860 + }, + { + "epoch": 0.7338909465525276, + "grad_norm": 1.5986355699917554, + "learning_rate": 1.7680984464059077e-05, + "loss": 1.1179, + "step": 1870 + }, + { + "epoch": 0.7378154970688513, + "grad_norm": 1.2845396362764854, + "learning_rate": 1.7653782305065158e-05, + "loss": 1.1407, + "step": 1880 + }, + { + "epoch": 0.741740047585175, + "grad_norm": 1.1067586294132603, + "learning_rate": 1.7626442719713083e-05, + "loss": 1.1255, + "step": 1890 + }, + { + "epoch": 0.7456645981014987, + "grad_norm": 2.2098441511746705, + "learning_rate": 1.7598966198894746e-05, + "loss": 1.1756, + "step": 1900 + }, + { + "epoch": 0.7495891486178223, + "grad_norm": 1.9535681972911683, + "learning_rate": 1.7571353235960754e-05, + "loss": 1.1813, + "step": 1910 + }, + { + "epoch": 0.753513699134146, + "grad_norm": 1.5198953597892402, + "learning_rate": 1.7543604326711592e-05, + "loss": 1.1157, + "step": 1920 + }, + { + "epoch": 0.7574382496504697, + "grad_norm": 1.282264664888777, + "learning_rate": 1.7515719969388697e-05, + "loss": 1.1325, + "step": 1930 + }, + { + "epoch": 0.7613628001667934, + "grad_norm": 1.1529117257538906, + "learning_rate": 1.7487700664665536e-05, + "loss": 1.1579, + "step": 1940 + }, + { + "epoch": 0.765287350683117, + "grad_norm": 2.4479153069810877, + "learning_rate": 1.7459546915638595e-05, + "loss": 1.1548, + "step": 1950 + }, + { + "epoch": 0.7692119011994407, + "grad_norm": 1.9930358021042167, + "learning_rate": 1.743125922781836e-05, + "loss": 1.1702, + "step": 1960 + }, + { + "epoch": 0.7731364517157644, + "grad_norm": 1.5561233301314203, + "learning_rate": 1.740283810912023e-05, + "loss": 1.1098, + "step": 1970 + }, + { + "epoch": 0.7770610022320881, + "grad_norm": 1.363877169879588, + "learning_rate": 1.737428406985541e-05, + "loss": 1.1276, + "step": 1980 + }, + { + "epoch": 0.7809855527484117, + "grad_norm": 1.1118223017408846, + "learning_rate": 1.7345597622721727e-05, + "loss": 1.1143, + "step": 1990 + }, + { + "epoch": 0.7849101032647354, + "grad_norm": 2.2103639445489707, + "learning_rate": 1.7316779282794458e-05, + "loss": 1.1436, + "step": 2000 + }, + { + "epoch": 0.7888346537810591, + "grad_norm": 1.9866216610680039, + "learning_rate": 1.728782956751705e-05, + "loss": 1.1366, + "step": 2010 + }, + { + "epoch": 0.7927592042973828, + "grad_norm": 1.557276150793299, + "learning_rate": 1.725874899669183e-05, + "loss": 1.1028, + "step": 2020 + }, + { + "epoch": 0.7966837548137065, + "grad_norm": 1.3407353791664398, + "learning_rate": 1.7229538092470708e-05, + "loss": 1.121, + "step": 2030 + }, + { + "epoch": 0.8006083053300301, + "grad_norm": 1.0452399263705143, + "learning_rate": 1.7200197379345752e-05, + "loss": 1.1052, + "step": 2040 + }, + { + "epoch": 0.8045328558463538, + "grad_norm": 2.255129030276742, + "learning_rate": 1.7170727384139808e-05, + "loss": 1.1534, + "step": 2050 + }, + { + "epoch": 0.8084574063626775, + "grad_norm": 2.023922420597953, + "learning_rate": 1.7141128635997027e-05, + "loss": 1.1536, + "step": 2060 + }, + { + "epoch": 0.8123819568790012, + "grad_norm": 1.6459699923272906, + "learning_rate": 1.711140166637336e-05, + "loss": 1.1237, + "step": 2070 + }, + { + "epoch": 0.8163065073953248, + "grad_norm": 1.3995835514158206, + "learning_rate": 1.7081547009027014e-05, + "loss": 1.1364, + "step": 2080 + }, + { + "epoch": 0.8202310579116485, + "grad_norm": 1.1468866803278337, + "learning_rate": 1.705156520000889e-05, + "loss": 1.1055, + "step": 2090 + }, + { + "epoch": 0.8241556084279722, + "grad_norm": 2.4534252010588626, + "learning_rate": 1.702145677765293e-05, + "loss": 1.1233, + "step": 2100 + }, + { + "epoch": 0.8280801589442959, + "grad_norm": 2.069228109517972, + "learning_rate": 1.6991222282566465e-05, + "loss": 1.1368, + "step": 2110 + }, + { + "epoch": 0.8320047094606196, + "grad_norm": 1.5353360725588796, + "learning_rate": 1.696086225762051e-05, + "loss": 1.0936, + "step": 2120 + }, + { + "epoch": 0.8359292599769432, + "grad_norm": 1.31441567730565, + "learning_rate": 1.6930377247940005e-05, + "loss": 1.103, + "step": 2130 + }, + { + "epoch": 0.8398538104932669, + "grad_norm": 0.9169234319686077, + "learning_rate": 1.689976780089405e-05, + "loss": 1.0933, + "step": 2140 + }, + { + "epoch": 0.8437783610095906, + "grad_norm": 2.321894947275435, + "learning_rate": 1.6869034466086046e-05, + "loss": 1.1397, + "step": 2150 + }, + { + "epoch": 0.8477029115259143, + "grad_norm": 2.0305512220158386, + "learning_rate": 1.6838177795343847e-05, + "loss": 1.1704, + "step": 2160 + }, + { + "epoch": 0.8516274620422379, + "grad_norm": 1.5512604200141975, + "learning_rate": 1.6807198342709858e-05, + "loss": 1.1113, + "step": 2170 + }, + { + "epoch": 0.8555520125585616, + "grad_norm": 1.3624069523785742, + "learning_rate": 1.677609666443105e-05, + "loss": 1.1355, + "step": 2180 + }, + { + "epoch": 0.8594765630748853, + "grad_norm": 1.162420454215084, + "learning_rate": 1.6744873318949032e-05, + "loss": 1.1217, + "step": 2190 + }, + { + "epoch": 0.863401113591209, + "grad_norm": 2.59296418698937, + "learning_rate": 1.6713528866889966e-05, + "loss": 1.1753, + "step": 2200 + }, + { + "epoch": 0.8673256641075326, + "grad_norm": 2.005406504075907, + "learning_rate": 1.6682063871054534e-05, + "loss": 1.1596, + "step": 2210 + }, + { + "epoch": 0.8712502146238563, + "grad_norm": 1.5322297264828286, + "learning_rate": 1.6650478896407825e-05, + "loss": 1.1093, + "step": 2220 + }, + { + "epoch": 0.87517476514018, + "grad_norm": 1.3164373320619593, + "learning_rate": 1.6618774510069187e-05, + "loss": 1.109, + "step": 2230 + }, + { + "epoch": 0.8790993156565037, + "grad_norm": 0.979869187444681, + "learning_rate": 1.6586951281302046e-05, + "loss": 1.1212, + "step": 2240 + }, + { + "epoch": 0.8830238661728274, + "grad_norm": 2.4909588716430617, + "learning_rate": 1.655500978150369e-05, + "loss": 1.1286, + "step": 2250 + }, + { + "epoch": 0.886948416689151, + "grad_norm": 2.022377055680929, + "learning_rate": 1.6522950584195003e-05, + "loss": 1.149, + "step": 2260 + }, + { + "epoch": 0.8908729672054747, + "grad_norm": 1.522868856655693, + "learning_rate": 1.649077426501017e-05, + "loss": 1.0971, + "step": 2270 + }, + { + "epoch": 0.8947975177217984, + "grad_norm": 1.3697259871875316, + "learning_rate": 1.6458481401686334e-05, + "loss": 1.0805, + "step": 2280 + }, + { + "epoch": 0.8987220682381221, + "grad_norm": 1.018922203182855, + "learning_rate": 1.6426072574053238e-05, + "loss": 1.1299, + "step": 2290 + }, + { + "epoch": 0.9026466187544457, + "grad_norm": 2.375176209683022, + "learning_rate": 1.6393548364022803e-05, + "loss": 1.1244, + "step": 2300 + }, + { + "epoch": 0.9065711692707694, + "grad_norm": 1.9892422302580408, + "learning_rate": 1.636090935557868e-05, + "loss": 1.1825, + "step": 2310 + }, + { + "epoch": 0.9104957197870931, + "grad_norm": 1.5724093274080302, + "learning_rate": 1.632815613476576e-05, + "loss": 1.1016, + "step": 2320 + }, + { + "epoch": 0.9144202703034168, + "grad_norm": 1.2589150941461418, + "learning_rate": 1.6295289289679674e-05, + "loss": 1.1056, + "step": 2330 + }, + { + "epoch": 0.9183448208197404, + "grad_norm": 0.946685256837567, + "learning_rate": 1.62623094104562e-05, + "loss": 1.0829, + "step": 2340 + }, + { + "epoch": 0.9222693713360641, + "grad_norm": 2.352958803717975, + "learning_rate": 1.6229217089260695e-05, + "loss": 1.1514, + "step": 2350 + }, + { + "epoch": 0.9261939218523878, + "grad_norm": 1.9500631427530646, + "learning_rate": 1.6196012920277436e-05, + "loss": 1.1563, + "step": 2360 + }, + { + "epoch": 0.9301184723687115, + "grad_norm": 1.5860068083635046, + "learning_rate": 1.616269749969899e-05, + "loss": 1.0999, + "step": 2370 + }, + { + "epoch": 0.9340430228850352, + "grad_norm": 1.2393963203713174, + "learning_rate": 1.6129271425715458e-05, + "loss": 1.1056, + "step": 2380 + }, + { + "epoch": 0.9379675734013588, + "grad_norm": 1.0108366563366444, + "learning_rate": 1.609573529850379e-05, + "loss": 1.0886, + "step": 2390 + }, + { + "epoch": 0.9418921239176825, + "grad_norm": 2.307306825085365, + "learning_rate": 1.6062089720216956e-05, + "loss": 1.125, + "step": 2400 + }, + { + "epoch": 0.9458166744340062, + "grad_norm": 2.08922761238031, + "learning_rate": 1.6028335294973182e-05, + "loss": 1.1676, + "step": 2410 + }, + { + "epoch": 0.9497412249503299, + "grad_norm": 1.5072970519469342, + "learning_rate": 1.5994472628845054e-05, + "loss": 1.0805, + "step": 2420 + }, + { + "epoch": 0.9536657754666535, + "grad_norm": 1.208673658456983, + "learning_rate": 1.5960502329848683e-05, + "loss": 1.1023, + "step": 2430 + }, + { + "epoch": 0.9575903259829772, + "grad_norm": 1.0701544169565054, + "learning_rate": 1.5926425007932747e-05, + "loss": 1.0802, + "step": 2440 + }, + { + "epoch": 0.9615148764993009, + "grad_norm": 2.2259127710545745, + "learning_rate": 1.5892241274967578e-05, + "loss": 1.1306, + "step": 2450 + }, + { + "epoch": 0.9654394270156246, + "grad_norm": 1.9613490972509378, + "learning_rate": 1.5857951744734145e-05, + "loss": 1.1527, + "step": 2460 + }, + { + "epoch": 0.9693639775319483, + "grad_norm": 1.6339352423393527, + "learning_rate": 1.5823557032913045e-05, + "loss": 1.1173, + "step": 2470 + }, + { + "epoch": 0.9732885280482719, + "grad_norm": 1.2845543184685153, + "learning_rate": 1.5789057757073444e-05, + "loss": 1.0858, + "step": 2480 + }, + { + "epoch": 0.9772130785645956, + "grad_norm": 1.0634701364462926, + "learning_rate": 1.5754454536662e-05, + "loss": 1.0772, + "step": 2490 + }, + { + "epoch": 0.9811376290809193, + "grad_norm": 2.5457603854360618, + "learning_rate": 1.5719747992991723e-05, + "loss": 1.1572, + "step": 2500 + }, + { + "epoch": 0.985062179597243, + "grad_norm": 1.9734906496943037, + "learning_rate": 1.568493874923084e-05, + "loss": 1.1277, + "step": 2510 + }, + { + "epoch": 0.9889867301135666, + "grad_norm": 1.5783237318057073, + "learning_rate": 1.5650027430391584e-05, + "loss": 1.0856, + "step": 2520 + }, + { + "epoch": 0.9929112806298903, + "grad_norm": 1.4264738203605272, + "learning_rate": 1.5615014663318993e-05, + "loss": 1.1078, + "step": 2530 + }, + { + "epoch": 0.996835831146214, + "grad_norm": 1.1620086010183999, + "learning_rate": 1.5579901076679625e-05, + "loss": 1.1097, + "step": 2540 + }, + { + "epoch": 0.999975471559273, + "eval_loss": 0.9125259518623352, + "eval_runtime": 1520.6591, + "eval_samples_per_second": 16.44, + "eval_steps_per_second": 4.11, + "step": 2548 + }, + { + "epoch": 1.0007849101032646, + "grad_norm": 1.0252573802297382, + "learning_rate": 1.5544687300950306e-05, + "loss": 0.9338, + "step": 2550 + }, + { + "epoch": 1.0047094606195883, + "grad_norm": 2.201134957778082, + "learning_rate": 1.5509373968406792e-05, + "loss": 0.9016, + "step": 2560 + }, + { + "epoch": 1.008634011135912, + "grad_norm": 1.7915224952217608, + "learning_rate": 1.5473961713112405e-05, + "loss": 0.991, + "step": 2570 + }, + { + "epoch": 1.0125585616522357, + "grad_norm": 1.7104141742083112, + "learning_rate": 1.5438451170906672e-05, + "loss": 0.9134, + "step": 2580 + }, + { + "epoch": 1.0164831121685594, + "grad_norm": 1.488033671149185, + "learning_rate": 1.5402842979393882e-05, + "loss": 0.8688, + "step": 2590 + }, + { + "epoch": 1.020407662684883, + "grad_norm": 1.1133098743882353, + "learning_rate": 1.5367137777931673e-05, + "loss": 0.8432, + "step": 2600 + }, + { + "epoch": 1.0243322132012067, + "grad_norm": 2.0585351825127143, + "learning_rate": 1.5331336207619507e-05, + "loss": 0.8874, + "step": 2610 + }, + { + "epoch": 1.0282567637175304, + "grad_norm": 1.867901103345647, + "learning_rate": 1.5295438911287203e-05, + "loss": 0.9336, + "step": 2620 + }, + { + "epoch": 1.032181314233854, + "grad_norm": 1.6563402725505043, + "learning_rate": 1.5259446533483357e-05, + "loss": 0.8879, + "step": 2630 + }, + { + "epoch": 1.0361058647501777, + "grad_norm": 1.4856230129943386, + "learning_rate": 1.5223359720463796e-05, + "loss": 0.859, + "step": 2640 + }, + { + "epoch": 1.0400304152665014, + "grad_norm": 0.9711013069770768, + "learning_rate": 1.5187179120179969e-05, + "loss": 0.8288, + "step": 2650 + }, + { + "epoch": 1.043954965782825, + "grad_norm": 1.9828694802282283, + "learning_rate": 1.5150905382267299e-05, + "loss": 0.8955, + "step": 2660 + }, + { + "epoch": 1.0478795162991488, + "grad_norm": 2.0020449073070283, + "learning_rate": 1.511453915803353e-05, + "loss": 0.9694, + "step": 2670 + }, + { + "epoch": 1.0518040668154724, + "grad_norm": 1.6466177405453537, + "learning_rate": 1.5078081100447035e-05, + "loss": 0.9115, + "step": 2680 + }, + { + "epoch": 1.0557286173317961, + "grad_norm": 1.7037544484554887, + "learning_rate": 1.5041531864125082e-05, + "loss": 0.8493, + "step": 2690 + }, + { + "epoch": 1.0596531678481198, + "grad_norm": 0.9649533896621292, + "learning_rate": 1.5004892105322092e-05, + "loss": 0.8204, + "step": 2700 + }, + { + "epoch": 1.0635777183644435, + "grad_norm": 2.104909863577972, + "learning_rate": 1.4968162481917836e-05, + "loss": 0.9002, + "step": 2710 + }, + { + "epoch": 1.0675022688807672, + "grad_norm": 2.0341252133554146, + "learning_rate": 1.4931343653405652e-05, + "loss": 0.9456, + "step": 2720 + }, + { + "epoch": 1.0714268193970908, + "grad_norm": 1.5915526204280668, + "learning_rate": 1.4894436280880578e-05, + "loss": 0.8801, + "step": 2730 + }, + { + "epoch": 1.0753513699134145, + "grad_norm": 1.5184799515062875, + "learning_rate": 1.4857441027027486e-05, + "loss": 0.8608, + "step": 2740 + }, + { + "epoch": 1.0792759204297382, + "grad_norm": 1.0739540966168113, + "learning_rate": 1.4820358556109202e-05, + "loss": 0.8383, + "step": 2750 + }, + { + "epoch": 1.0832004709460619, + "grad_norm": 2.3527635362598787, + "learning_rate": 1.4783189533954555e-05, + "loss": 0.8989, + "step": 2760 + }, + { + "epoch": 1.0871250214623855, + "grad_norm": 1.8879516919488128, + "learning_rate": 1.4745934627946432e-05, + "loss": 0.9203, + "step": 2770 + }, + { + "epoch": 1.0910495719787092, + "grad_norm": 1.6794025897082823, + "learning_rate": 1.4708594507009806e-05, + "loss": 0.8939, + "step": 2780 + }, + { + "epoch": 1.094974122495033, + "grad_norm": 1.263872598631979, + "learning_rate": 1.4671169841599695e-05, + "loss": 0.8435, + "step": 2790 + }, + { + "epoch": 1.0988986730113566, + "grad_norm": 0.9984868253029158, + "learning_rate": 1.4633661303689157e-05, + "loss": 0.8568, + "step": 2800 + }, + { + "epoch": 1.1028232235276803, + "grad_norm": 2.060249688323629, + "learning_rate": 1.4596069566757207e-05, + "loss": 0.8882, + "step": 2810 + }, + { + "epoch": 1.106747774044004, + "grad_norm": 1.920918977602427, + "learning_rate": 1.4558395305776731e-05, + "loss": 0.9299, + "step": 2820 + }, + { + "epoch": 1.1106723245603276, + "grad_norm": 1.6211510195551506, + "learning_rate": 1.4520639197202355e-05, + "loss": 0.8898, + "step": 2830 + }, + { + "epoch": 1.1145968750766513, + "grad_norm": 1.229775633396699, + "learning_rate": 1.4482801918958312e-05, + "loss": 0.8609, + "step": 2840 + }, + { + "epoch": 1.118521425592975, + "grad_norm": 1.1229141852110762, + "learning_rate": 1.4444884150426267e-05, + "loss": 0.8456, + "step": 2850 + }, + { + "epoch": 1.1224459761092986, + "grad_norm": 2.120911251605639, + "learning_rate": 1.4406886572433113e-05, + "loss": 0.8918, + "step": 2860 + }, + { + "epoch": 1.1263705266256223, + "grad_norm": 2.0769262910789767, + "learning_rate": 1.4368809867238754e-05, + "loss": 0.9531, + "step": 2870 + }, + { + "epoch": 1.130295077141946, + "grad_norm": 1.6225603088474205, + "learning_rate": 1.4330654718523847e-05, + "loss": 0.8979, + "step": 2880 + }, + { + "epoch": 1.1342196276582697, + "grad_norm": 1.1891920263768887, + "learning_rate": 1.4292421811377532e-05, + "loss": 0.8697, + "step": 2890 + }, + { + "epoch": 1.1381441781745933, + "grad_norm": 1.0159234075717296, + "learning_rate": 1.4254111832285128e-05, + "loss": 0.8353, + "step": 2900 + }, + { + "epoch": 1.142068728690917, + "grad_norm": 2.0861820849148383, + "learning_rate": 1.4215725469115806e-05, + "loss": 0.8676, + "step": 2910 + }, + { + "epoch": 1.1459932792072407, + "grad_norm": 1.9828582401916874, + "learning_rate": 1.4177263411110249e-05, + "loss": 0.9457, + "step": 2920 + }, + { + "epoch": 1.1499178297235644, + "grad_norm": 1.6363978446581915, + "learning_rate": 1.413872634886825e-05, + "loss": 0.8682, + "step": 2930 + }, + { + "epoch": 1.153842380239888, + "grad_norm": 1.414190445025758, + "learning_rate": 1.4100114974336352e-05, + "loss": 0.8663, + "step": 2940 + }, + { + "epoch": 1.1577669307562117, + "grad_norm": 0.9877954021628765, + "learning_rate": 1.4061429980795382e-05, + "loss": 0.8439, + "step": 2950 + }, + { + "epoch": 1.1616914812725354, + "grad_norm": 2.2358539386859726, + "learning_rate": 1.4022672062848034e-05, + "loss": 0.8819, + "step": 2960 + }, + { + "epoch": 1.165616031788859, + "grad_norm": 1.8768050513117698, + "learning_rate": 1.3983841916406383e-05, + "loss": 0.9261, + "step": 2970 + }, + { + "epoch": 1.1695405823051828, + "grad_norm": 1.6017361554290517, + "learning_rate": 1.3944940238679384e-05, + "loss": 0.898, + "step": 2980 + }, + { + "epoch": 1.1734651328215064, + "grad_norm": 1.2526189890877126, + "learning_rate": 1.390596772816037e-05, + "loss": 0.8496, + "step": 2990 + }, + { + "epoch": 1.1773896833378301, + "grad_norm": 1.0365383728001132, + "learning_rate": 1.3866925084614501e-05, + "loss": 0.8468, + "step": 3000 + }, + { + "epoch": 1.1813142338541538, + "grad_norm": 2.224248057811622, + "learning_rate": 1.3827813009066202e-05, + "loss": 0.8759, + "step": 3010 + }, + { + "epoch": 1.1852387843704775, + "grad_norm": 1.8752890091264613, + "learning_rate": 1.3788632203786567e-05, + "loss": 0.9297, + "step": 3020 + }, + { + "epoch": 1.1891633348868011, + "grad_norm": 1.6600876268313813, + "learning_rate": 1.374938337228076e-05, + "loss": 0.87, + "step": 3030 + }, + { + "epoch": 1.1930878854031248, + "grad_norm": 1.5276898634308227, + "learning_rate": 1.3710067219275382e-05, + "loss": 0.8693, + "step": 3040 + }, + { + "epoch": 1.1970124359194485, + "grad_norm": 0.9561545850483195, + "learning_rate": 1.3670684450705813e-05, + "loss": 0.8369, + "step": 3050 + }, + { + "epoch": 1.2009369864357722, + "grad_norm": 2.3827528867634307, + "learning_rate": 1.3631235773703535e-05, + "loss": 0.8932, + "step": 3060 + }, + { + "epoch": 1.2048615369520959, + "grad_norm": 2.0991202817563828, + "learning_rate": 1.3591721896583455e-05, + "loss": 0.9404, + "step": 3070 + }, + { + "epoch": 1.2087860874684195, + "grad_norm": 1.6251446131482838, + "learning_rate": 1.3552143528831149e-05, + "loss": 0.8804, + "step": 3080 + }, + { + "epoch": 1.2127106379847432, + "grad_norm": 1.3451655629852488, + "learning_rate": 1.3512501381090158e-05, + "loss": 0.8529, + "step": 3090 + }, + { + "epoch": 1.2166351885010669, + "grad_norm": 0.8385047694025927, + "learning_rate": 1.3472796165149217e-05, + "loss": 0.818, + "step": 3100 + }, + { + "epoch": 1.2205597390173906, + "grad_norm": 2.267246006997812, + "learning_rate": 1.3433028593929467e-05, + "loss": 0.8779, + "step": 3110 + }, + { + "epoch": 1.2244842895337142, + "grad_norm": 2.0703651009730035, + "learning_rate": 1.3393199381471657e-05, + "loss": 0.9371, + "step": 3120 + }, + { + "epoch": 1.228408840050038, + "grad_norm": 1.6728269310666124, + "learning_rate": 1.3353309242923336e-05, + "loss": 0.862, + "step": 3130 + }, + { + "epoch": 1.2323333905663616, + "grad_norm": 1.3943213550602511, + "learning_rate": 1.3313358894525997e-05, + "loss": 0.8734, + "step": 3140 + }, + { + "epoch": 1.2362579410826853, + "grad_norm": 0.9417267831082166, + "learning_rate": 1.327334905360222e-05, + "loss": 0.819, + "step": 3150 + }, + { + "epoch": 1.240182491599009, + "grad_norm": 2.03618285439807, + "learning_rate": 1.3233280438542795e-05, + "loss": 0.8671, + "step": 3160 + }, + { + "epoch": 1.2441070421153326, + "grad_norm": 1.8648736136084698, + "learning_rate": 1.319315376879383e-05, + "loss": 0.945, + "step": 3170 + }, + { + "epoch": 1.2480315926316563, + "grad_norm": 1.5679297693032792, + "learning_rate": 1.3152969764843812e-05, + "loss": 0.8778, + "step": 3180 + }, + { + "epoch": 1.25195614314798, + "grad_norm": 1.375990035837938, + "learning_rate": 1.3112729148210694e-05, + "loss": 0.8501, + "step": 3190 + }, + { + "epoch": 1.2558806936643037, + "grad_norm": 0.8945069929861201, + "learning_rate": 1.3072432641428931e-05, + "loss": 0.8555, + "step": 3200 + }, + { + "epoch": 1.2598052441806273, + "grad_norm": 1.9973442354473332, + "learning_rate": 1.3032080968036498e-05, + "loss": 0.8922, + "step": 3210 + }, + { + "epoch": 1.263729794696951, + "grad_norm": 2.1062119639954444, + "learning_rate": 1.2991674852561904e-05, + "loss": 0.929, + "step": 3220 + }, + { + "epoch": 1.2676543452132747, + "grad_norm": 1.6116944823889532, + "learning_rate": 1.2951215020511196e-05, + "loss": 0.8672, + "step": 3230 + }, + { + "epoch": 1.2715788957295984, + "grad_norm": 1.330101731052534, + "learning_rate": 1.2910702198354915e-05, + "loss": 0.853, + "step": 3240 + }, + { + "epoch": 1.275503446245922, + "grad_norm": 1.0644575098102675, + "learning_rate": 1.2870137113515053e-05, + "loss": 0.8281, + "step": 3250 + }, + { + "epoch": 1.2794279967622457, + "grad_norm": 2.4521439306633748, + "learning_rate": 1.2829520494352004e-05, + "loss": 0.8696, + "step": 3260 + }, + { + "epoch": 1.2833525472785694, + "grad_norm": 2.0163802307404466, + "learning_rate": 1.2788853070151477e-05, + "loss": 0.9172, + "step": 3270 + }, + { + "epoch": 1.287277097794893, + "grad_norm": 1.6281065777711845, + "learning_rate": 1.2748135571111404e-05, + "loss": 0.8644, + "step": 3280 + }, + { + "epoch": 1.2912016483112168, + "grad_norm": 1.5009876264848987, + "learning_rate": 1.2707368728328826e-05, + "loss": 0.8654, + "step": 3290 + }, + { + "epoch": 1.2951261988275404, + "grad_norm": 0.7853723853615091, + "learning_rate": 1.2666553273786771e-05, + "loss": 0.8031, + "step": 3300 + }, + { + "epoch": 1.299050749343864, + "grad_norm": 2.189696126410242, + "learning_rate": 1.2625689940341102e-05, + "loss": 0.8795, + "step": 3310 + }, + { + "epoch": 1.3029752998601878, + "grad_norm": 1.9334298716685878, + "learning_rate": 1.2584779461707374e-05, + "loss": 0.926, + "step": 3320 + }, + { + "epoch": 1.3068998503765115, + "grad_norm": 1.5481872721801182, + "learning_rate": 1.254382257244765e-05, + "loss": 0.8565, + "step": 3330 + }, + { + "epoch": 1.3108244008928351, + "grad_norm": 1.2607713095759323, + "learning_rate": 1.2502820007957302e-05, + "loss": 0.8505, + "step": 3340 + }, + { + "epoch": 1.3147489514091588, + "grad_norm": 0.8930555325270236, + "learning_rate": 1.2461772504451822e-05, + "loss": 0.8323, + "step": 3350 + }, + { + "epoch": 1.3186735019254825, + "grad_norm": 1.9338117374514918, + "learning_rate": 1.2420680798953604e-05, + "loss": 0.8754, + "step": 3360 + }, + { + "epoch": 1.3225980524418062, + "grad_norm": 2.061421402861005, + "learning_rate": 1.2379545629278693e-05, + "loss": 0.9426, + "step": 3370 + }, + { + "epoch": 1.3265226029581298, + "grad_norm": 1.588986245387613, + "learning_rate": 1.233836773402356e-05, + "loss": 0.867, + "step": 3380 + }, + { + "epoch": 1.3304471534744535, + "grad_norm": 1.2886545052157778, + "learning_rate": 1.229714785255182e-05, + "loss": 0.868, + "step": 3390 + }, + { + "epoch": 1.3343717039907772, + "grad_norm": 1.219703452074695, + "learning_rate": 1.2255886724980974e-05, + "loss": 0.8168, + "step": 3400 + }, + { + "epoch": 1.3382962545071009, + "grad_norm": 2.236888625402866, + "learning_rate": 1.2214585092169103e-05, + "loss": 0.8794, + "step": 3410 + }, + { + "epoch": 1.3422208050234246, + "grad_norm": 2.13039765109174, + "learning_rate": 1.2173243695701575e-05, + "loss": 0.9218, + "step": 3420 + }, + { + "epoch": 1.3461453555397482, + "grad_norm": 1.652775626275893, + "learning_rate": 1.213186327787773e-05, + "loss": 0.8937, + "step": 3430 + }, + { + "epoch": 1.350069906056072, + "grad_norm": 1.469984852272106, + "learning_rate": 1.209044458169756e-05, + "loss": 0.8582, + "step": 3440 + }, + { + "epoch": 1.3539944565723956, + "grad_norm": 0.8843940147907386, + "learning_rate": 1.2048988350848338e-05, + "loss": 0.8537, + "step": 3450 + }, + { + "epoch": 1.3579190070887193, + "grad_norm": 2.286808087504479, + "learning_rate": 1.2007495329691301e-05, + "loss": 0.8915, + "step": 3460 + }, + { + "epoch": 1.361843557605043, + "grad_norm": 2.1637511011528585, + "learning_rate": 1.1965966263248267e-05, + "loss": 0.9545, + "step": 3470 + }, + { + "epoch": 1.3657681081213666, + "grad_norm": 1.615967978820237, + "learning_rate": 1.192440189718825e-05, + "loss": 0.889, + "step": 3480 + }, + { + "epoch": 1.3696926586376903, + "grad_norm": 1.3561598955089822, + "learning_rate": 1.1882802977814092e-05, + "loss": 0.8568, + "step": 3490 + }, + { + "epoch": 1.373617209154014, + "grad_norm": 1.031049725329889, + "learning_rate": 1.184117025204905e-05, + "loss": 0.8065, + "step": 3500 + }, + { + "epoch": 1.3775417596703377, + "grad_norm": 2.130942563304923, + "learning_rate": 1.1799504467423382e-05, + "loss": 0.8781, + "step": 3510 + }, + { + "epoch": 1.3814663101866613, + "grad_norm": 1.914562387879776, + "learning_rate": 1.1757806372060934e-05, + "loss": 0.9244, + "step": 3520 + }, + { + "epoch": 1.385390860702985, + "grad_norm": 1.6890782058344322, + "learning_rate": 1.1716076714665701e-05, + "loss": 0.8621, + "step": 3530 + }, + { + "epoch": 1.3893154112193087, + "grad_norm": 1.2006002638188844, + "learning_rate": 1.1674316244508381e-05, + "loss": 0.8393, + "step": 3540 + }, + { + "epoch": 1.3932399617356324, + "grad_norm": 1.168530668598062, + "learning_rate": 1.1632525711412936e-05, + "loss": 0.8402, + "step": 3550 + }, + { + "epoch": 1.397164512251956, + "grad_norm": 2.214073914981144, + "learning_rate": 1.1590705865743108e-05, + "loss": 0.8794, + "step": 3560 + }, + { + "epoch": 1.4010890627682797, + "grad_norm": 1.9175558977832043, + "learning_rate": 1.1548857458388967e-05, + "loss": 0.9226, + "step": 3570 + }, + { + "epoch": 1.4050136132846034, + "grad_norm": 1.7516445773007234, + "learning_rate": 1.1506981240753406e-05, + "loss": 0.8688, + "step": 3580 + }, + { + "epoch": 1.408938163800927, + "grad_norm": 1.236950976064499, + "learning_rate": 1.1465077964738674e-05, + "loss": 0.8604, + "step": 3590 + }, + { + "epoch": 1.4128627143172507, + "grad_norm": 0.9946584575727137, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.8198, + "step": 3600 + }, + { + "epoch": 1.4167872648335744, + "grad_norm": 2.357981326812003, + "learning_rate": 1.1381193247596365e-05, + "loss": 0.8815, + "step": 3610 + }, + { + "epoch": 1.420711815349898, + "grad_norm": 2.068839516965018, + "learning_rate": 1.133921331264844e-05, + "loss": 0.9218, + "step": 3620 + }, + { + "epoch": 1.4246363658662218, + "grad_norm": 1.7728344994302516, + "learning_rate": 1.1297209331653606e-05, + "loss": 0.8679, + "step": 3630 + }, + { + "epoch": 1.4285609163825455, + "grad_norm": 1.2048121431631056, + "learning_rate": 1.1255182058808143e-05, + "loss": 0.8544, + "step": 3640 + }, + { + "epoch": 1.4324854668988691, + "grad_norm": 1.333275738578586, + "learning_rate": 1.1213132248726541e-05, + "loss": 0.8409, + "step": 3650 + }, + { + "epoch": 1.4364100174151928, + "grad_norm": 2.206900104214621, + "learning_rate": 1.1171060656427957e-05, + "loss": 0.881, + "step": 3660 + }, + { + "epoch": 1.4403345679315165, + "grad_norm": 2.0032350194617923, + "learning_rate": 1.1128968037322654e-05, + "loss": 0.9304, + "step": 3670 + }, + { + "epoch": 1.4442591184478402, + "grad_norm": 1.899875816582961, + "learning_rate": 1.1086855147198442e-05, + "loss": 0.8756, + "step": 3680 + }, + { + "epoch": 1.4481836689641638, + "grad_norm": 1.1750450632734881, + "learning_rate": 1.1044722742207102e-05, + "loss": 0.8516, + "step": 3690 + }, + { + "epoch": 1.4521082194804875, + "grad_norm": 1.0654206477147208, + "learning_rate": 1.1002571578850808e-05, + "loss": 0.8287, + "step": 3700 + }, + { + "epoch": 1.4560327699968112, + "grad_norm": 2.1156979827088698, + "learning_rate": 1.0960402413968552e-05, + "loss": 0.8747, + "step": 3710 + }, + { + "epoch": 1.4599573205131349, + "grad_norm": 1.9613993823327838, + "learning_rate": 1.0918216004722551e-05, + "loss": 0.9248, + "step": 3720 + }, + { + "epoch": 1.4638818710294585, + "grad_norm": 1.5896298448753268, + "learning_rate": 1.0876013108584644e-05, + "loss": 0.862, + "step": 3730 + }, + { + "epoch": 1.4678064215457822, + "grad_norm": 1.2663265403929282, + "learning_rate": 1.08337944833227e-05, + "loss": 0.8671, + "step": 3740 + }, + { + "epoch": 1.471730972062106, + "grad_norm": 0.8402022117082641, + "learning_rate": 1.0791560886987016e-05, + "loss": 0.8089, + "step": 3750 + }, + { + "epoch": 1.4756555225784296, + "grad_norm": 2.2895246842794346, + "learning_rate": 1.0749313077896697e-05, + "loss": 0.8865, + "step": 3760 + }, + { + "epoch": 1.4795800730947533, + "grad_norm": 2.005690344314872, + "learning_rate": 1.0707051814626035e-05, + "loss": 0.9195, + "step": 3770 + }, + { + "epoch": 1.483504623611077, + "grad_norm": 1.6528392426188365, + "learning_rate": 1.0664777855990909e-05, + "loss": 0.8482, + "step": 3780 + }, + { + "epoch": 1.4874291741274006, + "grad_norm": 1.3252339862500955, + "learning_rate": 1.062249196103514e-05, + "loss": 0.8633, + "step": 3790 + }, + { + "epoch": 1.4913537246437243, + "grad_norm": 0.8384617708635065, + "learning_rate": 1.0580194889016866e-05, + "loss": 0.8424, + "step": 3800 + }, + { + "epoch": 1.495278275160048, + "grad_norm": 2.2376729601071013, + "learning_rate": 1.0537887399394926e-05, + "loss": 0.8698, + "step": 3810 + }, + { + "epoch": 1.4992028256763716, + "grad_norm": 2.0054017581982158, + "learning_rate": 1.0495570251815204e-05, + "loss": 0.9146, + "step": 3820 + }, + { + "epoch": 1.5031273761926953, + "grad_norm": 1.620117177752491, + "learning_rate": 1.0453244206096993e-05, + "loss": 0.86, + "step": 3830 + }, + { + "epoch": 1.507051926709019, + "grad_norm": 1.2414222987018593, + "learning_rate": 1.0410910022219356e-05, + "loss": 0.8462, + "step": 3840 + }, + { + "epoch": 1.5109764772253427, + "grad_norm": 0.985434847255323, + "learning_rate": 1.0368568460307482e-05, + "loss": 0.8374, + "step": 3850 + }, + { + "epoch": 1.5149010277416664, + "grad_norm": 2.414664203183688, + "learning_rate": 1.0326220280619036e-05, + "loss": 0.8643, + "step": 3860 + }, + { + "epoch": 1.51882557825799, + "grad_norm": 1.959661011938626, + "learning_rate": 1.0283866243530506e-05, + "loss": 0.9216, + "step": 3870 + }, + { + "epoch": 1.5227501287743137, + "grad_norm": 1.8216460410768873, + "learning_rate": 1.0241507109523551e-05, + "loss": 0.8557, + "step": 3880 + }, + { + "epoch": 1.5266746792906374, + "grad_norm": 1.2379402019260293, + "learning_rate": 1.019914363917135e-05, + "loss": 0.8528, + "step": 3890 + }, + { + "epoch": 1.530599229806961, + "grad_norm": 1.1791841499933475, + "learning_rate": 1.0156776593124933e-05, + "loss": 0.8409, + "step": 3900 + }, + { + "epoch": 1.5345237803232847, + "grad_norm": 2.2744852259625294, + "learning_rate": 1.0114406732099549e-05, + "loss": 0.877, + "step": 3910 + }, + { + "epoch": 1.5384483308396084, + "grad_norm": 1.9131218031571517, + "learning_rate": 1.0072034816860979e-05, + "loss": 0.9287, + "step": 3920 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 1.7840885206492576, + "learning_rate": 1.0029661608211884e-05, + "loss": 0.8511, + "step": 3930 + }, + { + "epoch": 1.5462974318722558, + "grad_norm": 1.227268938632673, + "learning_rate": 9.987287866978169e-06, + "loss": 0.8535, + "step": 3940 + }, + { + "epoch": 1.5502219823885794, + "grad_norm": 1.1621128799647606, + "learning_rate": 9.944914353995277e-06, + "loss": 0.8447, + "step": 3950 + }, + { + "epoch": 1.5541465329049031, + "grad_norm": 2.2074040426072483, + "learning_rate": 9.90254183009457e-06, + "loss": 0.8529, + "step": 3960 + }, + { + "epoch": 1.5580710834212268, + "grad_norm": 1.8956680374875223, + "learning_rate": 9.860171056089646e-06, + "loss": 0.9103, + "step": 3970 + }, + { + "epoch": 1.5619956339375505, + "grad_norm": 1.8417783539473633, + "learning_rate": 9.817802792762675e-06, + "loss": 0.8619, + "step": 3980 + }, + { + "epoch": 1.5659201844538742, + "grad_norm": 1.1239027585518444, + "learning_rate": 9.775437800850764e-06, + "loss": 0.8405, + "step": 3990 + }, + { + "epoch": 1.5698447349701978, + "grad_norm": 1.093899462684469, + "learning_rate": 9.73307684103226e-06, + "loss": 0.8409, + "step": 4000 + }, + { + "epoch": 1.5737692854865215, + "grad_norm": 2.2458835890160698, + "learning_rate": 9.690720673913135e-06, + "loss": 0.8331, + "step": 4010 + }, + { + "epoch": 1.5776938360028452, + "grad_norm": 1.9788980494060489, + "learning_rate": 9.648370060013279e-06, + "loss": 0.9097, + "step": 4020 + }, + { + "epoch": 1.5816183865191689, + "grad_norm": 1.627824694857222, + "learning_rate": 9.606025759752895e-06, + "loss": 0.8831, + "step": 4030 + }, + { + "epoch": 1.5855429370354925, + "grad_norm": 1.2178062700728904, + "learning_rate": 9.56368853343882e-06, + "loss": 0.8462, + "step": 4040 + }, + { + "epoch": 1.5894674875518162, + "grad_norm": 0.8688900774056482, + "learning_rate": 9.52135914125086e-06, + "loss": 0.8132, + "step": 4050 + }, + { + "epoch": 1.59339203806814, + "grad_norm": 2.223343280091266, + "learning_rate": 9.479038343228173e-06, + "loss": 0.8987, + "step": 4060 + }, + { + "epoch": 1.5973165885844636, + "grad_norm": 1.9082308916293935, + "learning_rate": 9.436726899255596e-06, + "loss": 0.9305, + "step": 4070 + }, + { + "epoch": 1.6012411391007872, + "grad_norm": 1.6281376379032095, + "learning_rate": 9.394425569050018e-06, + "loss": 0.8806, + "step": 4080 + }, + { + "epoch": 1.605165689617111, + "grad_norm": 1.3465856506420029, + "learning_rate": 9.352135112146726e-06, + "loss": 0.8553, + "step": 4090 + }, + { + "epoch": 1.6090902401334346, + "grad_norm": 0.8680202506295652, + "learning_rate": 9.309856287885775e-06, + "loss": 0.8224, + "step": 4100 + }, + { + "epoch": 1.6130147906497583, + "grad_norm": 2.0551986939006266, + "learning_rate": 9.267589855398356e-06, + "loss": 0.866, + "step": 4110 + }, + { + "epoch": 1.616939341166082, + "grad_norm": 2.009568214502001, + "learning_rate": 9.22533657359315e-06, + "loss": 0.9291, + "step": 4120 + }, + { + "epoch": 1.6208638916824056, + "grad_norm": 1.63900979800811, + "learning_rate": 9.183097201142722e-06, + "loss": 0.8596, + "step": 4130 + }, + { + "epoch": 1.6247884421987293, + "grad_norm": 1.270695193750196, + "learning_rate": 9.140872496469891e-06, + "loss": 0.8496, + "step": 4140 + }, + { + "epoch": 1.628712992715053, + "grad_norm": 0.800400571374905, + "learning_rate": 9.098663217734102e-06, + "loss": 0.8171, + "step": 4150 + }, + { + "epoch": 1.6326375432313767, + "grad_norm": 2.239858328605921, + "learning_rate": 9.056470122817836e-06, + "loss": 0.8696, + "step": 4160 + }, + { + "epoch": 1.6365620937477003, + "grad_norm": 2.0001741452838444, + "learning_rate": 9.01429396931297e-06, + "loss": 0.9174, + "step": 4170 + }, + { + "epoch": 1.640486644264024, + "grad_norm": 1.6873988098563508, + "learning_rate": 8.972135514507212e-06, + "loss": 0.8725, + "step": 4180 + }, + { + "epoch": 1.6444111947803477, + "grad_norm": 1.1871894670080978, + "learning_rate": 8.92999551537046e-06, + "loss": 0.819, + "step": 4190 + }, + { + "epoch": 1.6483357452966714, + "grad_norm": 0.8255765266436247, + "learning_rate": 8.88787472854126e-06, + "loss": 0.8178, + "step": 4200 + }, + { + "epoch": 1.652260295812995, + "grad_norm": 2.1797960381842927, + "learning_rate": 8.845773910313168e-06, + "loss": 0.8486, + "step": 4210 + }, + { + "epoch": 1.6561848463293187, + "grad_norm": 1.9916857527591452, + "learning_rate": 8.803693816621218e-06, + "loss": 0.8947, + "step": 4220 + }, + { + "epoch": 1.6601093968456424, + "grad_norm": 1.7685848462081732, + "learning_rate": 8.761635203028319e-06, + "loss": 0.8766, + "step": 4230 + }, + { + "epoch": 1.664033947361966, + "grad_norm": 1.245584977919593, + "learning_rate": 8.719598824711694e-06, + "loss": 0.8337, + "step": 4240 + }, + { + "epoch": 1.6679584978782898, + "grad_norm": 1.0048598136091462, + "learning_rate": 8.677585436449332e-06, + "loss": 0.8163, + "step": 4250 + }, + { + "epoch": 1.6718830483946134, + "grad_norm": 2.398297940398292, + "learning_rate": 8.635595792606419e-06, + "loss": 0.8559, + "step": 4260 + }, + { + "epoch": 1.6758075989109371, + "grad_norm": 1.9730845143153721, + "learning_rate": 8.593630647121809e-06, + "loss": 0.895, + "step": 4270 + }, + { + "epoch": 1.6797321494272608, + "grad_norm": 1.6696729781305142, + "learning_rate": 8.551690753494476e-06, + "loss": 0.8508, + "step": 4280 + }, + { + "epoch": 1.6836566999435845, + "grad_norm": 1.3530608242464415, + "learning_rate": 8.509776864769982e-06, + "loss": 0.8295, + "step": 4290 + }, + { + "epoch": 1.6875812504599081, + "grad_norm": 0.8499210708063567, + "learning_rate": 8.467889733526977e-06, + "loss": 0.8245, + "step": 4300 + }, + { + "epoch": 1.6915058009762318, + "grad_norm": 2.3773533821815067, + "learning_rate": 8.426030111863654e-06, + "loss": 0.8521, + "step": 4310 + }, + { + "epoch": 1.6954303514925555, + "grad_norm": 2.077394013322105, + "learning_rate": 8.384198751384272e-06, + "loss": 0.9227, + "step": 4320 + }, + { + "epoch": 1.6993549020088792, + "grad_norm": 1.6718741898436833, + "learning_rate": 8.342396403185649e-06, + "loss": 0.8448, + "step": 4330 + }, + { + "epoch": 1.7032794525252029, + "grad_norm": 1.3584330125549005, + "learning_rate": 8.300623817843673e-06, + "loss": 0.8385, + "step": 4340 + }, + { + "epoch": 1.7072040030415265, + "grad_norm": 0.9817563909977678, + "learning_rate": 8.258881745399837e-06, + "loss": 0.8062, + "step": 4350 + }, + { + "epoch": 1.7111285535578502, + "grad_norm": 2.3439029636827127, + "learning_rate": 8.217170935347756e-06, + "loss": 0.8164, + "step": 4360 + }, + { + "epoch": 1.7150531040741739, + "grad_norm": 2.016656786265483, + "learning_rate": 8.17549213661973e-06, + "loss": 0.8954, + "step": 4370 + }, + { + "epoch": 1.7189776545904976, + "grad_norm": 1.7641936297392027, + "learning_rate": 8.133846097573263e-06, + "loss": 0.8658, + "step": 4380 + }, + { + "epoch": 1.7229022051068212, + "grad_norm": 1.289046077866149, + "learning_rate": 8.09223356597767e-06, + "loss": 0.8291, + "step": 4390 + }, + { + "epoch": 1.726826755623145, + "grad_norm": 0.8325407065873541, + "learning_rate": 8.050655289000612e-06, + "loss": 0.8168, + "step": 4400 + }, + { + "epoch": 1.7307513061394686, + "grad_norm": 2.6948112568790124, + "learning_rate": 8.009112013194707e-06, + "loss": 0.8495, + "step": 4410 + }, + { + "epoch": 1.7346758566557923, + "grad_norm": 1.9659980108114699, + "learning_rate": 7.96760448448411e-06, + "loss": 0.8947, + "step": 4420 + }, + { + "epoch": 1.738600407172116, + "grad_norm": 1.8751083414206937, + "learning_rate": 7.926133448151121e-06, + "loss": 0.8493, + "step": 4430 + }, + { + "epoch": 1.7425249576884396, + "grad_norm": 1.2387882054830557, + "learning_rate": 7.884699648822816e-06, + "loss": 0.8267, + "step": 4440 + }, + { + "epoch": 1.7464495082047633, + "grad_norm": 1.1399825715598682, + "learning_rate": 7.843303830457654e-06, + "loss": 0.791, + "step": 4450 + }, + { + "epoch": 1.750374058721087, + "grad_norm": 2.411702115321597, + "learning_rate": 7.801946736332144e-06, + "loss": 0.8578, + "step": 4460 + }, + { + "epoch": 1.7542986092374107, + "grad_norm": 2.125672710751084, + "learning_rate": 7.760629109027488e-06, + "loss": 0.8945, + "step": 4470 + }, + { + "epoch": 1.7582231597537343, + "grad_norm": 1.6589788634772225, + "learning_rate": 7.719351690416234e-06, + "loss": 0.8528, + "step": 4480 + }, + { + "epoch": 1.762147710270058, + "grad_norm": 1.2339299024543553, + "learning_rate": 7.678115221648983e-06, + "loss": 0.8264, + "step": 4490 + }, + { + "epoch": 1.7660722607863817, + "grad_norm": 0.8818646846150126, + "learning_rate": 7.636920443141057e-06, + "loss": 0.7858, + "step": 4500 + }, + { + "epoch": 1.7699968113027054, + "grad_norm": 2.227068159454661, + "learning_rate": 7.595768094559226e-06, + "loss": 0.8546, + "step": 4510 + }, + { + "epoch": 1.773921361819029, + "grad_norm": 2.0038773703981527, + "learning_rate": 7.554658914808404e-06, + "loss": 0.8974, + "step": 4520 + }, + { + "epoch": 1.7778459123353527, + "grad_norm": 1.559249536864396, + "learning_rate": 7.513593642018398e-06, + "loss": 0.8488, + "step": 4530 + }, + { + "epoch": 1.7817704628516764, + "grad_norm": 1.2969189118652353, + "learning_rate": 7.472573013530657e-06, + "loss": 0.8509, + "step": 4540 + }, + { + "epoch": 1.785695013368, + "grad_norm": 0.8958343326615786, + "learning_rate": 7.431597765885013e-06, + "loss": 0.7997, + "step": 4550 + }, + { + "epoch": 1.7896195638843237, + "grad_norm": 2.217098339251589, + "learning_rate": 7.39066863480648e-06, + "loss": 0.8348, + "step": 4560 + }, + { + "epoch": 1.7935441144006474, + "grad_norm": 2.1265537658801117, + "learning_rate": 7.349786355192023e-06, + "loss": 0.8944, + "step": 4570 + }, + { + "epoch": 1.797468664916971, + "grad_norm": 1.6797170624498905, + "learning_rate": 7.308951661097379e-06, + "loss": 0.8448, + "step": 4580 + }, + { + "epoch": 1.8013932154332948, + "grad_norm": 1.1786118411152537, + "learning_rate": 7.268165285723875e-06, + "loss": 0.8474, + "step": 4590 + }, + { + "epoch": 1.8053177659496185, + "grad_norm": 0.8503022381368873, + "learning_rate": 7.227427961405245e-06, + "loss": 0.7908, + "step": 4600 + }, + { + "epoch": 1.8092423164659421, + "grad_norm": 2.3703275938509525, + "learning_rate": 7.186740419594505e-06, + "loss": 0.845, + "step": 4610 + }, + { + "epoch": 1.8131668669822658, + "grad_norm": 2.0576172422877073, + "learning_rate": 7.1461033908508004e-06, + "loss": 0.9065, + "step": 4620 + }, + { + "epoch": 1.8170914174985895, + "grad_norm": 1.6463925064837197, + "learning_rate": 7.1055176048263085e-06, + "loss": 0.842, + "step": 4630 + }, + { + "epoch": 1.8210159680149132, + "grad_norm": 1.3242222800874879, + "learning_rate": 7.0649837902531095e-06, + "loss": 0.8499, + "step": 4640 + }, + { + "epoch": 1.8249405185312368, + "grad_norm": 0.8538674555412706, + "learning_rate": 7.0245026749301315e-06, + "loss": 0.8046, + "step": 4650 + }, + { + "epoch": 1.8288650690475605, + "grad_norm": 2.4490816400507516, + "learning_rate": 6.984074985710068e-06, + "loss": 0.8529, + "step": 4660 + }, + { + "epoch": 1.8327896195638842, + "grad_norm": 2.0294960511407822, + "learning_rate": 6.943701448486313e-06, + "loss": 0.8992, + "step": 4670 + }, + { + "epoch": 1.8367141700802079, + "grad_norm": 1.680488532778669, + "learning_rate": 6.903382788179962e-06, + "loss": 0.8566, + "step": 4680 + }, + { + "epoch": 1.8406387205965316, + "grad_norm": 1.2281066012509496, + "learning_rate": 6.8631197287267636e-06, + "loss": 0.8376, + "step": 4690 + }, + { + "epoch": 1.8445632711128552, + "grad_norm": 1.2188089446996344, + "learning_rate": 6.82291299306414e-06, + "loss": 0.8058, + "step": 4700 + }, + { + "epoch": 1.848487821629179, + "grad_norm": 2.1748995033926497, + "learning_rate": 6.782763303118194e-06, + "loss": 0.8464, + "step": 4710 + }, + { + "epoch": 1.8524123721455026, + "grad_norm": 2.0279495030924046, + "learning_rate": 6.742671379790756e-06, + "loss": 0.8782, + "step": 4720 + }, + { + "epoch": 1.8563369226618263, + "grad_norm": 1.6914708882579899, + "learning_rate": 6.702637942946441e-06, + "loss": 0.8422, + "step": 4730 + }, + { + "epoch": 1.86026147317815, + "grad_norm": 1.3001032946364874, + "learning_rate": 6.662663711399705e-06, + "loss": 0.8189, + "step": 4740 + }, + { + "epoch": 1.8641860236944736, + "grad_norm": 0.9695568949982307, + "learning_rate": 6.622749402901971e-06, + "loss": 0.7972, + "step": 4750 + }, + { + "epoch": 1.8681105742107973, + "grad_norm": 2.3580357832714696, + "learning_rate": 6.5828957341287025e-06, + "loss": 0.8602, + "step": 4760 + }, + { + "epoch": 1.872035124727121, + "grad_norm": 2.053651822379713, + "learning_rate": 6.5431034206665686e-06, + "loss": 0.8946, + "step": 4770 + }, + { + "epoch": 1.8759596752434446, + "grad_norm": 1.6781331457753144, + "learning_rate": 6.503373177000582e-06, + "loss": 0.8479, + "step": 4780 + }, + { + "epoch": 1.8798842257597683, + "grad_norm": 1.3439461982630885, + "learning_rate": 6.463705716501261e-06, + "loss": 0.8108, + "step": 4790 + }, + { + "epoch": 1.883808776276092, + "grad_norm": 0.8215956260267698, + "learning_rate": 6.424101751411842e-06, + "loss": 0.8124, + "step": 4800 + }, + { + "epoch": 1.8877333267924157, + "grad_norm": 2.2909082972168275, + "learning_rate": 6.3845619928354676e-06, + "loss": 0.8253, + "step": 4810 + }, + { + "epoch": 1.8916578773087394, + "grad_norm": 2.1043172809575057, + "learning_rate": 6.345087150722441e-06, + "loss": 0.8767, + "step": 4820 + }, + { + "epoch": 1.895582427825063, + "grad_norm": 1.7239356576641465, + "learning_rate": 6.305677933857455e-06, + "loss": 0.8217, + "step": 4830 + }, + { + "epoch": 1.8995069783413867, + "grad_norm": 1.4857325973939928, + "learning_rate": 6.266335049846886e-06, + "loss": 0.8415, + "step": 4840 + }, + { + "epoch": 1.9034315288577104, + "grad_norm": 0.9334678918114299, + "learning_rate": 6.227059205106085e-06, + "loss": 0.7717, + "step": 4850 + }, + { + "epoch": 1.907356079374034, + "grad_norm": 2.2849232645875297, + "learning_rate": 6.187851104846676e-06, + "loss": 0.846, + "step": 4860 + }, + { + "epoch": 1.9112806298903577, + "grad_norm": 1.945016141615289, + "learning_rate": 6.1487114530639205e-06, + "loss": 0.8882, + "step": 4870 + }, + { + "epoch": 1.9152051804066814, + "grad_norm": 1.7795684379972176, + "learning_rate": 6.109640952524052e-06, + "loss": 0.8329, + "step": 4880 + }, + { + "epoch": 1.919129730923005, + "grad_norm": 1.2407252843374668, + "learning_rate": 6.070640304751677e-06, + "loss": 0.8251, + "step": 4890 + }, + { + "epoch": 1.9230542814393288, + "grad_norm": 0.8194399568120128, + "learning_rate": 6.031710210017171e-06, + "loss": 0.7867, + "step": 4900 + }, + { + "epoch": 1.9269788319556524, + "grad_norm": 2.3956208299915955, + "learning_rate": 5.992851367324097e-06, + "loss": 0.8433, + "step": 4910 + }, + { + "epoch": 1.9309033824719761, + "grad_norm": 2.0827483565619915, + "learning_rate": 5.954064474396675e-06, + "loss": 0.885, + "step": 4920 + }, + { + "epoch": 1.9348279329882998, + "grad_norm": 1.70357287888689, + "learning_rate": 5.915350227667225e-06, + "loss": 0.8385, + "step": 4930 + }, + { + "epoch": 1.9387524835046235, + "grad_norm": 1.2786111455423548, + "learning_rate": 5.876709322263696e-06, + "loss": 0.8207, + "step": 4940 + }, + { + "epoch": 1.9426770340209472, + "grad_norm": 0.7671237740151083, + "learning_rate": 5.838142451997155e-06, + "loss": 0.8048, + "step": 4950 + }, + { + "epoch": 1.9466015845372708, + "grad_norm": 2.3292976163560546, + "learning_rate": 5.799650309349348e-06, + "loss": 0.8462, + "step": 4960 + }, + { + "epoch": 1.9505261350535945, + "grad_norm": 2.077691017439424, + "learning_rate": 5.761233585460265e-06, + "loss": 0.9123, + "step": 4970 + }, + { + "epoch": 1.9544506855699182, + "grad_norm": 1.6563795333206879, + "learning_rate": 5.722892970115712e-06, + "loss": 0.8154, + "step": 4980 + }, + { + "epoch": 1.9583752360862419, + "grad_norm": 1.3593453278887475, + "learning_rate": 5.684629151734949e-06, + "loss": 0.8108, + "step": 4990 + }, + { + "epoch": 1.9622997866025655, + "grad_norm": 0.8477008150249254, + "learning_rate": 5.6464428173583174e-06, + "loss": 0.7722, + "step": 5000 + }, + { + "epoch": 1.9662243371188892, + "grad_norm": 2.3204148587243103, + "learning_rate": 5.608334652634914e-06, + "loss": 0.8387, + "step": 5010 + }, + { + "epoch": 1.970148887635213, + "grad_norm": 2.08224549041347, + "learning_rate": 5.570305341810252e-06, + "loss": 0.8696, + "step": 5020 + }, + { + "epoch": 1.9740734381515366, + "grad_norm": 1.6687767550667691, + "learning_rate": 5.532355567714013e-06, + "loss": 0.8536, + "step": 5030 + }, + { + "epoch": 1.9779979886678603, + "grad_norm": 1.2616375376723235, + "learning_rate": 5.494486011747761e-06, + "loss": 0.8102, + "step": 5040 + }, + { + "epoch": 1.981922539184184, + "grad_norm": 0.7574755635671572, + "learning_rate": 5.4566973538727216e-06, + "loss": 0.7677, + "step": 5050 + }, + { + "epoch": 1.9858470897005076, + "grad_norm": 2.309973402310073, + "learning_rate": 5.418990272597561e-06, + "loss": 0.839, + "step": 5060 + }, + { + "epoch": 1.9897716402168313, + "grad_norm": 2.196528214038538, + "learning_rate": 5.381365444966205e-06, + "loss": 0.8893, + "step": 5070 + }, + { + "epoch": 1.993696190733155, + "grad_norm": 1.6515133028295628, + "learning_rate": 5.3438235465456926e-06, + "loss": 0.8053, + "step": 5080 + }, + { + "epoch": 1.9976207412494786, + "grad_norm": 1.4943879150775243, + "learning_rate": 5.306365251414043e-06, + "loss": 0.798, + "step": 5090 + }, + { + "epoch": 1.999975471559273, + "eval_loss": 0.6450071930885315, + "eval_runtime": 1529.4328, + "eval_samples_per_second": 16.346, + "eval_steps_per_second": 4.086, + "step": 5096 + }, + { + "epoch": 2.0015698202065293, + "grad_norm": 1.4784398750719419, + "learning_rate": 5.268991232148137e-06, + "loss": 0.8525, + "step": 5100 + }, + { + "epoch": 2.005494370722853, + "grad_norm": 9.338133108180491, + "learning_rate": 5.2317021598116635e-06, + "loss": 0.5966, + "step": 5110 + }, + { + "epoch": 2.0094189212391766, + "grad_norm": 1.8570667185487677, + "learning_rate": 5.1944987039430535e-06, + "loss": 0.6489, + "step": 5120 + }, + { + "epoch": 2.0133434717555003, + "grad_norm": 2.1202392786052697, + "learning_rate": 5.157381532543473e-06, + "loss": 0.6784, + "step": 5130 + }, + { + "epoch": 2.017268022271824, + "grad_norm": 1.700823660700944, + "learning_rate": 5.120351312064802e-06, + "loss": 0.6222, + "step": 5140 + }, + { + "epoch": 2.0211925727881477, + "grad_norm": 1.3821129005652262, + "learning_rate": 5.083408707397704e-06, + "loss": 0.6147, + "step": 5150 + }, + { + "epoch": 2.0251171233044714, + "grad_norm": 1.0951660096886453, + "learning_rate": 5.046554381859663e-06, + "loss": 0.5845, + "step": 5160 + }, + { + "epoch": 2.029041673820795, + "grad_norm": 2.138790453625462, + "learning_rate": 5.009788997183074e-06, + "loss": 0.6237, + "step": 5170 + }, + { + "epoch": 2.0329662243371187, + "grad_norm": 2.0579134568589597, + "learning_rate": 4.973113213503379e-06, + "loss": 0.705, + "step": 5180 + }, + { + "epoch": 2.0368907748534424, + "grad_norm": 1.75667330374861, + "learning_rate": 4.936527689347195e-06, + "loss": 0.6389, + "step": 5190 + }, + { + "epoch": 2.040815325369766, + "grad_norm": 1.308594348694505, + "learning_rate": 4.9000330816205e-06, + "loss": 0.6035, + "step": 5200 + }, + { + "epoch": 2.0447398758860897, + "grad_norm": 0.9410941640978612, + "learning_rate": 4.863630045596838e-06, + "loss": 0.5541, + "step": 5210 + }, + { + "epoch": 2.0486644264024134, + "grad_norm": 2.1522254372307357, + "learning_rate": 4.8273192349055405e-06, + "loss": 0.5952, + "step": 5220 + }, + { + "epoch": 2.052588976918737, + "grad_norm": 2.036592335343225, + "learning_rate": 4.791101301520016e-06, + "loss": 0.6809, + "step": 5230 + }, + { + "epoch": 2.0565135274350608, + "grad_norm": 1.686946791293678, + "learning_rate": 4.754976895746007e-06, + "loss": 0.6342, + "step": 5240 + }, + { + "epoch": 2.0604380779513845, + "grad_norm": 1.3407168000815266, + "learning_rate": 4.718946666209966e-06, + "loss": 0.6237, + "step": 5250 + }, + { + "epoch": 2.064362628467708, + "grad_norm": 0.8377432352577971, + "learning_rate": 4.683011259847346e-06, + "loss": 0.5427, + "step": 5260 + }, + { + "epoch": 2.068287178984032, + "grad_norm": 2.1455140854622496, + "learning_rate": 4.647171321891034e-06, + "loss": 0.6384, + "step": 5270 + }, + { + "epoch": 2.0722117295003555, + "grad_norm": 1.99774007410406, + "learning_rate": 4.61142749585975e-06, + "loss": 0.667, + "step": 5280 + }, + { + "epoch": 2.076136280016679, + "grad_norm": 1.7474636373639787, + "learning_rate": 4.575780423546476e-06, + "loss": 0.6309, + "step": 5290 + }, + { + "epoch": 2.080060830533003, + "grad_norm": 1.3359269678333927, + "learning_rate": 4.540230745006962e-06, + "loss": 0.5829, + "step": 5300 + }, + { + "epoch": 2.0839853810493265, + "grad_norm": 0.7162710672069984, + "learning_rate": 4.504779098548209e-06, + "loss": 0.5332, + "step": 5310 + }, + { + "epoch": 2.08790993156565, + "grad_norm": 2.0989757061484893, + "learning_rate": 4.469426120717025e-06, + "loss": 0.624, + "step": 5320 + }, + { + "epoch": 2.091834482081974, + "grad_norm": 2.1426189322836975, + "learning_rate": 4.434172446288579e-06, + "loss": 0.6681, + "step": 5330 + }, + { + "epoch": 2.0957590325982975, + "grad_norm": 1.776402952977803, + "learning_rate": 4.399018708255018e-06, + "loss": 0.6193, + "step": 5340 + }, + { + "epoch": 2.099683583114621, + "grad_norm": 1.3008240753278815, + "learning_rate": 4.363965537814102e-06, + "loss": 0.6082, + "step": 5350 + }, + { + "epoch": 2.103608133630945, + "grad_norm": 0.8996256226429636, + "learning_rate": 4.329013564357848e-06, + "loss": 0.5629, + "step": 5360 + }, + { + "epoch": 2.1075326841472686, + "grad_norm": 2.0984085235237004, + "learning_rate": 4.294163415461258e-06, + "loss": 0.6169, + "step": 5370 + }, + { + "epoch": 2.1114572346635923, + "grad_norm": 2.1055348326355667, + "learning_rate": 4.259415716871037e-06, + "loss": 0.6725, + "step": 5380 + }, + { + "epoch": 2.115381785179916, + "grad_norm": 1.7564083013798226, + "learning_rate": 4.224771092494355e-06, + "loss": 0.6177, + "step": 5390 + }, + { + "epoch": 2.1193063356962396, + "grad_norm": 1.3081730861592322, + "learning_rate": 4.1902301643876555e-06, + "loss": 0.5994, + "step": 5400 + }, + { + "epoch": 2.1232308862125633, + "grad_norm": 0.7914811793114824, + "learning_rate": 4.155793552745465e-06, + "loss": 0.5642, + "step": 5410 + }, + { + "epoch": 2.127155436728887, + "grad_norm": 2.4378415532004287, + "learning_rate": 4.1214618758892865e-06, + "loss": 0.6125, + "step": 5420 + }, + { + "epoch": 2.1310799872452106, + "grad_norm": 2.0526253778498154, + "learning_rate": 4.087235750256469e-06, + "loss": 0.666, + "step": 5430 + }, + { + "epoch": 2.1350045377615343, + "grad_norm": 1.8656396637088302, + "learning_rate": 4.053115790389159e-06, + "loss": 0.6394, + "step": 5440 + }, + { + "epoch": 2.138929088277858, + "grad_norm": 1.209266875293654, + "learning_rate": 4.019102608923262e-06, + "loss": 0.6132, + "step": 5450 + }, + { + "epoch": 2.1428536387941817, + "grad_norm": 1.1832551572175067, + "learning_rate": 3.985196816577433e-06, + "loss": 0.5475, + "step": 5460 + }, + { + "epoch": 2.1467781893105053, + "grad_norm": 2.2143801889042276, + "learning_rate": 3.951399022142127e-06, + "loss": 0.608, + "step": 5470 + }, + { + "epoch": 2.150702739826829, + "grad_norm": 2.128346152883008, + "learning_rate": 3.917709832468641e-06, + "loss": 0.6848, + "step": 5480 + }, + { + "epoch": 2.1546272903431527, + "grad_norm": 1.6896984103464467, + "learning_rate": 3.884129852458253e-06, + "loss": 0.6284, + "step": 5490 + }, + { + "epoch": 2.1585518408594764, + "grad_norm": 1.222694494025062, + "learning_rate": 3.850659685051336e-06, + "loss": 0.5898, + "step": 5500 + }, + { + "epoch": 2.1624763913758, + "grad_norm": 0.8066312626727323, + "learning_rate": 3.817299931216537e-06, + "loss": 0.546, + "step": 5510 + }, + { + "epoch": 2.1664009418921237, + "grad_norm": 2.05281043982966, + "learning_rate": 3.784051189939996e-06, + "loss": 0.6217, + "step": 5520 + }, + { + "epoch": 2.1703254924084474, + "grad_norm": 2.0329973993617556, + "learning_rate": 3.7509140582145707e-06, + "loss": 0.6679, + "step": 5530 + }, + { + "epoch": 2.174250042924771, + "grad_norm": 1.7930592907009166, + "learning_rate": 3.7178891310291444e-06, + "loss": 0.6302, + "step": 5540 + }, + { + "epoch": 2.1781745934410948, + "grad_norm": 1.3304481186905044, + "learning_rate": 3.6849770013579135e-06, + "loss": 0.5972, + "step": 5550 + }, + { + "epoch": 2.1820991439574184, + "grad_norm": 0.8862316368226267, + "learning_rate": 3.652178260149768e-06, + "loss": 0.5508, + "step": 5560 + }, + { + "epoch": 2.186023694473742, + "grad_norm": 2.16438233599458, + "learning_rate": 3.619493496317662e-06, + "loss": 0.6113, + "step": 5570 + }, + { + "epoch": 2.189948244990066, + "grad_norm": 2.097362156115183, + "learning_rate": 3.5869232967280466e-06, + "loss": 0.678, + "step": 5580 + }, + { + "epoch": 2.1938727955063895, + "grad_norm": 1.7293983928389607, + "learning_rate": 3.554468246190337e-06, + "loss": 0.6255, + "step": 5590 + }, + { + "epoch": 2.197797346022713, + "grad_norm": 1.2526910240559423, + "learning_rate": 3.522128927446392e-06, + "loss": 0.6191, + "step": 5600 + }, + { + "epoch": 2.201721896539037, + "grad_norm": 0.779154938656075, + "learning_rate": 3.489905921160083e-06, + "loss": 0.5403, + "step": 5610 + }, + { + "epoch": 2.2056464470553605, + "grad_norm": 2.0546436977432094, + "learning_rate": 3.4577998059068354e-06, + "loss": 0.6159, + "step": 5620 + }, + { + "epoch": 2.209570997571684, + "grad_norm": 2.141525722727545, + "learning_rate": 3.4258111581632634e-06, + "loss": 0.6876, + "step": 5630 + }, + { + "epoch": 2.213495548088008, + "grad_norm": 1.7486151559652525, + "learning_rate": 3.3939405522968105e-06, + "loss": 0.6232, + "step": 5640 + }, + { + "epoch": 2.2174200986043315, + "grad_norm": 1.2463505933135222, + "learning_rate": 3.362188560555434e-06, + "loss": 0.603, + "step": 5650 + }, + { + "epoch": 2.221344649120655, + "grad_norm": 0.7557156986288721, + "learning_rate": 3.3305557530573363e-06, + "loss": 0.5734, + "step": 5660 + }, + { + "epoch": 2.225269199636979, + "grad_norm": 2.6186065252503994, + "learning_rate": 3.2990426977807156e-06, + "loss": 0.6169, + "step": 5670 + }, + { + "epoch": 2.2291937501533026, + "grad_norm": 2.1713884725106314, + "learning_rate": 3.2676499605535918e-06, + "loss": 0.6557, + "step": 5680 + }, + { + "epoch": 2.2331183006696262, + "grad_norm": 1.8496259928018195, + "learning_rate": 3.2363781050436105e-06, + "loss": 0.6224, + "step": 5690 + }, + { + "epoch": 2.23704285118595, + "grad_norm": 1.3375119127996462, + "learning_rate": 3.2052276927479677e-06, + "loss": 0.6029, + "step": 5700 + }, + { + "epoch": 2.2409674017022736, + "grad_norm": 0.847985655236947, + "learning_rate": 3.1741992829832924e-06, + "loss": 0.5552, + "step": 5710 + }, + { + "epoch": 2.2448919522185973, + "grad_norm": 2.1771913040167212, + "learning_rate": 3.143293432875607e-06, + "loss": 0.6089, + "step": 5720 + }, + { + "epoch": 2.248816502734921, + "grad_norm": 2.0426822342569837, + "learning_rate": 3.112510697350348e-06, + "loss": 0.6927, + "step": 5730 + }, + { + "epoch": 2.2527410532512446, + "grad_norm": 1.8247017082924184, + "learning_rate": 3.081851629122372e-06, + "loss": 0.6389, + "step": 5740 + }, + { + "epoch": 2.2566656037675683, + "grad_norm": 1.3403447493161933, + "learning_rate": 3.051316778686055e-06, + "loss": 0.5947, + "step": 5750 + }, + { + "epoch": 2.260590154283892, + "grad_norm": 0.8089857773862223, + "learning_rate": 3.0209066943053944e-06, + "loss": 0.5622, + "step": 5760 + }, + { + "epoch": 2.2645147048002157, + "grad_norm": 2.3577656985489477, + "learning_rate": 2.990621922004172e-06, + "loss": 0.5892, + "step": 5770 + }, + { + "epoch": 2.2684392553165393, + "grad_norm": 2.1740666869738323, + "learning_rate": 2.960463005556149e-06, + "loss": 0.672, + "step": 5780 + }, + { + "epoch": 2.272363805832863, + "grad_norm": 1.829981750443929, + "learning_rate": 2.9304304864752886e-06, + "loss": 0.6373, + "step": 5790 + }, + { + "epoch": 2.2762883563491867, + "grad_norm": 1.3281957095626744, + "learning_rate": 2.900524904006061e-06, + "loss": 0.5975, + "step": 5800 + }, + { + "epoch": 2.2802129068655104, + "grad_norm": 0.7087417039119808, + "learning_rate": 2.87074679511373e-06, + "loss": 0.5296, + "step": 5810 + }, + { + "epoch": 2.284137457381834, + "grad_norm": 2.109191274545063, + "learning_rate": 2.8410966944747377e-06, + "loss": 0.5962, + "step": 5820 + }, + { + "epoch": 2.2880620078981577, + "grad_norm": 2.241584819214679, + "learning_rate": 2.8115751344670863e-06, + "loss": 0.6636, + "step": 5830 + }, + { + "epoch": 2.2919865584144814, + "grad_norm": 1.7605810701006008, + "learning_rate": 2.782182645160789e-06, + "loss": 0.6265, + "step": 5840 + }, + { + "epoch": 2.295911108930805, + "grad_norm": 1.2836328256236162, + "learning_rate": 2.7529197543083507e-06, + "loss": 0.5931, + "step": 5850 + }, + { + "epoch": 2.2998356594471288, + "grad_norm": 0.9519727219083821, + "learning_rate": 2.7237869873352827e-06, + "loss": 0.5509, + "step": 5860 + }, + { + "epoch": 2.3037602099634524, + "grad_norm": 2.1895645275891704, + "learning_rate": 2.6947848673306853e-06, + "loss": 0.6199, + "step": 5870 + }, + { + "epoch": 2.307684760479776, + "grad_norm": 2.0598817109009904, + "learning_rate": 2.6659139150378377e-06, + "loss": 0.6591, + "step": 5880 + }, + { + "epoch": 2.3116093109961, + "grad_norm": 1.8143316778974414, + "learning_rate": 2.6371746488448614e-06, + "loss": 0.6347, + "step": 5890 + }, + { + "epoch": 2.3155338615124235, + "grad_norm": 1.3541045070964877, + "learning_rate": 2.6085675847754155e-06, + "loss": 0.586, + "step": 5900 + }, + { + "epoch": 2.319458412028747, + "grad_norm": 0.7923721169078987, + "learning_rate": 2.5800932364794064e-06, + "loss": 0.5212, + "step": 5910 + }, + { + "epoch": 2.323382962545071, + "grad_norm": 2.794475468134139, + "learning_rate": 2.5517521152237966e-06, + "loss": 0.5974, + "step": 5920 + }, + { + "epoch": 2.3273075130613945, + "grad_norm": 2.2143050890712255, + "learning_rate": 2.5235447298834003e-06, + "loss": 0.6684, + "step": 5930 + }, + { + "epoch": 2.331232063577718, + "grad_norm": 1.7900709742899215, + "learning_rate": 2.49547158693176e-06, + "loss": 0.6278, + "step": 5940 + }, + { + "epoch": 2.335156614094042, + "grad_norm": 1.271494366484403, + "learning_rate": 2.4675331904320533e-06, + "loss": 0.5929, + "step": 5950 + }, + { + "epoch": 2.3390811646103655, + "grad_norm": 0.8556427431737861, + "learning_rate": 2.43973004202803e-06, + "loss": 0.5524, + "step": 5960 + }, + { + "epoch": 2.343005715126689, + "grad_norm": 2.117180082843371, + "learning_rate": 2.412062640935021e-06, + "loss": 0.6013, + "step": 5970 + }, + { + "epoch": 2.346930265643013, + "grad_norm": 2.1484003642018457, + "learning_rate": 2.3845314839309563e-06, + "loss": 0.6632, + "step": 5980 + }, + { + "epoch": 2.3508548161593366, + "grad_norm": 1.8030873162593484, + "learning_rate": 2.3571370653474656e-06, + "loss": 0.6168, + "step": 5990 + }, + { + "epoch": 2.3547793666756602, + "grad_norm": 1.2601737288351715, + "learning_rate": 2.329879877060981e-06, + "loss": 0.5886, + "step": 6000 + }, + { + "epoch": 2.358703917191984, + "grad_norm": 0.9165582307879456, + "learning_rate": 2.302760408483926e-06, + "loss": 0.5428, + "step": 6010 + }, + { + "epoch": 2.3626284677083076, + "grad_norm": 2.0984461367902605, + "learning_rate": 2.275779146555915e-06, + "loss": 0.6007, + "step": 6020 + }, + { + "epoch": 2.3665530182246313, + "grad_norm": 2.1846524601461894, + "learning_rate": 2.2489365757350132e-06, + "loss": 0.664, + "step": 6030 + }, + { + "epoch": 2.370477568740955, + "grad_norm": 1.7647491805175937, + "learning_rate": 2.2222331779890393e-06, + "loss": 0.6257, + "step": 6040 + }, + { + "epoch": 2.3744021192572786, + "grad_norm": 1.3444624902761966, + "learning_rate": 2.1956694327869043e-06, + "loss": 0.6041, + "step": 6050 + }, + { + "epoch": 2.3783266697736023, + "grad_norm": 0.8924579097538136, + "learning_rate": 2.16924581709002e-06, + "loss": 0.5369, + "step": 6060 + }, + { + "epoch": 2.382251220289926, + "grad_norm": 2.1816842093278526, + "learning_rate": 2.142962805343708e-06, + "loss": 0.5806, + "step": 6070 + }, + { + "epoch": 2.3861757708062497, + "grad_norm": 2.1315338386325138, + "learning_rate": 2.1168208694687108e-06, + "loss": 0.6934, + "step": 6080 + }, + { + "epoch": 2.3901003213225733, + "grad_norm": 1.8401419512172745, + "learning_rate": 2.0908204788526965e-06, + "loss": 0.6473, + "step": 6090 + }, + { + "epoch": 2.394024871838897, + "grad_norm": 1.256508614588367, + "learning_rate": 2.064962100341842e-06, + "loss": 0.6, + "step": 6100 + }, + { + "epoch": 2.3979494223552207, + "grad_norm": 0.7725755529070791, + "learning_rate": 2.039246198232446e-06, + "loss": 0.5488, + "step": 6110 + }, + { + "epoch": 2.4018739728715444, + "grad_norm": 2.1092659651397474, + "learning_rate": 2.0136732342625874e-06, + "loss": 0.5748, + "step": 6120 + }, + { + "epoch": 2.405798523387868, + "grad_norm": 2.202730059861675, + "learning_rate": 1.9882436676038477e-06, + "loss": 0.6778, + "step": 6130 + }, + { + "epoch": 2.4097230739041917, + "grad_norm": 1.7146488474319233, + "learning_rate": 1.962957954853055e-06, + "loss": 0.642, + "step": 6140 + }, + { + "epoch": 2.4136476244205154, + "grad_norm": 1.2875382567695426, + "learning_rate": 1.9378165500240943e-06, + "loss": 0.5935, + "step": 6150 + }, + { + "epoch": 2.417572174936839, + "grad_norm": 0.8159718677862676, + "learning_rate": 1.912819904539749e-06, + "loss": 0.556, + "step": 6160 + }, + { + "epoch": 2.4214967254531627, + "grad_norm": 2.147420171464766, + "learning_rate": 1.887968467223591e-06, + "loss": 0.6084, + "step": 6170 + }, + { + "epoch": 2.4254212759694864, + "grad_norm": 2.352418026966586, + "learning_rate": 1.8632626842919398e-06, + "loss": 0.6647, + "step": 6180 + }, + { + "epoch": 2.42934582648581, + "grad_norm": 1.843956419568753, + "learning_rate": 1.8387029993458273e-06, + "loss": 0.6224, + "step": 6190 + }, + { + "epoch": 2.4332703770021338, + "grad_norm": 1.215534258708622, + "learning_rate": 1.8142898533630536e-06, + "loss": 0.6116, + "step": 6200 + }, + { + "epoch": 2.4371949275184575, + "grad_norm": 0.7790773570638417, + "learning_rate": 1.7900236846902575e-06, + "loss": 0.5395, + "step": 6210 + }, + { + "epoch": 2.441119478034781, + "grad_norm": 2.274459819686287, + "learning_rate": 1.765904929035046e-06, + "loss": 0.6089, + "step": 6220 + }, + { + "epoch": 2.445044028551105, + "grad_norm": 2.156030969614435, + "learning_rate": 1.7419340194581803e-06, + "loss": 0.6517, + "step": 6230 + }, + { + "epoch": 2.4489685790674285, + "grad_norm": 1.8023260247461752, + "learning_rate": 1.7181113863657805e-06, + "loss": 0.6312, + "step": 6240 + }, + { + "epoch": 2.452893129583752, + "grad_norm": 1.6128467278404588, + "learning_rate": 1.6944374575016253e-06, + "loss": 0.6097, + "step": 6250 + }, + { + "epoch": 2.456817680100076, + "grad_norm": 0.7816281842546718, + "learning_rate": 1.670912657939443e-06, + "loss": 0.5411, + "step": 6260 + }, + { + "epoch": 2.4607422306163995, + "grad_norm": 2.2598007072156028, + "learning_rate": 1.6475374100753017e-06, + "loss": 0.6139, + "step": 6270 + }, + { + "epoch": 2.464666781132723, + "grad_norm": 2.1326684182909936, + "learning_rate": 1.624312133620013e-06, + "loss": 0.6849, + "step": 6280 + }, + { + "epoch": 2.468591331649047, + "grad_norm": 1.793136525038962, + "learning_rate": 1.6012372455915993e-06, + "loss": 0.6165, + "step": 6290 + }, + { + "epoch": 2.4725158821653705, + "grad_norm": 1.2436395590561673, + "learning_rate": 1.5783131603078083e-06, + "loss": 0.5958, + "step": 6300 + }, + { + "epoch": 2.4764404326816942, + "grad_norm": 0.8178284156434679, + "learning_rate": 1.555540289378663e-06, + "loss": 0.542, + "step": 6310 + }, + { + "epoch": 2.480364983198018, + "grad_norm": 2.219348641209819, + "learning_rate": 1.532919041699089e-06, + "loss": 0.6146, + "step": 6320 + }, + { + "epoch": 2.4842895337143416, + "grad_norm": 2.1421139951368375, + "learning_rate": 1.510449823441561e-06, + "loss": 0.669, + "step": 6330 + }, + { + "epoch": 2.4882140842306653, + "grad_norm": 1.7736069610642504, + "learning_rate": 1.4881330380488014e-06, + "loss": 0.6325, + "step": 6340 + }, + { + "epoch": 2.492138634746989, + "grad_norm": 1.2940938084497826, + "learning_rate": 1.4659690862265675e-06, + "loss": 0.5918, + "step": 6350 + }, + { + "epoch": 2.4960631852633126, + "grad_norm": 0.7888355624944882, + "learning_rate": 1.4439583659364154e-06, + "loss": 0.5432, + "step": 6360 + }, + { + "epoch": 2.4999877357796363, + "grad_norm": 2.2684459134167003, + "learning_rate": 1.4221012723885874e-06, + "loss": 0.6068, + "step": 6370 + }, + { + "epoch": 2.50391228629596, + "grad_norm": 2.303631890847662, + "learning_rate": 1.400398198034897e-06, + "loss": 0.6815, + "step": 6380 + }, + { + "epoch": 2.5078368368122836, + "grad_norm": 1.725928594909339, + "learning_rate": 1.3788495325616912e-06, + "loss": 0.629, + "step": 6390 + }, + { + "epoch": 2.5117613873286073, + "grad_norm": 1.2724712580718072, + "learning_rate": 1.357455662882855e-06, + "loss": 0.5858, + "step": 6400 + }, + { + "epoch": 2.515685937844931, + "grad_norm": 0.8196671527768702, + "learning_rate": 1.3362169731328534e-06, + "loss": 0.543, + "step": 6410 + }, + { + "epoch": 2.5196104883612547, + "grad_norm": 2.2121857648933614, + "learning_rate": 1.3151338446598483e-06, + "loss": 0.5918, + "step": 6420 + }, + { + "epoch": 2.5235350388775784, + "grad_norm": 2.07329199306329, + "learning_rate": 1.2942066560188349e-06, + "loss": 0.65, + "step": 6430 + }, + { + "epoch": 2.527459589393902, + "grad_norm": 1.7691554410042842, + "learning_rate": 1.2734357829648624e-06, + "loss": 0.6245, + "step": 6440 + }, + { + "epoch": 2.5313841399102257, + "grad_norm": 1.2397356761301885, + "learning_rate": 1.2528215984462766e-06, + "loss": 0.5757, + "step": 6450 + }, + { + "epoch": 2.5353086904265494, + "grad_norm": 0.8552163717265764, + "learning_rate": 1.23236447259802e-06, + "loss": 0.5636, + "step": 6460 + }, + { + "epoch": 2.539233240942873, + "grad_norm": 2.019030277835733, + "learning_rate": 1.2120647727349977e-06, + "loss": 0.5962, + "step": 6470 + }, + { + "epoch": 2.5431577914591967, + "grad_norm": 2.1298373917453928, + "learning_rate": 1.1919228633454738e-06, + "loss": 0.6936, + "step": 6480 + }, + { + "epoch": 2.5470823419755204, + "grad_norm": 1.8011050211359714, + "learning_rate": 1.1719391060845298e-06, + "loss": 0.6272, + "step": 6490 + }, + { + "epoch": 2.551006892491844, + "grad_norm": 1.221657027890025, + "learning_rate": 1.152113859767565e-06, + "loss": 0.6286, + "step": 6500 + }, + { + "epoch": 2.5549314430081678, + "grad_norm": 0.8061467765642856, + "learning_rate": 1.1324474803638653e-06, + "loss": 0.5501, + "step": 6510 + }, + { + "epoch": 2.5588559935244914, + "grad_norm": 2.051549539190188, + "learning_rate": 1.1129403209902034e-06, + "loss": 0.6067, + "step": 6520 + }, + { + "epoch": 2.562780544040815, + "grad_norm": 2.3142399536514073, + "learning_rate": 1.0935927319044959e-06, + "loss": 0.6484, + "step": 6530 + }, + { + "epoch": 2.566705094557139, + "grad_norm": 1.7767173325015122, + "learning_rate": 1.0744050604995237e-06, + "loss": 0.6047, + "step": 6540 + }, + { + "epoch": 2.5706296450734625, + "grad_norm": 1.3099949970790892, + "learning_rate": 1.0553776512966886e-06, + "loss": 0.5826, + "step": 6550 + }, + { + "epoch": 2.574554195589786, + "grad_norm": 0.8416372830973359, + "learning_rate": 1.0365108459398277e-06, + "loss": 0.5418, + "step": 6560 + }, + { + "epoch": 2.57847874610611, + "grad_norm": 2.127337984518528, + "learning_rate": 1.0178049831890768e-06, + "loss": 0.6093, + "step": 6570 + }, + { + "epoch": 2.5824032966224335, + "grad_norm": 2.252528047531405, + "learning_rate": 9.992603989147941e-07, + "loss": 0.6867, + "step": 6580 + }, + { + "epoch": 2.586327847138757, + "grad_norm": 1.7132261228204282, + "learning_rate": 9.808774260915243e-07, + "loss": 0.6564, + "step": 6590 + }, + { + "epoch": 2.590252397655081, + "grad_norm": 1.2934365792962297, + "learning_rate": 9.626563947920231e-07, + "loss": 0.5691, + "step": 6600 + }, + { + "epoch": 2.5941769481714045, + "grad_norm": 0.8786324009918022, + "learning_rate": 9.445976321813277e-07, + "loss": 0.5383, + "step": 6610 + }, + { + "epoch": 2.598101498687728, + "grad_norm": 2.2501075917295212, + "learning_rate": 9.267014625108806e-07, + "loss": 0.5817, + "step": 6620 + }, + { + "epoch": 2.602026049204052, + "grad_norm": 2.2174253075426167, + "learning_rate": 9.089682071127171e-07, + "loss": 0.6744, + "step": 6630 + }, + { + "epoch": 2.6059505997203756, + "grad_norm": 1.7616659355193223, + "learning_rate": 8.91398184393687e-07, + "loss": 0.6359, + "step": 6640 + }, + { + "epoch": 2.6098751502366992, + "grad_norm": 1.3089314769046627, + "learning_rate": 8.739917098297357e-07, + "loss": 0.6045, + "step": 6650 + }, + { + "epoch": 2.613799700753023, + "grad_norm": 0.7444796805297569, + "learning_rate": 8.567490959602509e-07, + "loss": 0.5295, + "step": 6660 + }, + { + "epoch": 2.6177242512693466, + "grad_norm": 2.2630343377076287, + "learning_rate": 8.396706523824372e-07, + "loss": 0.6244, + "step": 6670 + }, + { + "epoch": 2.6216488017856703, + "grad_norm": 2.0443361827637836, + "learning_rate": 8.227566857457702e-07, + "loss": 0.6894, + "step": 6680 + }, + { + "epoch": 2.625573352301994, + "grad_norm": 1.7688362533725879, + "learning_rate": 8.060074997464773e-07, + "loss": 0.6192, + "step": 6690 + }, + { + "epoch": 2.6294979028183176, + "grad_norm": 1.2229083921293311, + "learning_rate": 7.894233951220953e-07, + "loss": 0.5856, + "step": 6700 + }, + { + "epoch": 2.6334224533346413, + "grad_norm": 0.76533431687556, + "learning_rate": 7.730046696460691e-07, + "loss": 0.53, + "step": 6710 + }, + { + "epoch": 2.637347003850965, + "grad_norm": 2.264231653926727, + "learning_rate": 7.567516181223966e-07, + "loss": 0.5991, + "step": 6720 + }, + { + "epoch": 2.6412715543672887, + "grad_norm": 2.1991521419444267, + "learning_rate": 7.406645323803463e-07, + "loss": 0.6315, + "step": 6730 + }, + { + "epoch": 2.6451961048836123, + "grad_norm": 1.7239735369036862, + "learning_rate": 7.247437012692104e-07, + "loss": 0.6427, + "step": 6740 + }, + { + "epoch": 2.649120655399936, + "grad_norm": 1.2790232035397695, + "learning_rate": 7.089894106531214e-07, + "loss": 0.594, + "step": 6750 + }, + { + "epoch": 2.6530452059162597, + "grad_norm": 0.955171760712246, + "learning_rate": 6.934019434059213e-07, + "loss": 0.5533, + "step": 6760 + }, + { + "epoch": 2.6569697564325834, + "grad_norm": 2.1955164062676644, + "learning_rate": 6.779815794060718e-07, + "loss": 0.5936, + "step": 6770 + }, + { + "epoch": 2.660894306948907, + "grad_norm": 2.2404548484277664, + "learning_rate": 6.627285955316476e-07, + "loss": 0.6513, + "step": 6780 + }, + { + "epoch": 2.6648188574652307, + "grad_norm": 1.8828883729369554, + "learning_rate": 6.476432656553411e-07, + "loss": 0.6286, + "step": 6790 + }, + { + "epoch": 2.6687434079815544, + "grad_norm": 1.2443150219704873, + "learning_rate": 6.327258606395736e-07, + "loss": 0.5939, + "step": 6800 + }, + { + "epoch": 2.672667958497878, + "grad_norm": 0.9014377738542545, + "learning_rate": 6.179766483316041e-07, + "loss": 0.5334, + "step": 6810 + }, + { + "epoch": 2.6765925090142018, + "grad_norm": 2.183823820105367, + "learning_rate": 6.03395893558737e-07, + "loss": 0.5913, + "step": 6820 + }, + { + "epoch": 2.6805170595305254, + "grad_norm": 2.1912679703089313, + "learning_rate": 5.889838581235641e-07, + "loss": 0.6719, + "step": 6830 + }, + { + "epoch": 2.684441610046849, + "grad_norm": 1.7923155245248856, + "learning_rate": 5.747408007992572e-07, + "loss": 0.6208, + "step": 6840 + }, + { + "epoch": 2.688366160563173, + "grad_norm": 1.2494561675837905, + "learning_rate": 5.606669773249296e-07, + "loss": 0.596, + "step": 6850 + }, + { + "epoch": 2.6922907110794965, + "grad_norm": 0.907833148614821, + "learning_rate": 5.467626404010407e-07, + "loss": 0.5372, + "step": 6860 + }, + { + "epoch": 2.69621526159582, + "grad_norm": 2.2934267800117856, + "learning_rate": 5.330280396848619e-07, + "loss": 0.609, + "step": 6870 + }, + { + "epoch": 2.700139812112144, + "grad_norm": 2.114541613337409, + "learning_rate": 5.194634217859851e-07, + "loss": 0.6611, + "step": 6880 + }, + { + "epoch": 2.7040643626284675, + "grad_norm": 1.754501104961409, + "learning_rate": 5.060690302619053e-07, + "loss": 0.6157, + "step": 6890 + }, + { + "epoch": 2.707988913144791, + "grad_norm": 1.364912899052101, + "learning_rate": 4.92845105613644e-07, + "loss": 0.5929, + "step": 6900 + }, + { + "epoch": 2.711913463661115, + "grad_norm": 0.7372286271827898, + "learning_rate": 4.797918852814254e-07, + "loss": 0.5314, + "step": 6910 + }, + { + "epoch": 2.7158380141774385, + "grad_norm": 2.073475468780587, + "learning_rate": 4.6690960364041973e-07, + "loss": 0.5976, + "step": 6920 + }, + { + "epoch": 2.719762564693762, + "grad_norm": 2.2358152929450963, + "learning_rate": 4.5419849199653364e-07, + "loss": 0.6623, + "step": 6930 + }, + { + "epoch": 2.723687115210086, + "grad_norm": 1.815422184822613, + "learning_rate": 4.416587785822568e-07, + "loss": 0.6073, + "step": 6940 + }, + { + "epoch": 2.7276116657264096, + "grad_norm": 1.350125241744432, + "learning_rate": 4.2929068855256275e-07, + "loss": 0.5984, + "step": 6950 + }, + { + "epoch": 2.7315362162427332, + "grad_norm": 0.7223399634960578, + "learning_rate": 4.170944439808622e-07, + "loss": 0.5491, + "step": 6960 + }, + { + "epoch": 2.735460766759057, + "grad_norm": 2.1836663497672455, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.5941, + "step": 6970 + }, + { + "epoch": 2.7393853172753806, + "grad_norm": 2.154640259731243, + "learning_rate": 3.932183640734466e-07, + "loss": 0.6781, + "step": 6980 + }, + { + "epoch": 2.7433098677917043, + "grad_norm": 1.908154353492756, + "learning_rate": 3.8153895744115767e-07, + "loss": 0.6178, + "step": 6990 + }, + { + "epoch": 2.747234418308028, + "grad_norm": 1.2339098485081115, + "learning_rate": 3.700322536660228e-07, + "loss": 0.5819, + "step": 7000 + }, + { + "epoch": 2.7511589688243516, + "grad_norm": 0.9319742069385385, + "learning_rate": 3.586984593549614e-07, + "loss": 0.5296, + "step": 7010 + }, + { + "epoch": 2.7550835193406753, + "grad_norm": 2.2839758727554424, + "learning_rate": 3.475377780102451e-07, + "loss": 0.5919, + "step": 7020 + }, + { + "epoch": 2.759008069856999, + "grad_norm": 2.184552450870539, + "learning_rate": 3.365504100258399e-07, + "loss": 0.6341, + "step": 7030 + }, + { + "epoch": 2.7629326203733227, + "grad_norm": 1.7194768639357731, + "learning_rate": 3.2573655268380746e-07, + "loss": 0.6252, + "step": 7040 + }, + { + "epoch": 2.7668571708896463, + "grad_norm": 1.3255796782916087, + "learning_rate": 3.1509640015076946e-07, + "loss": 0.5879, + "step": 7050 + }, + { + "epoch": 2.77078172140597, + "grad_norm": 0.8088254429550722, + "learning_rate": 3.0463014347441255e-07, + "loss": 0.5519, + "step": 7060 + }, + { + "epoch": 2.7747062719222937, + "grad_norm": 2.202949542374798, + "learning_rate": 2.9433797058006195e-07, + "loss": 0.598, + "step": 7070 + }, + { + "epoch": 2.7786308224386174, + "grad_norm": 2.192327750139598, + "learning_rate": 2.842200662673111e-07, + "loss": 0.6815, + "step": 7080 + }, + { + "epoch": 2.782555372954941, + "grad_norm": 1.7866089200056756, + "learning_rate": 2.7427661220669535e-07, + "loss": 0.603, + "step": 7090 + }, + { + "epoch": 2.7864799234712647, + "grad_norm": 1.2795477884768611, + "learning_rate": 2.645077869364354e-07, + "loss": 0.5773, + "step": 7100 + }, + { + "epoch": 2.7904044739875884, + "grad_norm": 0.7183394998611272, + "learning_rate": 2.5491376585923265e-07, + "loss": 0.5313, + "step": 7110 + }, + { + "epoch": 2.794329024503912, + "grad_norm": 2.308178999869327, + "learning_rate": 2.4549472123911564e-07, + "loss": 0.5919, + "step": 7120 + }, + { + "epoch": 2.7982535750202358, + "grad_norm": 2.2719235337571813, + "learning_rate": 2.362508221983484e-07, + "loss": 0.6633, + "step": 7130 + }, + { + "epoch": 2.8021781255365594, + "grad_norm": 1.8358853033909601, + "learning_rate": 2.2718223471439815e-07, + "loss": 0.6084, + "step": 7140 + }, + { + "epoch": 2.806102676052883, + "grad_norm": 1.3561609455618762, + "learning_rate": 2.182891216169447e-07, + "loss": 0.5946, + "step": 7150 + }, + { + "epoch": 2.810027226569207, + "grad_norm": 0.801043920678816, + "learning_rate": 2.0957164258497031e-07, + "loss": 0.5394, + "step": 7160 + }, + { + "epoch": 2.8139517770855305, + "grad_norm": 2.3157120536637685, + "learning_rate": 2.0102995414387983e-07, + "loss": 0.5861, + "step": 7170 + }, + { + "epoch": 2.817876327601854, + "grad_norm": 2.0452627880092886, + "learning_rate": 1.9266420966270182e-07, + "loss": 0.6566, + "step": 7180 + }, + { + "epoch": 2.821800878118178, + "grad_norm": 1.883254183997997, + "learning_rate": 1.8447455935132418e-07, + "loss": 0.6011, + "step": 7190 + }, + { + "epoch": 2.8257254286345015, + "grad_norm": 1.277403091223367, + "learning_rate": 1.764611502578051e-07, + "loss": 0.573, + "step": 7200 + }, + { + "epoch": 2.829649979150825, + "grad_norm": 0.8701579907748704, + "learning_rate": 1.6862412626572845e-07, + "loss": 0.5748, + "step": 7210 + }, + { + "epoch": 2.833574529667149, + "grad_norm": 2.2395045060776115, + "learning_rate": 1.6096362809162047e-07, + "loss": 0.5897, + "step": 7220 + }, + { + "epoch": 2.8374990801834725, + "grad_norm": 2.1921791669212864, + "learning_rate": 1.5347979328242613e-07, + "loss": 0.6472, + "step": 7230 + }, + { + "epoch": 2.841423630699796, + "grad_norm": 1.7683013517405388, + "learning_rate": 1.461727562130344e-07, + "loss": 0.6176, + "step": 7240 + }, + { + "epoch": 2.84534818121612, + "grad_norm": 1.2599894592930116, + "learning_rate": 1.3904264808387246e-07, + "loss": 0.583, + "step": 7250 + }, + { + "epoch": 2.8492727317324436, + "grad_norm": 0.8602212034932214, + "learning_rate": 1.320895969185454e-07, + "loss": 0.5383, + "step": 7260 + }, + { + "epoch": 2.8531972822487672, + "grad_norm": 2.01097097204698, + "learning_rate": 1.2531372756153458e-07, + "loss": 0.5882, + "step": 7270 + }, + { + "epoch": 2.857121832765091, + "grad_norm": 2.29115443827529, + "learning_rate": 1.1871516167596186e-07, + "loss": 0.659, + "step": 7280 + }, + { + "epoch": 2.8610463832814146, + "grad_norm": 1.798405094313227, + "learning_rate": 1.1229401774140447e-07, + "loss": 0.6425, + "step": 7290 + }, + { + "epoch": 2.8649709337977383, + "grad_norm": 1.2675180226420815, + "learning_rate": 1.0605041105176128e-07, + "loss": 0.5757, + "step": 7300 + }, + { + "epoch": 2.868895484314062, + "grad_norm": 0.778167866423488, + "learning_rate": 9.998445371319332e-08, + "loss": 0.5289, + "step": 7310 + }, + { + "epoch": 2.8728200348303856, + "grad_norm": 2.1530650160440845, + "learning_rate": 9.409625464210093e-08, + "loss": 0.582, + "step": 7320 + }, + { + "epoch": 2.8767445853467093, + "grad_norm": 2.039982400238999, + "learning_rate": 8.83859195631731e-08, + "loss": 0.6649, + "step": 7330 + }, + { + "epoch": 2.880669135863033, + "grad_norm": 1.8192255183888113, + "learning_rate": 8.285355100748904e-08, + "loss": 0.6083, + "step": 7340 + }, + { + "epoch": 2.8845936863793566, + "grad_norm": 1.3511200742987195, + "learning_rate": 7.749924831067401e-08, + "loss": 0.5947, + "step": 7350 + }, + { + "epoch": 2.8885182368956803, + "grad_norm": 0.8078794663885482, + "learning_rate": 7.232310761112082e-08, + "loss": 0.5189, + "step": 7360 + }, + { + "epoch": 2.892442787412004, + "grad_norm": 2.086447914981744, + "learning_rate": 6.732522184825896e-08, + "loss": 0.5929, + "step": 7370 + }, + { + "epoch": 2.8963673379283277, + "grad_norm": 2.2301352619297097, + "learning_rate": 6.250568076088814e-08, + "loss": 0.6598, + "step": 7380 + }, + { + "epoch": 2.9002918884446514, + "grad_norm": 1.8389039532268878, + "learning_rate": 5.7864570885567405e-08, + "loss": 0.6154, + "step": 7390 + }, + { + "epoch": 2.904216438960975, + "grad_norm": 1.2792199660600578, + "learning_rate": 5.340197555505966e-08, + "loss": 0.6012, + "step": 7400 + }, + { + "epoch": 2.9081409894772987, + "grad_norm": 0.7325366163854787, + "learning_rate": 4.911797489683734e-08, + "loss": 0.5215, + "step": 7410 + }, + { + "epoch": 2.9120655399936224, + "grad_norm": 1.8968115785034971, + "learning_rate": 4.5012645831640225e-08, + "loss": 0.5973, + "step": 7420 + }, + { + "epoch": 2.915990090509946, + "grad_norm": 2.115373675723123, + "learning_rate": 4.108606207209875e-08, + "loss": 0.6792, + "step": 7430 + }, + { + "epoch": 2.9199146410262697, + "grad_norm": 1.8029011243698139, + "learning_rate": 3.7338294121407324e-08, + "loss": 0.62, + "step": 7440 + }, + { + "epoch": 2.9238391915425934, + "grad_norm": 1.288742701315245, + "learning_rate": 3.376940927206196e-08, + "loss": 0.5808, + "step": 7450 + }, + { + "epoch": 2.927763742058917, + "grad_norm": 0.798534719587082, + "learning_rate": 3.037947160464572e-08, + "loss": 0.5186, + "step": 7460 + }, + { + "epoch": 2.9316882925752408, + "grad_norm": 2.5141632319612954, + "learning_rate": 2.716854198668517e-08, + "loss": 0.577, + "step": 7470 + }, + { + "epoch": 2.9356128430915645, + "grad_norm": 2.35827100419327, + "learning_rate": 2.41366780715524e-08, + "loss": 0.6523, + "step": 7480 + }, + { + "epoch": 2.939537393607888, + "grad_norm": 1.8102877923296363, + "learning_rate": 2.1283934297432472e-08, + "loss": 0.6021, + "step": 7490 + }, + { + "epoch": 2.943461944124212, + "grad_norm": 1.3570094551959164, + "learning_rate": 1.861036188634424e-08, + "loss": 0.5859, + "step": 7500 + }, + { + "epoch": 2.9473864946405355, + "grad_norm": 0.8112341118577404, + "learning_rate": 1.6116008843224395e-08, + "loss": 0.5446, + "step": 7510 + }, + { + "epoch": 2.951311045156859, + "grad_norm": 2.0316666239859726, + "learning_rate": 1.3800919955058167e-08, + "loss": 0.5938, + "step": 7520 + }, + { + "epoch": 2.955235595673183, + "grad_norm": 2.24783418646791, + "learning_rate": 1.1665136790084408e-08, + "loss": 0.6663, + "step": 7530 + }, + { + "epoch": 2.9591601461895065, + "grad_norm": 1.7761857680646513, + "learning_rate": 9.708697697040636e-09, + "loss": 0.6199, + "step": 7540 + }, + { + "epoch": 2.96308469670583, + "grad_norm": 1.2633693500353296, + "learning_rate": 7.931637804481362e-09, + "loss": 0.5774, + "step": 7550 + }, + { + "epoch": 2.967009247222154, + "grad_norm": 0.8065989220041421, + "learning_rate": 6.333989020143039e-09, + "loss": 0.5492, + "step": 7560 + }, + { + "epoch": 2.9709337977384775, + "grad_norm": 2.3130844134472537, + "learning_rate": 4.915780030372297e-09, + "loss": 0.5792, + "step": 7570 + }, + { + "epoch": 2.974858348254801, + "grad_norm": 2.1600268864719765, + "learning_rate": 3.6770362996108033e-09, + "loss": 0.6643, + "step": 7580 + }, + { + "epoch": 2.978782898771125, + "grad_norm": 1.78858714847675, + "learning_rate": 2.617780069940068e-09, + "loss": 0.6227, + "step": 7590 + }, + { + "epoch": 2.9827074492874486, + "grad_norm": 1.2711261056787222, + "learning_rate": 1.738030360677323e-09, + "loss": 0.5816, + "step": 7600 + }, + { + "epoch": 2.9866319998037723, + "grad_norm": 0.8400903589420956, + "learning_rate": 1.0378029680391254e-09, + "loss": 0.5403, + "step": 7610 + }, + { + "epoch": 2.990556550320096, + "grad_norm": 2.242115052909609, + "learning_rate": 5.171104648549196e-10, + "loss": 0.6179, + "step": 7620 + }, + { + "epoch": 2.9944811008364196, + "grad_norm": 2.174676570322404, + "learning_rate": 1.759622003427719e-10, + "loss": 0.658, + "step": 7630 + }, + { + "epoch": 2.9984056513527433, + "grad_norm": 1.9961793276061766, + "learning_rate": 1.436429993950661e-11, + "loss": 0.5866, + "step": 7640 + }, + { + "epoch": 2.999975471559273, + "eval_loss": 0.5420118570327759, + "eval_runtime": 1735.1416, + "eval_samples_per_second": 14.408, + "eval_steps_per_second": 3.602, + "step": 7644 + }, + { + "epoch": 2.999975471559273, + "step": 7644, + "total_flos": 1249712503734272.0, + "train_loss": 0.20236909012093338, + "train_runtime": 73063.4732, + "train_samples_per_second": 13.392, + "train_steps_per_second": 0.105 + } + ], + "logging_steps": 10, + "max_steps": 7644, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1249712503734272.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}