{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995258416311048, "eval_steps": 500, "global_step": 1054, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000948316737790422, "grad_norm": 12.246086019020522, "learning_rate": 3.1545741324921134e-08, "loss": 0.3675, "step": 1 }, { "epoch": 0.001896633475580844, "grad_norm": 11.694026905196086, "learning_rate": 6.309148264984227e-08, "loss": 0.3513, "step": 2 }, { "epoch": 0.002844950213371266, "grad_norm": 10.688899348005608, "learning_rate": 9.463722397476342e-08, "loss": 0.3177, "step": 3 }, { "epoch": 0.003793266951161688, "grad_norm": 11.921298417211082, "learning_rate": 1.2618296529968454e-07, "loss": 0.3556, "step": 4 }, { "epoch": 0.00474158368895211, "grad_norm": 11.282902382225787, "learning_rate": 1.5772870662460568e-07, "loss": 0.3662, "step": 5 }, { "epoch": 0.005689900426742532, "grad_norm": 12.449826286939778, "learning_rate": 1.8927444794952683e-07, "loss": 0.3546, "step": 6 }, { "epoch": 0.006638217164532954, "grad_norm": 11.727097480206721, "learning_rate": 2.2082018927444798e-07, "loss": 0.3635, "step": 7 }, { "epoch": 0.007586533902323376, "grad_norm": 11.441751312661113, "learning_rate": 2.5236593059936907e-07, "loss": 0.3636, "step": 8 }, { "epoch": 0.008534850640113799, "grad_norm": 10.632930884848795, "learning_rate": 2.8391167192429027e-07, "loss": 0.2923, "step": 9 }, { "epoch": 0.00948316737790422, "grad_norm": 11.025857208188647, "learning_rate": 3.1545741324921137e-07, "loss": 0.3449, "step": 10 }, { "epoch": 0.010431484115694643, "grad_norm": 11.86359857266447, "learning_rate": 3.470031545741325e-07, "loss": 0.3354, "step": 11 }, { "epoch": 0.011379800853485065, "grad_norm": 11.01751351812872, "learning_rate": 3.7854889589905366e-07, "loss": 0.3369, "step": 12 }, { "epoch": 0.012328117591275486, "grad_norm": 9.502190495628849, "learning_rate": 4.100946372239748e-07, "loss": 0.3179, "step": 13 }, { "epoch": 0.013276434329065908, "grad_norm": 7.858408977040966, "learning_rate": 4.4164037854889596e-07, "loss": 0.2565, "step": 14 }, { "epoch": 0.01422475106685633, "grad_norm": 8.154333698814211, "learning_rate": 4.7318611987381705e-07, "loss": 0.2589, "step": 15 }, { "epoch": 0.015173067804646752, "grad_norm": 8.475444781638856, "learning_rate": 5.047318611987381e-07, "loss": 0.3001, "step": 16 }, { "epoch": 0.016121384542437174, "grad_norm": 7.13737669899092, "learning_rate": 5.362776025236594e-07, "loss": 0.2641, "step": 17 }, { "epoch": 0.017069701280227598, "grad_norm": 5.554684184392653, "learning_rate": 5.678233438485805e-07, "loss": 0.1902, "step": 18 }, { "epoch": 0.018018018018018018, "grad_norm": 4.568211300813283, "learning_rate": 5.993690851735017e-07, "loss": 0.208, "step": 19 }, { "epoch": 0.01896633475580844, "grad_norm": 4.7579569152913646, "learning_rate": 6.309148264984227e-07, "loss": 0.1994, "step": 20 }, { "epoch": 0.01991465149359886, "grad_norm": 4.7128465676673486, "learning_rate": 6.62460567823344e-07, "loss": 0.229, "step": 21 }, { "epoch": 0.020862968231389285, "grad_norm": 4.005405411985473, "learning_rate": 6.94006309148265e-07, "loss": 0.2095, "step": 22 }, { "epoch": 0.021811284969179705, "grad_norm": 4.676075338959145, "learning_rate": 7.255520504731863e-07, "loss": 0.2178, "step": 23 }, { "epoch": 0.02275960170697013, "grad_norm": 2.3652635706654435, "learning_rate": 7.570977917981073e-07, "loss": 0.1524, "step": 24 }, { "epoch": 0.02370791844476055, "grad_norm": 2.685337556789167, "learning_rate": 7.886435331230284e-07, "loss": 0.1672, "step": 25 }, { "epoch": 0.024656235182550973, "grad_norm": 2.430942973848189, "learning_rate": 8.201892744479496e-07, "loss": 0.1526, "step": 26 }, { "epoch": 0.025604551920341393, "grad_norm": 2.6467399445694286, "learning_rate": 8.517350157728707e-07, "loss": 0.1605, "step": 27 }, { "epoch": 0.026552868658131817, "grad_norm": 1.9805248826128374, "learning_rate": 8.832807570977919e-07, "loss": 0.1223, "step": 28 }, { "epoch": 0.027501185395922237, "grad_norm": 2.2695664454959785, "learning_rate": 9.148264984227131e-07, "loss": 0.1991, "step": 29 }, { "epoch": 0.02844950213371266, "grad_norm": 2.221333597086963, "learning_rate": 9.463722397476341e-07, "loss": 0.1466, "step": 30 }, { "epoch": 0.02939781887150308, "grad_norm": 2.4412316593782633, "learning_rate": 9.779179810725552e-07, "loss": 0.1757, "step": 31 }, { "epoch": 0.030346135609293504, "grad_norm": 2.3894901293863198, "learning_rate": 1.0094637223974763e-06, "loss": 0.1467, "step": 32 }, { "epoch": 0.031294452347083924, "grad_norm": 2.2254181707911003, "learning_rate": 1.0410094637223975e-06, "loss": 0.1403, "step": 33 }, { "epoch": 0.03224276908487435, "grad_norm": 2.0835670435267573, "learning_rate": 1.0725552050473188e-06, "loss": 0.1295, "step": 34 }, { "epoch": 0.03319108582266477, "grad_norm": 2.490534255553767, "learning_rate": 1.1041009463722398e-06, "loss": 0.1781, "step": 35 }, { "epoch": 0.034139402560455195, "grad_norm": 2.4852979753797526, "learning_rate": 1.135646687697161e-06, "loss": 0.1635, "step": 36 }, { "epoch": 0.03508771929824561, "grad_norm": 2.1821267836376093, "learning_rate": 1.1671924290220821e-06, "loss": 0.1536, "step": 37 }, { "epoch": 0.036036036036036036, "grad_norm": 2.0995981360286193, "learning_rate": 1.1987381703470034e-06, "loss": 0.1892, "step": 38 }, { "epoch": 0.03698435277382646, "grad_norm": 1.7671849396147046, "learning_rate": 1.2302839116719244e-06, "loss": 0.1441, "step": 39 }, { "epoch": 0.03793266951161688, "grad_norm": 1.9145592423222202, "learning_rate": 1.2618296529968455e-06, "loss": 0.1515, "step": 40 }, { "epoch": 0.0388809862494073, "grad_norm": 2.909485299628588, "learning_rate": 1.2933753943217667e-06, "loss": 0.176, "step": 41 }, { "epoch": 0.03982930298719772, "grad_norm": 1.6871888183428478, "learning_rate": 1.324921135646688e-06, "loss": 0.1215, "step": 42 }, { "epoch": 0.04077761972498815, "grad_norm": 1.5396564532901138, "learning_rate": 1.3564668769716088e-06, "loss": 0.1334, "step": 43 }, { "epoch": 0.04172593646277857, "grad_norm": 2.205033481070129, "learning_rate": 1.38801261829653e-06, "loss": 0.1397, "step": 44 }, { "epoch": 0.04267425320056899, "grad_norm": 1.8497757762613358, "learning_rate": 1.4195583596214513e-06, "loss": 0.1274, "step": 45 }, { "epoch": 0.04362256993835941, "grad_norm": 1.9376215434540043, "learning_rate": 1.4511041009463726e-06, "loss": 0.1228, "step": 46 }, { "epoch": 0.044570886676149835, "grad_norm": 1.594889970345864, "learning_rate": 1.4826498422712934e-06, "loss": 0.1137, "step": 47 }, { "epoch": 0.04551920341394026, "grad_norm": 1.7592570423176281, "learning_rate": 1.5141955835962146e-06, "loss": 0.141, "step": 48 }, { "epoch": 0.046467520151730675, "grad_norm": 1.6146283602515956, "learning_rate": 1.545741324921136e-06, "loss": 0.1428, "step": 49 }, { "epoch": 0.0474158368895211, "grad_norm": 1.503278573378982, "learning_rate": 1.5772870662460567e-06, "loss": 0.1318, "step": 50 }, { "epoch": 0.04836415362731152, "grad_norm": 1.37572777178569, "learning_rate": 1.608832807570978e-06, "loss": 0.1315, "step": 51 }, { "epoch": 0.049312470365101946, "grad_norm": 1.6002275154635794, "learning_rate": 1.6403785488958992e-06, "loss": 0.0935, "step": 52 }, { "epoch": 0.05026078710289237, "grad_norm": 1.9567696662008847, "learning_rate": 1.6719242902208203e-06, "loss": 0.1271, "step": 53 }, { "epoch": 0.051209103840682786, "grad_norm": 1.601626063178932, "learning_rate": 1.7034700315457413e-06, "loss": 0.0959, "step": 54 }, { "epoch": 0.05215742057847321, "grad_norm": 1.886431535590579, "learning_rate": 1.7350157728706626e-06, "loss": 0.1218, "step": 55 }, { "epoch": 0.05310573731626363, "grad_norm": 1.5354494166136305, "learning_rate": 1.7665615141955838e-06, "loss": 0.1139, "step": 56 }, { "epoch": 0.05405405405405406, "grad_norm": 2.311230053300576, "learning_rate": 1.7981072555205049e-06, "loss": 0.1426, "step": 57 }, { "epoch": 0.055002370791844474, "grad_norm": 1.6253071180005185, "learning_rate": 1.8296529968454261e-06, "loss": 0.1175, "step": 58 }, { "epoch": 0.0559506875296349, "grad_norm": 1.3821063491809322, "learning_rate": 1.8611987381703472e-06, "loss": 0.132, "step": 59 }, { "epoch": 0.05689900426742532, "grad_norm": 1.7624392868013044, "learning_rate": 1.8927444794952682e-06, "loss": 0.1221, "step": 60 }, { "epoch": 0.057847321005215745, "grad_norm": 1.3398437874784876, "learning_rate": 1.9242902208201892e-06, "loss": 0.125, "step": 61 }, { "epoch": 0.05879563774300616, "grad_norm": 1.562570182505017, "learning_rate": 1.9558359621451105e-06, "loss": 0.1413, "step": 62 }, { "epoch": 0.059743954480796585, "grad_norm": 1.6769755616188486, "learning_rate": 1.9873817034700317e-06, "loss": 0.1559, "step": 63 }, { "epoch": 0.06069227121858701, "grad_norm": 1.3917364499829268, "learning_rate": 2.0189274447949526e-06, "loss": 0.1377, "step": 64 }, { "epoch": 0.06164058795637743, "grad_norm": 1.8502674559797263, "learning_rate": 2.050473186119874e-06, "loss": 0.1487, "step": 65 }, { "epoch": 0.06258890469416785, "grad_norm": 3.158783977874437, "learning_rate": 2.082018927444795e-06, "loss": 0.119, "step": 66 }, { "epoch": 0.06353722143195828, "grad_norm": 1.811584236109641, "learning_rate": 2.1135646687697163e-06, "loss": 0.122, "step": 67 }, { "epoch": 0.0644855381697487, "grad_norm": 2.917344328319794, "learning_rate": 2.1451104100946376e-06, "loss": 0.1313, "step": 68 }, { "epoch": 0.06543385490753911, "grad_norm": 1.8029019845335916, "learning_rate": 2.1766561514195584e-06, "loss": 0.1138, "step": 69 }, { "epoch": 0.06638217164532954, "grad_norm": 1.6898543330406532, "learning_rate": 2.2082018927444797e-06, "loss": 0.1191, "step": 70 }, { "epoch": 0.06733048838311996, "grad_norm": 2.2925732127308214, "learning_rate": 2.239747634069401e-06, "loss": 0.1306, "step": 71 }, { "epoch": 0.06827880512091039, "grad_norm": 1.4433490292568716, "learning_rate": 2.271293375394322e-06, "loss": 0.1055, "step": 72 }, { "epoch": 0.06922712185870081, "grad_norm": 1.3862506183642664, "learning_rate": 2.302839116719243e-06, "loss": 0.1075, "step": 73 }, { "epoch": 0.07017543859649122, "grad_norm": 1.2816575561632197, "learning_rate": 2.3343848580441643e-06, "loss": 0.1028, "step": 74 }, { "epoch": 0.07112375533428165, "grad_norm": 1.893923472034316, "learning_rate": 2.3659305993690855e-06, "loss": 0.1011, "step": 75 }, { "epoch": 0.07207207207207207, "grad_norm": 1.6025824634915868, "learning_rate": 2.3974763406940068e-06, "loss": 0.1317, "step": 76 }, { "epoch": 0.07302038880986249, "grad_norm": 1.7176261068301808, "learning_rate": 2.4290220820189276e-06, "loss": 0.1447, "step": 77 }, { "epoch": 0.07396870554765292, "grad_norm": 2.4231160050612863, "learning_rate": 2.460567823343849e-06, "loss": 0.1384, "step": 78 }, { "epoch": 0.07491702228544334, "grad_norm": 1.2193411546548798, "learning_rate": 2.49211356466877e-06, "loss": 0.0992, "step": 79 }, { "epoch": 0.07586533902323377, "grad_norm": 1.5164983059809367, "learning_rate": 2.523659305993691e-06, "loss": 0.1001, "step": 80 }, { "epoch": 0.07681365576102418, "grad_norm": 1.6017905795769134, "learning_rate": 2.5552050473186126e-06, "loss": 0.1048, "step": 81 }, { "epoch": 0.0777619724988146, "grad_norm": 1.1836761079302904, "learning_rate": 2.5867507886435334e-06, "loss": 0.0982, "step": 82 }, { "epoch": 0.07871028923660503, "grad_norm": 3.3493572839513566, "learning_rate": 2.6182965299684543e-06, "loss": 0.1184, "step": 83 }, { "epoch": 0.07965860597439545, "grad_norm": 2.0313461386241722, "learning_rate": 2.649842271293376e-06, "loss": 0.1174, "step": 84 }, { "epoch": 0.08060692271218586, "grad_norm": 1.7152579326543271, "learning_rate": 2.6813880126182968e-06, "loss": 0.117, "step": 85 }, { "epoch": 0.0815552394499763, "grad_norm": 1.9082671591126898, "learning_rate": 2.7129337539432176e-06, "loss": 0.1538, "step": 86 }, { "epoch": 0.08250355618776671, "grad_norm": 1.1544236926306861, "learning_rate": 2.7444794952681393e-06, "loss": 0.0906, "step": 87 }, { "epoch": 0.08345187292555714, "grad_norm": 1.2516823902614436, "learning_rate": 2.77602523659306e-06, "loss": 0.1452, "step": 88 }, { "epoch": 0.08440018966334756, "grad_norm": 1.0339206815219761, "learning_rate": 2.807570977917981e-06, "loss": 0.0836, "step": 89 }, { "epoch": 0.08534850640113797, "grad_norm": 1.668394516826565, "learning_rate": 2.8391167192429026e-06, "loss": 0.1129, "step": 90 }, { "epoch": 0.0862968231389284, "grad_norm": 1.497152077632149, "learning_rate": 2.8706624605678234e-06, "loss": 0.1062, "step": 91 }, { "epoch": 0.08724513987671882, "grad_norm": 1.197731872894548, "learning_rate": 2.902208201892745e-06, "loss": 0.1102, "step": 92 }, { "epoch": 0.08819345661450925, "grad_norm": 1.4271367098608596, "learning_rate": 2.933753943217666e-06, "loss": 0.1175, "step": 93 }, { "epoch": 0.08914177335229967, "grad_norm": 1.6337936419255448, "learning_rate": 2.9652996845425868e-06, "loss": 0.1148, "step": 94 }, { "epoch": 0.09009009009009009, "grad_norm": 1.7427850789821318, "learning_rate": 2.9968454258675085e-06, "loss": 0.1246, "step": 95 }, { "epoch": 0.09103840682788052, "grad_norm": 1.2870967429199511, "learning_rate": 3.0283911671924293e-06, "loss": 0.087, "step": 96 }, { "epoch": 0.09198672356567093, "grad_norm": 1.1311991406490958, "learning_rate": 3.05993690851735e-06, "loss": 0.0996, "step": 97 }, { "epoch": 0.09293504030346135, "grad_norm": 1.3742928108454626, "learning_rate": 3.091482649842272e-06, "loss": 0.1293, "step": 98 }, { "epoch": 0.09388335704125178, "grad_norm": 1.516028333204866, "learning_rate": 3.1230283911671926e-06, "loss": 0.1078, "step": 99 }, { "epoch": 0.0948316737790422, "grad_norm": 1.1404699421620854, "learning_rate": 3.1545741324921135e-06, "loss": 0.1142, "step": 100 }, { "epoch": 0.09577999051683263, "grad_norm": 1.7924070029612504, "learning_rate": 3.186119873817035e-06, "loss": 0.1419, "step": 101 }, { "epoch": 0.09672830725462304, "grad_norm": 1.7297435544466835, "learning_rate": 3.217665615141956e-06, "loss": 0.117, "step": 102 }, { "epoch": 0.09767662399241346, "grad_norm": 1.2304625316537265, "learning_rate": 3.2492113564668772e-06, "loss": 0.0834, "step": 103 }, { "epoch": 0.09862494073020389, "grad_norm": 1.6554297434059837, "learning_rate": 3.2807570977917985e-06, "loss": 0.1251, "step": 104 }, { "epoch": 0.09957325746799431, "grad_norm": 1.9749022078409877, "learning_rate": 3.3123028391167193e-06, "loss": 0.1485, "step": 105 }, { "epoch": 0.10052157420578474, "grad_norm": 1.7816458729316766, "learning_rate": 3.3438485804416405e-06, "loss": 0.1168, "step": 106 }, { "epoch": 0.10146989094357516, "grad_norm": 1.6366795934026652, "learning_rate": 3.375394321766562e-06, "loss": 0.1221, "step": 107 }, { "epoch": 0.10241820768136557, "grad_norm": 1.0846701931516913, "learning_rate": 3.4069400630914826e-06, "loss": 0.086, "step": 108 }, { "epoch": 0.103366524419156, "grad_norm": 1.374499027535318, "learning_rate": 3.4384858044164043e-06, "loss": 0.1097, "step": 109 }, { "epoch": 0.10431484115694642, "grad_norm": 1.7733166976712489, "learning_rate": 3.470031545741325e-06, "loss": 0.1098, "step": 110 }, { "epoch": 0.10526315789473684, "grad_norm": 2.980678296553409, "learning_rate": 3.5015772870662464e-06, "loss": 0.0917, "step": 111 }, { "epoch": 0.10621147463252727, "grad_norm": 0.9904577188744437, "learning_rate": 3.5331230283911676e-06, "loss": 0.0777, "step": 112 }, { "epoch": 0.10715979137031768, "grad_norm": 1.4107631975145143, "learning_rate": 3.5646687697160885e-06, "loss": 0.0902, "step": 113 }, { "epoch": 0.10810810810810811, "grad_norm": 1.786967710835369, "learning_rate": 3.5962145110410097e-06, "loss": 0.0974, "step": 114 }, { "epoch": 0.10905642484589853, "grad_norm": 1.6278373703409408, "learning_rate": 3.627760252365931e-06, "loss": 0.1031, "step": 115 }, { "epoch": 0.11000474158368895, "grad_norm": 1.0856958566494381, "learning_rate": 3.6593059936908522e-06, "loss": 0.0872, "step": 116 }, { "epoch": 0.11095305832147938, "grad_norm": 1.0746142572780863, "learning_rate": 3.690851735015773e-06, "loss": 0.0753, "step": 117 }, { "epoch": 0.1119013750592698, "grad_norm": 1.794687772916648, "learning_rate": 3.7223974763406943e-06, "loss": 0.094, "step": 118 }, { "epoch": 0.11284969179706021, "grad_norm": 2.0574961246450543, "learning_rate": 3.7539432176656156e-06, "loss": 0.1032, "step": 119 }, { "epoch": 0.11379800853485064, "grad_norm": 1.0887603543641189, "learning_rate": 3.7854889589905364e-06, "loss": 0.0869, "step": 120 }, { "epoch": 0.11474632527264106, "grad_norm": 1.4381581196511768, "learning_rate": 3.817034700315458e-06, "loss": 0.105, "step": 121 }, { "epoch": 0.11569464201043149, "grad_norm": 2.0884869908112984, "learning_rate": 3.8485804416403785e-06, "loss": 0.1072, "step": 122 }, { "epoch": 0.1166429587482219, "grad_norm": 1.3918016525882038, "learning_rate": 3.8801261829653e-06, "loss": 0.0995, "step": 123 }, { "epoch": 0.11759127548601232, "grad_norm": 1.1199618265144746, "learning_rate": 3.911671924290221e-06, "loss": 0.0693, "step": 124 }, { "epoch": 0.11853959222380275, "grad_norm": 2.913976726787567, "learning_rate": 3.943217665615142e-06, "loss": 0.1203, "step": 125 }, { "epoch": 0.11948790896159317, "grad_norm": 1.4548880466216083, "learning_rate": 3.9747634069400635e-06, "loss": 0.0891, "step": 126 }, { "epoch": 0.1204362256993836, "grad_norm": 3.1711394720986235, "learning_rate": 4.006309148264985e-06, "loss": 0.1223, "step": 127 }, { "epoch": 0.12138454243717402, "grad_norm": 1.888765811166245, "learning_rate": 4.037854889589905e-06, "loss": 0.13, "step": 128 }, { "epoch": 0.12233285917496443, "grad_norm": 1.2398551211997078, "learning_rate": 4.069400630914827e-06, "loss": 0.1103, "step": 129 }, { "epoch": 0.12328117591275486, "grad_norm": 1.7438322556304724, "learning_rate": 4.100946372239748e-06, "loss": 0.1147, "step": 130 }, { "epoch": 0.12422949265054528, "grad_norm": 0.9363387889716617, "learning_rate": 4.132492113564669e-06, "loss": 0.0995, "step": 131 }, { "epoch": 0.1251778093883357, "grad_norm": 1.446859084810851, "learning_rate": 4.16403785488959e-06, "loss": 0.0994, "step": 132 }, { "epoch": 0.12612612612612611, "grad_norm": 1.1856203072681963, "learning_rate": 4.195583596214511e-06, "loss": 0.0927, "step": 133 }, { "epoch": 0.12707444286391656, "grad_norm": 1.103336827372462, "learning_rate": 4.227129337539433e-06, "loss": 0.0815, "step": 134 }, { "epoch": 0.12802275960170698, "grad_norm": 1.897384655096208, "learning_rate": 4.258675078864354e-06, "loss": 0.1248, "step": 135 }, { "epoch": 0.1289710763394974, "grad_norm": 1.6223901695891558, "learning_rate": 4.290220820189275e-06, "loss": 0.1456, "step": 136 }, { "epoch": 0.1299193930772878, "grad_norm": 1.93689861193564, "learning_rate": 4.321766561514196e-06, "loss": 0.1236, "step": 137 }, { "epoch": 0.13086770981507823, "grad_norm": 1.7202395942479507, "learning_rate": 4.353312302839117e-06, "loss": 0.0994, "step": 138 }, { "epoch": 0.13181602655286867, "grad_norm": 2.1336251410837717, "learning_rate": 4.384858044164038e-06, "loss": 0.0963, "step": 139 }, { "epoch": 0.1327643432906591, "grad_norm": 2.086908025505944, "learning_rate": 4.416403785488959e-06, "loss": 0.1397, "step": 140 }, { "epoch": 0.1337126600284495, "grad_norm": 1.903049841336412, "learning_rate": 4.447949526813881e-06, "loss": 0.1188, "step": 141 }, { "epoch": 0.13466097676623992, "grad_norm": 1.237639055790405, "learning_rate": 4.479495268138802e-06, "loss": 0.0864, "step": 142 }, { "epoch": 0.13560929350403034, "grad_norm": 1.533833989919448, "learning_rate": 4.511041009463723e-06, "loss": 0.1188, "step": 143 }, { "epoch": 0.13655761024182078, "grad_norm": 1.7546010414420699, "learning_rate": 4.542586750788644e-06, "loss": 0.1364, "step": 144 }, { "epoch": 0.1375059269796112, "grad_norm": 2.9799276151902645, "learning_rate": 4.574132492113565e-06, "loss": 0.1226, "step": 145 }, { "epoch": 0.13845424371740161, "grad_norm": 1.1723876001348499, "learning_rate": 4.605678233438486e-06, "loss": 0.086, "step": 146 }, { "epoch": 0.13940256045519203, "grad_norm": 2.069220754870492, "learning_rate": 4.637223974763407e-06, "loss": 0.1196, "step": 147 }, { "epoch": 0.14035087719298245, "grad_norm": 3.9795001087139124, "learning_rate": 4.6687697160883285e-06, "loss": 0.1152, "step": 148 }, { "epoch": 0.14129919393077287, "grad_norm": 1.4634422746453415, "learning_rate": 4.70031545741325e-06, "loss": 0.0916, "step": 149 }, { "epoch": 0.1422475106685633, "grad_norm": 1.3185726560010742, "learning_rate": 4.731861198738171e-06, "loss": 0.0904, "step": 150 }, { "epoch": 0.14319582740635373, "grad_norm": 1.5552910898557228, "learning_rate": 4.7634069400630914e-06, "loss": 0.0899, "step": 151 }, { "epoch": 0.14414414414414414, "grad_norm": 1.0997805514097108, "learning_rate": 4.7949526813880135e-06, "loss": 0.0795, "step": 152 }, { "epoch": 0.14509246088193456, "grad_norm": 1.7076641753438397, "learning_rate": 4.826498422712934e-06, "loss": 0.1081, "step": 153 }, { "epoch": 0.14604077761972498, "grad_norm": 1.6735518250841006, "learning_rate": 4.858044164037855e-06, "loss": 0.1068, "step": 154 }, { "epoch": 0.14698909435751542, "grad_norm": 1.2033878521779449, "learning_rate": 4.8895899053627764e-06, "loss": 0.0934, "step": 155 }, { "epoch": 0.14793741109530584, "grad_norm": 1.4908718795559122, "learning_rate": 4.921135646687698e-06, "loss": 0.1072, "step": 156 }, { "epoch": 0.14888572783309625, "grad_norm": 1.3234990953707453, "learning_rate": 4.952681388012618e-06, "loss": 0.104, "step": 157 }, { "epoch": 0.14983404457088667, "grad_norm": 1.3354249814975963, "learning_rate": 4.98422712933754e-06, "loss": 0.1189, "step": 158 }, { "epoch": 0.1507823613086771, "grad_norm": 1.224445144859879, "learning_rate": 5.015772870662461e-06, "loss": 0.1114, "step": 159 }, { "epoch": 0.15173067804646753, "grad_norm": 1.3554440133569026, "learning_rate": 5.047318611987382e-06, "loss": 0.1278, "step": 160 }, { "epoch": 0.15267899478425795, "grad_norm": 1.4393478098545054, "learning_rate": 5.078864353312303e-06, "loss": 0.1201, "step": 161 }, { "epoch": 0.15362731152204837, "grad_norm": 1.032684740456474, "learning_rate": 5.110410094637225e-06, "loss": 0.0841, "step": 162 }, { "epoch": 0.15457562825983878, "grad_norm": 1.2508286920209446, "learning_rate": 5.141955835962146e-06, "loss": 0.0863, "step": 163 }, { "epoch": 0.1555239449976292, "grad_norm": 1.899093372512286, "learning_rate": 5.173501577287067e-06, "loss": 0.1662, "step": 164 }, { "epoch": 0.15647226173541964, "grad_norm": 1.4000014551423334, "learning_rate": 5.205047318611987e-06, "loss": 0.0909, "step": 165 }, { "epoch": 0.15742057847321006, "grad_norm": 1.9418542456678585, "learning_rate": 5.2365930599369085e-06, "loss": 0.1013, "step": 166 }, { "epoch": 0.15836889521100048, "grad_norm": 1.5538903766146939, "learning_rate": 5.268138801261831e-06, "loss": 0.1177, "step": 167 }, { "epoch": 0.1593172119487909, "grad_norm": 1.3035129364423688, "learning_rate": 5.299684542586752e-06, "loss": 0.0961, "step": 168 }, { "epoch": 0.1602655286865813, "grad_norm": 1.273421849890499, "learning_rate": 5.331230283911672e-06, "loss": 0.1252, "step": 169 }, { "epoch": 0.16121384542437173, "grad_norm": 1.123016604976548, "learning_rate": 5.3627760252365935e-06, "loss": 0.0999, "step": 170 }, { "epoch": 0.16216216216216217, "grad_norm": 1.2409364166994, "learning_rate": 5.394321766561515e-06, "loss": 0.1095, "step": 171 }, { "epoch": 0.1631104788999526, "grad_norm": 1.1082140455460585, "learning_rate": 5.425867507886435e-06, "loss": 0.0736, "step": 172 }, { "epoch": 0.164058795637743, "grad_norm": 1.2872459579560394, "learning_rate": 5.457413249211357e-06, "loss": 0.0928, "step": 173 }, { "epoch": 0.16500711237553342, "grad_norm": 1.3830237110418746, "learning_rate": 5.4889589905362786e-06, "loss": 0.0973, "step": 174 }, { "epoch": 0.16595542911332384, "grad_norm": 1.2546887092347754, "learning_rate": 5.520504731861199e-06, "loss": 0.0832, "step": 175 }, { "epoch": 0.16690374585111428, "grad_norm": 1.1708284069676944, "learning_rate": 5.55205047318612e-06, "loss": 0.1075, "step": 176 }, { "epoch": 0.1678520625889047, "grad_norm": 1.101853335061695, "learning_rate": 5.5835962145110415e-06, "loss": 0.0897, "step": 177 }, { "epoch": 0.16880037932669512, "grad_norm": 1.015907357215909, "learning_rate": 5.615141955835962e-06, "loss": 0.0819, "step": 178 }, { "epoch": 0.16974869606448553, "grad_norm": 1.8752154604515816, "learning_rate": 5.646687697160884e-06, "loss": 0.1021, "step": 179 }, { "epoch": 0.17069701280227595, "grad_norm": 1.6971011710183759, "learning_rate": 5.678233438485805e-06, "loss": 0.0996, "step": 180 }, { "epoch": 0.1716453295400664, "grad_norm": 1.2212507178791898, "learning_rate": 5.709779179810726e-06, "loss": 0.1079, "step": 181 }, { "epoch": 0.1725936462778568, "grad_norm": 1.7343284525300247, "learning_rate": 5.741324921135647e-06, "loss": 0.1292, "step": 182 }, { "epoch": 0.17354196301564723, "grad_norm": 1.4376592014404461, "learning_rate": 5.772870662460568e-06, "loss": 0.1312, "step": 183 }, { "epoch": 0.17449027975343764, "grad_norm": 1.2528619821880524, "learning_rate": 5.80441640378549e-06, "loss": 0.0762, "step": 184 }, { "epoch": 0.17543859649122806, "grad_norm": 1.9247297159171304, "learning_rate": 5.835962145110411e-06, "loss": 0.1403, "step": 185 }, { "epoch": 0.1763869132290185, "grad_norm": 1.5028101353474104, "learning_rate": 5.867507886435332e-06, "loss": 0.1147, "step": 186 }, { "epoch": 0.17733522996680892, "grad_norm": 2.4179600186213714, "learning_rate": 5.899053627760253e-06, "loss": 0.0913, "step": 187 }, { "epoch": 0.17828354670459934, "grad_norm": 1.518835105924909, "learning_rate": 5.9305993690851736e-06, "loss": 0.0918, "step": 188 }, { "epoch": 0.17923186344238975, "grad_norm": 1.6543687104918372, "learning_rate": 5.962145110410095e-06, "loss": 0.122, "step": 189 }, { "epoch": 0.18018018018018017, "grad_norm": 1.4531807393638785, "learning_rate": 5.993690851735017e-06, "loss": 0.1228, "step": 190 }, { "epoch": 0.1811284969179706, "grad_norm": 1.4665808153812976, "learning_rate": 6.025236593059937e-06, "loss": 0.1014, "step": 191 }, { "epoch": 0.18207681365576103, "grad_norm": 1.2889682170490027, "learning_rate": 6.056782334384859e-06, "loss": 0.1055, "step": 192 }, { "epoch": 0.18302513039355145, "grad_norm": 1.3310497561635966, "learning_rate": 6.08832807570978e-06, "loss": 0.119, "step": 193 }, { "epoch": 0.18397344713134187, "grad_norm": 1.3246051325093873, "learning_rate": 6.1198738170347e-06, "loss": 0.1288, "step": 194 }, { "epoch": 0.18492176386913228, "grad_norm": 1.1979924093987135, "learning_rate": 6.1514195583596215e-06, "loss": 0.0877, "step": 195 }, { "epoch": 0.1858700806069227, "grad_norm": 1.1280419900810446, "learning_rate": 6.182965299684544e-06, "loss": 0.1085, "step": 196 }, { "epoch": 0.18681839734471314, "grad_norm": 1.3307017446168579, "learning_rate": 6.214511041009465e-06, "loss": 0.0853, "step": 197 }, { "epoch": 0.18776671408250356, "grad_norm": 1.1814823672365349, "learning_rate": 6.246056782334385e-06, "loss": 0.1066, "step": 198 }, { "epoch": 0.18871503082029398, "grad_norm": 0.7829348670836794, "learning_rate": 6.2776025236593065e-06, "loss": 0.0662, "step": 199 }, { "epoch": 0.1896633475580844, "grad_norm": 1.2435224715978643, "learning_rate": 6.309148264984227e-06, "loss": 0.088, "step": 200 }, { "epoch": 0.1906116642958748, "grad_norm": 1.0014149948809556, "learning_rate": 6.340694006309149e-06, "loss": 0.0975, "step": 201 }, { "epoch": 0.19155998103366526, "grad_norm": 0.9250673471848995, "learning_rate": 6.37223974763407e-06, "loss": 0.0877, "step": 202 }, { "epoch": 0.19250829777145567, "grad_norm": 1.056412139362465, "learning_rate": 6.4037854889589915e-06, "loss": 0.0763, "step": 203 }, { "epoch": 0.1934566145092461, "grad_norm": 0.9891782097788515, "learning_rate": 6.435331230283912e-06, "loss": 0.0834, "step": 204 }, { "epoch": 0.1944049312470365, "grad_norm": 1.0792725374885792, "learning_rate": 6.466876971608833e-06, "loss": 0.0885, "step": 205 }, { "epoch": 0.19535324798482692, "grad_norm": 1.2366811021393578, "learning_rate": 6.4984227129337544e-06, "loss": 0.0954, "step": 206 }, { "epoch": 0.19630156472261737, "grad_norm": 1.024115365006771, "learning_rate": 6.529968454258676e-06, "loss": 0.1215, "step": 207 }, { "epoch": 0.19724988146040778, "grad_norm": 1.2203185957532192, "learning_rate": 6.561514195583597e-06, "loss": 0.1202, "step": 208 }, { "epoch": 0.1981981981981982, "grad_norm": 0.9501403270885721, "learning_rate": 6.593059936908518e-06, "loss": 0.0715, "step": 209 }, { "epoch": 0.19914651493598862, "grad_norm": 1.5511308370546482, "learning_rate": 6.624605678233439e-06, "loss": 0.1089, "step": 210 }, { "epoch": 0.20009483167377903, "grad_norm": 0.9433860573102355, "learning_rate": 6.65615141955836e-06, "loss": 0.0648, "step": 211 }, { "epoch": 0.20104314841156948, "grad_norm": 1.0981902231687461, "learning_rate": 6.687697160883281e-06, "loss": 0.0663, "step": 212 }, { "epoch": 0.2019914651493599, "grad_norm": 1.064443363672458, "learning_rate": 6.719242902208203e-06, "loss": 0.077, "step": 213 }, { "epoch": 0.2029397818871503, "grad_norm": 1.3753290546304533, "learning_rate": 6.750788643533124e-06, "loss": 0.1093, "step": 214 }, { "epoch": 0.20388809862494073, "grad_norm": 1.2200081175269764, "learning_rate": 6.782334384858045e-06, "loss": 0.1094, "step": 215 }, { "epoch": 0.20483641536273114, "grad_norm": 0.9141258918864384, "learning_rate": 6.813880126182965e-06, "loss": 0.0911, "step": 216 }, { "epoch": 0.20578473210052156, "grad_norm": 2.528170753397052, "learning_rate": 6.8454258675078865e-06, "loss": 0.1079, "step": 217 }, { "epoch": 0.206733048838312, "grad_norm": 1.4430688823297448, "learning_rate": 6.876971608832809e-06, "loss": 0.1053, "step": 218 }, { "epoch": 0.20768136557610242, "grad_norm": 1.0186932336289805, "learning_rate": 6.90851735015773e-06, "loss": 0.0861, "step": 219 }, { "epoch": 0.20862968231389284, "grad_norm": 1.1420742589304766, "learning_rate": 6.94006309148265e-06, "loss": 0.094, "step": 220 }, { "epoch": 0.20957799905168326, "grad_norm": 1.2741420533987797, "learning_rate": 6.9716088328075715e-06, "loss": 0.0951, "step": 221 }, { "epoch": 0.21052631578947367, "grad_norm": 0.9075216722351295, "learning_rate": 7.003154574132493e-06, "loss": 0.0866, "step": 222 }, { "epoch": 0.21147463252726412, "grad_norm": 1.1980754719122302, "learning_rate": 7.034700315457413e-06, "loss": 0.0914, "step": 223 }, { "epoch": 0.21242294926505453, "grad_norm": 1.1939921471415105, "learning_rate": 7.066246056782335e-06, "loss": 0.1047, "step": 224 }, { "epoch": 0.21337126600284495, "grad_norm": 0.8519438677271276, "learning_rate": 7.0977917981072565e-06, "loss": 0.0941, "step": 225 }, { "epoch": 0.21431958274063537, "grad_norm": 0.789532854502906, "learning_rate": 7.129337539432177e-06, "loss": 0.0819, "step": 226 }, { "epoch": 0.21526789947842578, "grad_norm": 1.2111156014392817, "learning_rate": 7.160883280757098e-06, "loss": 0.1027, "step": 227 }, { "epoch": 0.21621621621621623, "grad_norm": 1.0588737043402552, "learning_rate": 7.1924290220820195e-06, "loss": 0.0952, "step": 228 }, { "epoch": 0.21716453295400664, "grad_norm": 0.933483217055125, "learning_rate": 7.22397476340694e-06, "loss": 0.0763, "step": 229 }, { "epoch": 0.21811284969179706, "grad_norm": 1.049586247769339, "learning_rate": 7.255520504731862e-06, "loss": 0.0789, "step": 230 }, { "epoch": 0.21906116642958748, "grad_norm": 1.1220808424289264, "learning_rate": 7.287066246056783e-06, "loss": 0.074, "step": 231 }, { "epoch": 0.2200094831673779, "grad_norm": 1.254391611101815, "learning_rate": 7.3186119873817045e-06, "loss": 0.093, "step": 232 }, { "epoch": 0.22095779990516834, "grad_norm": 1.274839766592392, "learning_rate": 7.350157728706625e-06, "loss": 0.0938, "step": 233 }, { "epoch": 0.22190611664295876, "grad_norm": 1.2629251738997191, "learning_rate": 7.381703470031546e-06, "loss": 0.1129, "step": 234 }, { "epoch": 0.22285443338074917, "grad_norm": 1.3595829605121952, "learning_rate": 7.413249211356468e-06, "loss": 0.1062, "step": 235 }, { "epoch": 0.2238027501185396, "grad_norm": 1.353026352957774, "learning_rate": 7.444794952681389e-06, "loss": 0.117, "step": 236 }, { "epoch": 0.22475106685633, "grad_norm": 1.3472351125895725, "learning_rate": 7.47634069400631e-06, "loss": 0.0827, "step": 237 }, { "epoch": 0.22569938359412042, "grad_norm": 0.9510770172761661, "learning_rate": 7.507886435331231e-06, "loss": 0.0759, "step": 238 }, { "epoch": 0.22664770033191087, "grad_norm": 1.2025915899822757, "learning_rate": 7.5394321766561515e-06, "loss": 0.0807, "step": 239 }, { "epoch": 0.22759601706970128, "grad_norm": 1.1640028047547857, "learning_rate": 7.570977917981073e-06, "loss": 0.0709, "step": 240 }, { "epoch": 0.2285443338074917, "grad_norm": 1.5223127858935517, "learning_rate": 7.602523659305995e-06, "loss": 0.1018, "step": 241 }, { "epoch": 0.22949265054528212, "grad_norm": 1.8495916864800697, "learning_rate": 7.634069400630916e-06, "loss": 0.0968, "step": 242 }, { "epoch": 0.23044096728307253, "grad_norm": 1.8476848640745251, "learning_rate": 7.665615141955837e-06, "loss": 0.086, "step": 243 }, { "epoch": 0.23138928402086298, "grad_norm": 1.4644626825262619, "learning_rate": 7.697160883280757e-06, "loss": 0.0974, "step": 244 }, { "epoch": 0.2323376007586534, "grad_norm": 1.8857810882326624, "learning_rate": 7.728706624605679e-06, "loss": 0.1036, "step": 245 }, { "epoch": 0.2332859174964438, "grad_norm": 1.7638762752182895, "learning_rate": 7.7602523659306e-06, "loss": 0.1097, "step": 246 }, { "epoch": 0.23423423423423423, "grad_norm": 1.2348758426158113, "learning_rate": 7.791798107255522e-06, "loss": 0.0866, "step": 247 }, { "epoch": 0.23518255097202465, "grad_norm": 1.1223471436540764, "learning_rate": 7.823343848580442e-06, "loss": 0.0564, "step": 248 }, { "epoch": 0.2361308677098151, "grad_norm": 0.8821001750676984, "learning_rate": 7.854889589905364e-06, "loss": 0.0696, "step": 249 }, { "epoch": 0.2370791844476055, "grad_norm": 0.9899264223411232, "learning_rate": 7.886435331230284e-06, "loss": 0.0702, "step": 250 }, { "epoch": 0.23802750118539592, "grad_norm": 0.9289219027994224, "learning_rate": 7.917981072555205e-06, "loss": 0.0843, "step": 251 }, { "epoch": 0.23897581792318634, "grad_norm": 1.0579670590751298, "learning_rate": 7.949526813880127e-06, "loss": 0.0921, "step": 252 }, { "epoch": 0.23992413466097676, "grad_norm": 1.4593486745973783, "learning_rate": 7.981072555205049e-06, "loss": 0.1229, "step": 253 }, { "epoch": 0.2408724513987672, "grad_norm": 0.9496576247693762, "learning_rate": 8.01261829652997e-06, "loss": 0.0861, "step": 254 }, { "epoch": 0.24182076813655762, "grad_norm": 1.1030565317688061, "learning_rate": 8.04416403785489e-06, "loss": 0.0893, "step": 255 }, { "epoch": 0.24276908487434803, "grad_norm": 0.9907604990146169, "learning_rate": 8.07570977917981e-06, "loss": 0.0928, "step": 256 }, { "epoch": 0.24371740161213845, "grad_norm": 0.9460810229319789, "learning_rate": 8.107255520504732e-06, "loss": 0.0974, "step": 257 }, { "epoch": 0.24466571834992887, "grad_norm": 0.8329291976282354, "learning_rate": 8.138801261829655e-06, "loss": 0.077, "step": 258 }, { "epoch": 0.24561403508771928, "grad_norm": 0.8587085474520708, "learning_rate": 8.170347003154575e-06, "loss": 0.0837, "step": 259 }, { "epoch": 0.24656235182550973, "grad_norm": 0.9113223159844124, "learning_rate": 8.201892744479495e-06, "loss": 0.088, "step": 260 }, { "epoch": 0.24751066856330015, "grad_norm": 0.8328940868524983, "learning_rate": 8.233438485804417e-06, "loss": 0.091, "step": 261 }, { "epoch": 0.24845898530109056, "grad_norm": 1.4264090310082065, "learning_rate": 8.264984227129338e-06, "loss": 0.1354, "step": 262 }, { "epoch": 0.24940730203888098, "grad_norm": 1.0550225951223755, "learning_rate": 8.296529968454258e-06, "loss": 0.0972, "step": 263 }, { "epoch": 0.2503556187766714, "grad_norm": 1.053508559451355, "learning_rate": 8.32807570977918e-06, "loss": 0.1035, "step": 264 }, { "epoch": 0.25130393551446184, "grad_norm": 1.4971087544821369, "learning_rate": 8.359621451104102e-06, "loss": 0.1001, "step": 265 }, { "epoch": 0.25225225225225223, "grad_norm": 1.075521297085326, "learning_rate": 8.391167192429023e-06, "loss": 0.0923, "step": 266 }, { "epoch": 0.2532005689900427, "grad_norm": 1.6910075728505873, "learning_rate": 8.422712933753943e-06, "loss": 0.1212, "step": 267 }, { "epoch": 0.2541488857278331, "grad_norm": 1.5073460991202734, "learning_rate": 8.454258675078865e-06, "loss": 0.087, "step": 268 }, { "epoch": 0.2550972024656235, "grad_norm": 1.0201575671512444, "learning_rate": 8.485804416403787e-06, "loss": 0.0871, "step": 269 }, { "epoch": 0.25604551920341395, "grad_norm": 1.1193230353064818, "learning_rate": 8.517350157728708e-06, "loss": 0.1031, "step": 270 }, { "epoch": 0.25699383594120434, "grad_norm": 1.3593779355277376, "learning_rate": 8.548895899053628e-06, "loss": 0.0861, "step": 271 }, { "epoch": 0.2579421526789948, "grad_norm": 1.5824627519870196, "learning_rate": 8.58044164037855e-06, "loss": 0.0998, "step": 272 }, { "epoch": 0.25889046941678523, "grad_norm": 2.316620691088296, "learning_rate": 8.61198738170347e-06, "loss": 0.1237, "step": 273 }, { "epoch": 0.2598387861545756, "grad_norm": 1.3708391836342668, "learning_rate": 8.643533123028391e-06, "loss": 0.0806, "step": 274 }, { "epoch": 0.26078710289236606, "grad_norm": 1.259879695037933, "learning_rate": 8.675078864353313e-06, "loss": 0.088, "step": 275 }, { "epoch": 0.26173541963015645, "grad_norm": 1.236718933875791, "learning_rate": 8.706624605678234e-06, "loss": 0.0842, "step": 276 }, { "epoch": 0.2626837363679469, "grad_norm": 1.438488419989871, "learning_rate": 8.738170347003156e-06, "loss": 0.0955, "step": 277 }, { "epoch": 0.26363205310573734, "grad_norm": 0.9563516338397714, "learning_rate": 8.769716088328076e-06, "loss": 0.0761, "step": 278 }, { "epoch": 0.26458036984352773, "grad_norm": 1.2728124128011007, "learning_rate": 8.801261829652997e-06, "loss": 0.0805, "step": 279 }, { "epoch": 0.2655286865813182, "grad_norm": 1.2205595373118223, "learning_rate": 8.832807570977919e-06, "loss": 0.0879, "step": 280 }, { "epoch": 0.26647700331910856, "grad_norm": 0.959493141925286, "learning_rate": 8.86435331230284e-06, "loss": 0.0728, "step": 281 }, { "epoch": 0.267425320056899, "grad_norm": 1.4340945839201555, "learning_rate": 8.895899053627761e-06, "loss": 0.0897, "step": 282 }, { "epoch": 0.26837363679468945, "grad_norm": 1.0061297486879381, "learning_rate": 8.927444794952682e-06, "loss": 0.0857, "step": 283 }, { "epoch": 0.26932195353247984, "grad_norm": 1.5459293734675696, "learning_rate": 8.958990536277604e-06, "loss": 0.1029, "step": 284 }, { "epoch": 0.2702702702702703, "grad_norm": 1.3222303946698841, "learning_rate": 8.990536277602524e-06, "loss": 0.084, "step": 285 }, { "epoch": 0.2712185870080607, "grad_norm": 1.185863549947665, "learning_rate": 9.022082018927446e-06, "loss": 0.1311, "step": 286 }, { "epoch": 0.2721669037458511, "grad_norm": 0.8959238307125761, "learning_rate": 9.053627760252367e-06, "loss": 0.067, "step": 287 }, { "epoch": 0.27311522048364156, "grad_norm": 1.369443136318961, "learning_rate": 9.085173501577289e-06, "loss": 0.1093, "step": 288 }, { "epoch": 0.27406353722143195, "grad_norm": 1.1052390238476015, "learning_rate": 9.116719242902209e-06, "loss": 0.103, "step": 289 }, { "epoch": 0.2750118539592224, "grad_norm": 1.325059650748033, "learning_rate": 9.14826498422713e-06, "loss": 0.1111, "step": 290 }, { "epoch": 0.2759601706970128, "grad_norm": 1.3248936963910136, "learning_rate": 9.17981072555205e-06, "loss": 0.0933, "step": 291 }, { "epoch": 0.27690848743480323, "grad_norm": 1.127118183479871, "learning_rate": 9.211356466876972e-06, "loss": 0.0891, "step": 292 }, { "epoch": 0.2778568041725936, "grad_norm": 1.3108916887707827, "learning_rate": 9.242902208201894e-06, "loss": 0.0939, "step": 293 }, { "epoch": 0.27880512091038406, "grad_norm": 1.0013886049046197, "learning_rate": 9.274447949526815e-06, "loss": 0.0692, "step": 294 }, { "epoch": 0.2797534376481745, "grad_norm": 1.1156101698361054, "learning_rate": 9.305993690851735e-06, "loss": 0.0868, "step": 295 }, { "epoch": 0.2807017543859649, "grad_norm": 1.2522202479933553, "learning_rate": 9.337539432176657e-06, "loss": 0.0914, "step": 296 }, { "epoch": 0.28165007112375534, "grad_norm": 1.3755827124206237, "learning_rate": 9.369085173501577e-06, "loss": 0.0936, "step": 297 }, { "epoch": 0.28259838786154573, "grad_norm": 1.4694162511089293, "learning_rate": 9.4006309148265e-06, "loss": 0.1071, "step": 298 }, { "epoch": 0.2835467045993362, "grad_norm": 1.255879045911956, "learning_rate": 9.43217665615142e-06, "loss": 0.0815, "step": 299 }, { "epoch": 0.2844950213371266, "grad_norm": 1.560204819302283, "learning_rate": 9.463722397476342e-06, "loss": 0.1234, "step": 300 }, { "epoch": 0.285443338074917, "grad_norm": 1.0121817898281276, "learning_rate": 9.495268138801262e-06, "loss": 0.0595, "step": 301 }, { "epoch": 0.28639165481270745, "grad_norm": 1.0711466156341418, "learning_rate": 9.526813880126183e-06, "loss": 0.0641, "step": 302 }, { "epoch": 0.28733997155049784, "grad_norm": 1.1496695710149105, "learning_rate": 9.558359621451105e-06, "loss": 0.0761, "step": 303 }, { "epoch": 0.2882882882882883, "grad_norm": 1.2059272704315518, "learning_rate": 9.589905362776027e-06, "loss": 0.0756, "step": 304 }, { "epoch": 0.28923660502607873, "grad_norm": 1.0424292745296735, "learning_rate": 9.621451104100947e-06, "loss": 0.0855, "step": 305 }, { "epoch": 0.2901849217638691, "grad_norm": 1.1497786768197902, "learning_rate": 9.652996845425868e-06, "loss": 0.071, "step": 306 }, { "epoch": 0.29113323850165956, "grad_norm": 1.3472444992692172, "learning_rate": 9.68454258675079e-06, "loss": 0.0934, "step": 307 }, { "epoch": 0.29208155523944995, "grad_norm": 1.3345310370843513, "learning_rate": 9.71608832807571e-06, "loss": 0.0998, "step": 308 }, { "epoch": 0.2930298719772404, "grad_norm": 1.01109508034154, "learning_rate": 9.747634069400632e-06, "loss": 0.0762, "step": 309 }, { "epoch": 0.29397818871503084, "grad_norm": 0.9249973635125475, "learning_rate": 9.779179810725553e-06, "loss": 0.074, "step": 310 }, { "epoch": 0.29492650545282123, "grad_norm": 0.804446344253587, "learning_rate": 9.810725552050473e-06, "loss": 0.0517, "step": 311 }, { "epoch": 0.2958748221906117, "grad_norm": 0.965596925556689, "learning_rate": 9.842271293375395e-06, "loss": 0.098, "step": 312 }, { "epoch": 0.29682313892840206, "grad_norm": 2.012807451707843, "learning_rate": 9.873817034700316e-06, "loss": 0.1038, "step": 313 }, { "epoch": 0.2977714556661925, "grad_norm": 1.2864066063043205, "learning_rate": 9.905362776025236e-06, "loss": 0.1102, "step": 314 }, { "epoch": 0.29871977240398295, "grad_norm": 0.8775284858258785, "learning_rate": 9.936908517350158e-06, "loss": 0.0913, "step": 315 }, { "epoch": 0.29966808914177334, "grad_norm": 0.9395466275555749, "learning_rate": 9.96845425867508e-06, "loss": 0.1156, "step": 316 }, { "epoch": 0.3006164058795638, "grad_norm": 1.031977177693936, "learning_rate": 1e-05, "loss": 0.0772, "step": 317 }, { "epoch": 0.3015647226173542, "grad_norm": 0.906696222035988, "learning_rate": 9.999996951577431e-06, "loss": 0.0745, "step": 318 }, { "epoch": 0.3025130393551446, "grad_norm": 1.6486632782552955, "learning_rate": 9.999987806313436e-06, "loss": 0.1295, "step": 319 }, { "epoch": 0.30346135609293506, "grad_norm": 1.0682004904191784, "learning_rate": 9.999972564219169e-06, "loss": 0.089, "step": 320 }, { "epoch": 0.30440967283072545, "grad_norm": 1.0160084965418597, "learning_rate": 9.999951225313217e-06, "loss": 0.0795, "step": 321 }, { "epoch": 0.3053579895685159, "grad_norm": 1.1229797355618714, "learning_rate": 9.999923789621598e-06, "loss": 0.0924, "step": 322 }, { "epoch": 0.3063063063063063, "grad_norm": 0.9925832526069106, "learning_rate": 9.999890257177766e-06, "loss": 0.0803, "step": 323 }, { "epoch": 0.30725462304409673, "grad_norm": 1.1785860516178814, "learning_rate": 9.999850628022611e-06, "loss": 0.0797, "step": 324 }, { "epoch": 0.3082029397818872, "grad_norm": 1.1520304204509717, "learning_rate": 9.999804902204455e-06, "loss": 0.0775, "step": 325 }, { "epoch": 0.30915125651967756, "grad_norm": 1.0880132191910508, "learning_rate": 9.999753079779054e-06, "loss": 0.0906, "step": 326 }, { "epoch": 0.310099573257468, "grad_norm": 1.5767657455822397, "learning_rate": 9.999695160809598e-06, "loss": 0.0956, "step": 327 }, { "epoch": 0.3110478899952584, "grad_norm": 0.7125012678361342, "learning_rate": 9.999631145366713e-06, "loss": 0.0661, "step": 328 }, { "epoch": 0.31199620673304884, "grad_norm": 1.088584252037159, "learning_rate": 9.999561033528457e-06, "loss": 0.1149, "step": 329 }, { "epoch": 0.3129445234708393, "grad_norm": 0.8523222222870042, "learning_rate": 9.999484825380323e-06, "loss": 0.0913, "step": 330 }, { "epoch": 0.3138928402086297, "grad_norm": 1.0164571883774136, "learning_rate": 9.999402521015236e-06, "loss": 0.0878, "step": 331 }, { "epoch": 0.3148411569464201, "grad_norm": 0.7164573705993513, "learning_rate": 9.999314120533557e-06, "loss": 0.0866, "step": 332 }, { "epoch": 0.3157894736842105, "grad_norm": 0.7954216406429697, "learning_rate": 9.999219624043075e-06, "loss": 0.0702, "step": 333 }, { "epoch": 0.31673779042200095, "grad_norm": 0.7996263107367133, "learning_rate": 9.99911903165902e-06, "loss": 0.0758, "step": 334 }, { "epoch": 0.3176861071597914, "grad_norm": 1.101451187378474, "learning_rate": 9.999012343504049e-06, "loss": 0.0957, "step": 335 }, { "epoch": 0.3186344238975818, "grad_norm": 0.7265535166036453, "learning_rate": 9.998899559708254e-06, "loss": 0.0743, "step": 336 }, { "epoch": 0.31958274063537223, "grad_norm": 1.272801256055057, "learning_rate": 9.998780680409161e-06, "loss": 0.0952, "step": 337 }, { "epoch": 0.3205310573731626, "grad_norm": 0.8770881337944402, "learning_rate": 9.99865570575173e-06, "loss": 0.066, "step": 338 }, { "epoch": 0.32147937411095306, "grad_norm": 1.0607119132841634, "learning_rate": 9.998524635888347e-06, "loss": 0.0913, "step": 339 }, { "epoch": 0.32242769084874345, "grad_norm": 0.9189346974278031, "learning_rate": 9.998387470978837e-06, "loss": 0.0881, "step": 340 }, { "epoch": 0.3233760075865339, "grad_norm": 0.7272168469454553, "learning_rate": 9.998244211190454e-06, "loss": 0.0713, "step": 341 }, { "epoch": 0.32432432432432434, "grad_norm": 0.9819255696828616, "learning_rate": 9.998094856697885e-06, "loss": 0.0834, "step": 342 }, { "epoch": 0.32527264106211473, "grad_norm": 0.6857773270509248, "learning_rate": 9.997939407683249e-06, "loss": 0.0524, "step": 343 }, { "epoch": 0.3262209577999052, "grad_norm": 1.0324591704355464, "learning_rate": 9.99777786433609e-06, "loss": 0.1108, "step": 344 }, { "epoch": 0.32716927453769556, "grad_norm": 1.1264206703681527, "learning_rate": 9.997610226853399e-06, "loss": 0.0987, "step": 345 }, { "epoch": 0.328117591275486, "grad_norm": 0.95789066514891, "learning_rate": 9.997436495439581e-06, "loss": 0.093, "step": 346 }, { "epoch": 0.32906590801327645, "grad_norm": 1.0448222803112024, "learning_rate": 9.997256670306478e-06, "loss": 0.0983, "step": 347 }, { "epoch": 0.33001422475106684, "grad_norm": 0.7737283316563024, "learning_rate": 9.997070751673367e-06, "loss": 0.0706, "step": 348 }, { "epoch": 0.3309625414888573, "grad_norm": 0.9596984880180834, "learning_rate": 9.99687873976695e-06, "loss": 0.0991, "step": 349 }, { "epoch": 0.3319108582266477, "grad_norm": 0.8411109119380658, "learning_rate": 9.99668063482136e-06, "loss": 0.0678, "step": 350 }, { "epoch": 0.3328591749644381, "grad_norm": 1.136491883808786, "learning_rate": 9.996476437078162e-06, "loss": 0.0986, "step": 351 }, { "epoch": 0.33380749170222856, "grad_norm": 3.03438587624818, "learning_rate": 9.996266146786344e-06, "loss": 0.0969, "step": 352 }, { "epoch": 0.33475580844001895, "grad_norm": 1.2333568047254937, "learning_rate": 9.996049764202332e-06, "loss": 0.0832, "step": 353 }, { "epoch": 0.3357041251778094, "grad_norm": 1.1301139087376384, "learning_rate": 9.995827289589974e-06, "loss": 0.0994, "step": 354 }, { "epoch": 0.3366524419155998, "grad_norm": 1.0303329732235522, "learning_rate": 9.995598723220548e-06, "loss": 0.0757, "step": 355 }, { "epoch": 0.33760075865339023, "grad_norm": 1.0605991674508604, "learning_rate": 9.995364065372762e-06, "loss": 0.0815, "step": 356 }, { "epoch": 0.3385490753911807, "grad_norm": 0.7941030771981634, "learning_rate": 9.995123316332752e-06, "loss": 0.0747, "step": 357 }, { "epoch": 0.33949739212897106, "grad_norm": 1.2313896272302265, "learning_rate": 9.994876476394075e-06, "loss": 0.0769, "step": 358 }, { "epoch": 0.3404457088667615, "grad_norm": 1.1944743493159886, "learning_rate": 9.994623545857727e-06, "loss": 0.0979, "step": 359 }, { "epoch": 0.3413940256045519, "grad_norm": 0.8285281294809631, "learning_rate": 9.994364525032116e-06, "loss": 0.0793, "step": 360 }, { "epoch": 0.34234234234234234, "grad_norm": 1.4761389910370195, "learning_rate": 9.994099414233091e-06, "loss": 0.0913, "step": 361 }, { "epoch": 0.3432906590801328, "grad_norm": 1.5408966458771916, "learning_rate": 9.993828213783915e-06, "loss": 0.0973, "step": 362 }, { "epoch": 0.3442389758179232, "grad_norm": 1.4559933930399096, "learning_rate": 9.993550924015283e-06, "loss": 0.0999, "step": 363 }, { "epoch": 0.3451872925557136, "grad_norm": 0.8454336561992738, "learning_rate": 9.993267545265314e-06, "loss": 0.0655, "step": 364 }, { "epoch": 0.346135609293504, "grad_norm": 0.796992439441769, "learning_rate": 9.992978077879552e-06, "loss": 0.0696, "step": 365 }, { "epoch": 0.34708392603129445, "grad_norm": 1.0553149426590827, "learning_rate": 9.992682522210963e-06, "loss": 0.0787, "step": 366 }, { "epoch": 0.3480322427690849, "grad_norm": 1.4860431297237584, "learning_rate": 9.992380878619939e-06, "loss": 0.106, "step": 367 }, { "epoch": 0.3489805595068753, "grad_norm": 1.3032907057151817, "learning_rate": 9.992073147474292e-06, "loss": 0.1021, "step": 368 }, { "epoch": 0.34992887624466573, "grad_norm": 1.0894704335759804, "learning_rate": 9.991759329149266e-06, "loss": 0.0905, "step": 369 }, { "epoch": 0.3508771929824561, "grad_norm": 1.1130576081628205, "learning_rate": 9.991439424027518e-06, "loss": 0.0846, "step": 370 }, { "epoch": 0.35182550972024657, "grad_norm": 0.9253664091514998, "learning_rate": 9.991113432499128e-06, "loss": 0.0882, "step": 371 }, { "epoch": 0.352773826458037, "grad_norm": 0.841899923853967, "learning_rate": 9.990781354961605e-06, "loss": 0.0806, "step": 372 }, { "epoch": 0.3537221431958274, "grad_norm": 0.9407729946270026, "learning_rate": 9.99044319181987e-06, "loss": 0.0939, "step": 373 }, { "epoch": 0.35467045993361784, "grad_norm": 0.9090058769044609, "learning_rate": 9.99009894348627e-06, "loss": 0.0891, "step": 374 }, { "epoch": 0.35561877667140823, "grad_norm": 0.6294083333837054, "learning_rate": 9.989748610380571e-06, "loss": 0.0706, "step": 375 }, { "epoch": 0.3565670934091987, "grad_norm": 0.9163781177038506, "learning_rate": 9.98939219292996e-06, "loss": 0.0697, "step": 376 }, { "epoch": 0.3575154101469891, "grad_norm": 1.1693511630739546, "learning_rate": 9.989029691569037e-06, "loss": 0.1056, "step": 377 }, { "epoch": 0.3584637268847795, "grad_norm": 1.0414233510818562, "learning_rate": 9.988661106739827e-06, "loss": 0.0988, "step": 378 }, { "epoch": 0.35941204362256995, "grad_norm": 1.2822153621266594, "learning_rate": 9.988286438891774e-06, "loss": 0.1189, "step": 379 }, { "epoch": 0.36036036036036034, "grad_norm": 0.63669429794073, "learning_rate": 9.987905688481732e-06, "loss": 0.0828, "step": 380 }, { "epoch": 0.3613086770981508, "grad_norm": 0.826754093590745, "learning_rate": 9.98751885597398e-06, "loss": 0.0848, "step": 381 }, { "epoch": 0.3622569938359412, "grad_norm": 0.8825949393702691, "learning_rate": 9.987125941840205e-06, "loss": 0.092, "step": 382 }, { "epoch": 0.3632053105737316, "grad_norm": 0.6103241173744877, "learning_rate": 9.986726946559517e-06, "loss": 0.08, "step": 383 }, { "epoch": 0.36415362731152207, "grad_norm": 0.7105367439957658, "learning_rate": 9.986321870618441e-06, "loss": 0.0685, "step": 384 }, { "epoch": 0.36510194404931245, "grad_norm": 1.802287343988455, "learning_rate": 9.985910714510908e-06, "loss": 0.0818, "step": 385 }, { "epoch": 0.3660502607871029, "grad_norm": 0.7732813708584271, "learning_rate": 9.985493478738275e-06, "loss": 0.07, "step": 386 }, { "epoch": 0.3669985775248933, "grad_norm": 0.8451643375246307, "learning_rate": 9.985070163809306e-06, "loss": 0.0744, "step": 387 }, { "epoch": 0.36794689426268373, "grad_norm": 1.126067442650852, "learning_rate": 9.984640770240173e-06, "loss": 0.1101, "step": 388 }, { "epoch": 0.3688952110004742, "grad_norm": 0.6652401258855057, "learning_rate": 9.984205298554467e-06, "loss": 0.0663, "step": 389 }, { "epoch": 0.36984352773826457, "grad_norm": 1.0802552975196003, "learning_rate": 9.983763749283193e-06, "loss": 0.0975, "step": 390 }, { "epoch": 0.370791844476055, "grad_norm": 0.7496808510910429, "learning_rate": 9.983316122964757e-06, "loss": 0.0701, "step": 391 }, { "epoch": 0.3717401612138454, "grad_norm": 0.6248602765248035, "learning_rate": 9.982862420144986e-06, "loss": 0.0643, "step": 392 }, { "epoch": 0.37268847795163584, "grad_norm": 1.7058022738803864, "learning_rate": 9.982402641377105e-06, "loss": 0.0936, "step": 393 }, { "epoch": 0.3736367946894263, "grad_norm": 1.205579756742393, "learning_rate": 9.98193678722176e-06, "loss": 0.0811, "step": 394 }, { "epoch": 0.3745851114272167, "grad_norm": 0.8021701752607538, "learning_rate": 9.981464858246993e-06, "loss": 0.0719, "step": 395 }, { "epoch": 0.3755334281650071, "grad_norm": 0.9210208736552777, "learning_rate": 9.980986855028267e-06, "loss": 0.0589, "step": 396 }, { "epoch": 0.3764817449027975, "grad_norm": 1.0458476195224804, "learning_rate": 9.980502778148438e-06, "loss": 0.0696, "step": 397 }, { "epoch": 0.37743006164058795, "grad_norm": 1.5095103680379303, "learning_rate": 9.980012628197778e-06, "loss": 0.0909, "step": 398 }, { "epoch": 0.3783783783783784, "grad_norm": 0.9521689001456719, "learning_rate": 9.979516405773956e-06, "loss": 0.0844, "step": 399 }, { "epoch": 0.3793266951161688, "grad_norm": 0.9909335290642662, "learning_rate": 9.979014111482057e-06, "loss": 0.079, "step": 400 }, { "epoch": 0.38027501185395923, "grad_norm": 1.300023515267878, "learning_rate": 9.978505745934559e-06, "loss": 0.1087, "step": 401 }, { "epoch": 0.3812233285917496, "grad_norm": 0.8905160216053487, "learning_rate": 9.977991309751347e-06, "loss": 0.0654, "step": 402 }, { "epoch": 0.38217164532954007, "grad_norm": 0.7908744916198801, "learning_rate": 9.97747080355971e-06, "loss": 0.0697, "step": 403 }, { "epoch": 0.3831199620673305, "grad_norm": 1.0819522254088034, "learning_rate": 9.976944227994337e-06, "loss": 0.0729, "step": 404 }, { "epoch": 0.3840682788051209, "grad_norm": 0.9319836261266163, "learning_rate": 9.976411583697316e-06, "loss": 0.077, "step": 405 }, { "epoch": 0.38501659554291134, "grad_norm": 0.7209233770781128, "learning_rate": 9.97587287131814e-06, "loss": 0.0708, "step": 406 }, { "epoch": 0.38596491228070173, "grad_norm": 0.8430932582390814, "learning_rate": 9.975328091513696e-06, "loss": 0.07, "step": 407 }, { "epoch": 0.3869132290184922, "grad_norm": 0.7932090811238357, "learning_rate": 9.974777244948271e-06, "loss": 0.0648, "step": 408 }, { "epoch": 0.3878615457562826, "grad_norm": 0.9213278429313838, "learning_rate": 9.974220332293554e-06, "loss": 0.0737, "step": 409 }, { "epoch": 0.388809862494073, "grad_norm": 0.4369389269684112, "learning_rate": 9.973657354228623e-06, "loss": 0.0509, "step": 410 }, { "epoch": 0.38975817923186346, "grad_norm": 0.7988805293653696, "learning_rate": 9.973088311439957e-06, "loss": 0.0684, "step": 411 }, { "epoch": 0.39070649596965384, "grad_norm": 0.9648310793568026, "learning_rate": 9.97251320462143e-06, "loss": 0.0849, "step": 412 }, { "epoch": 0.3916548127074443, "grad_norm": 0.7585613690692753, "learning_rate": 9.97193203447431e-06, "loss": 0.077, "step": 413 }, { "epoch": 0.39260312944523473, "grad_norm": 0.9380377046145346, "learning_rate": 9.971344801707256e-06, "loss": 0.0771, "step": 414 }, { "epoch": 0.3935514461830251, "grad_norm": 0.9822247506181627, "learning_rate": 9.970751507036323e-06, "loss": 0.1123, "step": 415 }, { "epoch": 0.39449976292081557, "grad_norm": 0.7156423865364446, "learning_rate": 9.970152151184956e-06, "loss": 0.0801, "step": 416 }, { "epoch": 0.39544807965860596, "grad_norm": 1.05912629502688, "learning_rate": 9.96954673488399e-06, "loss": 0.0804, "step": 417 }, { "epoch": 0.3963963963963964, "grad_norm": 1.1230479850270394, "learning_rate": 9.968935258871652e-06, "loss": 0.0799, "step": 418 }, { "epoch": 0.39734471313418684, "grad_norm": 1.0054642393242061, "learning_rate": 9.968317723893556e-06, "loss": 0.082, "step": 419 }, { "epoch": 0.39829302987197723, "grad_norm": 1.227859524837509, "learning_rate": 9.967694130702706e-06, "loss": 0.1069, "step": 420 }, { "epoch": 0.3992413466097677, "grad_norm": 1.2136272659300074, "learning_rate": 9.96706448005949e-06, "loss": 0.1112, "step": 421 }, { "epoch": 0.40018966334755807, "grad_norm": 0.9692912194018656, "learning_rate": 9.96642877273169e-06, "loss": 0.0837, "step": 422 }, { "epoch": 0.4011379800853485, "grad_norm": 0.7181203670103851, "learning_rate": 9.965787009494458e-06, "loss": 0.0648, "step": 423 }, { "epoch": 0.40208629682313896, "grad_norm": 0.9389223502528147, "learning_rate": 9.96513919113035e-06, "loss": 0.0846, "step": 424 }, { "epoch": 0.40303461356092934, "grad_norm": 0.6566856036851983, "learning_rate": 9.964485318429292e-06, "loss": 0.0776, "step": 425 }, { "epoch": 0.4039829302987198, "grad_norm": 1.0028156563396406, "learning_rate": 9.963825392188595e-06, "loss": 0.0719, "step": 426 }, { "epoch": 0.4049312470365102, "grad_norm": 0.9682157984093804, "learning_rate": 9.963159413212952e-06, "loss": 0.1058, "step": 427 }, { "epoch": 0.4058795637743006, "grad_norm": 1.1561667939356075, "learning_rate": 9.96248738231444e-06, "loss": 0.0982, "step": 428 }, { "epoch": 0.406827880512091, "grad_norm": 0.7960344078481167, "learning_rate": 9.961809300312512e-06, "loss": 0.0643, "step": 429 }, { "epoch": 0.40777619724988146, "grad_norm": 0.914323773268032, "learning_rate": 9.961125168034e-06, "loss": 0.0835, "step": 430 }, { "epoch": 0.4087245139876719, "grad_norm": 0.7441869330920762, "learning_rate": 9.960434986313113e-06, "loss": 0.0559, "step": 431 }, { "epoch": 0.4096728307254623, "grad_norm": 2.4732017252552367, "learning_rate": 9.959738755991437e-06, "loss": 0.1445, "step": 432 }, { "epoch": 0.41062114746325273, "grad_norm": 0.8533585342555405, "learning_rate": 9.959036477917935e-06, "loss": 0.0575, "step": 433 }, { "epoch": 0.4115694642010431, "grad_norm": 0.8190438451317316, "learning_rate": 9.95832815294894e-06, "loss": 0.0794, "step": 434 }, { "epoch": 0.41251778093883357, "grad_norm": 1.0046620676404385, "learning_rate": 9.957613781948164e-06, "loss": 0.0686, "step": 435 }, { "epoch": 0.413466097676624, "grad_norm": 0.9887051267008984, "learning_rate": 9.956893365786691e-06, "loss": 0.0618, "step": 436 }, { "epoch": 0.4144144144144144, "grad_norm": 0.6105909207601089, "learning_rate": 9.95616690534297e-06, "loss": 0.0572, "step": 437 }, { "epoch": 0.41536273115220484, "grad_norm": 1.5234824479103468, "learning_rate": 9.955434401502825e-06, "loss": 0.0994, "step": 438 }, { "epoch": 0.41631104788999523, "grad_norm": 1.1295839815001452, "learning_rate": 9.954695855159454e-06, "loss": 0.073, "step": 439 }, { "epoch": 0.4172593646277857, "grad_norm": 0.6583329952843571, "learning_rate": 9.95395126721341e-06, "loss": 0.0699, "step": 440 }, { "epoch": 0.4182076813655761, "grad_norm": 0.955937586299997, "learning_rate": 9.953200638572625e-06, "loss": 0.0815, "step": 441 }, { "epoch": 0.4191559981033665, "grad_norm": 1.5323108400108396, "learning_rate": 9.95244397015239e-06, "loss": 0.0732, "step": 442 }, { "epoch": 0.42010431484115696, "grad_norm": 1.677920724371183, "learning_rate": 9.951681262875365e-06, "loss": 0.0944, "step": 443 }, { "epoch": 0.42105263157894735, "grad_norm": 0.8926328574943209, "learning_rate": 9.95091251767157e-06, "loss": 0.0731, "step": 444 }, { "epoch": 0.4220009483167378, "grad_norm": 1.2692898943255297, "learning_rate": 9.950137735478389e-06, "loss": 0.1029, "step": 445 }, { "epoch": 0.42294926505452823, "grad_norm": 0.7345506207483801, "learning_rate": 9.949356917240569e-06, "loss": 0.0748, "step": 446 }, { "epoch": 0.4238975817923186, "grad_norm": 1.2435473519034808, "learning_rate": 9.948570063910216e-06, "loss": 0.1009, "step": 447 }, { "epoch": 0.42484589853010907, "grad_norm": 0.7650866909769807, "learning_rate": 9.947777176446792e-06, "loss": 0.0746, "step": 448 }, { "epoch": 0.42579421526789946, "grad_norm": 1.3807429981979404, "learning_rate": 9.946978255817121e-06, "loss": 0.0701, "step": 449 }, { "epoch": 0.4267425320056899, "grad_norm": 0.5315623424461096, "learning_rate": 9.946173302995382e-06, "loss": 0.0574, "step": 450 }, { "epoch": 0.42769084874348035, "grad_norm": 0.8562951763201797, "learning_rate": 9.94536231896311e-06, "loss": 0.0951, "step": 451 }, { "epoch": 0.42863916548127073, "grad_norm": 1.1965590998104225, "learning_rate": 9.944545304709192e-06, "loss": 0.0877, "step": 452 }, { "epoch": 0.4295874822190612, "grad_norm": 1.2735339749816497, "learning_rate": 9.943722261229872e-06, "loss": 0.0768, "step": 453 }, { "epoch": 0.43053579895685157, "grad_norm": 0.9370658659046329, "learning_rate": 9.942893189528743e-06, "loss": 0.0782, "step": 454 }, { "epoch": 0.431484115694642, "grad_norm": 1.5520551397042521, "learning_rate": 9.942058090616748e-06, "loss": 0.1039, "step": 455 }, { "epoch": 0.43243243243243246, "grad_norm": 1.3529615602541014, "learning_rate": 9.941216965512183e-06, "loss": 0.0867, "step": 456 }, { "epoch": 0.43338074917022285, "grad_norm": 1.192234505990805, "learning_rate": 9.940369815240688e-06, "loss": 0.0809, "step": 457 }, { "epoch": 0.4343290659080133, "grad_norm": 0.9763205758532367, "learning_rate": 9.939516640835254e-06, "loss": 0.0652, "step": 458 }, { "epoch": 0.4352773826458037, "grad_norm": 1.3415645605638937, "learning_rate": 9.938657443336212e-06, "loss": 0.109, "step": 459 }, { "epoch": 0.4362256993835941, "grad_norm": 1.1595154129634277, "learning_rate": 9.937792223791244e-06, "loss": 0.1002, "step": 460 }, { "epoch": 0.43717401612138457, "grad_norm": 1.33436975844217, "learning_rate": 9.936920983255372e-06, "loss": 0.114, "step": 461 }, { "epoch": 0.43812233285917496, "grad_norm": 1.0009653043703806, "learning_rate": 9.936043722790956e-06, "loss": 0.0827, "step": 462 }, { "epoch": 0.4390706495969654, "grad_norm": 1.1900315382859075, "learning_rate": 9.935160443467704e-06, "loss": 0.0991, "step": 463 }, { "epoch": 0.4400189663347558, "grad_norm": 0.7796648666540394, "learning_rate": 9.934271146362658e-06, "loss": 0.0729, "step": 464 }, { "epoch": 0.44096728307254623, "grad_norm": 0.7692033539386839, "learning_rate": 9.933375832560199e-06, "loss": 0.0752, "step": 465 }, { "epoch": 0.4419155998103367, "grad_norm": 0.7898679053377281, "learning_rate": 9.932474503152047e-06, "loss": 0.0557, "step": 466 }, { "epoch": 0.44286391654812707, "grad_norm": 1.308054442070126, "learning_rate": 9.931567159237252e-06, "loss": 0.1, "step": 467 }, { "epoch": 0.4438122332859175, "grad_norm": 0.8281027248286734, "learning_rate": 9.930653801922205e-06, "loss": 0.1066, "step": 468 }, { "epoch": 0.4447605500237079, "grad_norm": 0.6589498594732086, "learning_rate": 9.929734432320621e-06, "loss": 0.061, "step": 469 }, { "epoch": 0.44570886676149835, "grad_norm": 1.0105820136512023, "learning_rate": 9.928809051553554e-06, "loss": 0.0771, "step": 470 }, { "epoch": 0.4466571834992888, "grad_norm": 1.174475732403723, "learning_rate": 9.927877660749385e-06, "loss": 0.1029, "step": 471 }, { "epoch": 0.4476055002370792, "grad_norm": 0.7007588523937572, "learning_rate": 9.92694026104382e-06, "loss": 0.0548, "step": 472 }, { "epoch": 0.4485538169748696, "grad_norm": 0.7548622992450297, "learning_rate": 9.925996853579897e-06, "loss": 0.071, "step": 473 }, { "epoch": 0.44950213371266, "grad_norm": 0.9151211373906433, "learning_rate": 9.92504743950798e-06, "loss": 0.0728, "step": 474 }, { "epoch": 0.45045045045045046, "grad_norm": 1.3188113799099948, "learning_rate": 9.924092019985751e-06, "loss": 0.071, "step": 475 }, { "epoch": 0.45139876718824085, "grad_norm": 0.834826643366671, "learning_rate": 9.923130596178221e-06, "loss": 0.0827, "step": 476 }, { "epoch": 0.4523470839260313, "grad_norm": 0.8853088211117691, "learning_rate": 9.922163169257722e-06, "loss": 0.0714, "step": 477 }, { "epoch": 0.45329540066382173, "grad_norm": 0.9773650061711494, "learning_rate": 9.921189740403902e-06, "loss": 0.0902, "step": 478 }, { "epoch": 0.4542437174016121, "grad_norm": 0.8530429782086267, "learning_rate": 9.92021031080373e-06, "loss": 0.0896, "step": 479 }, { "epoch": 0.45519203413940257, "grad_norm": 0.6841245724165017, "learning_rate": 9.919224881651494e-06, "loss": 0.0574, "step": 480 }, { "epoch": 0.45614035087719296, "grad_norm": 0.8751901827667304, "learning_rate": 9.918233454148795e-06, "loss": 0.0712, "step": 481 }, { "epoch": 0.4570886676149834, "grad_norm": 0.8605318101074332, "learning_rate": 9.917236029504549e-06, "loss": 0.0758, "step": 482 }, { "epoch": 0.45803698435277385, "grad_norm": 0.6297402738230038, "learning_rate": 9.916232608934982e-06, "loss": 0.0835, "step": 483 }, { "epoch": 0.45898530109056423, "grad_norm": 1.2633792305334934, "learning_rate": 9.915223193663639e-06, "loss": 0.097, "step": 484 }, { "epoch": 0.4599336178283547, "grad_norm": 0.9453282561376489, "learning_rate": 9.914207784921366e-06, "loss": 0.0813, "step": 485 }, { "epoch": 0.46088193456614507, "grad_norm": 1.0981998450683066, "learning_rate": 9.913186383946322e-06, "loss": 0.0831, "step": 486 }, { "epoch": 0.4618302513039355, "grad_norm": 0.9453607555522517, "learning_rate": 9.91215899198397e-06, "loss": 0.0668, "step": 487 }, { "epoch": 0.46277856804172596, "grad_norm": 0.8480655824160724, "learning_rate": 9.911125610287085e-06, "loss": 0.0803, "step": 488 }, { "epoch": 0.46372688477951635, "grad_norm": 0.7365032755805906, "learning_rate": 9.910086240115738e-06, "loss": 0.0503, "step": 489 }, { "epoch": 0.4646752015173068, "grad_norm": 0.9926545138390478, "learning_rate": 9.909040882737301e-06, "loss": 0.0785, "step": 490 }, { "epoch": 0.4656235182550972, "grad_norm": 1.078153469225969, "learning_rate": 9.907989539426455e-06, "loss": 0.0942, "step": 491 }, { "epoch": 0.4665718349928876, "grad_norm": 0.891582918999742, "learning_rate": 9.906932211465173e-06, "loss": 0.0713, "step": 492 }, { "epoch": 0.46752015173067807, "grad_norm": 0.8352029023952229, "learning_rate": 9.90586890014273e-06, "loss": 0.0871, "step": 493 }, { "epoch": 0.46846846846846846, "grad_norm": 1.4543230270611818, "learning_rate": 9.904799606755695e-06, "loss": 0.1049, "step": 494 }, { "epoch": 0.4694167852062589, "grad_norm": 0.9571877161884975, "learning_rate": 9.90372433260793e-06, "loss": 0.0856, "step": 495 }, { "epoch": 0.4703651019440493, "grad_norm": 0.6657483404024113, "learning_rate": 9.90264307901059e-06, "loss": 0.0631, "step": 496 }, { "epoch": 0.47131341868183974, "grad_norm": 1.2493973473928695, "learning_rate": 9.901555847282123e-06, "loss": 0.0973, "step": 497 }, { "epoch": 0.4722617354196302, "grad_norm": 0.6689914382563446, "learning_rate": 9.900462638748266e-06, "loss": 0.0582, "step": 498 }, { "epoch": 0.47321005215742057, "grad_norm": 0.8246501895880392, "learning_rate": 9.899363454742044e-06, "loss": 0.0727, "step": 499 }, { "epoch": 0.474158368895211, "grad_norm": 1.442170890658491, "learning_rate": 9.898258296603769e-06, "loss": 0.0931, "step": 500 }, { "epoch": 0.4751066856330014, "grad_norm": 0.7582565389247256, "learning_rate": 9.897147165681034e-06, "loss": 0.0722, "step": 501 }, { "epoch": 0.47605500237079185, "grad_norm": 0.627525129279453, "learning_rate": 9.896030063328718e-06, "loss": 0.0597, "step": 502 }, { "epoch": 0.4770033191085823, "grad_norm": 0.6342149242840518, "learning_rate": 9.894906990908982e-06, "loss": 0.0725, "step": 503 }, { "epoch": 0.4779516358463727, "grad_norm": 0.8212079234115165, "learning_rate": 9.893777949791266e-06, "loss": 0.0649, "step": 504 }, { "epoch": 0.4788999525841631, "grad_norm": 0.8923951454231676, "learning_rate": 9.89264294135229e-06, "loss": 0.0595, "step": 505 }, { "epoch": 0.4798482693219535, "grad_norm": 1.0318440665130484, "learning_rate": 9.891501966976041e-06, "loss": 0.0842, "step": 506 }, { "epoch": 0.48079658605974396, "grad_norm": 0.6944537972828242, "learning_rate": 9.890355028053793e-06, "loss": 0.0752, "step": 507 }, { "epoch": 0.4817449027975344, "grad_norm": 1.0705584030604105, "learning_rate": 9.889202125984088e-06, "loss": 0.0647, "step": 508 }, { "epoch": 0.4826932195353248, "grad_norm": 0.9754252622446561, "learning_rate": 9.88804326217274e-06, "loss": 0.0687, "step": 509 }, { "epoch": 0.48364153627311524, "grad_norm": 0.9660762094606946, "learning_rate": 9.886878438032828e-06, "loss": 0.0789, "step": 510 }, { "epoch": 0.4845898530109056, "grad_norm": 0.5832722133461282, "learning_rate": 9.885707654984703e-06, "loss": 0.0636, "step": 511 }, { "epoch": 0.48553816974869607, "grad_norm": 0.7052006552554221, "learning_rate": 9.884530914455984e-06, "loss": 0.0586, "step": 512 }, { "epoch": 0.4864864864864865, "grad_norm": 0.9822072228951928, "learning_rate": 9.88334821788155e-06, "loss": 0.0645, "step": 513 }, { "epoch": 0.4874348032242769, "grad_norm": 0.9641946540266126, "learning_rate": 9.882159566703547e-06, "loss": 0.0885, "step": 514 }, { "epoch": 0.48838311996206735, "grad_norm": 0.6403136140606015, "learning_rate": 9.880964962371378e-06, "loss": 0.0678, "step": 515 }, { "epoch": 0.48933143669985774, "grad_norm": 0.7486541793123711, "learning_rate": 9.879764406341705e-06, "loss": 0.0741, "step": 516 }, { "epoch": 0.4902797534376482, "grad_norm": 0.5779229700891555, "learning_rate": 9.87855790007845e-06, "loss": 0.0646, "step": 517 }, { "epoch": 0.49122807017543857, "grad_norm": 0.7611283230447122, "learning_rate": 9.87734544505279e-06, "loss": 0.0768, "step": 518 }, { "epoch": 0.492176386913229, "grad_norm": 0.5823535883100547, "learning_rate": 9.876127042743155e-06, "loss": 0.0703, "step": 519 }, { "epoch": 0.49312470365101946, "grad_norm": 0.6827829977739827, "learning_rate": 9.874902694635226e-06, "loss": 0.0772, "step": 520 }, { "epoch": 0.49407302038880985, "grad_norm": 0.7254200544564426, "learning_rate": 9.873672402221937e-06, "loss": 0.0634, "step": 521 }, { "epoch": 0.4950213371266003, "grad_norm": 0.6425214796651868, "learning_rate": 9.872436167003468e-06, "loss": 0.064, "step": 522 }, { "epoch": 0.4959696538643907, "grad_norm": 0.623192525545158, "learning_rate": 9.871193990487242e-06, "loss": 0.077, "step": 523 }, { "epoch": 0.4969179706021811, "grad_norm": 0.7225947749173619, "learning_rate": 9.869945874187936e-06, "loss": 0.075, "step": 524 }, { "epoch": 0.49786628733997157, "grad_norm": 2.0516616577595435, "learning_rate": 9.868691819627462e-06, "loss": 0.0867, "step": 525 }, { "epoch": 0.49881460407776196, "grad_norm": 1.0257158284306434, "learning_rate": 9.867431828334974e-06, "loss": 0.0588, "step": 526 }, { "epoch": 0.4997629208155524, "grad_norm": 0.8403229438927825, "learning_rate": 9.86616590184687e-06, "loss": 0.0823, "step": 527 }, { "epoch": 0.5007112375533428, "grad_norm": 0.6449240492145598, "learning_rate": 9.864894041706779e-06, "loss": 0.0567, "step": 528 }, { "epoch": 0.5016595542911333, "grad_norm": 0.8789018684523284, "learning_rate": 9.863616249465567e-06, "loss": 0.0713, "step": 529 }, { "epoch": 0.5026078710289237, "grad_norm": 0.9524887983478211, "learning_rate": 9.862332526681336e-06, "loss": 0.0835, "step": 530 }, { "epoch": 0.5035561877667141, "grad_norm": 0.6422268170348604, "learning_rate": 9.861042874919417e-06, "loss": 0.0606, "step": 531 }, { "epoch": 0.5045045045045045, "grad_norm": 0.9032374038451735, "learning_rate": 9.859747295752374e-06, "loss": 0.0773, "step": 532 }, { "epoch": 0.505452821242295, "grad_norm": 0.9269404822199643, "learning_rate": 9.858445790759992e-06, "loss": 0.0822, "step": 533 }, { "epoch": 0.5064011379800853, "grad_norm": 0.7043514434980399, "learning_rate": 9.857138361529288e-06, "loss": 0.0688, "step": 534 }, { "epoch": 0.5073494547178757, "grad_norm": 0.8239211698855243, "learning_rate": 9.8558250096545e-06, "loss": 0.0542, "step": 535 }, { "epoch": 0.5082977714556662, "grad_norm": 0.8633975590563754, "learning_rate": 9.85450573673709e-06, "loss": 0.0744, "step": 536 }, { "epoch": 0.5092460881934566, "grad_norm": 0.6985004021466871, "learning_rate": 9.853180544385737e-06, "loss": 0.047, "step": 537 }, { "epoch": 0.510194404931247, "grad_norm": 0.5889042803503781, "learning_rate": 9.851849434216338e-06, "loss": 0.0557, "step": 538 }, { "epoch": 0.5111427216690374, "grad_norm": 0.7765705663935071, "learning_rate": 9.850512407852012e-06, "loss": 0.0669, "step": 539 }, { "epoch": 0.5120910384068279, "grad_norm": 0.8204550382112847, "learning_rate": 9.849169466923086e-06, "loss": 0.0685, "step": 540 }, { "epoch": 0.5130393551446183, "grad_norm": 0.5256883407913393, "learning_rate": 9.847820613067098e-06, "loss": 0.0537, "step": 541 }, { "epoch": 0.5139876718824087, "grad_norm": 0.6838576750776693, "learning_rate": 9.8464658479288e-06, "loss": 0.0704, "step": 542 }, { "epoch": 0.5149359886201992, "grad_norm": 0.8974806559813661, "learning_rate": 9.845105173160152e-06, "loss": 0.0899, "step": 543 }, { "epoch": 0.5158843053579896, "grad_norm": 0.7219053990698988, "learning_rate": 9.843738590420317e-06, "loss": 0.0468, "step": 544 }, { "epoch": 0.51683262209578, "grad_norm": 1.032987889739876, "learning_rate": 9.842366101375664e-06, "loss": 0.0562, "step": 545 }, { "epoch": 0.5177809388335705, "grad_norm": 0.7651951768284668, "learning_rate": 9.840987707699765e-06, "loss": 0.0669, "step": 546 }, { "epoch": 0.5187292555713608, "grad_norm": 0.6813496832389402, "learning_rate": 9.839603411073388e-06, "loss": 0.0706, "step": 547 }, { "epoch": 0.5196775723091512, "grad_norm": 0.7229692269198181, "learning_rate": 9.838213213184505e-06, "loss": 0.0771, "step": 548 }, { "epoch": 0.5206258890469416, "grad_norm": 1.157471128375012, "learning_rate": 9.836817115728277e-06, "loss": 0.0932, "step": 549 }, { "epoch": 0.5215742057847321, "grad_norm": 0.8058138449457062, "learning_rate": 9.835415120407063e-06, "loss": 0.0539, "step": 550 }, { "epoch": 0.5225225225225225, "grad_norm": 0.6915528599019737, "learning_rate": 9.834007228930414e-06, "loss": 0.0688, "step": 551 }, { "epoch": 0.5234708392603129, "grad_norm": 0.8835152385091712, "learning_rate": 9.832593443015068e-06, "loss": 0.0605, "step": 552 }, { "epoch": 0.5244191559981034, "grad_norm": 0.6896706794263241, "learning_rate": 9.83117376438495e-06, "loss": 0.0668, "step": 553 }, { "epoch": 0.5253674727358938, "grad_norm": 0.7651857964351815, "learning_rate": 9.829748194771175e-06, "loss": 0.064, "step": 554 }, { "epoch": 0.5263157894736842, "grad_norm": 0.6216741056003758, "learning_rate": 9.828316735912037e-06, "loss": 0.0541, "step": 555 }, { "epoch": 0.5272641062114747, "grad_norm": 0.6813673301708452, "learning_rate": 9.826879389553014e-06, "loss": 0.0574, "step": 556 }, { "epoch": 0.5282124229492651, "grad_norm": 0.7147998418504048, "learning_rate": 9.825436157446761e-06, "loss": 0.0576, "step": 557 }, { "epoch": 0.5291607396870555, "grad_norm": 0.6352148290105686, "learning_rate": 9.82398704135311e-06, "loss": 0.066, "step": 558 }, { "epoch": 0.5301090564248458, "grad_norm": 0.8511240887028577, "learning_rate": 9.822532043039068e-06, "loss": 0.0687, "step": 559 }, { "epoch": 0.5310573731626363, "grad_norm": 0.6876408977841421, "learning_rate": 9.821071164278815e-06, "loss": 0.0838, "step": 560 }, { "epoch": 0.5320056899004267, "grad_norm": 0.7354217835184531, "learning_rate": 9.819604406853703e-06, "loss": 0.0552, "step": 561 }, { "epoch": 0.5329540066382171, "grad_norm": 0.9572067784227991, "learning_rate": 9.818131772552249e-06, "loss": 0.1099, "step": 562 }, { "epoch": 0.5339023233760076, "grad_norm": 0.7931127239607592, "learning_rate": 9.816653263170137e-06, "loss": 0.0706, "step": 563 }, { "epoch": 0.534850640113798, "grad_norm": 0.8242420526129728, "learning_rate": 9.815168880510218e-06, "loss": 0.0946, "step": 564 }, { "epoch": 0.5357989568515884, "grad_norm": 1.0330372476146157, "learning_rate": 9.8136786263825e-06, "loss": 0.0951, "step": 565 }, { "epoch": 0.5367472735893789, "grad_norm": 0.7553297432270302, "learning_rate": 9.812182502604151e-06, "loss": 0.0663, "step": 566 }, { "epoch": 0.5376955903271693, "grad_norm": 0.8446853429895546, "learning_rate": 9.810680510999505e-06, "loss": 0.0728, "step": 567 }, { "epoch": 0.5386439070649597, "grad_norm": 0.5089680701907852, "learning_rate": 9.809172653400036e-06, "loss": 0.0501, "step": 568 }, { "epoch": 0.5395922238027501, "grad_norm": 0.7258180066288827, "learning_rate": 9.807658931644382e-06, "loss": 0.0752, "step": 569 }, { "epoch": 0.5405405405405406, "grad_norm": 0.7028402619162881, "learning_rate": 9.806139347578331e-06, "loss": 0.059, "step": 570 }, { "epoch": 0.541488857278331, "grad_norm": 0.7248854010393692, "learning_rate": 9.804613903054813e-06, "loss": 0.0851, "step": 571 }, { "epoch": 0.5424371740161213, "grad_norm": 0.7176555652391681, "learning_rate": 9.803082599933911e-06, "loss": 0.0697, "step": 572 }, { "epoch": 0.5433854907539118, "grad_norm": 0.4808404612456389, "learning_rate": 9.801545440082845e-06, "loss": 0.0569, "step": 573 }, { "epoch": 0.5443338074917022, "grad_norm": 0.8731137568130377, "learning_rate": 9.800002425375984e-06, "loss": 0.0657, "step": 574 }, { "epoch": 0.5452821242294926, "grad_norm": 0.7816194292982013, "learning_rate": 9.798453557694828e-06, "loss": 0.0724, "step": 575 }, { "epoch": 0.5462304409672831, "grad_norm": 0.9042436959378762, "learning_rate": 9.796898838928022e-06, "loss": 0.0784, "step": 576 }, { "epoch": 0.5471787577050735, "grad_norm": 1.0293154765529384, "learning_rate": 9.79533827097134e-06, "loss": 0.098, "step": 577 }, { "epoch": 0.5481270744428639, "grad_norm": 0.8678391414260259, "learning_rate": 9.793771855727691e-06, "loss": 0.0635, "step": 578 }, { "epoch": 0.5490753911806543, "grad_norm": 0.6041409950077287, "learning_rate": 9.792199595107115e-06, "loss": 0.0524, "step": 579 }, { "epoch": 0.5500237079184448, "grad_norm": 1.0292476772898875, "learning_rate": 9.790621491026773e-06, "loss": 0.0829, "step": 580 }, { "epoch": 0.5509720246562352, "grad_norm": 0.7074515600768486, "learning_rate": 9.78903754541096e-06, "loss": 0.0704, "step": 581 }, { "epoch": 0.5519203413940256, "grad_norm": 0.7603340975922476, "learning_rate": 9.787447760191092e-06, "loss": 0.0788, "step": 582 }, { "epoch": 0.5528686581318161, "grad_norm": 1.0766706695954442, "learning_rate": 9.785852137305699e-06, "loss": 0.079, "step": 583 }, { "epoch": 0.5538169748696065, "grad_norm": 0.7555731931730972, "learning_rate": 9.784250678700435e-06, "loss": 0.0705, "step": 584 }, { "epoch": 0.5547652916073968, "grad_norm": 0.7010961175305198, "learning_rate": 9.782643386328073e-06, "loss": 0.0713, "step": 585 }, { "epoch": 0.5557136083451872, "grad_norm": 1.0580272254821363, "learning_rate": 9.781030262148492e-06, "loss": 0.0671, "step": 586 }, { "epoch": 0.5566619250829777, "grad_norm": 0.6594876081209583, "learning_rate": 9.779411308128685e-06, "loss": 0.0867, "step": 587 }, { "epoch": 0.5576102418207681, "grad_norm": 1.3649847896410103, "learning_rate": 9.777786526242759e-06, "loss": 0.0847, "step": 588 }, { "epoch": 0.5585585585585585, "grad_norm": 0.6223880228627037, "learning_rate": 9.776155918471916e-06, "loss": 0.0579, "step": 589 }, { "epoch": 0.559506875296349, "grad_norm": 0.6862572922646061, "learning_rate": 9.774519486804476e-06, "loss": 0.053, "step": 590 }, { "epoch": 0.5604551920341394, "grad_norm": 0.6562455064809456, "learning_rate": 9.772877233235848e-06, "loss": 0.0651, "step": 591 }, { "epoch": 0.5614035087719298, "grad_norm": 0.7150505236504866, "learning_rate": 9.771229159768547e-06, "loss": 0.0697, "step": 592 }, { "epoch": 0.5623518255097203, "grad_norm": 0.7505406859172821, "learning_rate": 9.769575268412182e-06, "loss": 0.0691, "step": 593 }, { "epoch": 0.5633001422475107, "grad_norm": 0.7340490905887499, "learning_rate": 9.767915561183456e-06, "loss": 0.0748, "step": 594 }, { "epoch": 0.5642484589853011, "grad_norm": 0.7987611706335997, "learning_rate": 9.766250040106166e-06, "loss": 0.0682, "step": 595 }, { "epoch": 0.5651967757230915, "grad_norm": 1.2974449597341617, "learning_rate": 9.764578707211199e-06, "loss": 0.0751, "step": 596 }, { "epoch": 0.566145092460882, "grad_norm": 0.6191420122018653, "learning_rate": 9.762901564536523e-06, "loss": 0.0667, "step": 597 }, { "epoch": 0.5670934091986723, "grad_norm": 0.6903639931399153, "learning_rate": 9.761218614127193e-06, "loss": 0.0653, "step": 598 }, { "epoch": 0.5680417259364627, "grad_norm": 0.7974449669867185, "learning_rate": 9.759529858035351e-06, "loss": 0.0662, "step": 599 }, { "epoch": 0.5689900426742532, "grad_norm": 1.6445977802603875, "learning_rate": 9.75783529832021e-06, "loss": 0.0781, "step": 600 }, { "epoch": 0.5699383594120436, "grad_norm": 0.7682344601188886, "learning_rate": 9.756134937048066e-06, "loss": 0.0516, "step": 601 }, { "epoch": 0.570886676149834, "grad_norm": 0.6505039594954853, "learning_rate": 9.754428776292287e-06, "loss": 0.0522, "step": 602 }, { "epoch": 0.5718349928876245, "grad_norm": 1.0748139183671632, "learning_rate": 9.752716818133309e-06, "loss": 0.0787, "step": 603 }, { "epoch": 0.5727833096254149, "grad_norm": 0.7575374337239762, "learning_rate": 9.750999064658644e-06, "loss": 0.0618, "step": 604 }, { "epoch": 0.5737316263632053, "grad_norm": 0.5005741056916544, "learning_rate": 9.749275517962868e-06, "loss": 0.0579, "step": 605 }, { "epoch": 0.5746799431009957, "grad_norm": 0.9747236186565804, "learning_rate": 9.747546180147618e-06, "loss": 0.1137, "step": 606 }, { "epoch": 0.5756282598387862, "grad_norm": 0.5945741852680105, "learning_rate": 9.745811053321597e-06, "loss": 0.0528, "step": 607 }, { "epoch": 0.5765765765765766, "grad_norm": 0.8767385416979725, "learning_rate": 9.744070139600564e-06, "loss": 0.0756, "step": 608 }, { "epoch": 0.577524893314367, "grad_norm": 0.805183732938404, "learning_rate": 9.742323441107335e-06, "loss": 0.0796, "step": 609 }, { "epoch": 0.5784732100521575, "grad_norm": 0.4622182813428181, "learning_rate": 9.74057095997178e-06, "loss": 0.0466, "step": 610 }, { "epoch": 0.5794215267899478, "grad_norm": 1.323185570736391, "learning_rate": 9.738812698330821e-06, "loss": 0.0803, "step": 611 }, { "epoch": 0.5803698435277382, "grad_norm": 0.6017510939556475, "learning_rate": 9.737048658328428e-06, "loss": 0.0473, "step": 612 }, { "epoch": 0.5813181602655287, "grad_norm": 0.9340483579893749, "learning_rate": 9.735278842115616e-06, "loss": 0.0726, "step": 613 }, { "epoch": 0.5822664770033191, "grad_norm": 0.8017302866486061, "learning_rate": 9.733503251850443e-06, "loss": 0.0508, "step": 614 }, { "epoch": 0.5832147937411095, "grad_norm": 0.4915103436956615, "learning_rate": 9.73172188969801e-06, "loss": 0.0511, "step": 615 }, { "epoch": 0.5841631104788999, "grad_norm": 0.5454251857464146, "learning_rate": 9.729934757830455e-06, "loss": 0.043, "step": 616 }, { "epoch": 0.5851114272166904, "grad_norm": 0.45382702737394764, "learning_rate": 9.728141858426953e-06, "loss": 0.046, "step": 617 }, { "epoch": 0.5860597439544808, "grad_norm": 0.5609546349379012, "learning_rate": 9.726343193673707e-06, "loss": 0.0528, "step": 618 }, { "epoch": 0.5870080606922712, "grad_norm": 0.600673482298699, "learning_rate": 9.724538765763953e-06, "loss": 0.0539, "step": 619 }, { "epoch": 0.5879563774300617, "grad_norm": 0.9417089865736203, "learning_rate": 9.722728576897956e-06, "loss": 0.0583, "step": 620 }, { "epoch": 0.5889046941678521, "grad_norm": 0.4653439643190733, "learning_rate": 9.720912629283004e-06, "loss": 0.05, "step": 621 }, { "epoch": 0.5898530109056425, "grad_norm": 1.026549188147293, "learning_rate": 9.719090925133408e-06, "loss": 0.0643, "step": 622 }, { "epoch": 0.590801327643433, "grad_norm": 0.7947545630855374, "learning_rate": 9.717263466670496e-06, "loss": 0.0827, "step": 623 }, { "epoch": 0.5917496443812233, "grad_norm": 0.5505357789361721, "learning_rate": 9.715430256122616e-06, "loss": 0.057, "step": 624 }, { "epoch": 0.5926979611190137, "grad_norm": 0.6227650085275758, "learning_rate": 9.713591295725126e-06, "loss": 0.0613, "step": 625 }, { "epoch": 0.5936462778568041, "grad_norm": 0.8089764410308476, "learning_rate": 9.711746587720398e-06, "loss": 0.0575, "step": 626 }, { "epoch": 0.5945945945945946, "grad_norm": 0.8681782262186932, "learning_rate": 9.709896134357815e-06, "loss": 0.0664, "step": 627 }, { "epoch": 0.595542911332385, "grad_norm": 0.682165737662686, "learning_rate": 9.708039937893759e-06, "loss": 0.0558, "step": 628 }, { "epoch": 0.5964912280701754, "grad_norm": 0.6331915650172267, "learning_rate": 9.706178000591617e-06, "loss": 0.0628, "step": 629 }, { "epoch": 0.5974395448079659, "grad_norm": 0.5712611189361939, "learning_rate": 9.704310324721782e-06, "loss": 0.0741, "step": 630 }, { "epoch": 0.5983878615457563, "grad_norm": 0.4974903145873453, "learning_rate": 9.70243691256164e-06, "loss": 0.0569, "step": 631 }, { "epoch": 0.5993361782835467, "grad_norm": 0.8755421451427193, "learning_rate": 9.700557766395567e-06, "loss": 0.0884, "step": 632 }, { "epoch": 0.6002844950213371, "grad_norm": 0.5236784076286586, "learning_rate": 9.698672888514938e-06, "loss": 0.0493, "step": 633 }, { "epoch": 0.6012328117591276, "grad_norm": 0.6525012362182552, "learning_rate": 9.696782281218117e-06, "loss": 0.0683, "step": 634 }, { "epoch": 0.602181128496918, "grad_norm": 0.5119217968942416, "learning_rate": 9.69488594681045e-06, "loss": 0.0449, "step": 635 }, { "epoch": 0.6031294452347084, "grad_norm": 0.6576021927278618, "learning_rate": 9.692983887604269e-06, "loss": 0.0674, "step": 636 }, { "epoch": 0.6040777619724989, "grad_norm": 0.7157400695119305, "learning_rate": 9.691076105918885e-06, "loss": 0.0692, "step": 637 }, { "epoch": 0.6050260787102892, "grad_norm": 0.873028935018846, "learning_rate": 9.689162604080589e-06, "loss": 0.0999, "step": 638 }, { "epoch": 0.6059743954480796, "grad_norm": 0.8384167589559871, "learning_rate": 9.687243384422646e-06, "loss": 0.0771, "step": 639 }, { "epoch": 0.6069227121858701, "grad_norm": 0.5020655439555515, "learning_rate": 9.685318449285292e-06, "loss": 0.0512, "step": 640 }, { "epoch": 0.6078710289236605, "grad_norm": 0.36608001502573706, "learning_rate": 9.683387801015733e-06, "loss": 0.0377, "step": 641 }, { "epoch": 0.6088193456614509, "grad_norm": 0.7919506442179929, "learning_rate": 9.681451441968144e-06, "loss": 0.0775, "step": 642 }, { "epoch": 0.6097676623992413, "grad_norm": 0.6274619623629013, "learning_rate": 9.67950937450366e-06, "loss": 0.0645, "step": 643 }, { "epoch": 0.6107159791370318, "grad_norm": 0.5896565427831529, "learning_rate": 9.677561600990378e-06, "loss": 0.0595, "step": 644 }, { "epoch": 0.6116642958748222, "grad_norm": 0.5142338666265971, "learning_rate": 9.67560812380335e-06, "loss": 0.0597, "step": 645 }, { "epoch": 0.6126126126126126, "grad_norm": 0.6109668570207277, "learning_rate": 9.67364894532459e-06, "loss": 0.07, "step": 646 }, { "epoch": 0.6135609293504031, "grad_norm": 0.6756478515313759, "learning_rate": 9.671684067943056e-06, "loss": 0.0612, "step": 647 }, { "epoch": 0.6145092460881935, "grad_norm": 0.6142876685386528, "learning_rate": 9.669713494054662e-06, "loss": 0.06, "step": 648 }, { "epoch": 0.6154575628259839, "grad_norm": 0.8252522199066464, "learning_rate": 9.667737226062262e-06, "loss": 0.118, "step": 649 }, { "epoch": 0.6164058795637744, "grad_norm": 0.48924053020562824, "learning_rate": 9.665755266375657e-06, "loss": 0.0542, "step": 650 }, { "epoch": 0.6173541963015647, "grad_norm": 0.9087121397095356, "learning_rate": 9.663767617411587e-06, "loss": 0.0611, "step": 651 }, { "epoch": 0.6183025130393551, "grad_norm": 0.7764764902550111, "learning_rate": 9.66177428159373e-06, "loss": 0.0676, "step": 652 }, { "epoch": 0.6192508297771455, "grad_norm": 0.44918893065172116, "learning_rate": 9.659775261352697e-06, "loss": 0.0474, "step": 653 }, { "epoch": 0.620199146514936, "grad_norm": 0.9162652994629981, "learning_rate": 9.657770559126034e-06, "loss": 0.0981, "step": 654 }, { "epoch": 0.6211474632527264, "grad_norm": 0.6543823860401999, "learning_rate": 9.655760177358208e-06, "loss": 0.0744, "step": 655 }, { "epoch": 0.6220957799905168, "grad_norm": 0.44085186666179094, "learning_rate": 9.653744118500623e-06, "loss": 0.0532, "step": 656 }, { "epoch": 0.6230440967283073, "grad_norm": 0.7980175435844092, "learning_rate": 9.651722385011592e-06, "loss": 0.0807, "step": 657 }, { "epoch": 0.6239924134660977, "grad_norm": 0.4853866988799319, "learning_rate": 9.649694979356358e-06, "loss": 0.0454, "step": 658 }, { "epoch": 0.6249407302038881, "grad_norm": 0.5662361885259662, "learning_rate": 9.647661904007076e-06, "loss": 0.0621, "step": 659 }, { "epoch": 0.6258890469416786, "grad_norm": 0.8127269026146419, "learning_rate": 9.645623161442814e-06, "loss": 0.0773, "step": 660 }, { "epoch": 0.626837363679469, "grad_norm": 0.6294162739235921, "learning_rate": 9.643578754149552e-06, "loss": 0.0599, "step": 661 }, { "epoch": 0.6277856804172594, "grad_norm": 0.6965237350859914, "learning_rate": 9.641528684620179e-06, "loss": 0.0542, "step": 662 }, { "epoch": 0.6287339971550497, "grad_norm": 0.5265921422928361, "learning_rate": 9.639472955354483e-06, "loss": 0.0496, "step": 663 }, { "epoch": 0.6296823138928402, "grad_norm": 0.8663040094375097, "learning_rate": 9.63741156885916e-06, "loss": 0.0733, "step": 664 }, { "epoch": 0.6306306306306306, "grad_norm": 0.7508837936313448, "learning_rate": 9.635344527647798e-06, "loss": 0.08, "step": 665 }, { "epoch": 0.631578947368421, "grad_norm": 0.6827540936282853, "learning_rate": 9.633271834240885e-06, "loss": 0.0732, "step": 666 }, { "epoch": 0.6325272641062115, "grad_norm": 0.7441700461651841, "learning_rate": 9.631193491165798e-06, "loss": 0.0555, "step": 667 }, { "epoch": 0.6334755808440019, "grad_norm": 0.8313881844290032, "learning_rate": 9.629109500956803e-06, "loss": 0.0782, "step": 668 }, { "epoch": 0.6344238975817923, "grad_norm": 0.47754915650781987, "learning_rate": 9.627019866155056e-06, "loss": 0.0547, "step": 669 }, { "epoch": 0.6353722143195828, "grad_norm": 0.6618532396312571, "learning_rate": 9.624924589308591e-06, "loss": 0.0515, "step": 670 }, { "epoch": 0.6363205310573732, "grad_norm": 1.147117197534475, "learning_rate": 9.622823672972323e-06, "loss": 0.0882, "step": 671 }, { "epoch": 0.6372688477951636, "grad_norm": 0.5779383814129484, "learning_rate": 9.620717119708047e-06, "loss": 0.0659, "step": 672 }, { "epoch": 0.638217164532954, "grad_norm": 0.5799389859663083, "learning_rate": 9.618604932084427e-06, "loss": 0.0606, "step": 673 }, { "epoch": 0.6391654812707445, "grad_norm": 6.608545253943764, "learning_rate": 9.616487112677e-06, "loss": 0.066, "step": 674 }, { "epoch": 0.6401137980085349, "grad_norm": 0.7235578117181891, "learning_rate": 9.614363664068168e-06, "loss": 0.0628, "step": 675 }, { "epoch": 0.6410621147463252, "grad_norm": 0.6994528460712487, "learning_rate": 9.6122345888472e-06, "loss": 0.0628, "step": 676 }, { "epoch": 0.6420104314841157, "grad_norm": 0.6208663188504899, "learning_rate": 9.610099889610224e-06, "loss": 0.0554, "step": 677 }, { "epoch": 0.6429587482219061, "grad_norm": 0.6345977149189366, "learning_rate": 9.607959568960226e-06, "loss": 0.0632, "step": 678 }, { "epoch": 0.6439070649596965, "grad_norm": 0.8061055021711904, "learning_rate": 9.605813629507046e-06, "loss": 0.0684, "step": 679 }, { "epoch": 0.6448553816974869, "grad_norm": 0.6913423639588181, "learning_rate": 9.603662073867375e-06, "loss": 0.0673, "step": 680 }, { "epoch": 0.6458036984352774, "grad_norm": 0.7586179752230898, "learning_rate": 9.601504904664758e-06, "loss": 0.0702, "step": 681 }, { "epoch": 0.6467520151730678, "grad_norm": 0.5215807067369997, "learning_rate": 9.599342124529576e-06, "loss": 0.0484, "step": 682 }, { "epoch": 0.6477003319108582, "grad_norm": 0.4193899811291156, "learning_rate": 9.597173736099056e-06, "loss": 0.0455, "step": 683 }, { "epoch": 0.6486486486486487, "grad_norm": 1.0231627903377674, "learning_rate": 9.594999742017267e-06, "loss": 0.0755, "step": 684 }, { "epoch": 0.6495969653864391, "grad_norm": 0.5818860445113369, "learning_rate": 9.592820144935107e-06, "loss": 0.0457, "step": 685 }, { "epoch": 0.6505452821242295, "grad_norm": 0.8523614115619248, "learning_rate": 9.590634947510312e-06, "loss": 0.0666, "step": 686 }, { "epoch": 0.65149359886202, "grad_norm": 0.6819462318103672, "learning_rate": 9.588444152407441e-06, "loss": 0.0621, "step": 687 }, { "epoch": 0.6524419155998104, "grad_norm": 0.7350860734842137, "learning_rate": 9.586247762297882e-06, "loss": 0.0616, "step": 688 }, { "epoch": 0.6533902323376007, "grad_norm": 0.6877200427996193, "learning_rate": 9.584045779859848e-06, "loss": 0.0691, "step": 689 }, { "epoch": 0.6543385490753911, "grad_norm": 0.7777410132259543, "learning_rate": 9.581838207778367e-06, "loss": 0.0672, "step": 690 }, { "epoch": 0.6552868658131816, "grad_norm": 1.0340407583447775, "learning_rate": 9.579625048745281e-06, "loss": 0.0692, "step": 691 }, { "epoch": 0.656235182550972, "grad_norm": 0.6061769180463831, "learning_rate": 9.577406305459251e-06, "loss": 0.0519, "step": 692 }, { "epoch": 0.6571834992887624, "grad_norm": 0.7287017758175208, "learning_rate": 9.575181980625743e-06, "loss": 0.0626, "step": 693 }, { "epoch": 0.6581318160265529, "grad_norm": 0.6923184185544935, "learning_rate": 9.57295207695703e-06, "loss": 0.0602, "step": 694 }, { "epoch": 0.6590801327643433, "grad_norm": 0.7441802004305137, "learning_rate": 9.570716597172187e-06, "loss": 0.0785, "step": 695 }, { "epoch": 0.6600284495021337, "grad_norm": 0.5600328414907927, "learning_rate": 9.568475543997088e-06, "loss": 0.0525, "step": 696 }, { "epoch": 0.6609767662399242, "grad_norm": 0.6179093672887623, "learning_rate": 9.566228920164405e-06, "loss": 0.0498, "step": 697 }, { "epoch": 0.6619250829777146, "grad_norm": 1.0001632318997007, "learning_rate": 9.563976728413602e-06, "loss": 0.1065, "step": 698 }, { "epoch": 0.662873399715505, "grad_norm": 0.6197443639375237, "learning_rate": 9.56171897149093e-06, "loss": 0.0429, "step": 699 }, { "epoch": 0.6638217164532954, "grad_norm": 0.7426532648337794, "learning_rate": 9.55945565214943e-06, "loss": 0.0603, "step": 700 }, { "epoch": 0.6647700331910859, "grad_norm": 0.9809220324323352, "learning_rate": 9.557186773148922e-06, "loss": 0.0844, "step": 701 }, { "epoch": 0.6657183499288762, "grad_norm": 0.6596268576375636, "learning_rate": 9.554912337256007e-06, "loss": 0.0627, "step": 702 }, { "epoch": 0.6666666666666666, "grad_norm": 0.6445430375796782, "learning_rate": 9.552632347244062e-06, "loss": 0.0621, "step": 703 }, { "epoch": 0.6676149834044571, "grad_norm": 0.6854389668990125, "learning_rate": 9.550346805893236e-06, "loss": 0.0709, "step": 704 }, { "epoch": 0.6685633001422475, "grad_norm": 0.9157472924094435, "learning_rate": 9.548055715990448e-06, "loss": 0.0669, "step": 705 }, { "epoch": 0.6695116168800379, "grad_norm": 0.6210182918721243, "learning_rate": 9.545759080329381e-06, "loss": 0.0642, "step": 706 }, { "epoch": 0.6704599336178284, "grad_norm": 0.5811606762164421, "learning_rate": 9.543456901710483e-06, "loss": 0.0734, "step": 707 }, { "epoch": 0.6714082503556188, "grad_norm": 0.6797271720519124, "learning_rate": 9.541149182940958e-06, "loss": 0.0543, "step": 708 }, { "epoch": 0.6723565670934092, "grad_norm": 0.5126068611905316, "learning_rate": 9.538835926834766e-06, "loss": 0.0504, "step": 709 }, { "epoch": 0.6733048838311996, "grad_norm": 0.6464058845065579, "learning_rate": 9.536517136212623e-06, "loss": 0.0596, "step": 710 }, { "epoch": 0.6742532005689901, "grad_norm": 0.5987248394746172, "learning_rate": 9.534192813901986e-06, "loss": 0.0561, "step": 711 }, { "epoch": 0.6752015173067805, "grad_norm": 0.5757268664620699, "learning_rate": 9.531862962737065e-06, "loss": 0.0662, "step": 712 }, { "epoch": 0.6761498340445709, "grad_norm": 0.6884820373956889, "learning_rate": 9.529527585558806e-06, "loss": 0.0734, "step": 713 }, { "epoch": 0.6770981507823614, "grad_norm": 0.5599551362853026, "learning_rate": 9.5271866852149e-06, "loss": 0.0497, "step": 714 }, { "epoch": 0.6780464675201517, "grad_norm": 1.2727013612767513, "learning_rate": 9.524840264559762e-06, "loss": 0.0806, "step": 715 }, { "epoch": 0.6789947842579421, "grad_norm": 0.5125594480614294, "learning_rate": 9.522488326454551e-06, "loss": 0.0464, "step": 716 }, { "epoch": 0.6799431009957326, "grad_norm": 0.9279881234599379, "learning_rate": 9.520130873767141e-06, "loss": 0.0466, "step": 717 }, { "epoch": 0.680891417733523, "grad_norm": 0.5884738866592291, "learning_rate": 9.517767909372143e-06, "loss": 0.0463, "step": 718 }, { "epoch": 0.6818397344713134, "grad_norm": 0.6405987798189022, "learning_rate": 9.515399436150879e-06, "loss": 0.0646, "step": 719 }, { "epoch": 0.6827880512091038, "grad_norm": 0.6141893191288851, "learning_rate": 9.513025456991394e-06, "loss": 0.0713, "step": 720 }, { "epoch": 0.6837363679468943, "grad_norm": 0.5294631004623913, "learning_rate": 9.510645974788441e-06, "loss": 0.0533, "step": 721 }, { "epoch": 0.6846846846846847, "grad_norm": 0.5983803884552171, "learning_rate": 9.508260992443492e-06, "loss": 0.0574, "step": 722 }, { "epoch": 0.6856330014224751, "grad_norm": 0.7168015362345571, "learning_rate": 9.505870512864715e-06, "loss": 0.0622, "step": 723 }, { "epoch": 0.6865813181602656, "grad_norm": 0.8061703745318712, "learning_rate": 9.503474538966992e-06, "loss": 0.072, "step": 724 }, { "epoch": 0.687529634898056, "grad_norm": 0.6410612258118752, "learning_rate": 9.501073073671896e-06, "loss": 0.0454, "step": 725 }, { "epoch": 0.6884779516358464, "grad_norm": 0.790215058142473, "learning_rate": 9.498666119907701e-06, "loss": 0.0677, "step": 726 }, { "epoch": 0.6894262683736367, "grad_norm": 0.6299133472058956, "learning_rate": 9.496253680609371e-06, "loss": 0.0585, "step": 727 }, { "epoch": 0.6903745851114272, "grad_norm": 1.0623017139889208, "learning_rate": 9.493835758718561e-06, "loss": 0.069, "step": 728 }, { "epoch": 0.6913229018492176, "grad_norm": 0.5536012592608316, "learning_rate": 9.491412357183607e-06, "loss": 0.0686, "step": 729 }, { "epoch": 0.692271218587008, "grad_norm": 0.6038206755461478, "learning_rate": 9.488983478959534e-06, "loss": 0.0706, "step": 730 }, { "epoch": 0.6932195353247985, "grad_norm": 0.6342419868913964, "learning_rate": 9.486549127008037e-06, "loss": 0.0496, "step": 731 }, { "epoch": 0.6941678520625889, "grad_norm": 1.1555208683238716, "learning_rate": 9.484109304297493e-06, "loss": 0.0834, "step": 732 }, { "epoch": 0.6951161688003793, "grad_norm": 0.8509380581545992, "learning_rate": 9.481664013802943e-06, "loss": 0.0794, "step": 733 }, { "epoch": 0.6960644855381698, "grad_norm": 0.8224046322343856, "learning_rate": 9.479213258506102e-06, "loss": 0.0869, "step": 734 }, { "epoch": 0.6970128022759602, "grad_norm": 0.6505920471844966, "learning_rate": 9.476757041395342e-06, "loss": 0.0642, "step": 735 }, { "epoch": 0.6979611190137506, "grad_norm": 0.5162948092375159, "learning_rate": 9.474295365465697e-06, "loss": 0.0539, "step": 736 }, { "epoch": 0.698909435751541, "grad_norm": 0.7194486779836317, "learning_rate": 9.471828233718863e-06, "loss": 0.0585, "step": 737 }, { "epoch": 0.6998577524893315, "grad_norm": 0.9014549238602243, "learning_rate": 9.46935564916318e-06, "loss": 0.0874, "step": 738 }, { "epoch": 0.7008060692271219, "grad_norm": 0.7378312572460828, "learning_rate": 9.466877614813645e-06, "loss": 0.0657, "step": 739 }, { "epoch": 0.7017543859649122, "grad_norm": 0.815800968244944, "learning_rate": 9.464394133691891e-06, "loss": 0.0538, "step": 740 }, { "epoch": 0.7027027027027027, "grad_norm": 0.5271528573688194, "learning_rate": 9.461905208826202e-06, "loss": 0.0619, "step": 741 }, { "epoch": 0.7036510194404931, "grad_norm": 0.9062594050922635, "learning_rate": 9.459410843251496e-06, "loss": 0.0659, "step": 742 }, { "epoch": 0.7045993361782835, "grad_norm": 0.6578698656781865, "learning_rate": 9.456911040009323e-06, "loss": 0.0577, "step": 743 }, { "epoch": 0.705547652916074, "grad_norm": 0.6791351680766123, "learning_rate": 9.454405802147864e-06, "loss": 0.0669, "step": 744 }, { "epoch": 0.7064959696538644, "grad_norm": 0.7662019136887008, "learning_rate": 9.451895132721933e-06, "loss": 0.0692, "step": 745 }, { "epoch": 0.7074442863916548, "grad_norm": 0.6997379483885225, "learning_rate": 9.449379034792961e-06, "loss": 0.0609, "step": 746 }, { "epoch": 0.7083926031294452, "grad_norm": 0.6231531262832446, "learning_rate": 9.446857511429e-06, "loss": 0.0568, "step": 747 }, { "epoch": 0.7093409198672357, "grad_norm": 0.638618143024491, "learning_rate": 9.444330565704715e-06, "loss": 0.0391, "step": 748 }, { "epoch": 0.7102892366050261, "grad_norm": 0.6101709327712237, "learning_rate": 9.441798200701388e-06, "loss": 0.0692, "step": 749 }, { "epoch": 0.7112375533428165, "grad_norm": 0.7771396965466206, "learning_rate": 9.439260419506906e-06, "loss": 0.0616, "step": 750 }, { "epoch": 0.712185870080607, "grad_norm": 0.663533581873393, "learning_rate": 9.436717225215761e-06, "loss": 0.0706, "step": 751 }, { "epoch": 0.7131341868183974, "grad_norm": 0.7406791150442034, "learning_rate": 9.434168620929045e-06, "loss": 0.0759, "step": 752 }, { "epoch": 0.7140825035561877, "grad_norm": 0.6589932311994989, "learning_rate": 9.431614609754446e-06, "loss": 0.0676, "step": 753 }, { "epoch": 0.7150308202939782, "grad_norm": 0.7873737037891946, "learning_rate": 9.429055194806247e-06, "loss": 0.0661, "step": 754 }, { "epoch": 0.7159791370317686, "grad_norm": 0.6588547169267579, "learning_rate": 9.42649037920532e-06, "loss": 0.068, "step": 755 }, { "epoch": 0.716927453769559, "grad_norm": 0.8208102856389554, "learning_rate": 9.423920166079122e-06, "loss": 0.0829, "step": 756 }, { "epoch": 0.7178757705073494, "grad_norm": 0.5652492127213, "learning_rate": 9.421344558561689e-06, "loss": 0.0754, "step": 757 }, { "epoch": 0.7188240872451399, "grad_norm": 2.03543668980321, "learning_rate": 9.418763559793639e-06, "loss": 0.0469, "step": 758 }, { "epoch": 0.7197724039829303, "grad_norm": 0.7132600676949169, "learning_rate": 9.41617717292216e-06, "loss": 0.058, "step": 759 }, { "epoch": 0.7207207207207207, "grad_norm": 0.5814418519545377, "learning_rate": 9.413585401101014e-06, "loss": 0.0676, "step": 760 }, { "epoch": 0.7216690374585112, "grad_norm": 0.778087468578043, "learning_rate": 9.410988247490527e-06, "loss": 0.0565, "step": 761 }, { "epoch": 0.7226173541963016, "grad_norm": 0.5978506887698309, "learning_rate": 9.408385715257589e-06, "loss": 0.0526, "step": 762 }, { "epoch": 0.723565670934092, "grad_norm": 0.7345386180038043, "learning_rate": 9.405777807575643e-06, "loss": 0.0779, "step": 763 }, { "epoch": 0.7245139876718824, "grad_norm": 0.6765882263629432, "learning_rate": 9.403164527624695e-06, "loss": 0.0739, "step": 764 }, { "epoch": 0.7254623044096729, "grad_norm": 0.6200059319183251, "learning_rate": 9.400545878591297e-06, "loss": 0.0425, "step": 765 }, { "epoch": 0.7264106211474632, "grad_norm": 0.5764913642807622, "learning_rate": 9.397921863668545e-06, "loss": 0.0525, "step": 766 }, { "epoch": 0.7273589378852536, "grad_norm": 0.5072870053545583, "learning_rate": 9.395292486056087e-06, "loss": 0.0466, "step": 767 }, { "epoch": 0.7283072546230441, "grad_norm": 0.6266493674563252, "learning_rate": 9.3926577489601e-06, "loss": 0.0564, "step": 768 }, { "epoch": 0.7292555713608345, "grad_norm": 0.6781903020718192, "learning_rate": 9.390017655593303e-06, "loss": 0.0625, "step": 769 }, { "epoch": 0.7302038880986249, "grad_norm": 0.6970906328583575, "learning_rate": 9.387372209174943e-06, "loss": 0.0499, "step": 770 }, { "epoch": 0.7311522048364154, "grad_norm": 0.4830643779006922, "learning_rate": 9.384721412930797e-06, "loss": 0.0522, "step": 771 }, { "epoch": 0.7321005215742058, "grad_norm": 0.5981146539751457, "learning_rate": 9.382065270093164e-06, "loss": 0.0503, "step": 772 }, { "epoch": 0.7330488383119962, "grad_norm": 0.6288690777841561, "learning_rate": 9.37940378390086e-06, "loss": 0.0505, "step": 773 }, { "epoch": 0.7339971550497866, "grad_norm": 0.6043657243192845, "learning_rate": 9.376736957599219e-06, "loss": 0.048, "step": 774 }, { "epoch": 0.7349454717875771, "grad_norm": 1.3199303132586044, "learning_rate": 9.37406479444009e-06, "loss": 0.0787, "step": 775 }, { "epoch": 0.7358937885253675, "grad_norm": 0.9970354985082576, "learning_rate": 9.37138729768182e-06, "loss": 0.0593, "step": 776 }, { "epoch": 0.7368421052631579, "grad_norm": 0.6154243426982743, "learning_rate": 9.36870447058927e-06, "loss": 0.0552, "step": 777 }, { "epoch": 0.7377904220009484, "grad_norm": 0.688917247579616, "learning_rate": 9.366016316433796e-06, "loss": 0.0688, "step": 778 }, { "epoch": 0.7387387387387387, "grad_norm": 0.8890574424533809, "learning_rate": 9.363322838493252e-06, "loss": 0.0616, "step": 779 }, { "epoch": 0.7396870554765291, "grad_norm": 0.5256518464793154, "learning_rate": 9.360624040051975e-06, "loss": 0.0449, "step": 780 }, { "epoch": 0.7406353722143196, "grad_norm": 0.7015686604630017, "learning_rate": 9.357919924400802e-06, "loss": 0.0744, "step": 781 }, { "epoch": 0.74158368895211, "grad_norm": 0.5444389461448026, "learning_rate": 9.355210494837046e-06, "loss": 0.058, "step": 782 }, { "epoch": 0.7425320056899004, "grad_norm": 0.8635005280396899, "learning_rate": 9.352495754664501e-06, "loss": 0.0817, "step": 783 }, { "epoch": 0.7434803224276908, "grad_norm": 0.3975227023619501, "learning_rate": 9.349775707193439e-06, "loss": 0.0325, "step": 784 }, { "epoch": 0.7444286391654813, "grad_norm": 0.9671794171858287, "learning_rate": 9.347050355740598e-06, "loss": 0.0942, "step": 785 }, { "epoch": 0.7453769559032717, "grad_norm": 0.8627076848581986, "learning_rate": 9.34431970362919e-06, "loss": 0.0603, "step": 786 }, { "epoch": 0.7463252726410621, "grad_norm": 0.676971569472859, "learning_rate": 9.341583754188887e-06, "loss": 0.0609, "step": 787 }, { "epoch": 0.7472735893788526, "grad_norm": 0.6234019106033082, "learning_rate": 9.338842510755822e-06, "loss": 0.0527, "step": 788 }, { "epoch": 0.748221906116643, "grad_norm": 0.5688808355503273, "learning_rate": 9.336095976672578e-06, "loss": 0.0746, "step": 789 }, { "epoch": 0.7491702228544334, "grad_norm": 0.8927220033190019, "learning_rate": 9.3333441552882e-06, "loss": 0.0663, "step": 790 }, { "epoch": 0.7501185395922239, "grad_norm": 0.6760705893906477, "learning_rate": 9.33058704995817e-06, "loss": 0.0607, "step": 791 }, { "epoch": 0.7510668563300142, "grad_norm": 0.6421619908578323, "learning_rate": 9.327824664044418e-06, "loss": 0.0601, "step": 792 }, { "epoch": 0.7520151730678046, "grad_norm": 0.7064042205046658, "learning_rate": 9.32505700091531e-06, "loss": 0.0656, "step": 793 }, { "epoch": 0.752963489805595, "grad_norm": 0.6789456621715841, "learning_rate": 9.322284063945651e-06, "loss": 0.0754, "step": 794 }, { "epoch": 0.7539118065433855, "grad_norm": 0.6349001762224292, "learning_rate": 9.319505856516674e-06, "loss": 0.055, "step": 795 }, { "epoch": 0.7548601232811759, "grad_norm": 0.7970733715082516, "learning_rate": 9.316722382016037e-06, "loss": 0.0606, "step": 796 }, { "epoch": 0.7558084400189663, "grad_norm": 0.6989262918440643, "learning_rate": 9.313933643837825e-06, "loss": 0.0419, "step": 797 }, { "epoch": 0.7567567567567568, "grad_norm": 0.45444386596780545, "learning_rate": 9.311139645382539e-06, "loss": 0.0499, "step": 798 }, { "epoch": 0.7577050734945472, "grad_norm": 0.7340919059070612, "learning_rate": 9.308340390057091e-06, "loss": 0.0605, "step": 799 }, { "epoch": 0.7586533902323376, "grad_norm": 0.49624254277855845, "learning_rate": 9.305535881274812e-06, "loss": 0.038, "step": 800 }, { "epoch": 0.7596017069701281, "grad_norm": 0.5558026345234958, "learning_rate": 9.302726122455425e-06, "loss": 0.0477, "step": 801 }, { "epoch": 0.7605500237079185, "grad_norm": 0.7377034794768454, "learning_rate": 9.299911117025071e-06, "loss": 0.0798, "step": 802 }, { "epoch": 0.7614983404457089, "grad_norm": 0.6642309640857783, "learning_rate": 9.297090868416276e-06, "loss": 0.0578, "step": 803 }, { "epoch": 0.7624466571834992, "grad_norm": 0.4901567357915514, "learning_rate": 9.294265380067965e-06, "loss": 0.0546, "step": 804 }, { "epoch": 0.7633949739212897, "grad_norm": 0.5504015183910195, "learning_rate": 9.291434655425452e-06, "loss": 0.0476, "step": 805 }, { "epoch": 0.7643432906590801, "grad_norm": 0.7880325712467479, "learning_rate": 9.288598697940433e-06, "loss": 0.0967, "step": 806 }, { "epoch": 0.7652916073968705, "grad_norm": 1.0094413699993006, "learning_rate": 9.285757511070987e-06, "loss": 0.0547, "step": 807 }, { "epoch": 0.766239924134661, "grad_norm": 0.5462824953438216, "learning_rate": 9.28291109828157e-06, "loss": 0.0622, "step": 808 }, { "epoch": 0.7671882408724514, "grad_norm": 0.6095693174069973, "learning_rate": 9.28005946304301e-06, "loss": 0.054, "step": 809 }, { "epoch": 0.7681365576102418, "grad_norm": 0.5522598480936777, "learning_rate": 9.277202608832502e-06, "loss": 0.0608, "step": 810 }, { "epoch": 0.7690848743480322, "grad_norm": 0.8887551561479244, "learning_rate": 9.274340539133604e-06, "loss": 0.0733, "step": 811 }, { "epoch": 0.7700331910858227, "grad_norm": 0.6536519593388536, "learning_rate": 9.271473257436239e-06, "loss": 0.0704, "step": 812 }, { "epoch": 0.7709815078236131, "grad_norm": 0.6903014054311826, "learning_rate": 9.268600767236677e-06, "loss": 0.0839, "step": 813 }, { "epoch": 0.7719298245614035, "grad_norm": 0.5929159416904847, "learning_rate": 9.265723072037546e-06, "loss": 0.0592, "step": 814 }, { "epoch": 0.772878141299194, "grad_norm": 0.7439638317959937, "learning_rate": 9.26284017534782e-06, "loss": 0.0568, "step": 815 }, { "epoch": 0.7738264580369844, "grad_norm": 0.5860050856048022, "learning_rate": 9.259952080682812e-06, "loss": 0.0667, "step": 816 }, { "epoch": 0.7747747747747747, "grad_norm": 0.4842910654706692, "learning_rate": 9.257058791564175e-06, "loss": 0.0513, "step": 817 }, { "epoch": 0.7757230915125652, "grad_norm": 0.789038697553299, "learning_rate": 9.254160311519896e-06, "loss": 0.0557, "step": 818 }, { "epoch": 0.7766714082503556, "grad_norm": 0.5387139258318481, "learning_rate": 9.251256644084292e-06, "loss": 0.0558, "step": 819 }, { "epoch": 0.777619724988146, "grad_norm": 0.8887946106511906, "learning_rate": 9.248347792798006e-06, "loss": 0.0776, "step": 820 }, { "epoch": 0.7785680417259364, "grad_norm": 0.7477907494684204, "learning_rate": 9.245433761208e-06, "loss": 0.0706, "step": 821 }, { "epoch": 0.7795163584637269, "grad_norm": 0.8176178183928178, "learning_rate": 9.242514552867556e-06, "loss": 0.0806, "step": 822 }, { "epoch": 0.7804646752015173, "grad_norm": 0.5104409829727489, "learning_rate": 9.239590171336262e-06, "loss": 0.0427, "step": 823 }, { "epoch": 0.7814129919393077, "grad_norm": 0.5922185838285359, "learning_rate": 9.236660620180024e-06, "loss": 0.0553, "step": 824 }, { "epoch": 0.7823613086770982, "grad_norm": 0.9414341871189567, "learning_rate": 9.23372590297104e-06, "loss": 0.0678, "step": 825 }, { "epoch": 0.7833096254148886, "grad_norm": 0.49939628701466243, "learning_rate": 9.230786023287819e-06, "loss": 0.0437, "step": 826 }, { "epoch": 0.784257942152679, "grad_norm": 0.519425273825053, "learning_rate": 9.227840984715154e-06, "loss": 0.0497, "step": 827 }, { "epoch": 0.7852062588904695, "grad_norm": 0.5443123255099412, "learning_rate": 9.224890790844137e-06, "loss": 0.0612, "step": 828 }, { "epoch": 0.7861545756282599, "grad_norm": 0.511905527310258, "learning_rate": 9.221935445272144e-06, "loss": 0.0449, "step": 829 }, { "epoch": 0.7871028923660502, "grad_norm": 0.6705781452415145, "learning_rate": 9.218974951602829e-06, "loss": 0.063, "step": 830 }, { "epoch": 0.7880512091038406, "grad_norm": 0.47754646141190604, "learning_rate": 9.216009313446125e-06, "loss": 0.0688, "step": 831 }, { "epoch": 0.7889995258416311, "grad_norm": 0.5705276893342319, "learning_rate": 9.213038534418244e-06, "loss": 0.0686, "step": 832 }, { "epoch": 0.7899478425794215, "grad_norm": 0.4253509537520698, "learning_rate": 9.21006261814166e-06, "loss": 0.0427, "step": 833 }, { "epoch": 0.7908961593172119, "grad_norm": 0.533220697742502, "learning_rate": 9.207081568245112e-06, "loss": 0.0394, "step": 834 }, { "epoch": 0.7918444760550024, "grad_norm": 0.5786737951816707, "learning_rate": 9.2040953883636e-06, "loss": 0.0556, "step": 835 }, { "epoch": 0.7927927927927928, "grad_norm": 1.05765776588404, "learning_rate": 9.20110408213838e-06, "loss": 0.0388, "step": 836 }, { "epoch": 0.7937411095305832, "grad_norm": 0.809530041430475, "learning_rate": 9.19810765321696e-06, "loss": 0.1042, "step": 837 }, { "epoch": 0.7946894262683737, "grad_norm": 0.4767483114016521, "learning_rate": 9.19510610525309e-06, "loss": 0.0586, "step": 838 }, { "epoch": 0.7956377430061641, "grad_norm": 0.6212000890855088, "learning_rate": 9.192099441906765e-06, "loss": 0.063, "step": 839 }, { "epoch": 0.7965860597439545, "grad_norm": 0.5793471462839893, "learning_rate": 9.189087666844219e-06, "loss": 0.0599, "step": 840 }, { "epoch": 0.7975343764817449, "grad_norm": 0.6109133021965912, "learning_rate": 9.186070783737915e-06, "loss": 0.0655, "step": 841 }, { "epoch": 0.7984826932195354, "grad_norm": 1.7579309929430755, "learning_rate": 9.183048796266547e-06, "loss": 0.0531, "step": 842 }, { "epoch": 0.7994310099573257, "grad_norm": 0.6305893305402994, "learning_rate": 9.180021708115034e-06, "loss": 0.069, "step": 843 }, { "epoch": 0.8003793266951161, "grad_norm": 0.5799218206040034, "learning_rate": 9.176989522974512e-06, "loss": 0.0548, "step": 844 }, { "epoch": 0.8013276434329066, "grad_norm": 0.5205329821796497, "learning_rate": 9.173952244542335e-06, "loss": 0.0551, "step": 845 }, { "epoch": 0.802275960170697, "grad_norm": 0.6401356176971456, "learning_rate": 9.170909876522067e-06, "loss": 0.0613, "step": 846 }, { "epoch": 0.8032242769084874, "grad_norm": 0.6283553782308525, "learning_rate": 9.167862422623474e-06, "loss": 0.0681, "step": 847 }, { "epoch": 0.8041725936462779, "grad_norm": 0.5291087716357314, "learning_rate": 9.164809886562532e-06, "loss": 0.0428, "step": 848 }, { "epoch": 0.8051209103840683, "grad_norm": 0.6176212098121372, "learning_rate": 9.161752272061405e-06, "loss": 0.0607, "step": 849 }, { "epoch": 0.8060692271218587, "grad_norm": 0.5258734780929885, "learning_rate": 9.158689582848454e-06, "loss": 0.0555, "step": 850 }, { "epoch": 0.8070175438596491, "grad_norm": 0.5473102285657928, "learning_rate": 9.155621822658229e-06, "loss": 0.0461, "step": 851 }, { "epoch": 0.8079658605974396, "grad_norm": 0.7147069989389465, "learning_rate": 9.15254899523146e-06, "loss": 0.0699, "step": 852 }, { "epoch": 0.80891417733523, "grad_norm": 0.5116476113725856, "learning_rate": 9.14947110431506e-06, "loss": 0.0593, "step": 853 }, { "epoch": 0.8098624940730204, "grad_norm": 0.599625799358922, "learning_rate": 9.146388153662109e-06, "loss": 0.0719, "step": 854 }, { "epoch": 0.8108108108108109, "grad_norm": 0.5657265833927722, "learning_rate": 9.143300147031866e-06, "loss": 0.0539, "step": 855 }, { "epoch": 0.8117591275486012, "grad_norm": 0.490285928003467, "learning_rate": 9.14020708818975e-06, "loss": 0.0551, "step": 856 }, { "epoch": 0.8127074442863916, "grad_norm": 0.5667257690255696, "learning_rate": 9.137108980907341e-06, "loss": 0.0485, "step": 857 }, { "epoch": 0.813655761024182, "grad_norm": 0.7288808283591064, "learning_rate": 9.134005828962373e-06, "loss": 0.0464, "step": 858 }, { "epoch": 0.8146040777619725, "grad_norm": 0.6578159612053353, "learning_rate": 9.130897636138736e-06, "loss": 0.0458, "step": 859 }, { "epoch": 0.8155523944997629, "grad_norm": 0.6699312438910595, "learning_rate": 9.127784406226462e-06, "loss": 0.0484, "step": 860 }, { "epoch": 0.8165007112375533, "grad_norm": 0.7577555099867996, "learning_rate": 9.124666143021728e-06, "loss": 0.0552, "step": 861 }, { "epoch": 0.8174490279753438, "grad_norm": 0.6571718231580975, "learning_rate": 9.121542850326849e-06, "loss": 0.0418, "step": 862 }, { "epoch": 0.8183973447131342, "grad_norm": 0.7375729716381728, "learning_rate": 9.118414531950268e-06, "loss": 0.0586, "step": 863 }, { "epoch": 0.8193456614509246, "grad_norm": 0.7778186396499422, "learning_rate": 9.115281191706563e-06, "loss": 0.0638, "step": 864 }, { "epoch": 0.8202939781887151, "grad_norm": 1.1753642296648885, "learning_rate": 9.11214283341643e-06, "loss": 0.0914, "step": 865 }, { "epoch": 0.8212422949265055, "grad_norm": 0.6221136965708439, "learning_rate": 9.108999460906687e-06, "loss": 0.0513, "step": 866 }, { "epoch": 0.8221906116642959, "grad_norm": 1.0126156537474953, "learning_rate": 9.105851078010265e-06, "loss": 0.0511, "step": 867 }, { "epoch": 0.8231389284020862, "grad_norm": 0.609505398312846, "learning_rate": 9.102697688566204e-06, "loss": 0.0607, "step": 868 }, { "epoch": 0.8240872451398767, "grad_norm": 0.6781545775462046, "learning_rate": 9.09953929641965e-06, "loss": 0.0537, "step": 869 }, { "epoch": 0.8250355618776671, "grad_norm": 0.6162914997785193, "learning_rate": 9.096375905421849e-06, "loss": 0.0514, "step": 870 }, { "epoch": 0.8259838786154575, "grad_norm": 0.9380195573648793, "learning_rate": 9.093207519430138e-06, "loss": 0.0592, "step": 871 }, { "epoch": 0.826932195353248, "grad_norm": 0.6891518456384623, "learning_rate": 9.090034142307955e-06, "loss": 0.0611, "step": 872 }, { "epoch": 0.8278805120910384, "grad_norm": 0.6860355795137043, "learning_rate": 9.086855777924813e-06, "loss": 0.0651, "step": 873 }, { "epoch": 0.8288288288288288, "grad_norm": 0.5941193542193252, "learning_rate": 9.083672430156313e-06, "loss": 0.0561, "step": 874 }, { "epoch": 0.8297771455666193, "grad_norm": 0.9859763647912905, "learning_rate": 9.080484102884132e-06, "loss": 0.0558, "step": 875 }, { "epoch": 0.8307254623044097, "grad_norm": 0.6607364577205248, "learning_rate": 9.077290799996015e-06, "loss": 0.0445, "step": 876 }, { "epoch": 0.8316737790422001, "grad_norm": 0.4579344621348973, "learning_rate": 9.074092525385777e-06, "loss": 0.0532, "step": 877 }, { "epoch": 0.8326220957799905, "grad_norm": 0.44020290978074095, "learning_rate": 9.070889282953297e-06, "loss": 0.0432, "step": 878 }, { "epoch": 0.833570412517781, "grad_norm": 0.7817453278171299, "learning_rate": 9.067681076604507e-06, "loss": 0.0622, "step": 879 }, { "epoch": 0.8345187292555714, "grad_norm": 0.6106825636941368, "learning_rate": 9.064467910251396e-06, "loss": 0.0499, "step": 880 }, { "epoch": 0.8354670459933617, "grad_norm": 0.5733918003298187, "learning_rate": 9.061249787812e-06, "loss": 0.058, "step": 881 }, { "epoch": 0.8364153627311522, "grad_norm": 0.734104839469145, "learning_rate": 9.058026713210396e-06, "loss": 0.0603, "step": 882 }, { "epoch": 0.8373636794689426, "grad_norm": 0.5863205921902287, "learning_rate": 9.054798690376702e-06, "loss": 0.0542, "step": 883 }, { "epoch": 0.838311996206733, "grad_norm": 0.6529541400114963, "learning_rate": 9.051565723247072e-06, "loss": 0.0546, "step": 884 }, { "epoch": 0.8392603129445235, "grad_norm": 0.8496840763418192, "learning_rate": 9.048327815763682e-06, "loss": 0.0499, "step": 885 }, { "epoch": 0.8402086296823139, "grad_norm": 0.4879463969986272, "learning_rate": 9.045084971874738e-06, "loss": 0.0404, "step": 886 }, { "epoch": 0.8411569464201043, "grad_norm": 0.48366631890428774, "learning_rate": 9.041837195534462e-06, "loss": 0.0438, "step": 887 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5668428745474414, "learning_rate": 9.038584490703095e-06, "loss": 0.0577, "step": 888 }, { "epoch": 0.8430535798956852, "grad_norm": 0.6139669391301276, "learning_rate": 9.03532686134688e-06, "loss": 0.0699, "step": 889 }, { "epoch": 0.8440018966334756, "grad_norm": 0.6191388281587789, "learning_rate": 9.032064311438073e-06, "loss": 0.0588, "step": 890 }, { "epoch": 0.844950213371266, "grad_norm": 0.6152385003685913, "learning_rate": 9.028796844954924e-06, "loss": 0.0537, "step": 891 }, { "epoch": 0.8458985301090565, "grad_norm": 0.755005932732524, "learning_rate": 9.025524465881683e-06, "loss": 0.0649, "step": 892 }, { "epoch": 0.8468468468468469, "grad_norm": 0.7676513218085563, "learning_rate": 9.022247178208585e-06, "loss": 0.0635, "step": 893 }, { "epoch": 0.8477951635846372, "grad_norm": 0.5920047067355723, "learning_rate": 9.018964985931856e-06, "loss": 0.06, "step": 894 }, { "epoch": 0.8487434803224277, "grad_norm": 0.5807083572715754, "learning_rate": 9.015677893053695e-06, "loss": 0.0505, "step": 895 }, { "epoch": 0.8496917970602181, "grad_norm": 0.7897487160161104, "learning_rate": 9.012385903582286e-06, "loss": 0.0714, "step": 896 }, { "epoch": 0.8506401137980085, "grad_norm": 0.5382652341176712, "learning_rate": 9.009089021531777e-06, "loss": 0.0512, "step": 897 }, { "epoch": 0.8515884305357989, "grad_norm": 0.8441756486986386, "learning_rate": 9.005787250922285e-06, "loss": 0.0766, "step": 898 }, { "epoch": 0.8525367472735894, "grad_norm": 0.42966299233294036, "learning_rate": 9.002480595779883e-06, "loss": 0.0469, "step": 899 }, { "epoch": 0.8534850640113798, "grad_norm": 0.5779848432711783, "learning_rate": 8.999169060136609e-06, "loss": 0.0549, "step": 900 }, { "epoch": 0.8544333807491702, "grad_norm": 0.49828391414464324, "learning_rate": 8.995852648030444e-06, "loss": 0.0513, "step": 901 }, { "epoch": 0.8553816974869607, "grad_norm": 0.5712972033755797, "learning_rate": 8.99253136350532e-06, "loss": 0.0642, "step": 902 }, { "epoch": 0.8563300142247511, "grad_norm": 0.7463859566833713, "learning_rate": 8.989205210611106e-06, "loss": 0.0669, "step": 903 }, { "epoch": 0.8572783309625415, "grad_norm": 0.6015754760898006, "learning_rate": 8.98587419340361e-06, "loss": 0.0518, "step": 904 }, { "epoch": 0.8582266477003319, "grad_norm": 0.7279488477743896, "learning_rate": 8.982538315944573e-06, "loss": 0.0603, "step": 905 }, { "epoch": 0.8591749644381224, "grad_norm": 0.41210687518386613, "learning_rate": 8.979197582301662e-06, "loss": 0.0508, "step": 906 }, { "epoch": 0.8601232811759127, "grad_norm": 1.4900739335277513, "learning_rate": 8.97585199654846e-06, "loss": 0.072, "step": 907 }, { "epoch": 0.8610715979137031, "grad_norm": 0.5450963951689192, "learning_rate": 8.972501562764476e-06, "loss": 0.0566, "step": 908 }, { "epoch": 0.8620199146514936, "grad_norm": 0.5356916948533633, "learning_rate": 8.969146285035119e-06, "loss": 0.0471, "step": 909 }, { "epoch": 0.862968231389284, "grad_norm": 0.6064958608566305, "learning_rate": 8.965786167451713e-06, "loss": 0.0586, "step": 910 }, { "epoch": 0.8639165481270744, "grad_norm": 0.6550030676781202, "learning_rate": 8.962421214111486e-06, "loss": 0.0622, "step": 911 }, { "epoch": 0.8648648648648649, "grad_norm": 0.5789487697080219, "learning_rate": 8.959051429117551e-06, "loss": 0.0587, "step": 912 }, { "epoch": 0.8658131816026553, "grad_norm": 0.6480466907010984, "learning_rate": 8.955676816578922e-06, "loss": 0.0596, "step": 913 }, { "epoch": 0.8667614983404457, "grad_norm": 0.703037972481164, "learning_rate": 8.9522973806105e-06, "loss": 0.0836, "step": 914 }, { "epoch": 0.8677098150782361, "grad_norm": 0.49499510899266297, "learning_rate": 8.94891312533306e-06, "loss": 0.0493, "step": 915 }, { "epoch": 0.8686581318160266, "grad_norm": 0.4679737716122778, "learning_rate": 8.945524054873261e-06, "loss": 0.0473, "step": 916 }, { "epoch": 0.869606448553817, "grad_norm": 0.4868047238192127, "learning_rate": 8.942130173363628e-06, "loss": 0.0617, "step": 917 }, { "epoch": 0.8705547652916074, "grad_norm": 0.48143223119722567, "learning_rate": 8.938731484942557e-06, "loss": 0.0459, "step": 918 }, { "epoch": 0.8715030820293979, "grad_norm": 0.5109365563225756, "learning_rate": 8.935327993754307e-06, "loss": 0.0603, "step": 919 }, { "epoch": 0.8724513987671882, "grad_norm": 0.5946328530954544, "learning_rate": 8.931919703948981e-06, "loss": 0.0663, "step": 920 }, { "epoch": 0.8733997155049786, "grad_norm": 0.6675396299202498, "learning_rate": 8.928506619682549e-06, "loss": 0.0522, "step": 921 }, { "epoch": 0.8743480322427691, "grad_norm": 0.5242785281728278, "learning_rate": 8.925088745116817e-06, "loss": 0.0477, "step": 922 }, { "epoch": 0.8752963489805595, "grad_norm": 0.4607255100157249, "learning_rate": 8.921666084419435e-06, "loss": 0.0444, "step": 923 }, { "epoch": 0.8762446657183499, "grad_norm": 0.6127086410246447, "learning_rate": 8.918238641763894e-06, "loss": 0.0505, "step": 924 }, { "epoch": 0.8771929824561403, "grad_norm": 0.7108664485212953, "learning_rate": 8.914806421329505e-06, "loss": 0.0372, "step": 925 }, { "epoch": 0.8781412991939308, "grad_norm": 0.48171514690034495, "learning_rate": 8.911369427301418e-06, "loss": 0.0467, "step": 926 }, { "epoch": 0.8790896159317212, "grad_norm": 0.5032020795283936, "learning_rate": 8.907927663870592e-06, "loss": 0.0383, "step": 927 }, { "epoch": 0.8800379326695116, "grad_norm": 0.6490864569323296, "learning_rate": 8.90448113523381e-06, "loss": 0.0703, "step": 928 }, { "epoch": 0.8809862494073021, "grad_norm": 0.5274849878368799, "learning_rate": 8.901029845593658e-06, "loss": 0.0497, "step": 929 }, { "epoch": 0.8819345661450925, "grad_norm": 0.7209898569229573, "learning_rate": 8.897573799158534e-06, "loss": 0.0845, "step": 930 }, { "epoch": 0.8828828828828829, "grad_norm": 0.653701403062353, "learning_rate": 8.894113000142636e-06, "loss": 0.0528, "step": 931 }, { "epoch": 0.8838311996206734, "grad_norm": 0.5252034559155617, "learning_rate": 8.890647452765954e-06, "loss": 0.054, "step": 932 }, { "epoch": 0.8847795163584637, "grad_norm": 0.6597062824750437, "learning_rate": 8.887177161254267e-06, "loss": 0.0508, "step": 933 }, { "epoch": 0.8857278330962541, "grad_norm": 0.9841434864966624, "learning_rate": 8.883702129839144e-06, "loss": 0.06, "step": 934 }, { "epoch": 0.8866761498340445, "grad_norm": 0.4716559195813748, "learning_rate": 8.880222362757928e-06, "loss": 0.0484, "step": 935 }, { "epoch": 0.887624466571835, "grad_norm": 0.6275887169553205, "learning_rate": 8.87673786425374e-06, "loss": 0.055, "step": 936 }, { "epoch": 0.8885727833096254, "grad_norm": 0.5480616561224483, "learning_rate": 8.87324863857547e-06, "loss": 0.0512, "step": 937 }, { "epoch": 0.8895211000474158, "grad_norm": 0.5716073816122306, "learning_rate": 8.869754689977774e-06, "loss": 0.0575, "step": 938 }, { "epoch": 0.8904694167852063, "grad_norm": 0.8761043849726794, "learning_rate": 8.866256022721062e-06, "loss": 0.0508, "step": 939 }, { "epoch": 0.8914177335229967, "grad_norm": 0.7017157731117182, "learning_rate": 8.862752641071499e-06, "loss": 0.0546, "step": 940 }, { "epoch": 0.8923660502607871, "grad_norm": 1.5138916151321196, "learning_rate": 8.859244549301005e-06, "loss": 0.0658, "step": 941 }, { "epoch": 0.8933143669985776, "grad_norm": 0.8433261605133346, "learning_rate": 8.855731751687233e-06, "loss": 0.0553, "step": 942 }, { "epoch": 0.894262683736368, "grad_norm": 0.5494966721887847, "learning_rate": 8.852214252513582e-06, "loss": 0.0494, "step": 943 }, { "epoch": 0.8952110004741584, "grad_norm": 0.6006177701179363, "learning_rate": 8.848692056069184e-06, "loss": 0.0612, "step": 944 }, { "epoch": 0.8961593172119487, "grad_norm": 0.6876171031491582, "learning_rate": 8.84516516664889e-06, "loss": 0.0609, "step": 945 }, { "epoch": 0.8971076339497392, "grad_norm": 0.846588378426009, "learning_rate": 8.841633588553287e-06, "loss": 0.0593, "step": 946 }, { "epoch": 0.8980559506875296, "grad_norm": 1.175631640532978, "learning_rate": 8.838097326088667e-06, "loss": 0.0767, "step": 947 }, { "epoch": 0.89900426742532, "grad_norm": 0.7010270158444133, "learning_rate": 8.834556383567042e-06, "loss": 0.0637, "step": 948 }, { "epoch": 0.8999525841631105, "grad_norm": 0.7103962193756044, "learning_rate": 8.831010765306124e-06, "loss": 0.047, "step": 949 }, { "epoch": 0.9009009009009009, "grad_norm": 0.9919713077792982, "learning_rate": 8.827460475629334e-06, "loss": 0.0699, "step": 950 }, { "epoch": 0.9018492176386913, "grad_norm": 0.9438936607800321, "learning_rate": 8.823905518865782e-06, "loss": 0.0962, "step": 951 }, { "epoch": 0.9027975343764817, "grad_norm": 0.41357107371942303, "learning_rate": 8.820345899350275e-06, "loss": 0.0393, "step": 952 }, { "epoch": 0.9037458511142722, "grad_norm": 0.6094306471098007, "learning_rate": 8.8167816214233e-06, "loss": 0.0547, "step": 953 }, { "epoch": 0.9046941678520626, "grad_norm": 0.45434395748515616, "learning_rate": 8.81321268943103e-06, "loss": 0.0458, "step": 954 }, { "epoch": 0.905642484589853, "grad_norm": 0.584662000585842, "learning_rate": 8.809639107725308e-06, "loss": 0.0684, "step": 955 }, { "epoch": 0.9065908013276435, "grad_norm": 0.6281479664499341, "learning_rate": 8.80606088066365e-06, "loss": 0.0485, "step": 956 }, { "epoch": 0.9075391180654339, "grad_norm": 0.5220137398785665, "learning_rate": 8.802478012609235e-06, "loss": 0.0478, "step": 957 }, { "epoch": 0.9084874348032242, "grad_norm": 0.7613507347001472, "learning_rate": 8.798890507930899e-06, "loss": 0.0534, "step": 958 }, { "epoch": 0.9094357515410147, "grad_norm": 0.5338153539509801, "learning_rate": 8.795298371003138e-06, "loss": 0.0467, "step": 959 }, { "epoch": 0.9103840682788051, "grad_norm": 0.508435320780577, "learning_rate": 8.791701606206092e-06, "loss": 0.05, "step": 960 }, { "epoch": 0.9113323850165955, "grad_norm": 0.6801979027503147, "learning_rate": 8.788100217925541e-06, "loss": 0.0654, "step": 961 }, { "epoch": 0.9122807017543859, "grad_norm": 0.5472159955708181, "learning_rate": 8.78449421055291e-06, "loss": 0.0566, "step": 962 }, { "epoch": 0.9132290184921764, "grad_norm": 0.5546852372370231, "learning_rate": 8.78088358848525e-06, "loss": 0.0544, "step": 963 }, { "epoch": 0.9141773352299668, "grad_norm": 0.7376086419870055, "learning_rate": 8.777268356125244e-06, "loss": 0.0618, "step": 964 }, { "epoch": 0.9151256519677572, "grad_norm": 0.461174714622349, "learning_rate": 8.773648517881194e-06, "loss": 0.0527, "step": 965 }, { "epoch": 0.9160739687055477, "grad_norm": 1.100649311314461, "learning_rate": 8.770024078167017e-06, "loss": 0.075, "step": 966 }, { "epoch": 0.9170222854433381, "grad_norm": 0.5385193734337945, "learning_rate": 8.766395041402245e-06, "loss": 0.056, "step": 967 }, { "epoch": 0.9179706021811285, "grad_norm": 0.4215583451342763, "learning_rate": 8.762761412012011e-06, "loss": 0.045, "step": 968 }, { "epoch": 0.918918918918919, "grad_norm": 0.5690890175604749, "learning_rate": 8.75912319442705e-06, "loss": 0.0568, "step": 969 }, { "epoch": 0.9198672356567094, "grad_norm": 0.5598668678593514, "learning_rate": 8.755480393083694e-06, "loss": 0.0629, "step": 970 }, { "epoch": 0.9208155523944997, "grad_norm": 0.4230299561301444, "learning_rate": 8.751833012423861e-06, "loss": 0.0402, "step": 971 }, { "epoch": 0.9217638691322901, "grad_norm": 0.8504416588391118, "learning_rate": 8.74818105689505e-06, "loss": 0.0521, "step": 972 }, { "epoch": 0.9227121858700806, "grad_norm": 0.461086821346764, "learning_rate": 8.744524530950351e-06, "loss": 0.0426, "step": 973 }, { "epoch": 0.923660502607871, "grad_norm": 0.5086789755859074, "learning_rate": 8.740863439048412e-06, "loss": 0.0487, "step": 974 }, { "epoch": 0.9246088193456614, "grad_norm": 0.45915883182777006, "learning_rate": 8.737197785653457e-06, "loss": 0.0444, "step": 975 }, { "epoch": 0.9255571360834519, "grad_norm": 0.6701095989032753, "learning_rate": 8.73352757523527e-06, "loss": 0.0707, "step": 976 }, { "epoch": 0.9265054528212423, "grad_norm": 0.460793794881083, "learning_rate": 8.729852812269192e-06, "loss": 0.0462, "step": 977 }, { "epoch": 0.9274537695590327, "grad_norm": 0.42146552351647865, "learning_rate": 8.726173501236115e-06, "loss": 0.0413, "step": 978 }, { "epoch": 0.9284020862968232, "grad_norm": 0.4515670497285217, "learning_rate": 8.722489646622477e-06, "loss": 0.0486, "step": 979 }, { "epoch": 0.9293504030346136, "grad_norm": 0.7298661971153528, "learning_rate": 8.718801252920257e-06, "loss": 0.0728, "step": 980 }, { "epoch": 0.930298719772404, "grad_norm": 0.6123325398467794, "learning_rate": 8.715108324626967e-06, "loss": 0.0528, "step": 981 }, { "epoch": 0.9312470365101944, "grad_norm": 0.5334963078534037, "learning_rate": 8.711410866245648e-06, "loss": 0.0409, "step": 982 }, { "epoch": 0.9321953532479849, "grad_norm": 0.44851971952458897, "learning_rate": 8.70770888228487e-06, "loss": 0.0509, "step": 983 }, { "epoch": 0.9331436699857752, "grad_norm": 0.9770313333004932, "learning_rate": 8.704002377258714e-06, "loss": 0.0463, "step": 984 }, { "epoch": 0.9340919867235656, "grad_norm": 0.7370636377202378, "learning_rate": 8.700291355686779e-06, "loss": 0.0637, "step": 985 }, { "epoch": 0.9350403034613561, "grad_norm": 0.6070776528057518, "learning_rate": 8.69657582209417e-06, "loss": 0.0488, "step": 986 }, { "epoch": 0.9359886201991465, "grad_norm": 0.7278417266877663, "learning_rate": 8.692855781011494e-06, "loss": 0.0501, "step": 987 }, { "epoch": 0.9369369369369369, "grad_norm": 0.4731052806759658, "learning_rate": 8.689131236974853e-06, "loss": 0.0417, "step": 988 }, { "epoch": 0.9378852536747273, "grad_norm": 0.45598792555472306, "learning_rate": 8.68540219452584e-06, "loss": 0.0396, "step": 989 }, { "epoch": 0.9388335704125178, "grad_norm": 0.5661429908370399, "learning_rate": 8.681668658211535e-06, "loss": 0.0577, "step": 990 }, { "epoch": 0.9397818871503082, "grad_norm": 0.41955875165931145, "learning_rate": 8.677930632584496e-06, "loss": 0.0432, "step": 991 }, { "epoch": 0.9407302038880986, "grad_norm": 0.4107826749470781, "learning_rate": 8.674188122202756e-06, "loss": 0.0535, "step": 992 }, { "epoch": 0.9416785206258891, "grad_norm": 0.47653411892607034, "learning_rate": 8.670441131629816e-06, "loss": 0.0586, "step": 993 }, { "epoch": 0.9426268373636795, "grad_norm": 0.53171021829938, "learning_rate": 8.66668966543464e-06, "loss": 0.0518, "step": 994 }, { "epoch": 0.9435751541014699, "grad_norm": 0.43148473645836083, "learning_rate": 8.662933728191651e-06, "loss": 0.0431, "step": 995 }, { "epoch": 0.9445234708392604, "grad_norm": 0.4471351558402442, "learning_rate": 8.659173324480722e-06, "loss": 0.0438, "step": 996 }, { "epoch": 0.9454717875770507, "grad_norm": 0.5782265716940447, "learning_rate": 8.65540845888717e-06, "loss": 0.0719, "step": 997 }, { "epoch": 0.9464201043148411, "grad_norm": 0.4141433604011682, "learning_rate": 8.651639136001762e-06, "loss": 0.0469, "step": 998 }, { "epoch": 0.9473684210526315, "grad_norm": 0.6061219180547935, "learning_rate": 8.647865360420686e-06, "loss": 0.0489, "step": 999 }, { "epoch": 0.948316737790422, "grad_norm": 0.48916698447496854, "learning_rate": 8.644087136745572e-06, "loss": 0.0513, "step": 1000 }, { "epoch": 0.9492650545282124, "grad_norm": 0.37441669864478105, "learning_rate": 8.640304469583469e-06, "loss": 0.0412, "step": 1001 }, { "epoch": 0.9502133712660028, "grad_norm": 1.0623152293680482, "learning_rate": 8.636517363546838e-06, "loss": 0.0655, "step": 1002 }, { "epoch": 0.9511616880037933, "grad_norm": 0.7061581986197312, "learning_rate": 8.63272582325357e-06, "loss": 0.0499, "step": 1003 }, { "epoch": 0.9521100047415837, "grad_norm": 0.5399127227606683, "learning_rate": 8.62892985332694e-06, "loss": 0.0518, "step": 1004 }, { "epoch": 0.9530583214793741, "grad_norm": 0.4559892605058489, "learning_rate": 8.625129458395643e-06, "loss": 0.0459, "step": 1005 }, { "epoch": 0.9540066382171646, "grad_norm": 0.485355373272851, "learning_rate": 8.621324643093762e-06, "loss": 0.0454, "step": 1006 }, { "epoch": 0.954954954954955, "grad_norm": 0.7459047370537332, "learning_rate": 8.617515412060771e-06, "loss": 0.06, "step": 1007 }, { "epoch": 0.9559032716927454, "grad_norm": 0.7374476556281685, "learning_rate": 8.613701769941526e-06, "loss": 0.0677, "step": 1008 }, { "epoch": 0.9568515884305357, "grad_norm": 0.5640575902917073, "learning_rate": 8.609883721386266e-06, "loss": 0.0464, "step": 1009 }, { "epoch": 0.9577999051683262, "grad_norm": 0.5329518829334081, "learning_rate": 8.606061271050601e-06, "loss": 0.0422, "step": 1010 }, { "epoch": 0.9587482219061166, "grad_norm": 0.5672285885118362, "learning_rate": 8.602234423595509e-06, "loss": 0.0432, "step": 1011 }, { "epoch": 0.959696538643907, "grad_norm": 0.49279890911522445, "learning_rate": 8.598403183687328e-06, "loss": 0.0411, "step": 1012 }, { "epoch": 0.9606448553816975, "grad_norm": 0.5195118583178678, "learning_rate": 8.594567555997755e-06, "loss": 0.0575, "step": 1013 }, { "epoch": 0.9615931721194879, "grad_norm": 1.372925234445775, "learning_rate": 8.590727545203833e-06, "loss": 0.0615, "step": 1014 }, { "epoch": 0.9625414888572783, "grad_norm": 0.7147315054833345, "learning_rate": 8.586883155987955e-06, "loss": 0.0712, "step": 1015 }, { "epoch": 0.9634898055950688, "grad_norm": 0.5802509696174448, "learning_rate": 8.583034393037848e-06, "loss": 0.0552, "step": 1016 }, { "epoch": 0.9644381223328592, "grad_norm": 0.49007583048635933, "learning_rate": 8.579181261046576e-06, "loss": 0.0449, "step": 1017 }, { "epoch": 0.9653864390706496, "grad_norm": 0.48751614831454176, "learning_rate": 8.57532376471253e-06, "loss": 0.0475, "step": 1018 }, { "epoch": 0.96633475580844, "grad_norm": 0.6496160692100631, "learning_rate": 8.571461908739415e-06, "loss": 0.0523, "step": 1019 }, { "epoch": 0.9672830725462305, "grad_norm": 0.481345745516473, "learning_rate": 8.567595697836266e-06, "loss": 0.0515, "step": 1020 }, { "epoch": 0.9682313892840209, "grad_norm": 0.5247818144993567, "learning_rate": 8.563725136717419e-06, "loss": 0.0494, "step": 1021 }, { "epoch": 0.9691797060218112, "grad_norm": 0.8474516614825078, "learning_rate": 8.559850230102513e-06, "loss": 0.0578, "step": 1022 }, { "epoch": 0.9701280227596017, "grad_norm": 0.7494686751693889, "learning_rate": 8.555970982716492e-06, "loss": 0.0613, "step": 1023 }, { "epoch": 0.9710763394973921, "grad_norm": 0.528161959351856, "learning_rate": 8.55208739928959e-06, "loss": 0.0446, "step": 1024 }, { "epoch": 0.9720246562351825, "grad_norm": 0.7556057248494816, "learning_rate": 8.54819948455733e-06, "loss": 0.0611, "step": 1025 }, { "epoch": 0.972972972972973, "grad_norm": 0.4857201457975449, "learning_rate": 8.54430724326051e-06, "loss": 0.0396, "step": 1026 }, { "epoch": 0.9739212897107634, "grad_norm": 0.4633933638270801, "learning_rate": 8.540410680145213e-06, "loss": 0.045, "step": 1027 }, { "epoch": 0.9748696064485538, "grad_norm": 0.5215732727679809, "learning_rate": 8.536509799962784e-06, "loss": 0.047, "step": 1028 }, { "epoch": 0.9758179231863442, "grad_norm": 1.5449712519877792, "learning_rate": 8.532604607469839e-06, "loss": 0.0717, "step": 1029 }, { "epoch": 0.9767662399241347, "grad_norm": 0.46693259860172376, "learning_rate": 8.528695107428247e-06, "loss": 0.0458, "step": 1030 }, { "epoch": 0.9777145566619251, "grad_norm": 0.5388054089062692, "learning_rate": 8.52478130460513e-06, "loss": 0.047, "step": 1031 }, { "epoch": 0.9786628733997155, "grad_norm": 0.5283181708144433, "learning_rate": 8.520863203772858e-06, "loss": 0.0496, "step": 1032 }, { "epoch": 0.979611190137506, "grad_norm": 0.5890035811704775, "learning_rate": 8.516940809709044e-06, "loss": 0.0437, "step": 1033 }, { "epoch": 0.9805595068752964, "grad_norm": 0.446739345865473, "learning_rate": 8.513014127196533e-06, "loss": 0.042, "step": 1034 }, { "epoch": 0.9815078236130867, "grad_norm": 0.49851759898580866, "learning_rate": 8.509083161023399e-06, "loss": 0.0553, "step": 1035 }, { "epoch": 0.9824561403508771, "grad_norm": 0.8986990099986447, "learning_rate": 8.505147915982943e-06, "loss": 0.0491, "step": 1036 }, { "epoch": 0.9834044570886676, "grad_norm": 0.4813313700157437, "learning_rate": 8.501208396873677e-06, "loss": 0.0524, "step": 1037 }, { "epoch": 0.984352773826458, "grad_norm": 0.7823009578163489, "learning_rate": 8.497264608499332e-06, "loss": 0.0542, "step": 1038 }, { "epoch": 0.9853010905642484, "grad_norm": 0.5256393060960738, "learning_rate": 8.49331655566884e-06, "loss": 0.0545, "step": 1039 }, { "epoch": 0.9862494073020389, "grad_norm": 0.5400471979930811, "learning_rate": 8.489364243196334e-06, "loss": 0.0495, "step": 1040 }, { "epoch": 0.9871977240398293, "grad_norm": 0.5862041954662611, "learning_rate": 8.485407675901142e-06, "loss": 0.0442, "step": 1041 }, { "epoch": 0.9881460407776197, "grad_norm": 0.6834922008296388, "learning_rate": 8.48144685860778e-06, "loss": 0.064, "step": 1042 }, { "epoch": 0.9890943575154102, "grad_norm": 0.8002369541010694, "learning_rate": 8.477481796145945e-06, "loss": 0.0464, "step": 1043 }, { "epoch": 0.9900426742532006, "grad_norm": 0.47393154077930216, "learning_rate": 8.47351249335051e-06, "loss": 0.0485, "step": 1044 }, { "epoch": 0.990990990990991, "grad_norm": 0.4987272807246751, "learning_rate": 8.469538955061525e-06, "loss": 0.0478, "step": 1045 }, { "epoch": 0.9919393077287814, "grad_norm": 0.6406968710094035, "learning_rate": 8.465561186124193e-06, "loss": 0.0494, "step": 1046 }, { "epoch": 0.9928876244665719, "grad_norm": 0.5319476049591959, "learning_rate": 8.46157919138889e-06, "loss": 0.038, "step": 1047 }, { "epoch": 0.9938359412043622, "grad_norm": 0.5377926003236448, "learning_rate": 8.457592975711128e-06, "loss": 0.0415, "step": 1048 }, { "epoch": 0.9947842579421526, "grad_norm": 0.5054973123174826, "learning_rate": 8.45360254395158e-06, "loss": 0.0509, "step": 1049 }, { "epoch": 0.9957325746799431, "grad_norm": 0.6511826899131821, "learning_rate": 8.449607900976056e-06, "loss": 0.0496, "step": 1050 }, { "epoch": 0.9966808914177335, "grad_norm": 0.34335574918053036, "learning_rate": 8.445609051655497e-06, "loss": 0.0322, "step": 1051 }, { "epoch": 0.9976292081555239, "grad_norm": 0.5324023086103392, "learning_rate": 8.441606000865978e-06, "loss": 0.0465, "step": 1052 }, { "epoch": 0.9985775248933144, "grad_norm": 0.3971741987281817, "learning_rate": 8.437598753488693e-06, "loss": 0.0316, "step": 1053 }, { "epoch": 0.9995258416311048, "grad_norm": 0.4702644191912913, "learning_rate": 8.43358731440996e-06, "loss": 0.0424, "step": 1054 }, { "epoch": 0.9995258416311048, "eval_loss": 0.05579984560608864, "eval_runtime": 205.6016, "eval_samples_per_second": 34.547, "eval_steps_per_second": 1.08, "step": 1054 } ], "logging_steps": 1, "max_steps": 3162, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.8007437509892506e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }