lillian039's picture
Upload folder using huggingface_hub
d92e353 verified
raw
history blame contribute delete
No virus
183 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995258416311048,
"eval_steps": 500,
"global_step": 1054,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000948316737790422,
"grad_norm": 12.246086019020522,
"learning_rate": 3.1545741324921134e-08,
"loss": 0.3675,
"step": 1
},
{
"epoch": 0.001896633475580844,
"grad_norm": 11.694026905196086,
"learning_rate": 6.309148264984227e-08,
"loss": 0.3513,
"step": 2
},
{
"epoch": 0.002844950213371266,
"grad_norm": 10.688899348005608,
"learning_rate": 9.463722397476342e-08,
"loss": 0.3177,
"step": 3
},
{
"epoch": 0.003793266951161688,
"grad_norm": 11.921298417211082,
"learning_rate": 1.2618296529968454e-07,
"loss": 0.3556,
"step": 4
},
{
"epoch": 0.00474158368895211,
"grad_norm": 11.282902382225787,
"learning_rate": 1.5772870662460568e-07,
"loss": 0.3662,
"step": 5
},
{
"epoch": 0.005689900426742532,
"grad_norm": 12.449826286939778,
"learning_rate": 1.8927444794952683e-07,
"loss": 0.3546,
"step": 6
},
{
"epoch": 0.006638217164532954,
"grad_norm": 11.727097480206721,
"learning_rate": 2.2082018927444798e-07,
"loss": 0.3635,
"step": 7
},
{
"epoch": 0.007586533902323376,
"grad_norm": 11.441751312661113,
"learning_rate": 2.5236593059936907e-07,
"loss": 0.3636,
"step": 8
},
{
"epoch": 0.008534850640113799,
"grad_norm": 10.632930884848795,
"learning_rate": 2.8391167192429027e-07,
"loss": 0.2923,
"step": 9
},
{
"epoch": 0.00948316737790422,
"grad_norm": 11.025857208188647,
"learning_rate": 3.1545741324921137e-07,
"loss": 0.3449,
"step": 10
},
{
"epoch": 0.010431484115694643,
"grad_norm": 11.86359857266447,
"learning_rate": 3.470031545741325e-07,
"loss": 0.3354,
"step": 11
},
{
"epoch": 0.011379800853485065,
"grad_norm": 11.01751351812872,
"learning_rate": 3.7854889589905366e-07,
"loss": 0.3369,
"step": 12
},
{
"epoch": 0.012328117591275486,
"grad_norm": 9.502190495628849,
"learning_rate": 4.100946372239748e-07,
"loss": 0.3179,
"step": 13
},
{
"epoch": 0.013276434329065908,
"grad_norm": 7.858408977040966,
"learning_rate": 4.4164037854889596e-07,
"loss": 0.2565,
"step": 14
},
{
"epoch": 0.01422475106685633,
"grad_norm": 8.154333698814211,
"learning_rate": 4.7318611987381705e-07,
"loss": 0.2589,
"step": 15
},
{
"epoch": 0.015173067804646752,
"grad_norm": 8.475444781638856,
"learning_rate": 5.047318611987381e-07,
"loss": 0.3001,
"step": 16
},
{
"epoch": 0.016121384542437174,
"grad_norm": 7.13737669899092,
"learning_rate": 5.362776025236594e-07,
"loss": 0.2641,
"step": 17
},
{
"epoch": 0.017069701280227598,
"grad_norm": 5.554684184392653,
"learning_rate": 5.678233438485805e-07,
"loss": 0.1902,
"step": 18
},
{
"epoch": 0.018018018018018018,
"grad_norm": 4.568211300813283,
"learning_rate": 5.993690851735017e-07,
"loss": 0.208,
"step": 19
},
{
"epoch": 0.01896633475580844,
"grad_norm": 4.7579569152913646,
"learning_rate": 6.309148264984227e-07,
"loss": 0.1994,
"step": 20
},
{
"epoch": 0.01991465149359886,
"grad_norm": 4.7128465676673486,
"learning_rate": 6.62460567823344e-07,
"loss": 0.229,
"step": 21
},
{
"epoch": 0.020862968231389285,
"grad_norm": 4.005405411985473,
"learning_rate": 6.94006309148265e-07,
"loss": 0.2095,
"step": 22
},
{
"epoch": 0.021811284969179705,
"grad_norm": 4.676075338959145,
"learning_rate": 7.255520504731863e-07,
"loss": 0.2178,
"step": 23
},
{
"epoch": 0.02275960170697013,
"grad_norm": 2.3652635706654435,
"learning_rate": 7.570977917981073e-07,
"loss": 0.1524,
"step": 24
},
{
"epoch": 0.02370791844476055,
"grad_norm": 2.685337556789167,
"learning_rate": 7.886435331230284e-07,
"loss": 0.1672,
"step": 25
},
{
"epoch": 0.024656235182550973,
"grad_norm": 2.430942973848189,
"learning_rate": 8.201892744479496e-07,
"loss": 0.1526,
"step": 26
},
{
"epoch": 0.025604551920341393,
"grad_norm": 2.6467399445694286,
"learning_rate": 8.517350157728707e-07,
"loss": 0.1605,
"step": 27
},
{
"epoch": 0.026552868658131817,
"grad_norm": 1.9805248826128374,
"learning_rate": 8.832807570977919e-07,
"loss": 0.1223,
"step": 28
},
{
"epoch": 0.027501185395922237,
"grad_norm": 2.2695664454959785,
"learning_rate": 9.148264984227131e-07,
"loss": 0.1991,
"step": 29
},
{
"epoch": 0.02844950213371266,
"grad_norm": 2.221333597086963,
"learning_rate": 9.463722397476341e-07,
"loss": 0.1466,
"step": 30
},
{
"epoch": 0.02939781887150308,
"grad_norm": 2.4412316593782633,
"learning_rate": 9.779179810725552e-07,
"loss": 0.1757,
"step": 31
},
{
"epoch": 0.030346135609293504,
"grad_norm": 2.3894901293863198,
"learning_rate": 1.0094637223974763e-06,
"loss": 0.1467,
"step": 32
},
{
"epoch": 0.031294452347083924,
"grad_norm": 2.2254181707911003,
"learning_rate": 1.0410094637223975e-06,
"loss": 0.1403,
"step": 33
},
{
"epoch": 0.03224276908487435,
"grad_norm": 2.0835670435267573,
"learning_rate": 1.0725552050473188e-06,
"loss": 0.1295,
"step": 34
},
{
"epoch": 0.03319108582266477,
"grad_norm": 2.490534255553767,
"learning_rate": 1.1041009463722398e-06,
"loss": 0.1781,
"step": 35
},
{
"epoch": 0.034139402560455195,
"grad_norm": 2.4852979753797526,
"learning_rate": 1.135646687697161e-06,
"loss": 0.1635,
"step": 36
},
{
"epoch": 0.03508771929824561,
"grad_norm": 2.1821267836376093,
"learning_rate": 1.1671924290220821e-06,
"loss": 0.1536,
"step": 37
},
{
"epoch": 0.036036036036036036,
"grad_norm": 2.0995981360286193,
"learning_rate": 1.1987381703470034e-06,
"loss": 0.1892,
"step": 38
},
{
"epoch": 0.03698435277382646,
"grad_norm": 1.7671849396147046,
"learning_rate": 1.2302839116719244e-06,
"loss": 0.1441,
"step": 39
},
{
"epoch": 0.03793266951161688,
"grad_norm": 1.9145592423222202,
"learning_rate": 1.2618296529968455e-06,
"loss": 0.1515,
"step": 40
},
{
"epoch": 0.0388809862494073,
"grad_norm": 2.909485299628588,
"learning_rate": 1.2933753943217667e-06,
"loss": 0.176,
"step": 41
},
{
"epoch": 0.03982930298719772,
"grad_norm": 1.6871888183428478,
"learning_rate": 1.324921135646688e-06,
"loss": 0.1215,
"step": 42
},
{
"epoch": 0.04077761972498815,
"grad_norm": 1.5396564532901138,
"learning_rate": 1.3564668769716088e-06,
"loss": 0.1334,
"step": 43
},
{
"epoch": 0.04172593646277857,
"grad_norm": 2.205033481070129,
"learning_rate": 1.38801261829653e-06,
"loss": 0.1397,
"step": 44
},
{
"epoch": 0.04267425320056899,
"grad_norm": 1.8497757762613358,
"learning_rate": 1.4195583596214513e-06,
"loss": 0.1274,
"step": 45
},
{
"epoch": 0.04362256993835941,
"grad_norm": 1.9376215434540043,
"learning_rate": 1.4511041009463726e-06,
"loss": 0.1228,
"step": 46
},
{
"epoch": 0.044570886676149835,
"grad_norm": 1.594889970345864,
"learning_rate": 1.4826498422712934e-06,
"loss": 0.1137,
"step": 47
},
{
"epoch": 0.04551920341394026,
"grad_norm": 1.7592570423176281,
"learning_rate": 1.5141955835962146e-06,
"loss": 0.141,
"step": 48
},
{
"epoch": 0.046467520151730675,
"grad_norm": 1.6146283602515956,
"learning_rate": 1.545741324921136e-06,
"loss": 0.1428,
"step": 49
},
{
"epoch": 0.0474158368895211,
"grad_norm": 1.503278573378982,
"learning_rate": 1.5772870662460567e-06,
"loss": 0.1318,
"step": 50
},
{
"epoch": 0.04836415362731152,
"grad_norm": 1.37572777178569,
"learning_rate": 1.608832807570978e-06,
"loss": 0.1315,
"step": 51
},
{
"epoch": 0.049312470365101946,
"grad_norm": 1.6002275154635794,
"learning_rate": 1.6403785488958992e-06,
"loss": 0.0935,
"step": 52
},
{
"epoch": 0.05026078710289237,
"grad_norm": 1.9567696662008847,
"learning_rate": 1.6719242902208203e-06,
"loss": 0.1271,
"step": 53
},
{
"epoch": 0.051209103840682786,
"grad_norm": 1.601626063178932,
"learning_rate": 1.7034700315457413e-06,
"loss": 0.0959,
"step": 54
},
{
"epoch": 0.05215742057847321,
"grad_norm": 1.886431535590579,
"learning_rate": 1.7350157728706626e-06,
"loss": 0.1218,
"step": 55
},
{
"epoch": 0.05310573731626363,
"grad_norm": 1.5354494166136305,
"learning_rate": 1.7665615141955838e-06,
"loss": 0.1139,
"step": 56
},
{
"epoch": 0.05405405405405406,
"grad_norm": 2.311230053300576,
"learning_rate": 1.7981072555205049e-06,
"loss": 0.1426,
"step": 57
},
{
"epoch": 0.055002370791844474,
"grad_norm": 1.6253071180005185,
"learning_rate": 1.8296529968454261e-06,
"loss": 0.1175,
"step": 58
},
{
"epoch": 0.0559506875296349,
"grad_norm": 1.3821063491809322,
"learning_rate": 1.8611987381703472e-06,
"loss": 0.132,
"step": 59
},
{
"epoch": 0.05689900426742532,
"grad_norm": 1.7624392868013044,
"learning_rate": 1.8927444794952682e-06,
"loss": 0.1221,
"step": 60
},
{
"epoch": 0.057847321005215745,
"grad_norm": 1.3398437874784876,
"learning_rate": 1.9242902208201892e-06,
"loss": 0.125,
"step": 61
},
{
"epoch": 0.05879563774300616,
"grad_norm": 1.562570182505017,
"learning_rate": 1.9558359621451105e-06,
"loss": 0.1413,
"step": 62
},
{
"epoch": 0.059743954480796585,
"grad_norm": 1.6769755616188486,
"learning_rate": 1.9873817034700317e-06,
"loss": 0.1559,
"step": 63
},
{
"epoch": 0.06069227121858701,
"grad_norm": 1.3917364499829268,
"learning_rate": 2.0189274447949526e-06,
"loss": 0.1377,
"step": 64
},
{
"epoch": 0.06164058795637743,
"grad_norm": 1.8502674559797263,
"learning_rate": 2.050473186119874e-06,
"loss": 0.1487,
"step": 65
},
{
"epoch": 0.06258890469416785,
"grad_norm": 3.158783977874437,
"learning_rate": 2.082018927444795e-06,
"loss": 0.119,
"step": 66
},
{
"epoch": 0.06353722143195828,
"grad_norm": 1.811584236109641,
"learning_rate": 2.1135646687697163e-06,
"loss": 0.122,
"step": 67
},
{
"epoch": 0.0644855381697487,
"grad_norm": 2.917344328319794,
"learning_rate": 2.1451104100946376e-06,
"loss": 0.1313,
"step": 68
},
{
"epoch": 0.06543385490753911,
"grad_norm": 1.8029019845335916,
"learning_rate": 2.1766561514195584e-06,
"loss": 0.1138,
"step": 69
},
{
"epoch": 0.06638217164532954,
"grad_norm": 1.6898543330406532,
"learning_rate": 2.2082018927444797e-06,
"loss": 0.1191,
"step": 70
},
{
"epoch": 0.06733048838311996,
"grad_norm": 2.2925732127308214,
"learning_rate": 2.239747634069401e-06,
"loss": 0.1306,
"step": 71
},
{
"epoch": 0.06827880512091039,
"grad_norm": 1.4433490292568716,
"learning_rate": 2.271293375394322e-06,
"loss": 0.1055,
"step": 72
},
{
"epoch": 0.06922712185870081,
"grad_norm": 1.3862506183642664,
"learning_rate": 2.302839116719243e-06,
"loss": 0.1075,
"step": 73
},
{
"epoch": 0.07017543859649122,
"grad_norm": 1.2816575561632197,
"learning_rate": 2.3343848580441643e-06,
"loss": 0.1028,
"step": 74
},
{
"epoch": 0.07112375533428165,
"grad_norm": 1.893923472034316,
"learning_rate": 2.3659305993690855e-06,
"loss": 0.1011,
"step": 75
},
{
"epoch": 0.07207207207207207,
"grad_norm": 1.6025824634915868,
"learning_rate": 2.3974763406940068e-06,
"loss": 0.1317,
"step": 76
},
{
"epoch": 0.07302038880986249,
"grad_norm": 1.7176261068301808,
"learning_rate": 2.4290220820189276e-06,
"loss": 0.1447,
"step": 77
},
{
"epoch": 0.07396870554765292,
"grad_norm": 2.4231160050612863,
"learning_rate": 2.460567823343849e-06,
"loss": 0.1384,
"step": 78
},
{
"epoch": 0.07491702228544334,
"grad_norm": 1.2193411546548798,
"learning_rate": 2.49211356466877e-06,
"loss": 0.0992,
"step": 79
},
{
"epoch": 0.07586533902323377,
"grad_norm": 1.5164983059809367,
"learning_rate": 2.523659305993691e-06,
"loss": 0.1001,
"step": 80
},
{
"epoch": 0.07681365576102418,
"grad_norm": 1.6017905795769134,
"learning_rate": 2.5552050473186126e-06,
"loss": 0.1048,
"step": 81
},
{
"epoch": 0.0777619724988146,
"grad_norm": 1.1836761079302904,
"learning_rate": 2.5867507886435334e-06,
"loss": 0.0982,
"step": 82
},
{
"epoch": 0.07871028923660503,
"grad_norm": 3.3493572839513566,
"learning_rate": 2.6182965299684543e-06,
"loss": 0.1184,
"step": 83
},
{
"epoch": 0.07965860597439545,
"grad_norm": 2.0313461386241722,
"learning_rate": 2.649842271293376e-06,
"loss": 0.1174,
"step": 84
},
{
"epoch": 0.08060692271218586,
"grad_norm": 1.7152579326543271,
"learning_rate": 2.6813880126182968e-06,
"loss": 0.117,
"step": 85
},
{
"epoch": 0.0815552394499763,
"grad_norm": 1.9082671591126898,
"learning_rate": 2.7129337539432176e-06,
"loss": 0.1538,
"step": 86
},
{
"epoch": 0.08250355618776671,
"grad_norm": 1.1544236926306861,
"learning_rate": 2.7444794952681393e-06,
"loss": 0.0906,
"step": 87
},
{
"epoch": 0.08345187292555714,
"grad_norm": 1.2516823902614436,
"learning_rate": 2.77602523659306e-06,
"loss": 0.1452,
"step": 88
},
{
"epoch": 0.08440018966334756,
"grad_norm": 1.0339206815219761,
"learning_rate": 2.807570977917981e-06,
"loss": 0.0836,
"step": 89
},
{
"epoch": 0.08534850640113797,
"grad_norm": 1.668394516826565,
"learning_rate": 2.8391167192429026e-06,
"loss": 0.1129,
"step": 90
},
{
"epoch": 0.0862968231389284,
"grad_norm": 1.497152077632149,
"learning_rate": 2.8706624605678234e-06,
"loss": 0.1062,
"step": 91
},
{
"epoch": 0.08724513987671882,
"grad_norm": 1.197731872894548,
"learning_rate": 2.902208201892745e-06,
"loss": 0.1102,
"step": 92
},
{
"epoch": 0.08819345661450925,
"grad_norm": 1.4271367098608596,
"learning_rate": 2.933753943217666e-06,
"loss": 0.1175,
"step": 93
},
{
"epoch": 0.08914177335229967,
"grad_norm": 1.6337936419255448,
"learning_rate": 2.9652996845425868e-06,
"loss": 0.1148,
"step": 94
},
{
"epoch": 0.09009009009009009,
"grad_norm": 1.7427850789821318,
"learning_rate": 2.9968454258675085e-06,
"loss": 0.1246,
"step": 95
},
{
"epoch": 0.09103840682788052,
"grad_norm": 1.2870967429199511,
"learning_rate": 3.0283911671924293e-06,
"loss": 0.087,
"step": 96
},
{
"epoch": 0.09198672356567093,
"grad_norm": 1.1311991406490958,
"learning_rate": 3.05993690851735e-06,
"loss": 0.0996,
"step": 97
},
{
"epoch": 0.09293504030346135,
"grad_norm": 1.3742928108454626,
"learning_rate": 3.091482649842272e-06,
"loss": 0.1293,
"step": 98
},
{
"epoch": 0.09388335704125178,
"grad_norm": 1.516028333204866,
"learning_rate": 3.1230283911671926e-06,
"loss": 0.1078,
"step": 99
},
{
"epoch": 0.0948316737790422,
"grad_norm": 1.1404699421620854,
"learning_rate": 3.1545741324921135e-06,
"loss": 0.1142,
"step": 100
},
{
"epoch": 0.09577999051683263,
"grad_norm": 1.7924070029612504,
"learning_rate": 3.186119873817035e-06,
"loss": 0.1419,
"step": 101
},
{
"epoch": 0.09672830725462304,
"grad_norm": 1.7297435544466835,
"learning_rate": 3.217665615141956e-06,
"loss": 0.117,
"step": 102
},
{
"epoch": 0.09767662399241346,
"grad_norm": 1.2304625316537265,
"learning_rate": 3.2492113564668772e-06,
"loss": 0.0834,
"step": 103
},
{
"epoch": 0.09862494073020389,
"grad_norm": 1.6554297434059837,
"learning_rate": 3.2807570977917985e-06,
"loss": 0.1251,
"step": 104
},
{
"epoch": 0.09957325746799431,
"grad_norm": 1.9749022078409877,
"learning_rate": 3.3123028391167193e-06,
"loss": 0.1485,
"step": 105
},
{
"epoch": 0.10052157420578474,
"grad_norm": 1.7816458729316766,
"learning_rate": 3.3438485804416405e-06,
"loss": 0.1168,
"step": 106
},
{
"epoch": 0.10146989094357516,
"grad_norm": 1.6366795934026652,
"learning_rate": 3.375394321766562e-06,
"loss": 0.1221,
"step": 107
},
{
"epoch": 0.10241820768136557,
"grad_norm": 1.0846701931516913,
"learning_rate": 3.4069400630914826e-06,
"loss": 0.086,
"step": 108
},
{
"epoch": 0.103366524419156,
"grad_norm": 1.374499027535318,
"learning_rate": 3.4384858044164043e-06,
"loss": 0.1097,
"step": 109
},
{
"epoch": 0.10431484115694642,
"grad_norm": 1.7733166976712489,
"learning_rate": 3.470031545741325e-06,
"loss": 0.1098,
"step": 110
},
{
"epoch": 0.10526315789473684,
"grad_norm": 2.980678296553409,
"learning_rate": 3.5015772870662464e-06,
"loss": 0.0917,
"step": 111
},
{
"epoch": 0.10621147463252727,
"grad_norm": 0.9904577188744437,
"learning_rate": 3.5331230283911676e-06,
"loss": 0.0777,
"step": 112
},
{
"epoch": 0.10715979137031768,
"grad_norm": 1.4107631975145143,
"learning_rate": 3.5646687697160885e-06,
"loss": 0.0902,
"step": 113
},
{
"epoch": 0.10810810810810811,
"grad_norm": 1.786967710835369,
"learning_rate": 3.5962145110410097e-06,
"loss": 0.0974,
"step": 114
},
{
"epoch": 0.10905642484589853,
"grad_norm": 1.6278373703409408,
"learning_rate": 3.627760252365931e-06,
"loss": 0.1031,
"step": 115
},
{
"epoch": 0.11000474158368895,
"grad_norm": 1.0856958566494381,
"learning_rate": 3.6593059936908522e-06,
"loss": 0.0872,
"step": 116
},
{
"epoch": 0.11095305832147938,
"grad_norm": 1.0746142572780863,
"learning_rate": 3.690851735015773e-06,
"loss": 0.0753,
"step": 117
},
{
"epoch": 0.1119013750592698,
"grad_norm": 1.794687772916648,
"learning_rate": 3.7223974763406943e-06,
"loss": 0.094,
"step": 118
},
{
"epoch": 0.11284969179706021,
"grad_norm": 2.0574961246450543,
"learning_rate": 3.7539432176656156e-06,
"loss": 0.1032,
"step": 119
},
{
"epoch": 0.11379800853485064,
"grad_norm": 1.0887603543641189,
"learning_rate": 3.7854889589905364e-06,
"loss": 0.0869,
"step": 120
},
{
"epoch": 0.11474632527264106,
"grad_norm": 1.4381581196511768,
"learning_rate": 3.817034700315458e-06,
"loss": 0.105,
"step": 121
},
{
"epoch": 0.11569464201043149,
"grad_norm": 2.0884869908112984,
"learning_rate": 3.8485804416403785e-06,
"loss": 0.1072,
"step": 122
},
{
"epoch": 0.1166429587482219,
"grad_norm": 1.3918016525882038,
"learning_rate": 3.8801261829653e-06,
"loss": 0.0995,
"step": 123
},
{
"epoch": 0.11759127548601232,
"grad_norm": 1.1199618265144746,
"learning_rate": 3.911671924290221e-06,
"loss": 0.0693,
"step": 124
},
{
"epoch": 0.11853959222380275,
"grad_norm": 2.913976726787567,
"learning_rate": 3.943217665615142e-06,
"loss": 0.1203,
"step": 125
},
{
"epoch": 0.11948790896159317,
"grad_norm": 1.4548880466216083,
"learning_rate": 3.9747634069400635e-06,
"loss": 0.0891,
"step": 126
},
{
"epoch": 0.1204362256993836,
"grad_norm": 3.1711394720986235,
"learning_rate": 4.006309148264985e-06,
"loss": 0.1223,
"step": 127
},
{
"epoch": 0.12138454243717402,
"grad_norm": 1.888765811166245,
"learning_rate": 4.037854889589905e-06,
"loss": 0.13,
"step": 128
},
{
"epoch": 0.12233285917496443,
"grad_norm": 1.2398551211997078,
"learning_rate": 4.069400630914827e-06,
"loss": 0.1103,
"step": 129
},
{
"epoch": 0.12328117591275486,
"grad_norm": 1.7438322556304724,
"learning_rate": 4.100946372239748e-06,
"loss": 0.1147,
"step": 130
},
{
"epoch": 0.12422949265054528,
"grad_norm": 0.9363387889716617,
"learning_rate": 4.132492113564669e-06,
"loss": 0.0995,
"step": 131
},
{
"epoch": 0.1251778093883357,
"grad_norm": 1.446859084810851,
"learning_rate": 4.16403785488959e-06,
"loss": 0.0994,
"step": 132
},
{
"epoch": 0.12612612612612611,
"grad_norm": 1.1856203072681963,
"learning_rate": 4.195583596214511e-06,
"loss": 0.0927,
"step": 133
},
{
"epoch": 0.12707444286391656,
"grad_norm": 1.103336827372462,
"learning_rate": 4.227129337539433e-06,
"loss": 0.0815,
"step": 134
},
{
"epoch": 0.12802275960170698,
"grad_norm": 1.897384655096208,
"learning_rate": 4.258675078864354e-06,
"loss": 0.1248,
"step": 135
},
{
"epoch": 0.1289710763394974,
"grad_norm": 1.6223901695891558,
"learning_rate": 4.290220820189275e-06,
"loss": 0.1456,
"step": 136
},
{
"epoch": 0.1299193930772878,
"grad_norm": 1.93689861193564,
"learning_rate": 4.321766561514196e-06,
"loss": 0.1236,
"step": 137
},
{
"epoch": 0.13086770981507823,
"grad_norm": 1.7202395942479507,
"learning_rate": 4.353312302839117e-06,
"loss": 0.0994,
"step": 138
},
{
"epoch": 0.13181602655286867,
"grad_norm": 2.1336251410837717,
"learning_rate": 4.384858044164038e-06,
"loss": 0.0963,
"step": 139
},
{
"epoch": 0.1327643432906591,
"grad_norm": 2.086908025505944,
"learning_rate": 4.416403785488959e-06,
"loss": 0.1397,
"step": 140
},
{
"epoch": 0.1337126600284495,
"grad_norm": 1.903049841336412,
"learning_rate": 4.447949526813881e-06,
"loss": 0.1188,
"step": 141
},
{
"epoch": 0.13466097676623992,
"grad_norm": 1.237639055790405,
"learning_rate": 4.479495268138802e-06,
"loss": 0.0864,
"step": 142
},
{
"epoch": 0.13560929350403034,
"grad_norm": 1.533833989919448,
"learning_rate": 4.511041009463723e-06,
"loss": 0.1188,
"step": 143
},
{
"epoch": 0.13655761024182078,
"grad_norm": 1.7546010414420699,
"learning_rate": 4.542586750788644e-06,
"loss": 0.1364,
"step": 144
},
{
"epoch": 0.1375059269796112,
"grad_norm": 2.9799276151902645,
"learning_rate": 4.574132492113565e-06,
"loss": 0.1226,
"step": 145
},
{
"epoch": 0.13845424371740161,
"grad_norm": 1.1723876001348499,
"learning_rate": 4.605678233438486e-06,
"loss": 0.086,
"step": 146
},
{
"epoch": 0.13940256045519203,
"grad_norm": 2.069220754870492,
"learning_rate": 4.637223974763407e-06,
"loss": 0.1196,
"step": 147
},
{
"epoch": 0.14035087719298245,
"grad_norm": 3.9795001087139124,
"learning_rate": 4.6687697160883285e-06,
"loss": 0.1152,
"step": 148
},
{
"epoch": 0.14129919393077287,
"grad_norm": 1.4634422746453415,
"learning_rate": 4.70031545741325e-06,
"loss": 0.0916,
"step": 149
},
{
"epoch": 0.1422475106685633,
"grad_norm": 1.3185726560010742,
"learning_rate": 4.731861198738171e-06,
"loss": 0.0904,
"step": 150
},
{
"epoch": 0.14319582740635373,
"grad_norm": 1.5552910898557228,
"learning_rate": 4.7634069400630914e-06,
"loss": 0.0899,
"step": 151
},
{
"epoch": 0.14414414414414414,
"grad_norm": 1.0997805514097108,
"learning_rate": 4.7949526813880135e-06,
"loss": 0.0795,
"step": 152
},
{
"epoch": 0.14509246088193456,
"grad_norm": 1.7076641753438397,
"learning_rate": 4.826498422712934e-06,
"loss": 0.1081,
"step": 153
},
{
"epoch": 0.14604077761972498,
"grad_norm": 1.6735518250841006,
"learning_rate": 4.858044164037855e-06,
"loss": 0.1068,
"step": 154
},
{
"epoch": 0.14698909435751542,
"grad_norm": 1.2033878521779449,
"learning_rate": 4.8895899053627764e-06,
"loss": 0.0934,
"step": 155
},
{
"epoch": 0.14793741109530584,
"grad_norm": 1.4908718795559122,
"learning_rate": 4.921135646687698e-06,
"loss": 0.1072,
"step": 156
},
{
"epoch": 0.14888572783309625,
"grad_norm": 1.3234990953707453,
"learning_rate": 4.952681388012618e-06,
"loss": 0.104,
"step": 157
},
{
"epoch": 0.14983404457088667,
"grad_norm": 1.3354249814975963,
"learning_rate": 4.98422712933754e-06,
"loss": 0.1189,
"step": 158
},
{
"epoch": 0.1507823613086771,
"grad_norm": 1.224445144859879,
"learning_rate": 5.015772870662461e-06,
"loss": 0.1114,
"step": 159
},
{
"epoch": 0.15173067804646753,
"grad_norm": 1.3554440133569026,
"learning_rate": 5.047318611987382e-06,
"loss": 0.1278,
"step": 160
},
{
"epoch": 0.15267899478425795,
"grad_norm": 1.4393478098545054,
"learning_rate": 5.078864353312303e-06,
"loss": 0.1201,
"step": 161
},
{
"epoch": 0.15362731152204837,
"grad_norm": 1.032684740456474,
"learning_rate": 5.110410094637225e-06,
"loss": 0.0841,
"step": 162
},
{
"epoch": 0.15457562825983878,
"grad_norm": 1.2508286920209446,
"learning_rate": 5.141955835962146e-06,
"loss": 0.0863,
"step": 163
},
{
"epoch": 0.1555239449976292,
"grad_norm": 1.899093372512286,
"learning_rate": 5.173501577287067e-06,
"loss": 0.1662,
"step": 164
},
{
"epoch": 0.15647226173541964,
"grad_norm": 1.4000014551423334,
"learning_rate": 5.205047318611987e-06,
"loss": 0.0909,
"step": 165
},
{
"epoch": 0.15742057847321006,
"grad_norm": 1.9418542456678585,
"learning_rate": 5.2365930599369085e-06,
"loss": 0.1013,
"step": 166
},
{
"epoch": 0.15836889521100048,
"grad_norm": 1.5538903766146939,
"learning_rate": 5.268138801261831e-06,
"loss": 0.1177,
"step": 167
},
{
"epoch": 0.1593172119487909,
"grad_norm": 1.3035129364423688,
"learning_rate": 5.299684542586752e-06,
"loss": 0.0961,
"step": 168
},
{
"epoch": 0.1602655286865813,
"grad_norm": 1.273421849890499,
"learning_rate": 5.331230283911672e-06,
"loss": 0.1252,
"step": 169
},
{
"epoch": 0.16121384542437173,
"grad_norm": 1.123016604976548,
"learning_rate": 5.3627760252365935e-06,
"loss": 0.0999,
"step": 170
},
{
"epoch": 0.16216216216216217,
"grad_norm": 1.2409364166994,
"learning_rate": 5.394321766561515e-06,
"loss": 0.1095,
"step": 171
},
{
"epoch": 0.1631104788999526,
"grad_norm": 1.1082140455460585,
"learning_rate": 5.425867507886435e-06,
"loss": 0.0736,
"step": 172
},
{
"epoch": 0.164058795637743,
"grad_norm": 1.2872459579560394,
"learning_rate": 5.457413249211357e-06,
"loss": 0.0928,
"step": 173
},
{
"epoch": 0.16500711237553342,
"grad_norm": 1.3830237110418746,
"learning_rate": 5.4889589905362786e-06,
"loss": 0.0973,
"step": 174
},
{
"epoch": 0.16595542911332384,
"grad_norm": 1.2546887092347754,
"learning_rate": 5.520504731861199e-06,
"loss": 0.0832,
"step": 175
},
{
"epoch": 0.16690374585111428,
"grad_norm": 1.1708284069676944,
"learning_rate": 5.55205047318612e-06,
"loss": 0.1075,
"step": 176
},
{
"epoch": 0.1678520625889047,
"grad_norm": 1.101853335061695,
"learning_rate": 5.5835962145110415e-06,
"loss": 0.0897,
"step": 177
},
{
"epoch": 0.16880037932669512,
"grad_norm": 1.015907357215909,
"learning_rate": 5.615141955835962e-06,
"loss": 0.0819,
"step": 178
},
{
"epoch": 0.16974869606448553,
"grad_norm": 1.8752154604515816,
"learning_rate": 5.646687697160884e-06,
"loss": 0.1021,
"step": 179
},
{
"epoch": 0.17069701280227595,
"grad_norm": 1.6971011710183759,
"learning_rate": 5.678233438485805e-06,
"loss": 0.0996,
"step": 180
},
{
"epoch": 0.1716453295400664,
"grad_norm": 1.2212507178791898,
"learning_rate": 5.709779179810726e-06,
"loss": 0.1079,
"step": 181
},
{
"epoch": 0.1725936462778568,
"grad_norm": 1.7343284525300247,
"learning_rate": 5.741324921135647e-06,
"loss": 0.1292,
"step": 182
},
{
"epoch": 0.17354196301564723,
"grad_norm": 1.4376592014404461,
"learning_rate": 5.772870662460568e-06,
"loss": 0.1312,
"step": 183
},
{
"epoch": 0.17449027975343764,
"grad_norm": 1.2528619821880524,
"learning_rate": 5.80441640378549e-06,
"loss": 0.0762,
"step": 184
},
{
"epoch": 0.17543859649122806,
"grad_norm": 1.9247297159171304,
"learning_rate": 5.835962145110411e-06,
"loss": 0.1403,
"step": 185
},
{
"epoch": 0.1763869132290185,
"grad_norm": 1.5028101353474104,
"learning_rate": 5.867507886435332e-06,
"loss": 0.1147,
"step": 186
},
{
"epoch": 0.17733522996680892,
"grad_norm": 2.4179600186213714,
"learning_rate": 5.899053627760253e-06,
"loss": 0.0913,
"step": 187
},
{
"epoch": 0.17828354670459934,
"grad_norm": 1.518835105924909,
"learning_rate": 5.9305993690851736e-06,
"loss": 0.0918,
"step": 188
},
{
"epoch": 0.17923186344238975,
"grad_norm": 1.6543687104918372,
"learning_rate": 5.962145110410095e-06,
"loss": 0.122,
"step": 189
},
{
"epoch": 0.18018018018018017,
"grad_norm": 1.4531807393638785,
"learning_rate": 5.993690851735017e-06,
"loss": 0.1228,
"step": 190
},
{
"epoch": 0.1811284969179706,
"grad_norm": 1.4665808153812976,
"learning_rate": 6.025236593059937e-06,
"loss": 0.1014,
"step": 191
},
{
"epoch": 0.18207681365576103,
"grad_norm": 1.2889682170490027,
"learning_rate": 6.056782334384859e-06,
"loss": 0.1055,
"step": 192
},
{
"epoch": 0.18302513039355145,
"grad_norm": 1.3310497561635966,
"learning_rate": 6.08832807570978e-06,
"loss": 0.119,
"step": 193
},
{
"epoch": 0.18397344713134187,
"grad_norm": 1.3246051325093873,
"learning_rate": 6.1198738170347e-06,
"loss": 0.1288,
"step": 194
},
{
"epoch": 0.18492176386913228,
"grad_norm": 1.1979924093987135,
"learning_rate": 6.1514195583596215e-06,
"loss": 0.0877,
"step": 195
},
{
"epoch": 0.1858700806069227,
"grad_norm": 1.1280419900810446,
"learning_rate": 6.182965299684544e-06,
"loss": 0.1085,
"step": 196
},
{
"epoch": 0.18681839734471314,
"grad_norm": 1.3307017446168579,
"learning_rate": 6.214511041009465e-06,
"loss": 0.0853,
"step": 197
},
{
"epoch": 0.18776671408250356,
"grad_norm": 1.1814823672365349,
"learning_rate": 6.246056782334385e-06,
"loss": 0.1066,
"step": 198
},
{
"epoch": 0.18871503082029398,
"grad_norm": 0.7829348670836794,
"learning_rate": 6.2776025236593065e-06,
"loss": 0.0662,
"step": 199
},
{
"epoch": 0.1896633475580844,
"grad_norm": 1.2435224715978643,
"learning_rate": 6.309148264984227e-06,
"loss": 0.088,
"step": 200
},
{
"epoch": 0.1906116642958748,
"grad_norm": 1.0014149948809556,
"learning_rate": 6.340694006309149e-06,
"loss": 0.0975,
"step": 201
},
{
"epoch": 0.19155998103366526,
"grad_norm": 0.9250673471848995,
"learning_rate": 6.37223974763407e-06,
"loss": 0.0877,
"step": 202
},
{
"epoch": 0.19250829777145567,
"grad_norm": 1.056412139362465,
"learning_rate": 6.4037854889589915e-06,
"loss": 0.0763,
"step": 203
},
{
"epoch": 0.1934566145092461,
"grad_norm": 0.9891782097788515,
"learning_rate": 6.435331230283912e-06,
"loss": 0.0834,
"step": 204
},
{
"epoch": 0.1944049312470365,
"grad_norm": 1.0792725374885792,
"learning_rate": 6.466876971608833e-06,
"loss": 0.0885,
"step": 205
},
{
"epoch": 0.19535324798482692,
"grad_norm": 1.2366811021393578,
"learning_rate": 6.4984227129337544e-06,
"loss": 0.0954,
"step": 206
},
{
"epoch": 0.19630156472261737,
"grad_norm": 1.024115365006771,
"learning_rate": 6.529968454258676e-06,
"loss": 0.1215,
"step": 207
},
{
"epoch": 0.19724988146040778,
"grad_norm": 1.2203185957532192,
"learning_rate": 6.561514195583597e-06,
"loss": 0.1202,
"step": 208
},
{
"epoch": 0.1981981981981982,
"grad_norm": 0.9501403270885721,
"learning_rate": 6.593059936908518e-06,
"loss": 0.0715,
"step": 209
},
{
"epoch": 0.19914651493598862,
"grad_norm": 1.5511308370546482,
"learning_rate": 6.624605678233439e-06,
"loss": 0.1089,
"step": 210
},
{
"epoch": 0.20009483167377903,
"grad_norm": 0.9433860573102355,
"learning_rate": 6.65615141955836e-06,
"loss": 0.0648,
"step": 211
},
{
"epoch": 0.20104314841156948,
"grad_norm": 1.0981902231687461,
"learning_rate": 6.687697160883281e-06,
"loss": 0.0663,
"step": 212
},
{
"epoch": 0.2019914651493599,
"grad_norm": 1.064443363672458,
"learning_rate": 6.719242902208203e-06,
"loss": 0.077,
"step": 213
},
{
"epoch": 0.2029397818871503,
"grad_norm": 1.3753290546304533,
"learning_rate": 6.750788643533124e-06,
"loss": 0.1093,
"step": 214
},
{
"epoch": 0.20388809862494073,
"grad_norm": 1.2200081175269764,
"learning_rate": 6.782334384858045e-06,
"loss": 0.1094,
"step": 215
},
{
"epoch": 0.20483641536273114,
"grad_norm": 0.9141258918864384,
"learning_rate": 6.813880126182965e-06,
"loss": 0.0911,
"step": 216
},
{
"epoch": 0.20578473210052156,
"grad_norm": 2.528170753397052,
"learning_rate": 6.8454258675078865e-06,
"loss": 0.1079,
"step": 217
},
{
"epoch": 0.206733048838312,
"grad_norm": 1.4430688823297448,
"learning_rate": 6.876971608832809e-06,
"loss": 0.1053,
"step": 218
},
{
"epoch": 0.20768136557610242,
"grad_norm": 1.0186932336289805,
"learning_rate": 6.90851735015773e-06,
"loss": 0.0861,
"step": 219
},
{
"epoch": 0.20862968231389284,
"grad_norm": 1.1420742589304766,
"learning_rate": 6.94006309148265e-06,
"loss": 0.094,
"step": 220
},
{
"epoch": 0.20957799905168326,
"grad_norm": 1.2741420533987797,
"learning_rate": 6.9716088328075715e-06,
"loss": 0.0951,
"step": 221
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.9075216722351295,
"learning_rate": 7.003154574132493e-06,
"loss": 0.0866,
"step": 222
},
{
"epoch": 0.21147463252726412,
"grad_norm": 1.1980754719122302,
"learning_rate": 7.034700315457413e-06,
"loss": 0.0914,
"step": 223
},
{
"epoch": 0.21242294926505453,
"grad_norm": 1.1939921471415105,
"learning_rate": 7.066246056782335e-06,
"loss": 0.1047,
"step": 224
},
{
"epoch": 0.21337126600284495,
"grad_norm": 0.8519438677271276,
"learning_rate": 7.0977917981072565e-06,
"loss": 0.0941,
"step": 225
},
{
"epoch": 0.21431958274063537,
"grad_norm": 0.789532854502906,
"learning_rate": 7.129337539432177e-06,
"loss": 0.0819,
"step": 226
},
{
"epoch": 0.21526789947842578,
"grad_norm": 1.2111156014392817,
"learning_rate": 7.160883280757098e-06,
"loss": 0.1027,
"step": 227
},
{
"epoch": 0.21621621621621623,
"grad_norm": 1.0588737043402552,
"learning_rate": 7.1924290220820195e-06,
"loss": 0.0952,
"step": 228
},
{
"epoch": 0.21716453295400664,
"grad_norm": 0.933483217055125,
"learning_rate": 7.22397476340694e-06,
"loss": 0.0763,
"step": 229
},
{
"epoch": 0.21811284969179706,
"grad_norm": 1.049586247769339,
"learning_rate": 7.255520504731862e-06,
"loss": 0.0789,
"step": 230
},
{
"epoch": 0.21906116642958748,
"grad_norm": 1.1220808424289264,
"learning_rate": 7.287066246056783e-06,
"loss": 0.074,
"step": 231
},
{
"epoch": 0.2200094831673779,
"grad_norm": 1.254391611101815,
"learning_rate": 7.3186119873817045e-06,
"loss": 0.093,
"step": 232
},
{
"epoch": 0.22095779990516834,
"grad_norm": 1.274839766592392,
"learning_rate": 7.350157728706625e-06,
"loss": 0.0938,
"step": 233
},
{
"epoch": 0.22190611664295876,
"grad_norm": 1.2629251738997191,
"learning_rate": 7.381703470031546e-06,
"loss": 0.1129,
"step": 234
},
{
"epoch": 0.22285443338074917,
"grad_norm": 1.3595829605121952,
"learning_rate": 7.413249211356468e-06,
"loss": 0.1062,
"step": 235
},
{
"epoch": 0.2238027501185396,
"grad_norm": 1.353026352957774,
"learning_rate": 7.444794952681389e-06,
"loss": 0.117,
"step": 236
},
{
"epoch": 0.22475106685633,
"grad_norm": 1.3472351125895725,
"learning_rate": 7.47634069400631e-06,
"loss": 0.0827,
"step": 237
},
{
"epoch": 0.22569938359412042,
"grad_norm": 0.9510770172761661,
"learning_rate": 7.507886435331231e-06,
"loss": 0.0759,
"step": 238
},
{
"epoch": 0.22664770033191087,
"grad_norm": 1.2025915899822757,
"learning_rate": 7.5394321766561515e-06,
"loss": 0.0807,
"step": 239
},
{
"epoch": 0.22759601706970128,
"grad_norm": 1.1640028047547857,
"learning_rate": 7.570977917981073e-06,
"loss": 0.0709,
"step": 240
},
{
"epoch": 0.2285443338074917,
"grad_norm": 1.5223127858935517,
"learning_rate": 7.602523659305995e-06,
"loss": 0.1018,
"step": 241
},
{
"epoch": 0.22949265054528212,
"grad_norm": 1.8495916864800697,
"learning_rate": 7.634069400630916e-06,
"loss": 0.0968,
"step": 242
},
{
"epoch": 0.23044096728307253,
"grad_norm": 1.8476848640745251,
"learning_rate": 7.665615141955837e-06,
"loss": 0.086,
"step": 243
},
{
"epoch": 0.23138928402086298,
"grad_norm": 1.4644626825262619,
"learning_rate": 7.697160883280757e-06,
"loss": 0.0974,
"step": 244
},
{
"epoch": 0.2323376007586534,
"grad_norm": 1.8857810882326624,
"learning_rate": 7.728706624605679e-06,
"loss": 0.1036,
"step": 245
},
{
"epoch": 0.2332859174964438,
"grad_norm": 1.7638762752182895,
"learning_rate": 7.7602523659306e-06,
"loss": 0.1097,
"step": 246
},
{
"epoch": 0.23423423423423423,
"grad_norm": 1.2348758426158113,
"learning_rate": 7.791798107255522e-06,
"loss": 0.0866,
"step": 247
},
{
"epoch": 0.23518255097202465,
"grad_norm": 1.1223471436540764,
"learning_rate": 7.823343848580442e-06,
"loss": 0.0564,
"step": 248
},
{
"epoch": 0.2361308677098151,
"grad_norm": 0.8821001750676984,
"learning_rate": 7.854889589905364e-06,
"loss": 0.0696,
"step": 249
},
{
"epoch": 0.2370791844476055,
"grad_norm": 0.9899264223411232,
"learning_rate": 7.886435331230284e-06,
"loss": 0.0702,
"step": 250
},
{
"epoch": 0.23802750118539592,
"grad_norm": 0.9289219027994224,
"learning_rate": 7.917981072555205e-06,
"loss": 0.0843,
"step": 251
},
{
"epoch": 0.23897581792318634,
"grad_norm": 1.0579670590751298,
"learning_rate": 7.949526813880127e-06,
"loss": 0.0921,
"step": 252
},
{
"epoch": 0.23992413466097676,
"grad_norm": 1.4593486745973783,
"learning_rate": 7.981072555205049e-06,
"loss": 0.1229,
"step": 253
},
{
"epoch": 0.2408724513987672,
"grad_norm": 0.9496576247693762,
"learning_rate": 8.01261829652997e-06,
"loss": 0.0861,
"step": 254
},
{
"epoch": 0.24182076813655762,
"grad_norm": 1.1030565317688061,
"learning_rate": 8.04416403785489e-06,
"loss": 0.0893,
"step": 255
},
{
"epoch": 0.24276908487434803,
"grad_norm": 0.9907604990146169,
"learning_rate": 8.07570977917981e-06,
"loss": 0.0928,
"step": 256
},
{
"epoch": 0.24371740161213845,
"grad_norm": 0.9460810229319789,
"learning_rate": 8.107255520504732e-06,
"loss": 0.0974,
"step": 257
},
{
"epoch": 0.24466571834992887,
"grad_norm": 0.8329291976282354,
"learning_rate": 8.138801261829655e-06,
"loss": 0.077,
"step": 258
},
{
"epoch": 0.24561403508771928,
"grad_norm": 0.8587085474520708,
"learning_rate": 8.170347003154575e-06,
"loss": 0.0837,
"step": 259
},
{
"epoch": 0.24656235182550973,
"grad_norm": 0.9113223159844124,
"learning_rate": 8.201892744479495e-06,
"loss": 0.088,
"step": 260
},
{
"epoch": 0.24751066856330015,
"grad_norm": 0.8328940868524983,
"learning_rate": 8.233438485804417e-06,
"loss": 0.091,
"step": 261
},
{
"epoch": 0.24845898530109056,
"grad_norm": 1.4264090310082065,
"learning_rate": 8.264984227129338e-06,
"loss": 0.1354,
"step": 262
},
{
"epoch": 0.24940730203888098,
"grad_norm": 1.0550225951223755,
"learning_rate": 8.296529968454258e-06,
"loss": 0.0972,
"step": 263
},
{
"epoch": 0.2503556187766714,
"grad_norm": 1.053508559451355,
"learning_rate": 8.32807570977918e-06,
"loss": 0.1035,
"step": 264
},
{
"epoch": 0.25130393551446184,
"grad_norm": 1.4971087544821369,
"learning_rate": 8.359621451104102e-06,
"loss": 0.1001,
"step": 265
},
{
"epoch": 0.25225225225225223,
"grad_norm": 1.075521297085326,
"learning_rate": 8.391167192429023e-06,
"loss": 0.0923,
"step": 266
},
{
"epoch": 0.2532005689900427,
"grad_norm": 1.6910075728505873,
"learning_rate": 8.422712933753943e-06,
"loss": 0.1212,
"step": 267
},
{
"epoch": 0.2541488857278331,
"grad_norm": 1.5073460991202734,
"learning_rate": 8.454258675078865e-06,
"loss": 0.087,
"step": 268
},
{
"epoch": 0.2550972024656235,
"grad_norm": 1.0201575671512444,
"learning_rate": 8.485804416403787e-06,
"loss": 0.0871,
"step": 269
},
{
"epoch": 0.25604551920341395,
"grad_norm": 1.1193230353064818,
"learning_rate": 8.517350157728708e-06,
"loss": 0.1031,
"step": 270
},
{
"epoch": 0.25699383594120434,
"grad_norm": 1.3593779355277376,
"learning_rate": 8.548895899053628e-06,
"loss": 0.0861,
"step": 271
},
{
"epoch": 0.2579421526789948,
"grad_norm": 1.5824627519870196,
"learning_rate": 8.58044164037855e-06,
"loss": 0.0998,
"step": 272
},
{
"epoch": 0.25889046941678523,
"grad_norm": 2.316620691088296,
"learning_rate": 8.61198738170347e-06,
"loss": 0.1237,
"step": 273
},
{
"epoch": 0.2598387861545756,
"grad_norm": 1.3708391836342668,
"learning_rate": 8.643533123028391e-06,
"loss": 0.0806,
"step": 274
},
{
"epoch": 0.26078710289236606,
"grad_norm": 1.259879695037933,
"learning_rate": 8.675078864353313e-06,
"loss": 0.088,
"step": 275
},
{
"epoch": 0.26173541963015645,
"grad_norm": 1.236718933875791,
"learning_rate": 8.706624605678234e-06,
"loss": 0.0842,
"step": 276
},
{
"epoch": 0.2626837363679469,
"grad_norm": 1.438488419989871,
"learning_rate": 8.738170347003156e-06,
"loss": 0.0955,
"step": 277
},
{
"epoch": 0.26363205310573734,
"grad_norm": 0.9563516338397714,
"learning_rate": 8.769716088328076e-06,
"loss": 0.0761,
"step": 278
},
{
"epoch": 0.26458036984352773,
"grad_norm": 1.2728124128011007,
"learning_rate": 8.801261829652997e-06,
"loss": 0.0805,
"step": 279
},
{
"epoch": 0.2655286865813182,
"grad_norm": 1.2205595373118223,
"learning_rate": 8.832807570977919e-06,
"loss": 0.0879,
"step": 280
},
{
"epoch": 0.26647700331910856,
"grad_norm": 0.959493141925286,
"learning_rate": 8.86435331230284e-06,
"loss": 0.0728,
"step": 281
},
{
"epoch": 0.267425320056899,
"grad_norm": 1.4340945839201555,
"learning_rate": 8.895899053627761e-06,
"loss": 0.0897,
"step": 282
},
{
"epoch": 0.26837363679468945,
"grad_norm": 1.0061297486879381,
"learning_rate": 8.927444794952682e-06,
"loss": 0.0857,
"step": 283
},
{
"epoch": 0.26932195353247984,
"grad_norm": 1.5459293734675696,
"learning_rate": 8.958990536277604e-06,
"loss": 0.1029,
"step": 284
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.3222303946698841,
"learning_rate": 8.990536277602524e-06,
"loss": 0.084,
"step": 285
},
{
"epoch": 0.2712185870080607,
"grad_norm": 1.185863549947665,
"learning_rate": 9.022082018927446e-06,
"loss": 0.1311,
"step": 286
},
{
"epoch": 0.2721669037458511,
"grad_norm": 0.8959238307125761,
"learning_rate": 9.053627760252367e-06,
"loss": 0.067,
"step": 287
},
{
"epoch": 0.27311522048364156,
"grad_norm": 1.369443136318961,
"learning_rate": 9.085173501577289e-06,
"loss": 0.1093,
"step": 288
},
{
"epoch": 0.27406353722143195,
"grad_norm": 1.1052390238476015,
"learning_rate": 9.116719242902209e-06,
"loss": 0.103,
"step": 289
},
{
"epoch": 0.2750118539592224,
"grad_norm": 1.325059650748033,
"learning_rate": 9.14826498422713e-06,
"loss": 0.1111,
"step": 290
},
{
"epoch": 0.2759601706970128,
"grad_norm": 1.3248936963910136,
"learning_rate": 9.17981072555205e-06,
"loss": 0.0933,
"step": 291
},
{
"epoch": 0.27690848743480323,
"grad_norm": 1.127118183479871,
"learning_rate": 9.211356466876972e-06,
"loss": 0.0891,
"step": 292
},
{
"epoch": 0.2778568041725936,
"grad_norm": 1.3108916887707827,
"learning_rate": 9.242902208201894e-06,
"loss": 0.0939,
"step": 293
},
{
"epoch": 0.27880512091038406,
"grad_norm": 1.0013886049046197,
"learning_rate": 9.274447949526815e-06,
"loss": 0.0692,
"step": 294
},
{
"epoch": 0.2797534376481745,
"grad_norm": 1.1156101698361054,
"learning_rate": 9.305993690851735e-06,
"loss": 0.0868,
"step": 295
},
{
"epoch": 0.2807017543859649,
"grad_norm": 1.2522202479933553,
"learning_rate": 9.337539432176657e-06,
"loss": 0.0914,
"step": 296
},
{
"epoch": 0.28165007112375534,
"grad_norm": 1.3755827124206237,
"learning_rate": 9.369085173501577e-06,
"loss": 0.0936,
"step": 297
},
{
"epoch": 0.28259838786154573,
"grad_norm": 1.4694162511089293,
"learning_rate": 9.4006309148265e-06,
"loss": 0.1071,
"step": 298
},
{
"epoch": 0.2835467045993362,
"grad_norm": 1.255879045911956,
"learning_rate": 9.43217665615142e-06,
"loss": 0.0815,
"step": 299
},
{
"epoch": 0.2844950213371266,
"grad_norm": 1.560204819302283,
"learning_rate": 9.463722397476342e-06,
"loss": 0.1234,
"step": 300
},
{
"epoch": 0.285443338074917,
"grad_norm": 1.0121817898281276,
"learning_rate": 9.495268138801262e-06,
"loss": 0.0595,
"step": 301
},
{
"epoch": 0.28639165481270745,
"grad_norm": 1.0711466156341418,
"learning_rate": 9.526813880126183e-06,
"loss": 0.0641,
"step": 302
},
{
"epoch": 0.28733997155049784,
"grad_norm": 1.1496695710149105,
"learning_rate": 9.558359621451105e-06,
"loss": 0.0761,
"step": 303
},
{
"epoch": 0.2882882882882883,
"grad_norm": 1.2059272704315518,
"learning_rate": 9.589905362776027e-06,
"loss": 0.0756,
"step": 304
},
{
"epoch": 0.28923660502607873,
"grad_norm": 1.0424292745296735,
"learning_rate": 9.621451104100947e-06,
"loss": 0.0855,
"step": 305
},
{
"epoch": 0.2901849217638691,
"grad_norm": 1.1497786768197902,
"learning_rate": 9.652996845425868e-06,
"loss": 0.071,
"step": 306
},
{
"epoch": 0.29113323850165956,
"grad_norm": 1.3472444992692172,
"learning_rate": 9.68454258675079e-06,
"loss": 0.0934,
"step": 307
},
{
"epoch": 0.29208155523944995,
"grad_norm": 1.3345310370843513,
"learning_rate": 9.71608832807571e-06,
"loss": 0.0998,
"step": 308
},
{
"epoch": 0.2930298719772404,
"grad_norm": 1.01109508034154,
"learning_rate": 9.747634069400632e-06,
"loss": 0.0762,
"step": 309
},
{
"epoch": 0.29397818871503084,
"grad_norm": 0.9249973635125475,
"learning_rate": 9.779179810725553e-06,
"loss": 0.074,
"step": 310
},
{
"epoch": 0.29492650545282123,
"grad_norm": 0.804446344253587,
"learning_rate": 9.810725552050473e-06,
"loss": 0.0517,
"step": 311
},
{
"epoch": 0.2958748221906117,
"grad_norm": 0.965596925556689,
"learning_rate": 9.842271293375395e-06,
"loss": 0.098,
"step": 312
},
{
"epoch": 0.29682313892840206,
"grad_norm": 2.012807451707843,
"learning_rate": 9.873817034700316e-06,
"loss": 0.1038,
"step": 313
},
{
"epoch": 0.2977714556661925,
"grad_norm": 1.2864066063043205,
"learning_rate": 9.905362776025236e-06,
"loss": 0.1102,
"step": 314
},
{
"epoch": 0.29871977240398295,
"grad_norm": 0.8775284858258785,
"learning_rate": 9.936908517350158e-06,
"loss": 0.0913,
"step": 315
},
{
"epoch": 0.29966808914177334,
"grad_norm": 0.9395466275555749,
"learning_rate": 9.96845425867508e-06,
"loss": 0.1156,
"step": 316
},
{
"epoch": 0.3006164058795638,
"grad_norm": 1.031977177693936,
"learning_rate": 1e-05,
"loss": 0.0772,
"step": 317
},
{
"epoch": 0.3015647226173542,
"grad_norm": 0.906696222035988,
"learning_rate": 9.999996951577431e-06,
"loss": 0.0745,
"step": 318
},
{
"epoch": 0.3025130393551446,
"grad_norm": 1.6486632782552955,
"learning_rate": 9.999987806313436e-06,
"loss": 0.1295,
"step": 319
},
{
"epoch": 0.30346135609293506,
"grad_norm": 1.0682004904191784,
"learning_rate": 9.999972564219169e-06,
"loss": 0.089,
"step": 320
},
{
"epoch": 0.30440967283072545,
"grad_norm": 1.0160084965418597,
"learning_rate": 9.999951225313217e-06,
"loss": 0.0795,
"step": 321
},
{
"epoch": 0.3053579895685159,
"grad_norm": 1.1229797355618714,
"learning_rate": 9.999923789621598e-06,
"loss": 0.0924,
"step": 322
},
{
"epoch": 0.3063063063063063,
"grad_norm": 0.9925832526069106,
"learning_rate": 9.999890257177766e-06,
"loss": 0.0803,
"step": 323
},
{
"epoch": 0.30725462304409673,
"grad_norm": 1.1785860516178814,
"learning_rate": 9.999850628022611e-06,
"loss": 0.0797,
"step": 324
},
{
"epoch": 0.3082029397818872,
"grad_norm": 1.1520304204509717,
"learning_rate": 9.999804902204455e-06,
"loss": 0.0775,
"step": 325
},
{
"epoch": 0.30915125651967756,
"grad_norm": 1.0880132191910508,
"learning_rate": 9.999753079779054e-06,
"loss": 0.0906,
"step": 326
},
{
"epoch": 0.310099573257468,
"grad_norm": 1.5767657455822397,
"learning_rate": 9.999695160809598e-06,
"loss": 0.0956,
"step": 327
},
{
"epoch": 0.3110478899952584,
"grad_norm": 0.7125012678361342,
"learning_rate": 9.999631145366713e-06,
"loss": 0.0661,
"step": 328
},
{
"epoch": 0.31199620673304884,
"grad_norm": 1.088584252037159,
"learning_rate": 9.999561033528457e-06,
"loss": 0.1149,
"step": 329
},
{
"epoch": 0.3129445234708393,
"grad_norm": 0.8523222222870042,
"learning_rate": 9.999484825380323e-06,
"loss": 0.0913,
"step": 330
},
{
"epoch": 0.3138928402086297,
"grad_norm": 1.0164571883774136,
"learning_rate": 9.999402521015236e-06,
"loss": 0.0878,
"step": 331
},
{
"epoch": 0.3148411569464201,
"grad_norm": 0.7164573705993513,
"learning_rate": 9.999314120533557e-06,
"loss": 0.0866,
"step": 332
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.7954216406429697,
"learning_rate": 9.999219624043075e-06,
"loss": 0.0702,
"step": 333
},
{
"epoch": 0.31673779042200095,
"grad_norm": 0.7996263107367133,
"learning_rate": 9.99911903165902e-06,
"loss": 0.0758,
"step": 334
},
{
"epoch": 0.3176861071597914,
"grad_norm": 1.101451187378474,
"learning_rate": 9.999012343504049e-06,
"loss": 0.0957,
"step": 335
},
{
"epoch": 0.3186344238975818,
"grad_norm": 0.7265535166036453,
"learning_rate": 9.998899559708254e-06,
"loss": 0.0743,
"step": 336
},
{
"epoch": 0.31958274063537223,
"grad_norm": 1.272801256055057,
"learning_rate": 9.998780680409161e-06,
"loss": 0.0952,
"step": 337
},
{
"epoch": 0.3205310573731626,
"grad_norm": 0.8770881337944402,
"learning_rate": 9.99865570575173e-06,
"loss": 0.066,
"step": 338
},
{
"epoch": 0.32147937411095306,
"grad_norm": 1.0607119132841634,
"learning_rate": 9.998524635888347e-06,
"loss": 0.0913,
"step": 339
},
{
"epoch": 0.32242769084874345,
"grad_norm": 0.9189346974278031,
"learning_rate": 9.998387470978837e-06,
"loss": 0.0881,
"step": 340
},
{
"epoch": 0.3233760075865339,
"grad_norm": 0.7272168469454553,
"learning_rate": 9.998244211190454e-06,
"loss": 0.0713,
"step": 341
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.9819255696828616,
"learning_rate": 9.998094856697885e-06,
"loss": 0.0834,
"step": 342
},
{
"epoch": 0.32527264106211473,
"grad_norm": 0.6857773270509248,
"learning_rate": 9.997939407683249e-06,
"loss": 0.0524,
"step": 343
},
{
"epoch": 0.3262209577999052,
"grad_norm": 1.0324591704355464,
"learning_rate": 9.99777786433609e-06,
"loss": 0.1108,
"step": 344
},
{
"epoch": 0.32716927453769556,
"grad_norm": 1.1264206703681527,
"learning_rate": 9.997610226853399e-06,
"loss": 0.0987,
"step": 345
},
{
"epoch": 0.328117591275486,
"grad_norm": 0.95789066514891,
"learning_rate": 9.997436495439581e-06,
"loss": 0.093,
"step": 346
},
{
"epoch": 0.32906590801327645,
"grad_norm": 1.0448222803112024,
"learning_rate": 9.997256670306478e-06,
"loss": 0.0983,
"step": 347
},
{
"epoch": 0.33001422475106684,
"grad_norm": 0.7737283316563024,
"learning_rate": 9.997070751673367e-06,
"loss": 0.0706,
"step": 348
},
{
"epoch": 0.3309625414888573,
"grad_norm": 0.9596984880180834,
"learning_rate": 9.99687873976695e-06,
"loss": 0.0991,
"step": 349
},
{
"epoch": 0.3319108582266477,
"grad_norm": 0.8411109119380658,
"learning_rate": 9.99668063482136e-06,
"loss": 0.0678,
"step": 350
},
{
"epoch": 0.3328591749644381,
"grad_norm": 1.136491883808786,
"learning_rate": 9.996476437078162e-06,
"loss": 0.0986,
"step": 351
},
{
"epoch": 0.33380749170222856,
"grad_norm": 3.03438587624818,
"learning_rate": 9.996266146786344e-06,
"loss": 0.0969,
"step": 352
},
{
"epoch": 0.33475580844001895,
"grad_norm": 1.2333568047254937,
"learning_rate": 9.996049764202332e-06,
"loss": 0.0832,
"step": 353
},
{
"epoch": 0.3357041251778094,
"grad_norm": 1.1301139087376384,
"learning_rate": 9.995827289589974e-06,
"loss": 0.0994,
"step": 354
},
{
"epoch": 0.3366524419155998,
"grad_norm": 1.0303329732235522,
"learning_rate": 9.995598723220548e-06,
"loss": 0.0757,
"step": 355
},
{
"epoch": 0.33760075865339023,
"grad_norm": 1.0605991674508604,
"learning_rate": 9.995364065372762e-06,
"loss": 0.0815,
"step": 356
},
{
"epoch": 0.3385490753911807,
"grad_norm": 0.7941030771981634,
"learning_rate": 9.995123316332752e-06,
"loss": 0.0747,
"step": 357
},
{
"epoch": 0.33949739212897106,
"grad_norm": 1.2313896272302265,
"learning_rate": 9.994876476394075e-06,
"loss": 0.0769,
"step": 358
},
{
"epoch": 0.3404457088667615,
"grad_norm": 1.1944743493159886,
"learning_rate": 9.994623545857727e-06,
"loss": 0.0979,
"step": 359
},
{
"epoch": 0.3413940256045519,
"grad_norm": 0.8285281294809631,
"learning_rate": 9.994364525032116e-06,
"loss": 0.0793,
"step": 360
},
{
"epoch": 0.34234234234234234,
"grad_norm": 1.4761389910370195,
"learning_rate": 9.994099414233091e-06,
"loss": 0.0913,
"step": 361
},
{
"epoch": 0.3432906590801328,
"grad_norm": 1.5408966458771916,
"learning_rate": 9.993828213783915e-06,
"loss": 0.0973,
"step": 362
},
{
"epoch": 0.3442389758179232,
"grad_norm": 1.4559933930399096,
"learning_rate": 9.993550924015283e-06,
"loss": 0.0999,
"step": 363
},
{
"epoch": 0.3451872925557136,
"grad_norm": 0.8454336561992738,
"learning_rate": 9.993267545265314e-06,
"loss": 0.0655,
"step": 364
},
{
"epoch": 0.346135609293504,
"grad_norm": 0.796992439441769,
"learning_rate": 9.992978077879552e-06,
"loss": 0.0696,
"step": 365
},
{
"epoch": 0.34708392603129445,
"grad_norm": 1.0553149426590827,
"learning_rate": 9.992682522210963e-06,
"loss": 0.0787,
"step": 366
},
{
"epoch": 0.3480322427690849,
"grad_norm": 1.4860431297237584,
"learning_rate": 9.992380878619939e-06,
"loss": 0.106,
"step": 367
},
{
"epoch": 0.3489805595068753,
"grad_norm": 1.3032907057151817,
"learning_rate": 9.992073147474292e-06,
"loss": 0.1021,
"step": 368
},
{
"epoch": 0.34992887624466573,
"grad_norm": 1.0894704335759804,
"learning_rate": 9.991759329149266e-06,
"loss": 0.0905,
"step": 369
},
{
"epoch": 0.3508771929824561,
"grad_norm": 1.1130576081628205,
"learning_rate": 9.991439424027518e-06,
"loss": 0.0846,
"step": 370
},
{
"epoch": 0.35182550972024657,
"grad_norm": 0.9253664091514998,
"learning_rate": 9.991113432499128e-06,
"loss": 0.0882,
"step": 371
},
{
"epoch": 0.352773826458037,
"grad_norm": 0.841899923853967,
"learning_rate": 9.990781354961605e-06,
"loss": 0.0806,
"step": 372
},
{
"epoch": 0.3537221431958274,
"grad_norm": 0.9407729946270026,
"learning_rate": 9.99044319181987e-06,
"loss": 0.0939,
"step": 373
},
{
"epoch": 0.35467045993361784,
"grad_norm": 0.9090058769044609,
"learning_rate": 9.99009894348627e-06,
"loss": 0.0891,
"step": 374
},
{
"epoch": 0.35561877667140823,
"grad_norm": 0.6294083333837054,
"learning_rate": 9.989748610380571e-06,
"loss": 0.0706,
"step": 375
},
{
"epoch": 0.3565670934091987,
"grad_norm": 0.9163781177038506,
"learning_rate": 9.98939219292996e-06,
"loss": 0.0697,
"step": 376
},
{
"epoch": 0.3575154101469891,
"grad_norm": 1.1693511630739546,
"learning_rate": 9.989029691569037e-06,
"loss": 0.1056,
"step": 377
},
{
"epoch": 0.3584637268847795,
"grad_norm": 1.0414233510818562,
"learning_rate": 9.988661106739827e-06,
"loss": 0.0988,
"step": 378
},
{
"epoch": 0.35941204362256995,
"grad_norm": 1.2822153621266594,
"learning_rate": 9.988286438891774e-06,
"loss": 0.1189,
"step": 379
},
{
"epoch": 0.36036036036036034,
"grad_norm": 0.63669429794073,
"learning_rate": 9.987905688481732e-06,
"loss": 0.0828,
"step": 380
},
{
"epoch": 0.3613086770981508,
"grad_norm": 0.826754093590745,
"learning_rate": 9.98751885597398e-06,
"loss": 0.0848,
"step": 381
},
{
"epoch": 0.3622569938359412,
"grad_norm": 0.8825949393702691,
"learning_rate": 9.987125941840205e-06,
"loss": 0.092,
"step": 382
},
{
"epoch": 0.3632053105737316,
"grad_norm": 0.6103241173744877,
"learning_rate": 9.986726946559517e-06,
"loss": 0.08,
"step": 383
},
{
"epoch": 0.36415362731152207,
"grad_norm": 0.7105367439957658,
"learning_rate": 9.986321870618441e-06,
"loss": 0.0685,
"step": 384
},
{
"epoch": 0.36510194404931245,
"grad_norm": 1.802287343988455,
"learning_rate": 9.985910714510908e-06,
"loss": 0.0818,
"step": 385
},
{
"epoch": 0.3660502607871029,
"grad_norm": 0.7732813708584271,
"learning_rate": 9.985493478738275e-06,
"loss": 0.07,
"step": 386
},
{
"epoch": 0.3669985775248933,
"grad_norm": 0.8451643375246307,
"learning_rate": 9.985070163809306e-06,
"loss": 0.0744,
"step": 387
},
{
"epoch": 0.36794689426268373,
"grad_norm": 1.126067442650852,
"learning_rate": 9.984640770240173e-06,
"loss": 0.1101,
"step": 388
},
{
"epoch": 0.3688952110004742,
"grad_norm": 0.6652401258855057,
"learning_rate": 9.984205298554467e-06,
"loss": 0.0663,
"step": 389
},
{
"epoch": 0.36984352773826457,
"grad_norm": 1.0802552975196003,
"learning_rate": 9.983763749283193e-06,
"loss": 0.0975,
"step": 390
},
{
"epoch": 0.370791844476055,
"grad_norm": 0.7496808510910429,
"learning_rate": 9.983316122964757e-06,
"loss": 0.0701,
"step": 391
},
{
"epoch": 0.3717401612138454,
"grad_norm": 0.6248602765248035,
"learning_rate": 9.982862420144986e-06,
"loss": 0.0643,
"step": 392
},
{
"epoch": 0.37268847795163584,
"grad_norm": 1.7058022738803864,
"learning_rate": 9.982402641377105e-06,
"loss": 0.0936,
"step": 393
},
{
"epoch": 0.3736367946894263,
"grad_norm": 1.205579756742393,
"learning_rate": 9.98193678722176e-06,
"loss": 0.0811,
"step": 394
},
{
"epoch": 0.3745851114272167,
"grad_norm": 0.8021701752607538,
"learning_rate": 9.981464858246993e-06,
"loss": 0.0719,
"step": 395
},
{
"epoch": 0.3755334281650071,
"grad_norm": 0.9210208736552777,
"learning_rate": 9.980986855028267e-06,
"loss": 0.0589,
"step": 396
},
{
"epoch": 0.3764817449027975,
"grad_norm": 1.0458476195224804,
"learning_rate": 9.980502778148438e-06,
"loss": 0.0696,
"step": 397
},
{
"epoch": 0.37743006164058795,
"grad_norm": 1.5095103680379303,
"learning_rate": 9.980012628197778e-06,
"loss": 0.0909,
"step": 398
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.9521689001456719,
"learning_rate": 9.979516405773956e-06,
"loss": 0.0844,
"step": 399
},
{
"epoch": 0.3793266951161688,
"grad_norm": 0.9909335290642662,
"learning_rate": 9.979014111482057e-06,
"loss": 0.079,
"step": 400
},
{
"epoch": 0.38027501185395923,
"grad_norm": 1.300023515267878,
"learning_rate": 9.978505745934559e-06,
"loss": 0.1087,
"step": 401
},
{
"epoch": 0.3812233285917496,
"grad_norm": 0.8905160216053487,
"learning_rate": 9.977991309751347e-06,
"loss": 0.0654,
"step": 402
},
{
"epoch": 0.38217164532954007,
"grad_norm": 0.7908744916198801,
"learning_rate": 9.97747080355971e-06,
"loss": 0.0697,
"step": 403
},
{
"epoch": 0.3831199620673305,
"grad_norm": 1.0819522254088034,
"learning_rate": 9.976944227994337e-06,
"loss": 0.0729,
"step": 404
},
{
"epoch": 0.3840682788051209,
"grad_norm": 0.9319836261266163,
"learning_rate": 9.976411583697316e-06,
"loss": 0.077,
"step": 405
},
{
"epoch": 0.38501659554291134,
"grad_norm": 0.7209233770781128,
"learning_rate": 9.97587287131814e-06,
"loss": 0.0708,
"step": 406
},
{
"epoch": 0.38596491228070173,
"grad_norm": 0.8430932582390814,
"learning_rate": 9.975328091513696e-06,
"loss": 0.07,
"step": 407
},
{
"epoch": 0.3869132290184922,
"grad_norm": 0.7932090811238357,
"learning_rate": 9.974777244948271e-06,
"loss": 0.0648,
"step": 408
},
{
"epoch": 0.3878615457562826,
"grad_norm": 0.9213278429313838,
"learning_rate": 9.974220332293554e-06,
"loss": 0.0737,
"step": 409
},
{
"epoch": 0.388809862494073,
"grad_norm": 0.4369389269684112,
"learning_rate": 9.973657354228623e-06,
"loss": 0.0509,
"step": 410
},
{
"epoch": 0.38975817923186346,
"grad_norm": 0.7988805293653696,
"learning_rate": 9.973088311439957e-06,
"loss": 0.0684,
"step": 411
},
{
"epoch": 0.39070649596965384,
"grad_norm": 0.9648310793568026,
"learning_rate": 9.97251320462143e-06,
"loss": 0.0849,
"step": 412
},
{
"epoch": 0.3916548127074443,
"grad_norm": 0.7585613690692753,
"learning_rate": 9.97193203447431e-06,
"loss": 0.077,
"step": 413
},
{
"epoch": 0.39260312944523473,
"grad_norm": 0.9380377046145346,
"learning_rate": 9.971344801707256e-06,
"loss": 0.0771,
"step": 414
},
{
"epoch": 0.3935514461830251,
"grad_norm": 0.9822247506181627,
"learning_rate": 9.970751507036323e-06,
"loss": 0.1123,
"step": 415
},
{
"epoch": 0.39449976292081557,
"grad_norm": 0.7156423865364446,
"learning_rate": 9.970152151184956e-06,
"loss": 0.0801,
"step": 416
},
{
"epoch": 0.39544807965860596,
"grad_norm": 1.05912629502688,
"learning_rate": 9.96954673488399e-06,
"loss": 0.0804,
"step": 417
},
{
"epoch": 0.3963963963963964,
"grad_norm": 1.1230479850270394,
"learning_rate": 9.968935258871652e-06,
"loss": 0.0799,
"step": 418
},
{
"epoch": 0.39734471313418684,
"grad_norm": 1.0054642393242061,
"learning_rate": 9.968317723893556e-06,
"loss": 0.082,
"step": 419
},
{
"epoch": 0.39829302987197723,
"grad_norm": 1.227859524837509,
"learning_rate": 9.967694130702706e-06,
"loss": 0.1069,
"step": 420
},
{
"epoch": 0.3992413466097677,
"grad_norm": 1.2136272659300074,
"learning_rate": 9.96706448005949e-06,
"loss": 0.1112,
"step": 421
},
{
"epoch": 0.40018966334755807,
"grad_norm": 0.9692912194018656,
"learning_rate": 9.96642877273169e-06,
"loss": 0.0837,
"step": 422
},
{
"epoch": 0.4011379800853485,
"grad_norm": 0.7181203670103851,
"learning_rate": 9.965787009494458e-06,
"loss": 0.0648,
"step": 423
},
{
"epoch": 0.40208629682313896,
"grad_norm": 0.9389223502528147,
"learning_rate": 9.96513919113035e-06,
"loss": 0.0846,
"step": 424
},
{
"epoch": 0.40303461356092934,
"grad_norm": 0.6566856036851983,
"learning_rate": 9.964485318429292e-06,
"loss": 0.0776,
"step": 425
},
{
"epoch": 0.4039829302987198,
"grad_norm": 1.0028156563396406,
"learning_rate": 9.963825392188595e-06,
"loss": 0.0719,
"step": 426
},
{
"epoch": 0.4049312470365102,
"grad_norm": 0.9682157984093804,
"learning_rate": 9.963159413212952e-06,
"loss": 0.1058,
"step": 427
},
{
"epoch": 0.4058795637743006,
"grad_norm": 1.1561667939356075,
"learning_rate": 9.96248738231444e-06,
"loss": 0.0982,
"step": 428
},
{
"epoch": 0.406827880512091,
"grad_norm": 0.7960344078481167,
"learning_rate": 9.961809300312512e-06,
"loss": 0.0643,
"step": 429
},
{
"epoch": 0.40777619724988146,
"grad_norm": 0.914323773268032,
"learning_rate": 9.961125168034e-06,
"loss": 0.0835,
"step": 430
},
{
"epoch": 0.4087245139876719,
"grad_norm": 0.7441869330920762,
"learning_rate": 9.960434986313113e-06,
"loss": 0.0559,
"step": 431
},
{
"epoch": 0.4096728307254623,
"grad_norm": 2.4732017252552367,
"learning_rate": 9.959738755991437e-06,
"loss": 0.1445,
"step": 432
},
{
"epoch": 0.41062114746325273,
"grad_norm": 0.8533585342555405,
"learning_rate": 9.959036477917935e-06,
"loss": 0.0575,
"step": 433
},
{
"epoch": 0.4115694642010431,
"grad_norm": 0.8190438451317316,
"learning_rate": 9.95832815294894e-06,
"loss": 0.0794,
"step": 434
},
{
"epoch": 0.41251778093883357,
"grad_norm": 1.0046620676404385,
"learning_rate": 9.957613781948164e-06,
"loss": 0.0686,
"step": 435
},
{
"epoch": 0.413466097676624,
"grad_norm": 0.9887051267008984,
"learning_rate": 9.956893365786691e-06,
"loss": 0.0618,
"step": 436
},
{
"epoch": 0.4144144144144144,
"grad_norm": 0.6105909207601089,
"learning_rate": 9.95616690534297e-06,
"loss": 0.0572,
"step": 437
},
{
"epoch": 0.41536273115220484,
"grad_norm": 1.5234824479103468,
"learning_rate": 9.955434401502825e-06,
"loss": 0.0994,
"step": 438
},
{
"epoch": 0.41631104788999523,
"grad_norm": 1.1295839815001452,
"learning_rate": 9.954695855159454e-06,
"loss": 0.073,
"step": 439
},
{
"epoch": 0.4172593646277857,
"grad_norm": 0.6583329952843571,
"learning_rate": 9.95395126721341e-06,
"loss": 0.0699,
"step": 440
},
{
"epoch": 0.4182076813655761,
"grad_norm": 0.955937586299997,
"learning_rate": 9.953200638572625e-06,
"loss": 0.0815,
"step": 441
},
{
"epoch": 0.4191559981033665,
"grad_norm": 1.5323108400108396,
"learning_rate": 9.95244397015239e-06,
"loss": 0.0732,
"step": 442
},
{
"epoch": 0.42010431484115696,
"grad_norm": 1.677920724371183,
"learning_rate": 9.951681262875365e-06,
"loss": 0.0944,
"step": 443
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.8926328574943209,
"learning_rate": 9.95091251767157e-06,
"loss": 0.0731,
"step": 444
},
{
"epoch": 0.4220009483167378,
"grad_norm": 1.2692898943255297,
"learning_rate": 9.950137735478389e-06,
"loss": 0.1029,
"step": 445
},
{
"epoch": 0.42294926505452823,
"grad_norm": 0.7345506207483801,
"learning_rate": 9.949356917240569e-06,
"loss": 0.0748,
"step": 446
},
{
"epoch": 0.4238975817923186,
"grad_norm": 1.2435473519034808,
"learning_rate": 9.948570063910216e-06,
"loss": 0.1009,
"step": 447
},
{
"epoch": 0.42484589853010907,
"grad_norm": 0.7650866909769807,
"learning_rate": 9.947777176446792e-06,
"loss": 0.0746,
"step": 448
},
{
"epoch": 0.42579421526789946,
"grad_norm": 1.3807429981979404,
"learning_rate": 9.946978255817121e-06,
"loss": 0.0701,
"step": 449
},
{
"epoch": 0.4267425320056899,
"grad_norm": 0.5315623424461096,
"learning_rate": 9.946173302995382e-06,
"loss": 0.0574,
"step": 450
},
{
"epoch": 0.42769084874348035,
"grad_norm": 0.8562951763201797,
"learning_rate": 9.94536231896311e-06,
"loss": 0.0951,
"step": 451
},
{
"epoch": 0.42863916548127073,
"grad_norm": 1.1965590998104225,
"learning_rate": 9.944545304709192e-06,
"loss": 0.0877,
"step": 452
},
{
"epoch": 0.4295874822190612,
"grad_norm": 1.2735339749816497,
"learning_rate": 9.943722261229872e-06,
"loss": 0.0768,
"step": 453
},
{
"epoch": 0.43053579895685157,
"grad_norm": 0.9370658659046329,
"learning_rate": 9.942893189528743e-06,
"loss": 0.0782,
"step": 454
},
{
"epoch": 0.431484115694642,
"grad_norm": 1.5520551397042521,
"learning_rate": 9.942058090616748e-06,
"loss": 0.1039,
"step": 455
},
{
"epoch": 0.43243243243243246,
"grad_norm": 1.3529615602541014,
"learning_rate": 9.941216965512183e-06,
"loss": 0.0867,
"step": 456
},
{
"epoch": 0.43338074917022285,
"grad_norm": 1.192234505990805,
"learning_rate": 9.940369815240688e-06,
"loss": 0.0809,
"step": 457
},
{
"epoch": 0.4343290659080133,
"grad_norm": 0.9763205758532367,
"learning_rate": 9.939516640835254e-06,
"loss": 0.0652,
"step": 458
},
{
"epoch": 0.4352773826458037,
"grad_norm": 1.3415645605638937,
"learning_rate": 9.938657443336212e-06,
"loss": 0.109,
"step": 459
},
{
"epoch": 0.4362256993835941,
"grad_norm": 1.1595154129634277,
"learning_rate": 9.937792223791244e-06,
"loss": 0.1002,
"step": 460
},
{
"epoch": 0.43717401612138457,
"grad_norm": 1.33436975844217,
"learning_rate": 9.936920983255372e-06,
"loss": 0.114,
"step": 461
},
{
"epoch": 0.43812233285917496,
"grad_norm": 1.0009653043703806,
"learning_rate": 9.936043722790956e-06,
"loss": 0.0827,
"step": 462
},
{
"epoch": 0.4390706495969654,
"grad_norm": 1.1900315382859075,
"learning_rate": 9.935160443467704e-06,
"loss": 0.0991,
"step": 463
},
{
"epoch": 0.4400189663347558,
"grad_norm": 0.7796648666540394,
"learning_rate": 9.934271146362658e-06,
"loss": 0.0729,
"step": 464
},
{
"epoch": 0.44096728307254623,
"grad_norm": 0.7692033539386839,
"learning_rate": 9.933375832560199e-06,
"loss": 0.0752,
"step": 465
},
{
"epoch": 0.4419155998103367,
"grad_norm": 0.7898679053377281,
"learning_rate": 9.932474503152047e-06,
"loss": 0.0557,
"step": 466
},
{
"epoch": 0.44286391654812707,
"grad_norm": 1.308054442070126,
"learning_rate": 9.931567159237252e-06,
"loss": 0.1,
"step": 467
},
{
"epoch": 0.4438122332859175,
"grad_norm": 0.8281027248286734,
"learning_rate": 9.930653801922205e-06,
"loss": 0.1066,
"step": 468
},
{
"epoch": 0.4447605500237079,
"grad_norm": 0.6589498594732086,
"learning_rate": 9.929734432320621e-06,
"loss": 0.061,
"step": 469
},
{
"epoch": 0.44570886676149835,
"grad_norm": 1.0105820136512023,
"learning_rate": 9.928809051553554e-06,
"loss": 0.0771,
"step": 470
},
{
"epoch": 0.4466571834992888,
"grad_norm": 1.174475732403723,
"learning_rate": 9.927877660749385e-06,
"loss": 0.1029,
"step": 471
},
{
"epoch": 0.4476055002370792,
"grad_norm": 0.7007588523937572,
"learning_rate": 9.92694026104382e-06,
"loss": 0.0548,
"step": 472
},
{
"epoch": 0.4485538169748696,
"grad_norm": 0.7548622992450297,
"learning_rate": 9.925996853579897e-06,
"loss": 0.071,
"step": 473
},
{
"epoch": 0.44950213371266,
"grad_norm": 0.9151211373906433,
"learning_rate": 9.92504743950798e-06,
"loss": 0.0728,
"step": 474
},
{
"epoch": 0.45045045045045046,
"grad_norm": 1.3188113799099948,
"learning_rate": 9.924092019985751e-06,
"loss": 0.071,
"step": 475
},
{
"epoch": 0.45139876718824085,
"grad_norm": 0.834826643366671,
"learning_rate": 9.923130596178221e-06,
"loss": 0.0827,
"step": 476
},
{
"epoch": 0.4523470839260313,
"grad_norm": 0.8853088211117691,
"learning_rate": 9.922163169257722e-06,
"loss": 0.0714,
"step": 477
},
{
"epoch": 0.45329540066382173,
"grad_norm": 0.9773650061711494,
"learning_rate": 9.921189740403902e-06,
"loss": 0.0902,
"step": 478
},
{
"epoch": 0.4542437174016121,
"grad_norm": 0.8530429782086267,
"learning_rate": 9.92021031080373e-06,
"loss": 0.0896,
"step": 479
},
{
"epoch": 0.45519203413940257,
"grad_norm": 0.6841245724165017,
"learning_rate": 9.919224881651494e-06,
"loss": 0.0574,
"step": 480
},
{
"epoch": 0.45614035087719296,
"grad_norm": 0.8751901827667304,
"learning_rate": 9.918233454148795e-06,
"loss": 0.0712,
"step": 481
},
{
"epoch": 0.4570886676149834,
"grad_norm": 0.8605318101074332,
"learning_rate": 9.917236029504549e-06,
"loss": 0.0758,
"step": 482
},
{
"epoch": 0.45803698435277385,
"grad_norm": 0.6297402738230038,
"learning_rate": 9.916232608934982e-06,
"loss": 0.0835,
"step": 483
},
{
"epoch": 0.45898530109056423,
"grad_norm": 1.2633792305334934,
"learning_rate": 9.915223193663639e-06,
"loss": 0.097,
"step": 484
},
{
"epoch": 0.4599336178283547,
"grad_norm": 0.9453282561376489,
"learning_rate": 9.914207784921366e-06,
"loss": 0.0813,
"step": 485
},
{
"epoch": 0.46088193456614507,
"grad_norm": 1.0981998450683066,
"learning_rate": 9.913186383946322e-06,
"loss": 0.0831,
"step": 486
},
{
"epoch": 0.4618302513039355,
"grad_norm": 0.9453607555522517,
"learning_rate": 9.91215899198397e-06,
"loss": 0.0668,
"step": 487
},
{
"epoch": 0.46277856804172596,
"grad_norm": 0.8480655824160724,
"learning_rate": 9.911125610287085e-06,
"loss": 0.0803,
"step": 488
},
{
"epoch": 0.46372688477951635,
"grad_norm": 0.7365032755805906,
"learning_rate": 9.910086240115738e-06,
"loss": 0.0503,
"step": 489
},
{
"epoch": 0.4646752015173068,
"grad_norm": 0.9926545138390478,
"learning_rate": 9.909040882737301e-06,
"loss": 0.0785,
"step": 490
},
{
"epoch": 0.4656235182550972,
"grad_norm": 1.078153469225969,
"learning_rate": 9.907989539426455e-06,
"loss": 0.0942,
"step": 491
},
{
"epoch": 0.4665718349928876,
"grad_norm": 0.891582918999742,
"learning_rate": 9.906932211465173e-06,
"loss": 0.0713,
"step": 492
},
{
"epoch": 0.46752015173067807,
"grad_norm": 0.8352029023952229,
"learning_rate": 9.90586890014273e-06,
"loss": 0.0871,
"step": 493
},
{
"epoch": 0.46846846846846846,
"grad_norm": 1.4543230270611818,
"learning_rate": 9.904799606755695e-06,
"loss": 0.1049,
"step": 494
},
{
"epoch": 0.4694167852062589,
"grad_norm": 0.9571877161884975,
"learning_rate": 9.90372433260793e-06,
"loss": 0.0856,
"step": 495
},
{
"epoch": 0.4703651019440493,
"grad_norm": 0.6657483404024113,
"learning_rate": 9.90264307901059e-06,
"loss": 0.0631,
"step": 496
},
{
"epoch": 0.47131341868183974,
"grad_norm": 1.2493973473928695,
"learning_rate": 9.901555847282123e-06,
"loss": 0.0973,
"step": 497
},
{
"epoch": 0.4722617354196302,
"grad_norm": 0.6689914382563446,
"learning_rate": 9.900462638748266e-06,
"loss": 0.0582,
"step": 498
},
{
"epoch": 0.47321005215742057,
"grad_norm": 0.8246501895880392,
"learning_rate": 9.899363454742044e-06,
"loss": 0.0727,
"step": 499
},
{
"epoch": 0.474158368895211,
"grad_norm": 1.442170890658491,
"learning_rate": 9.898258296603769e-06,
"loss": 0.0931,
"step": 500
},
{
"epoch": 0.4751066856330014,
"grad_norm": 0.7582565389247256,
"learning_rate": 9.897147165681034e-06,
"loss": 0.0722,
"step": 501
},
{
"epoch": 0.47605500237079185,
"grad_norm": 0.627525129279453,
"learning_rate": 9.896030063328718e-06,
"loss": 0.0597,
"step": 502
},
{
"epoch": 0.4770033191085823,
"grad_norm": 0.6342149242840518,
"learning_rate": 9.894906990908982e-06,
"loss": 0.0725,
"step": 503
},
{
"epoch": 0.4779516358463727,
"grad_norm": 0.8212079234115165,
"learning_rate": 9.893777949791266e-06,
"loss": 0.0649,
"step": 504
},
{
"epoch": 0.4788999525841631,
"grad_norm": 0.8923951454231676,
"learning_rate": 9.89264294135229e-06,
"loss": 0.0595,
"step": 505
},
{
"epoch": 0.4798482693219535,
"grad_norm": 1.0318440665130484,
"learning_rate": 9.891501966976041e-06,
"loss": 0.0842,
"step": 506
},
{
"epoch": 0.48079658605974396,
"grad_norm": 0.6944537972828242,
"learning_rate": 9.890355028053793e-06,
"loss": 0.0752,
"step": 507
},
{
"epoch": 0.4817449027975344,
"grad_norm": 1.0705584030604105,
"learning_rate": 9.889202125984088e-06,
"loss": 0.0647,
"step": 508
},
{
"epoch": 0.4826932195353248,
"grad_norm": 0.9754252622446561,
"learning_rate": 9.88804326217274e-06,
"loss": 0.0687,
"step": 509
},
{
"epoch": 0.48364153627311524,
"grad_norm": 0.9660762094606946,
"learning_rate": 9.886878438032828e-06,
"loss": 0.0789,
"step": 510
},
{
"epoch": 0.4845898530109056,
"grad_norm": 0.5832722133461282,
"learning_rate": 9.885707654984703e-06,
"loss": 0.0636,
"step": 511
},
{
"epoch": 0.48553816974869607,
"grad_norm": 0.7052006552554221,
"learning_rate": 9.884530914455984e-06,
"loss": 0.0586,
"step": 512
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.9822072228951928,
"learning_rate": 9.88334821788155e-06,
"loss": 0.0645,
"step": 513
},
{
"epoch": 0.4874348032242769,
"grad_norm": 0.9641946540266126,
"learning_rate": 9.882159566703547e-06,
"loss": 0.0885,
"step": 514
},
{
"epoch": 0.48838311996206735,
"grad_norm": 0.6403136140606015,
"learning_rate": 9.880964962371378e-06,
"loss": 0.0678,
"step": 515
},
{
"epoch": 0.48933143669985774,
"grad_norm": 0.7486541793123711,
"learning_rate": 9.879764406341705e-06,
"loss": 0.0741,
"step": 516
},
{
"epoch": 0.4902797534376482,
"grad_norm": 0.5779229700891555,
"learning_rate": 9.87855790007845e-06,
"loss": 0.0646,
"step": 517
},
{
"epoch": 0.49122807017543857,
"grad_norm": 0.7611283230447122,
"learning_rate": 9.87734544505279e-06,
"loss": 0.0768,
"step": 518
},
{
"epoch": 0.492176386913229,
"grad_norm": 0.5823535883100547,
"learning_rate": 9.876127042743155e-06,
"loss": 0.0703,
"step": 519
},
{
"epoch": 0.49312470365101946,
"grad_norm": 0.6827829977739827,
"learning_rate": 9.874902694635226e-06,
"loss": 0.0772,
"step": 520
},
{
"epoch": 0.49407302038880985,
"grad_norm": 0.7254200544564426,
"learning_rate": 9.873672402221937e-06,
"loss": 0.0634,
"step": 521
},
{
"epoch": 0.4950213371266003,
"grad_norm": 0.6425214796651868,
"learning_rate": 9.872436167003468e-06,
"loss": 0.064,
"step": 522
},
{
"epoch": 0.4959696538643907,
"grad_norm": 0.623192525545158,
"learning_rate": 9.871193990487242e-06,
"loss": 0.077,
"step": 523
},
{
"epoch": 0.4969179706021811,
"grad_norm": 0.7225947749173619,
"learning_rate": 9.869945874187936e-06,
"loss": 0.075,
"step": 524
},
{
"epoch": 0.49786628733997157,
"grad_norm": 2.0516616577595435,
"learning_rate": 9.868691819627462e-06,
"loss": 0.0867,
"step": 525
},
{
"epoch": 0.49881460407776196,
"grad_norm": 1.0257158284306434,
"learning_rate": 9.867431828334974e-06,
"loss": 0.0588,
"step": 526
},
{
"epoch": 0.4997629208155524,
"grad_norm": 0.8403229438927825,
"learning_rate": 9.86616590184687e-06,
"loss": 0.0823,
"step": 527
},
{
"epoch": 0.5007112375533428,
"grad_norm": 0.6449240492145598,
"learning_rate": 9.864894041706779e-06,
"loss": 0.0567,
"step": 528
},
{
"epoch": 0.5016595542911333,
"grad_norm": 0.8789018684523284,
"learning_rate": 9.863616249465567e-06,
"loss": 0.0713,
"step": 529
},
{
"epoch": 0.5026078710289237,
"grad_norm": 0.9524887983478211,
"learning_rate": 9.862332526681336e-06,
"loss": 0.0835,
"step": 530
},
{
"epoch": 0.5035561877667141,
"grad_norm": 0.6422268170348604,
"learning_rate": 9.861042874919417e-06,
"loss": 0.0606,
"step": 531
},
{
"epoch": 0.5045045045045045,
"grad_norm": 0.9032374038451735,
"learning_rate": 9.859747295752374e-06,
"loss": 0.0773,
"step": 532
},
{
"epoch": 0.505452821242295,
"grad_norm": 0.9269404822199643,
"learning_rate": 9.858445790759992e-06,
"loss": 0.0822,
"step": 533
},
{
"epoch": 0.5064011379800853,
"grad_norm": 0.7043514434980399,
"learning_rate": 9.857138361529288e-06,
"loss": 0.0688,
"step": 534
},
{
"epoch": 0.5073494547178757,
"grad_norm": 0.8239211698855243,
"learning_rate": 9.8558250096545e-06,
"loss": 0.0542,
"step": 535
},
{
"epoch": 0.5082977714556662,
"grad_norm": 0.8633975590563754,
"learning_rate": 9.85450573673709e-06,
"loss": 0.0744,
"step": 536
},
{
"epoch": 0.5092460881934566,
"grad_norm": 0.6985004021466871,
"learning_rate": 9.853180544385737e-06,
"loss": 0.047,
"step": 537
},
{
"epoch": 0.510194404931247,
"grad_norm": 0.5889042803503781,
"learning_rate": 9.851849434216338e-06,
"loss": 0.0557,
"step": 538
},
{
"epoch": 0.5111427216690374,
"grad_norm": 0.7765705663935071,
"learning_rate": 9.850512407852012e-06,
"loss": 0.0669,
"step": 539
},
{
"epoch": 0.5120910384068279,
"grad_norm": 0.8204550382112847,
"learning_rate": 9.849169466923086e-06,
"loss": 0.0685,
"step": 540
},
{
"epoch": 0.5130393551446183,
"grad_norm": 0.5256883407913393,
"learning_rate": 9.847820613067098e-06,
"loss": 0.0537,
"step": 541
},
{
"epoch": 0.5139876718824087,
"grad_norm": 0.6838576750776693,
"learning_rate": 9.8464658479288e-06,
"loss": 0.0704,
"step": 542
},
{
"epoch": 0.5149359886201992,
"grad_norm": 0.8974806559813661,
"learning_rate": 9.845105173160152e-06,
"loss": 0.0899,
"step": 543
},
{
"epoch": 0.5158843053579896,
"grad_norm": 0.7219053990698988,
"learning_rate": 9.843738590420317e-06,
"loss": 0.0468,
"step": 544
},
{
"epoch": 0.51683262209578,
"grad_norm": 1.032987889739876,
"learning_rate": 9.842366101375664e-06,
"loss": 0.0562,
"step": 545
},
{
"epoch": 0.5177809388335705,
"grad_norm": 0.7651951768284668,
"learning_rate": 9.840987707699765e-06,
"loss": 0.0669,
"step": 546
},
{
"epoch": 0.5187292555713608,
"grad_norm": 0.6813496832389402,
"learning_rate": 9.839603411073388e-06,
"loss": 0.0706,
"step": 547
},
{
"epoch": 0.5196775723091512,
"grad_norm": 0.7229692269198181,
"learning_rate": 9.838213213184505e-06,
"loss": 0.0771,
"step": 548
},
{
"epoch": 0.5206258890469416,
"grad_norm": 1.157471128375012,
"learning_rate": 9.836817115728277e-06,
"loss": 0.0932,
"step": 549
},
{
"epoch": 0.5215742057847321,
"grad_norm": 0.8058138449457062,
"learning_rate": 9.835415120407063e-06,
"loss": 0.0539,
"step": 550
},
{
"epoch": 0.5225225225225225,
"grad_norm": 0.6915528599019737,
"learning_rate": 9.834007228930414e-06,
"loss": 0.0688,
"step": 551
},
{
"epoch": 0.5234708392603129,
"grad_norm": 0.8835152385091712,
"learning_rate": 9.832593443015068e-06,
"loss": 0.0605,
"step": 552
},
{
"epoch": 0.5244191559981034,
"grad_norm": 0.6896706794263241,
"learning_rate": 9.83117376438495e-06,
"loss": 0.0668,
"step": 553
},
{
"epoch": 0.5253674727358938,
"grad_norm": 0.7651857964351815,
"learning_rate": 9.829748194771175e-06,
"loss": 0.064,
"step": 554
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.6216741056003758,
"learning_rate": 9.828316735912037e-06,
"loss": 0.0541,
"step": 555
},
{
"epoch": 0.5272641062114747,
"grad_norm": 0.6813673301708452,
"learning_rate": 9.826879389553014e-06,
"loss": 0.0574,
"step": 556
},
{
"epoch": 0.5282124229492651,
"grad_norm": 0.7147998418504048,
"learning_rate": 9.825436157446761e-06,
"loss": 0.0576,
"step": 557
},
{
"epoch": 0.5291607396870555,
"grad_norm": 0.6352148290105686,
"learning_rate": 9.82398704135311e-06,
"loss": 0.066,
"step": 558
},
{
"epoch": 0.5301090564248458,
"grad_norm": 0.8511240887028577,
"learning_rate": 9.822532043039068e-06,
"loss": 0.0687,
"step": 559
},
{
"epoch": 0.5310573731626363,
"grad_norm": 0.6876408977841421,
"learning_rate": 9.821071164278815e-06,
"loss": 0.0838,
"step": 560
},
{
"epoch": 0.5320056899004267,
"grad_norm": 0.7354217835184531,
"learning_rate": 9.819604406853703e-06,
"loss": 0.0552,
"step": 561
},
{
"epoch": 0.5329540066382171,
"grad_norm": 0.9572067784227991,
"learning_rate": 9.818131772552249e-06,
"loss": 0.1099,
"step": 562
},
{
"epoch": 0.5339023233760076,
"grad_norm": 0.7931127239607592,
"learning_rate": 9.816653263170137e-06,
"loss": 0.0706,
"step": 563
},
{
"epoch": 0.534850640113798,
"grad_norm": 0.8242420526129728,
"learning_rate": 9.815168880510218e-06,
"loss": 0.0946,
"step": 564
},
{
"epoch": 0.5357989568515884,
"grad_norm": 1.0330372476146157,
"learning_rate": 9.8136786263825e-06,
"loss": 0.0951,
"step": 565
},
{
"epoch": 0.5367472735893789,
"grad_norm": 0.7553297432270302,
"learning_rate": 9.812182502604151e-06,
"loss": 0.0663,
"step": 566
},
{
"epoch": 0.5376955903271693,
"grad_norm": 0.8446853429895546,
"learning_rate": 9.810680510999505e-06,
"loss": 0.0728,
"step": 567
},
{
"epoch": 0.5386439070649597,
"grad_norm": 0.5089680701907852,
"learning_rate": 9.809172653400036e-06,
"loss": 0.0501,
"step": 568
},
{
"epoch": 0.5395922238027501,
"grad_norm": 0.7258180066288827,
"learning_rate": 9.807658931644382e-06,
"loss": 0.0752,
"step": 569
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.7028402619162881,
"learning_rate": 9.806139347578331e-06,
"loss": 0.059,
"step": 570
},
{
"epoch": 0.541488857278331,
"grad_norm": 0.7248854010393692,
"learning_rate": 9.804613903054813e-06,
"loss": 0.0851,
"step": 571
},
{
"epoch": 0.5424371740161213,
"grad_norm": 0.7176555652391681,
"learning_rate": 9.803082599933911e-06,
"loss": 0.0697,
"step": 572
},
{
"epoch": 0.5433854907539118,
"grad_norm": 0.4808404612456389,
"learning_rate": 9.801545440082845e-06,
"loss": 0.0569,
"step": 573
},
{
"epoch": 0.5443338074917022,
"grad_norm": 0.8731137568130377,
"learning_rate": 9.800002425375984e-06,
"loss": 0.0657,
"step": 574
},
{
"epoch": 0.5452821242294926,
"grad_norm": 0.7816194292982013,
"learning_rate": 9.798453557694828e-06,
"loss": 0.0724,
"step": 575
},
{
"epoch": 0.5462304409672831,
"grad_norm": 0.9042436959378762,
"learning_rate": 9.796898838928022e-06,
"loss": 0.0784,
"step": 576
},
{
"epoch": 0.5471787577050735,
"grad_norm": 1.0293154765529384,
"learning_rate": 9.79533827097134e-06,
"loss": 0.098,
"step": 577
},
{
"epoch": 0.5481270744428639,
"grad_norm": 0.8678391414260259,
"learning_rate": 9.793771855727691e-06,
"loss": 0.0635,
"step": 578
},
{
"epoch": 0.5490753911806543,
"grad_norm": 0.6041409950077287,
"learning_rate": 9.792199595107115e-06,
"loss": 0.0524,
"step": 579
},
{
"epoch": 0.5500237079184448,
"grad_norm": 1.0292476772898875,
"learning_rate": 9.790621491026773e-06,
"loss": 0.0829,
"step": 580
},
{
"epoch": 0.5509720246562352,
"grad_norm": 0.7074515600768486,
"learning_rate": 9.78903754541096e-06,
"loss": 0.0704,
"step": 581
},
{
"epoch": 0.5519203413940256,
"grad_norm": 0.7603340975922476,
"learning_rate": 9.787447760191092e-06,
"loss": 0.0788,
"step": 582
},
{
"epoch": 0.5528686581318161,
"grad_norm": 1.0766706695954442,
"learning_rate": 9.785852137305699e-06,
"loss": 0.079,
"step": 583
},
{
"epoch": 0.5538169748696065,
"grad_norm": 0.7555731931730972,
"learning_rate": 9.784250678700435e-06,
"loss": 0.0705,
"step": 584
},
{
"epoch": 0.5547652916073968,
"grad_norm": 0.7010961175305198,
"learning_rate": 9.782643386328073e-06,
"loss": 0.0713,
"step": 585
},
{
"epoch": 0.5557136083451872,
"grad_norm": 1.0580272254821363,
"learning_rate": 9.781030262148492e-06,
"loss": 0.0671,
"step": 586
},
{
"epoch": 0.5566619250829777,
"grad_norm": 0.6594876081209583,
"learning_rate": 9.779411308128685e-06,
"loss": 0.0867,
"step": 587
},
{
"epoch": 0.5576102418207681,
"grad_norm": 1.3649847896410103,
"learning_rate": 9.777786526242759e-06,
"loss": 0.0847,
"step": 588
},
{
"epoch": 0.5585585585585585,
"grad_norm": 0.6223880228627037,
"learning_rate": 9.776155918471916e-06,
"loss": 0.0579,
"step": 589
},
{
"epoch": 0.559506875296349,
"grad_norm": 0.6862572922646061,
"learning_rate": 9.774519486804476e-06,
"loss": 0.053,
"step": 590
},
{
"epoch": 0.5604551920341394,
"grad_norm": 0.6562455064809456,
"learning_rate": 9.772877233235848e-06,
"loss": 0.0651,
"step": 591
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.7150505236504866,
"learning_rate": 9.771229159768547e-06,
"loss": 0.0697,
"step": 592
},
{
"epoch": 0.5623518255097203,
"grad_norm": 0.7505406859172821,
"learning_rate": 9.769575268412182e-06,
"loss": 0.0691,
"step": 593
},
{
"epoch": 0.5633001422475107,
"grad_norm": 0.7340490905887499,
"learning_rate": 9.767915561183456e-06,
"loss": 0.0748,
"step": 594
},
{
"epoch": 0.5642484589853011,
"grad_norm": 0.7987611706335997,
"learning_rate": 9.766250040106166e-06,
"loss": 0.0682,
"step": 595
},
{
"epoch": 0.5651967757230915,
"grad_norm": 1.2974449597341617,
"learning_rate": 9.764578707211199e-06,
"loss": 0.0751,
"step": 596
},
{
"epoch": 0.566145092460882,
"grad_norm": 0.6191420122018653,
"learning_rate": 9.762901564536523e-06,
"loss": 0.0667,
"step": 597
},
{
"epoch": 0.5670934091986723,
"grad_norm": 0.6903639931399153,
"learning_rate": 9.761218614127193e-06,
"loss": 0.0653,
"step": 598
},
{
"epoch": 0.5680417259364627,
"grad_norm": 0.7974449669867185,
"learning_rate": 9.759529858035351e-06,
"loss": 0.0662,
"step": 599
},
{
"epoch": 0.5689900426742532,
"grad_norm": 1.6445977802603875,
"learning_rate": 9.75783529832021e-06,
"loss": 0.0781,
"step": 600
},
{
"epoch": 0.5699383594120436,
"grad_norm": 0.7682344601188886,
"learning_rate": 9.756134937048066e-06,
"loss": 0.0516,
"step": 601
},
{
"epoch": 0.570886676149834,
"grad_norm": 0.6505039594954853,
"learning_rate": 9.754428776292287e-06,
"loss": 0.0522,
"step": 602
},
{
"epoch": 0.5718349928876245,
"grad_norm": 1.0748139183671632,
"learning_rate": 9.752716818133309e-06,
"loss": 0.0787,
"step": 603
},
{
"epoch": 0.5727833096254149,
"grad_norm": 0.7575374337239762,
"learning_rate": 9.750999064658644e-06,
"loss": 0.0618,
"step": 604
},
{
"epoch": 0.5737316263632053,
"grad_norm": 0.5005741056916544,
"learning_rate": 9.749275517962868e-06,
"loss": 0.0579,
"step": 605
},
{
"epoch": 0.5746799431009957,
"grad_norm": 0.9747236186565804,
"learning_rate": 9.747546180147618e-06,
"loss": 0.1137,
"step": 606
},
{
"epoch": 0.5756282598387862,
"grad_norm": 0.5945741852680105,
"learning_rate": 9.745811053321597e-06,
"loss": 0.0528,
"step": 607
},
{
"epoch": 0.5765765765765766,
"grad_norm": 0.8767385416979725,
"learning_rate": 9.744070139600564e-06,
"loss": 0.0756,
"step": 608
},
{
"epoch": 0.577524893314367,
"grad_norm": 0.805183732938404,
"learning_rate": 9.742323441107335e-06,
"loss": 0.0796,
"step": 609
},
{
"epoch": 0.5784732100521575,
"grad_norm": 0.4622182813428181,
"learning_rate": 9.74057095997178e-06,
"loss": 0.0466,
"step": 610
},
{
"epoch": 0.5794215267899478,
"grad_norm": 1.323185570736391,
"learning_rate": 9.738812698330821e-06,
"loss": 0.0803,
"step": 611
},
{
"epoch": 0.5803698435277382,
"grad_norm": 0.6017510939556475,
"learning_rate": 9.737048658328428e-06,
"loss": 0.0473,
"step": 612
},
{
"epoch": 0.5813181602655287,
"grad_norm": 0.9340483579893749,
"learning_rate": 9.735278842115616e-06,
"loss": 0.0726,
"step": 613
},
{
"epoch": 0.5822664770033191,
"grad_norm": 0.8017302866486061,
"learning_rate": 9.733503251850443e-06,
"loss": 0.0508,
"step": 614
},
{
"epoch": 0.5832147937411095,
"grad_norm": 0.4915103436956615,
"learning_rate": 9.73172188969801e-06,
"loss": 0.0511,
"step": 615
},
{
"epoch": 0.5841631104788999,
"grad_norm": 0.5454251857464146,
"learning_rate": 9.729934757830455e-06,
"loss": 0.043,
"step": 616
},
{
"epoch": 0.5851114272166904,
"grad_norm": 0.45382702737394764,
"learning_rate": 9.728141858426953e-06,
"loss": 0.046,
"step": 617
},
{
"epoch": 0.5860597439544808,
"grad_norm": 0.5609546349379012,
"learning_rate": 9.726343193673707e-06,
"loss": 0.0528,
"step": 618
},
{
"epoch": 0.5870080606922712,
"grad_norm": 0.600673482298699,
"learning_rate": 9.724538765763953e-06,
"loss": 0.0539,
"step": 619
},
{
"epoch": 0.5879563774300617,
"grad_norm": 0.9417089865736203,
"learning_rate": 9.722728576897956e-06,
"loss": 0.0583,
"step": 620
},
{
"epoch": 0.5889046941678521,
"grad_norm": 0.4653439643190733,
"learning_rate": 9.720912629283004e-06,
"loss": 0.05,
"step": 621
},
{
"epoch": 0.5898530109056425,
"grad_norm": 1.026549188147293,
"learning_rate": 9.719090925133408e-06,
"loss": 0.0643,
"step": 622
},
{
"epoch": 0.590801327643433,
"grad_norm": 0.7947545630855374,
"learning_rate": 9.717263466670496e-06,
"loss": 0.0827,
"step": 623
},
{
"epoch": 0.5917496443812233,
"grad_norm": 0.5505357789361721,
"learning_rate": 9.715430256122616e-06,
"loss": 0.057,
"step": 624
},
{
"epoch": 0.5926979611190137,
"grad_norm": 0.6227650085275758,
"learning_rate": 9.713591295725126e-06,
"loss": 0.0613,
"step": 625
},
{
"epoch": 0.5936462778568041,
"grad_norm": 0.8089764410308476,
"learning_rate": 9.711746587720398e-06,
"loss": 0.0575,
"step": 626
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.8681782262186932,
"learning_rate": 9.709896134357815e-06,
"loss": 0.0664,
"step": 627
},
{
"epoch": 0.595542911332385,
"grad_norm": 0.682165737662686,
"learning_rate": 9.708039937893759e-06,
"loss": 0.0558,
"step": 628
},
{
"epoch": 0.5964912280701754,
"grad_norm": 0.6331915650172267,
"learning_rate": 9.706178000591617e-06,
"loss": 0.0628,
"step": 629
},
{
"epoch": 0.5974395448079659,
"grad_norm": 0.5712611189361939,
"learning_rate": 9.704310324721782e-06,
"loss": 0.0741,
"step": 630
},
{
"epoch": 0.5983878615457563,
"grad_norm": 0.4974903145873453,
"learning_rate": 9.70243691256164e-06,
"loss": 0.0569,
"step": 631
},
{
"epoch": 0.5993361782835467,
"grad_norm": 0.8755421451427193,
"learning_rate": 9.700557766395567e-06,
"loss": 0.0884,
"step": 632
},
{
"epoch": 0.6002844950213371,
"grad_norm": 0.5236784076286586,
"learning_rate": 9.698672888514938e-06,
"loss": 0.0493,
"step": 633
},
{
"epoch": 0.6012328117591276,
"grad_norm": 0.6525012362182552,
"learning_rate": 9.696782281218117e-06,
"loss": 0.0683,
"step": 634
},
{
"epoch": 0.602181128496918,
"grad_norm": 0.5119217968942416,
"learning_rate": 9.69488594681045e-06,
"loss": 0.0449,
"step": 635
},
{
"epoch": 0.6031294452347084,
"grad_norm": 0.6576021927278618,
"learning_rate": 9.692983887604269e-06,
"loss": 0.0674,
"step": 636
},
{
"epoch": 0.6040777619724989,
"grad_norm": 0.7157400695119305,
"learning_rate": 9.691076105918885e-06,
"loss": 0.0692,
"step": 637
},
{
"epoch": 0.6050260787102892,
"grad_norm": 0.873028935018846,
"learning_rate": 9.689162604080589e-06,
"loss": 0.0999,
"step": 638
},
{
"epoch": 0.6059743954480796,
"grad_norm": 0.8384167589559871,
"learning_rate": 9.687243384422646e-06,
"loss": 0.0771,
"step": 639
},
{
"epoch": 0.6069227121858701,
"grad_norm": 0.5020655439555515,
"learning_rate": 9.685318449285292e-06,
"loss": 0.0512,
"step": 640
},
{
"epoch": 0.6078710289236605,
"grad_norm": 0.36608001502573706,
"learning_rate": 9.683387801015733e-06,
"loss": 0.0377,
"step": 641
},
{
"epoch": 0.6088193456614509,
"grad_norm": 0.7919506442179929,
"learning_rate": 9.681451441968144e-06,
"loss": 0.0775,
"step": 642
},
{
"epoch": 0.6097676623992413,
"grad_norm": 0.6274619623629013,
"learning_rate": 9.67950937450366e-06,
"loss": 0.0645,
"step": 643
},
{
"epoch": 0.6107159791370318,
"grad_norm": 0.5896565427831529,
"learning_rate": 9.677561600990378e-06,
"loss": 0.0595,
"step": 644
},
{
"epoch": 0.6116642958748222,
"grad_norm": 0.5142338666265971,
"learning_rate": 9.67560812380335e-06,
"loss": 0.0597,
"step": 645
},
{
"epoch": 0.6126126126126126,
"grad_norm": 0.6109668570207277,
"learning_rate": 9.67364894532459e-06,
"loss": 0.07,
"step": 646
},
{
"epoch": 0.6135609293504031,
"grad_norm": 0.6756478515313759,
"learning_rate": 9.671684067943056e-06,
"loss": 0.0612,
"step": 647
},
{
"epoch": 0.6145092460881935,
"grad_norm": 0.6142876685386528,
"learning_rate": 9.669713494054662e-06,
"loss": 0.06,
"step": 648
},
{
"epoch": 0.6154575628259839,
"grad_norm": 0.8252522199066464,
"learning_rate": 9.667737226062262e-06,
"loss": 0.118,
"step": 649
},
{
"epoch": 0.6164058795637744,
"grad_norm": 0.48924053020562824,
"learning_rate": 9.665755266375657e-06,
"loss": 0.0542,
"step": 650
},
{
"epoch": 0.6173541963015647,
"grad_norm": 0.9087121397095356,
"learning_rate": 9.663767617411587e-06,
"loss": 0.0611,
"step": 651
},
{
"epoch": 0.6183025130393551,
"grad_norm": 0.7764764902550111,
"learning_rate": 9.66177428159373e-06,
"loss": 0.0676,
"step": 652
},
{
"epoch": 0.6192508297771455,
"grad_norm": 0.44918893065172116,
"learning_rate": 9.659775261352697e-06,
"loss": 0.0474,
"step": 653
},
{
"epoch": 0.620199146514936,
"grad_norm": 0.9162652994629981,
"learning_rate": 9.657770559126034e-06,
"loss": 0.0981,
"step": 654
},
{
"epoch": 0.6211474632527264,
"grad_norm": 0.6543823860401999,
"learning_rate": 9.655760177358208e-06,
"loss": 0.0744,
"step": 655
},
{
"epoch": 0.6220957799905168,
"grad_norm": 0.44085186666179094,
"learning_rate": 9.653744118500623e-06,
"loss": 0.0532,
"step": 656
},
{
"epoch": 0.6230440967283073,
"grad_norm": 0.7980175435844092,
"learning_rate": 9.651722385011592e-06,
"loss": 0.0807,
"step": 657
},
{
"epoch": 0.6239924134660977,
"grad_norm": 0.4853866988799319,
"learning_rate": 9.649694979356358e-06,
"loss": 0.0454,
"step": 658
},
{
"epoch": 0.6249407302038881,
"grad_norm": 0.5662361885259662,
"learning_rate": 9.647661904007076e-06,
"loss": 0.0621,
"step": 659
},
{
"epoch": 0.6258890469416786,
"grad_norm": 0.8127269026146419,
"learning_rate": 9.645623161442814e-06,
"loss": 0.0773,
"step": 660
},
{
"epoch": 0.626837363679469,
"grad_norm": 0.6294162739235921,
"learning_rate": 9.643578754149552e-06,
"loss": 0.0599,
"step": 661
},
{
"epoch": 0.6277856804172594,
"grad_norm": 0.6965237350859914,
"learning_rate": 9.641528684620179e-06,
"loss": 0.0542,
"step": 662
},
{
"epoch": 0.6287339971550497,
"grad_norm": 0.5265921422928361,
"learning_rate": 9.639472955354483e-06,
"loss": 0.0496,
"step": 663
},
{
"epoch": 0.6296823138928402,
"grad_norm": 0.8663040094375097,
"learning_rate": 9.63741156885916e-06,
"loss": 0.0733,
"step": 664
},
{
"epoch": 0.6306306306306306,
"grad_norm": 0.7508837936313448,
"learning_rate": 9.635344527647798e-06,
"loss": 0.08,
"step": 665
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.6827540936282853,
"learning_rate": 9.633271834240885e-06,
"loss": 0.0732,
"step": 666
},
{
"epoch": 0.6325272641062115,
"grad_norm": 0.7441700461651841,
"learning_rate": 9.631193491165798e-06,
"loss": 0.0555,
"step": 667
},
{
"epoch": 0.6334755808440019,
"grad_norm": 0.8313881844290032,
"learning_rate": 9.629109500956803e-06,
"loss": 0.0782,
"step": 668
},
{
"epoch": 0.6344238975817923,
"grad_norm": 0.47754915650781987,
"learning_rate": 9.627019866155056e-06,
"loss": 0.0547,
"step": 669
},
{
"epoch": 0.6353722143195828,
"grad_norm": 0.6618532396312571,
"learning_rate": 9.624924589308591e-06,
"loss": 0.0515,
"step": 670
},
{
"epoch": 0.6363205310573732,
"grad_norm": 1.147117197534475,
"learning_rate": 9.622823672972323e-06,
"loss": 0.0882,
"step": 671
},
{
"epoch": 0.6372688477951636,
"grad_norm": 0.5779383814129484,
"learning_rate": 9.620717119708047e-06,
"loss": 0.0659,
"step": 672
},
{
"epoch": 0.638217164532954,
"grad_norm": 0.5799389859663083,
"learning_rate": 9.618604932084427e-06,
"loss": 0.0606,
"step": 673
},
{
"epoch": 0.6391654812707445,
"grad_norm": 6.608545253943764,
"learning_rate": 9.616487112677e-06,
"loss": 0.066,
"step": 674
},
{
"epoch": 0.6401137980085349,
"grad_norm": 0.7235578117181891,
"learning_rate": 9.614363664068168e-06,
"loss": 0.0628,
"step": 675
},
{
"epoch": 0.6410621147463252,
"grad_norm": 0.6994528460712487,
"learning_rate": 9.6122345888472e-06,
"loss": 0.0628,
"step": 676
},
{
"epoch": 0.6420104314841157,
"grad_norm": 0.6208663188504899,
"learning_rate": 9.610099889610224e-06,
"loss": 0.0554,
"step": 677
},
{
"epoch": 0.6429587482219061,
"grad_norm": 0.6345977149189366,
"learning_rate": 9.607959568960226e-06,
"loss": 0.0632,
"step": 678
},
{
"epoch": 0.6439070649596965,
"grad_norm": 0.8061055021711904,
"learning_rate": 9.605813629507046e-06,
"loss": 0.0684,
"step": 679
},
{
"epoch": 0.6448553816974869,
"grad_norm": 0.6913423639588181,
"learning_rate": 9.603662073867375e-06,
"loss": 0.0673,
"step": 680
},
{
"epoch": 0.6458036984352774,
"grad_norm": 0.7586179752230898,
"learning_rate": 9.601504904664758e-06,
"loss": 0.0702,
"step": 681
},
{
"epoch": 0.6467520151730678,
"grad_norm": 0.5215807067369997,
"learning_rate": 9.599342124529576e-06,
"loss": 0.0484,
"step": 682
},
{
"epoch": 0.6477003319108582,
"grad_norm": 0.4193899811291156,
"learning_rate": 9.597173736099056e-06,
"loss": 0.0455,
"step": 683
},
{
"epoch": 0.6486486486486487,
"grad_norm": 1.0231627903377674,
"learning_rate": 9.594999742017267e-06,
"loss": 0.0755,
"step": 684
},
{
"epoch": 0.6495969653864391,
"grad_norm": 0.5818860445113369,
"learning_rate": 9.592820144935107e-06,
"loss": 0.0457,
"step": 685
},
{
"epoch": 0.6505452821242295,
"grad_norm": 0.8523614115619248,
"learning_rate": 9.590634947510312e-06,
"loss": 0.0666,
"step": 686
},
{
"epoch": 0.65149359886202,
"grad_norm": 0.6819462318103672,
"learning_rate": 9.588444152407441e-06,
"loss": 0.0621,
"step": 687
},
{
"epoch": 0.6524419155998104,
"grad_norm": 0.7350860734842137,
"learning_rate": 9.586247762297882e-06,
"loss": 0.0616,
"step": 688
},
{
"epoch": 0.6533902323376007,
"grad_norm": 0.6877200427996193,
"learning_rate": 9.584045779859848e-06,
"loss": 0.0691,
"step": 689
},
{
"epoch": 0.6543385490753911,
"grad_norm": 0.7777410132259543,
"learning_rate": 9.581838207778367e-06,
"loss": 0.0672,
"step": 690
},
{
"epoch": 0.6552868658131816,
"grad_norm": 1.0340407583447775,
"learning_rate": 9.579625048745281e-06,
"loss": 0.0692,
"step": 691
},
{
"epoch": 0.656235182550972,
"grad_norm": 0.6061769180463831,
"learning_rate": 9.577406305459251e-06,
"loss": 0.0519,
"step": 692
},
{
"epoch": 0.6571834992887624,
"grad_norm": 0.7287017758175208,
"learning_rate": 9.575181980625743e-06,
"loss": 0.0626,
"step": 693
},
{
"epoch": 0.6581318160265529,
"grad_norm": 0.6923184185544935,
"learning_rate": 9.57295207695703e-06,
"loss": 0.0602,
"step": 694
},
{
"epoch": 0.6590801327643433,
"grad_norm": 0.7441802004305137,
"learning_rate": 9.570716597172187e-06,
"loss": 0.0785,
"step": 695
},
{
"epoch": 0.6600284495021337,
"grad_norm": 0.5600328414907927,
"learning_rate": 9.568475543997088e-06,
"loss": 0.0525,
"step": 696
},
{
"epoch": 0.6609767662399242,
"grad_norm": 0.6179093672887623,
"learning_rate": 9.566228920164405e-06,
"loss": 0.0498,
"step": 697
},
{
"epoch": 0.6619250829777146,
"grad_norm": 1.0001632318997007,
"learning_rate": 9.563976728413602e-06,
"loss": 0.1065,
"step": 698
},
{
"epoch": 0.662873399715505,
"grad_norm": 0.6197443639375237,
"learning_rate": 9.56171897149093e-06,
"loss": 0.0429,
"step": 699
},
{
"epoch": 0.6638217164532954,
"grad_norm": 0.7426532648337794,
"learning_rate": 9.55945565214943e-06,
"loss": 0.0603,
"step": 700
},
{
"epoch": 0.6647700331910859,
"grad_norm": 0.9809220324323352,
"learning_rate": 9.557186773148922e-06,
"loss": 0.0844,
"step": 701
},
{
"epoch": 0.6657183499288762,
"grad_norm": 0.6596268576375636,
"learning_rate": 9.554912337256007e-06,
"loss": 0.0627,
"step": 702
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.6445430375796782,
"learning_rate": 9.552632347244062e-06,
"loss": 0.0621,
"step": 703
},
{
"epoch": 0.6676149834044571,
"grad_norm": 0.6854389668990125,
"learning_rate": 9.550346805893236e-06,
"loss": 0.0709,
"step": 704
},
{
"epoch": 0.6685633001422475,
"grad_norm": 0.9157472924094435,
"learning_rate": 9.548055715990448e-06,
"loss": 0.0669,
"step": 705
},
{
"epoch": 0.6695116168800379,
"grad_norm": 0.6210182918721243,
"learning_rate": 9.545759080329381e-06,
"loss": 0.0642,
"step": 706
},
{
"epoch": 0.6704599336178284,
"grad_norm": 0.5811606762164421,
"learning_rate": 9.543456901710483e-06,
"loss": 0.0734,
"step": 707
},
{
"epoch": 0.6714082503556188,
"grad_norm": 0.6797271720519124,
"learning_rate": 9.541149182940958e-06,
"loss": 0.0543,
"step": 708
},
{
"epoch": 0.6723565670934092,
"grad_norm": 0.5126068611905316,
"learning_rate": 9.538835926834766e-06,
"loss": 0.0504,
"step": 709
},
{
"epoch": 0.6733048838311996,
"grad_norm": 0.6464058845065579,
"learning_rate": 9.536517136212623e-06,
"loss": 0.0596,
"step": 710
},
{
"epoch": 0.6742532005689901,
"grad_norm": 0.5987248394746172,
"learning_rate": 9.534192813901986e-06,
"loss": 0.0561,
"step": 711
},
{
"epoch": 0.6752015173067805,
"grad_norm": 0.5757268664620699,
"learning_rate": 9.531862962737065e-06,
"loss": 0.0662,
"step": 712
},
{
"epoch": 0.6761498340445709,
"grad_norm": 0.6884820373956889,
"learning_rate": 9.529527585558806e-06,
"loss": 0.0734,
"step": 713
},
{
"epoch": 0.6770981507823614,
"grad_norm": 0.5599551362853026,
"learning_rate": 9.5271866852149e-06,
"loss": 0.0497,
"step": 714
},
{
"epoch": 0.6780464675201517,
"grad_norm": 1.2727013612767513,
"learning_rate": 9.524840264559762e-06,
"loss": 0.0806,
"step": 715
},
{
"epoch": 0.6789947842579421,
"grad_norm": 0.5125594480614294,
"learning_rate": 9.522488326454551e-06,
"loss": 0.0464,
"step": 716
},
{
"epoch": 0.6799431009957326,
"grad_norm": 0.9279881234599379,
"learning_rate": 9.520130873767141e-06,
"loss": 0.0466,
"step": 717
},
{
"epoch": 0.680891417733523,
"grad_norm": 0.5884738866592291,
"learning_rate": 9.517767909372143e-06,
"loss": 0.0463,
"step": 718
},
{
"epoch": 0.6818397344713134,
"grad_norm": 0.6405987798189022,
"learning_rate": 9.515399436150879e-06,
"loss": 0.0646,
"step": 719
},
{
"epoch": 0.6827880512091038,
"grad_norm": 0.6141893191288851,
"learning_rate": 9.513025456991394e-06,
"loss": 0.0713,
"step": 720
},
{
"epoch": 0.6837363679468943,
"grad_norm": 0.5294631004623913,
"learning_rate": 9.510645974788441e-06,
"loss": 0.0533,
"step": 721
},
{
"epoch": 0.6846846846846847,
"grad_norm": 0.5983803884552171,
"learning_rate": 9.508260992443492e-06,
"loss": 0.0574,
"step": 722
},
{
"epoch": 0.6856330014224751,
"grad_norm": 0.7168015362345571,
"learning_rate": 9.505870512864715e-06,
"loss": 0.0622,
"step": 723
},
{
"epoch": 0.6865813181602656,
"grad_norm": 0.8061703745318712,
"learning_rate": 9.503474538966992e-06,
"loss": 0.072,
"step": 724
},
{
"epoch": 0.687529634898056,
"grad_norm": 0.6410612258118752,
"learning_rate": 9.501073073671896e-06,
"loss": 0.0454,
"step": 725
},
{
"epoch": 0.6884779516358464,
"grad_norm": 0.790215058142473,
"learning_rate": 9.498666119907701e-06,
"loss": 0.0677,
"step": 726
},
{
"epoch": 0.6894262683736367,
"grad_norm": 0.6299133472058956,
"learning_rate": 9.496253680609371e-06,
"loss": 0.0585,
"step": 727
},
{
"epoch": 0.6903745851114272,
"grad_norm": 1.0623017139889208,
"learning_rate": 9.493835758718561e-06,
"loss": 0.069,
"step": 728
},
{
"epoch": 0.6913229018492176,
"grad_norm": 0.5536012592608316,
"learning_rate": 9.491412357183607e-06,
"loss": 0.0686,
"step": 729
},
{
"epoch": 0.692271218587008,
"grad_norm": 0.6038206755461478,
"learning_rate": 9.488983478959534e-06,
"loss": 0.0706,
"step": 730
},
{
"epoch": 0.6932195353247985,
"grad_norm": 0.6342419868913964,
"learning_rate": 9.486549127008037e-06,
"loss": 0.0496,
"step": 731
},
{
"epoch": 0.6941678520625889,
"grad_norm": 1.1555208683238716,
"learning_rate": 9.484109304297493e-06,
"loss": 0.0834,
"step": 732
},
{
"epoch": 0.6951161688003793,
"grad_norm": 0.8509380581545992,
"learning_rate": 9.481664013802943e-06,
"loss": 0.0794,
"step": 733
},
{
"epoch": 0.6960644855381698,
"grad_norm": 0.8224046322343856,
"learning_rate": 9.479213258506102e-06,
"loss": 0.0869,
"step": 734
},
{
"epoch": 0.6970128022759602,
"grad_norm": 0.6505920471844966,
"learning_rate": 9.476757041395342e-06,
"loss": 0.0642,
"step": 735
},
{
"epoch": 0.6979611190137506,
"grad_norm": 0.5162948092375159,
"learning_rate": 9.474295365465697e-06,
"loss": 0.0539,
"step": 736
},
{
"epoch": 0.698909435751541,
"grad_norm": 0.7194486779836317,
"learning_rate": 9.471828233718863e-06,
"loss": 0.0585,
"step": 737
},
{
"epoch": 0.6998577524893315,
"grad_norm": 0.9014549238602243,
"learning_rate": 9.46935564916318e-06,
"loss": 0.0874,
"step": 738
},
{
"epoch": 0.7008060692271219,
"grad_norm": 0.7378312572460828,
"learning_rate": 9.466877614813645e-06,
"loss": 0.0657,
"step": 739
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.815800968244944,
"learning_rate": 9.464394133691891e-06,
"loss": 0.0538,
"step": 740
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.5271528573688194,
"learning_rate": 9.461905208826202e-06,
"loss": 0.0619,
"step": 741
},
{
"epoch": 0.7036510194404931,
"grad_norm": 0.9062594050922635,
"learning_rate": 9.459410843251496e-06,
"loss": 0.0659,
"step": 742
},
{
"epoch": 0.7045993361782835,
"grad_norm": 0.6578698656781865,
"learning_rate": 9.456911040009323e-06,
"loss": 0.0577,
"step": 743
},
{
"epoch": 0.705547652916074,
"grad_norm": 0.6791351680766123,
"learning_rate": 9.454405802147864e-06,
"loss": 0.0669,
"step": 744
},
{
"epoch": 0.7064959696538644,
"grad_norm": 0.7662019136887008,
"learning_rate": 9.451895132721933e-06,
"loss": 0.0692,
"step": 745
},
{
"epoch": 0.7074442863916548,
"grad_norm": 0.6997379483885225,
"learning_rate": 9.449379034792961e-06,
"loss": 0.0609,
"step": 746
},
{
"epoch": 0.7083926031294452,
"grad_norm": 0.6231531262832446,
"learning_rate": 9.446857511429e-06,
"loss": 0.0568,
"step": 747
},
{
"epoch": 0.7093409198672357,
"grad_norm": 0.638618143024491,
"learning_rate": 9.444330565704715e-06,
"loss": 0.0391,
"step": 748
},
{
"epoch": 0.7102892366050261,
"grad_norm": 0.6101709327712237,
"learning_rate": 9.441798200701388e-06,
"loss": 0.0692,
"step": 749
},
{
"epoch": 0.7112375533428165,
"grad_norm": 0.7771396965466206,
"learning_rate": 9.439260419506906e-06,
"loss": 0.0616,
"step": 750
},
{
"epoch": 0.712185870080607,
"grad_norm": 0.663533581873393,
"learning_rate": 9.436717225215761e-06,
"loss": 0.0706,
"step": 751
},
{
"epoch": 0.7131341868183974,
"grad_norm": 0.7406791150442034,
"learning_rate": 9.434168620929045e-06,
"loss": 0.0759,
"step": 752
},
{
"epoch": 0.7140825035561877,
"grad_norm": 0.6589932311994989,
"learning_rate": 9.431614609754446e-06,
"loss": 0.0676,
"step": 753
},
{
"epoch": 0.7150308202939782,
"grad_norm": 0.7873737037891946,
"learning_rate": 9.429055194806247e-06,
"loss": 0.0661,
"step": 754
},
{
"epoch": 0.7159791370317686,
"grad_norm": 0.6588547169267579,
"learning_rate": 9.42649037920532e-06,
"loss": 0.068,
"step": 755
},
{
"epoch": 0.716927453769559,
"grad_norm": 0.8208102856389554,
"learning_rate": 9.423920166079122e-06,
"loss": 0.0829,
"step": 756
},
{
"epoch": 0.7178757705073494,
"grad_norm": 0.5652492127213,
"learning_rate": 9.421344558561689e-06,
"loss": 0.0754,
"step": 757
},
{
"epoch": 0.7188240872451399,
"grad_norm": 2.03543668980321,
"learning_rate": 9.418763559793639e-06,
"loss": 0.0469,
"step": 758
},
{
"epoch": 0.7197724039829303,
"grad_norm": 0.7132600676949169,
"learning_rate": 9.41617717292216e-06,
"loss": 0.058,
"step": 759
},
{
"epoch": 0.7207207207207207,
"grad_norm": 0.5814418519545377,
"learning_rate": 9.413585401101014e-06,
"loss": 0.0676,
"step": 760
},
{
"epoch": 0.7216690374585112,
"grad_norm": 0.778087468578043,
"learning_rate": 9.410988247490527e-06,
"loss": 0.0565,
"step": 761
},
{
"epoch": 0.7226173541963016,
"grad_norm": 0.5978506887698309,
"learning_rate": 9.408385715257589e-06,
"loss": 0.0526,
"step": 762
},
{
"epoch": 0.723565670934092,
"grad_norm": 0.7345386180038043,
"learning_rate": 9.405777807575643e-06,
"loss": 0.0779,
"step": 763
},
{
"epoch": 0.7245139876718824,
"grad_norm": 0.6765882263629432,
"learning_rate": 9.403164527624695e-06,
"loss": 0.0739,
"step": 764
},
{
"epoch": 0.7254623044096729,
"grad_norm": 0.6200059319183251,
"learning_rate": 9.400545878591297e-06,
"loss": 0.0425,
"step": 765
},
{
"epoch": 0.7264106211474632,
"grad_norm": 0.5764913642807622,
"learning_rate": 9.397921863668545e-06,
"loss": 0.0525,
"step": 766
},
{
"epoch": 0.7273589378852536,
"grad_norm": 0.5072870053545583,
"learning_rate": 9.395292486056087e-06,
"loss": 0.0466,
"step": 767
},
{
"epoch": 0.7283072546230441,
"grad_norm": 0.6266493674563252,
"learning_rate": 9.3926577489601e-06,
"loss": 0.0564,
"step": 768
},
{
"epoch": 0.7292555713608345,
"grad_norm": 0.6781903020718192,
"learning_rate": 9.390017655593303e-06,
"loss": 0.0625,
"step": 769
},
{
"epoch": 0.7302038880986249,
"grad_norm": 0.6970906328583575,
"learning_rate": 9.387372209174943e-06,
"loss": 0.0499,
"step": 770
},
{
"epoch": 0.7311522048364154,
"grad_norm": 0.4830643779006922,
"learning_rate": 9.384721412930797e-06,
"loss": 0.0522,
"step": 771
},
{
"epoch": 0.7321005215742058,
"grad_norm": 0.5981146539751457,
"learning_rate": 9.382065270093164e-06,
"loss": 0.0503,
"step": 772
},
{
"epoch": 0.7330488383119962,
"grad_norm": 0.6288690777841561,
"learning_rate": 9.37940378390086e-06,
"loss": 0.0505,
"step": 773
},
{
"epoch": 0.7339971550497866,
"grad_norm": 0.6043657243192845,
"learning_rate": 9.376736957599219e-06,
"loss": 0.048,
"step": 774
},
{
"epoch": 0.7349454717875771,
"grad_norm": 1.3199303132586044,
"learning_rate": 9.37406479444009e-06,
"loss": 0.0787,
"step": 775
},
{
"epoch": 0.7358937885253675,
"grad_norm": 0.9970354985082576,
"learning_rate": 9.37138729768182e-06,
"loss": 0.0593,
"step": 776
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.6154243426982743,
"learning_rate": 9.36870447058927e-06,
"loss": 0.0552,
"step": 777
},
{
"epoch": 0.7377904220009484,
"grad_norm": 0.688917247579616,
"learning_rate": 9.366016316433796e-06,
"loss": 0.0688,
"step": 778
},
{
"epoch": 0.7387387387387387,
"grad_norm": 0.8890574424533809,
"learning_rate": 9.363322838493252e-06,
"loss": 0.0616,
"step": 779
},
{
"epoch": 0.7396870554765291,
"grad_norm": 0.5256518464793154,
"learning_rate": 9.360624040051975e-06,
"loss": 0.0449,
"step": 780
},
{
"epoch": 0.7406353722143196,
"grad_norm": 0.7015686604630017,
"learning_rate": 9.357919924400802e-06,
"loss": 0.0744,
"step": 781
},
{
"epoch": 0.74158368895211,
"grad_norm": 0.5444389461448026,
"learning_rate": 9.355210494837046e-06,
"loss": 0.058,
"step": 782
},
{
"epoch": 0.7425320056899004,
"grad_norm": 0.8635005280396899,
"learning_rate": 9.352495754664501e-06,
"loss": 0.0817,
"step": 783
},
{
"epoch": 0.7434803224276908,
"grad_norm": 0.3975227023619501,
"learning_rate": 9.349775707193439e-06,
"loss": 0.0325,
"step": 784
},
{
"epoch": 0.7444286391654813,
"grad_norm": 0.9671794171858287,
"learning_rate": 9.347050355740598e-06,
"loss": 0.0942,
"step": 785
},
{
"epoch": 0.7453769559032717,
"grad_norm": 0.8627076848581986,
"learning_rate": 9.34431970362919e-06,
"loss": 0.0603,
"step": 786
},
{
"epoch": 0.7463252726410621,
"grad_norm": 0.676971569472859,
"learning_rate": 9.341583754188887e-06,
"loss": 0.0609,
"step": 787
},
{
"epoch": 0.7472735893788526,
"grad_norm": 0.6234019106033082,
"learning_rate": 9.338842510755822e-06,
"loss": 0.0527,
"step": 788
},
{
"epoch": 0.748221906116643,
"grad_norm": 0.5688808355503273,
"learning_rate": 9.336095976672578e-06,
"loss": 0.0746,
"step": 789
},
{
"epoch": 0.7491702228544334,
"grad_norm": 0.8927220033190019,
"learning_rate": 9.3333441552882e-06,
"loss": 0.0663,
"step": 790
},
{
"epoch": 0.7501185395922239,
"grad_norm": 0.6760705893906477,
"learning_rate": 9.33058704995817e-06,
"loss": 0.0607,
"step": 791
},
{
"epoch": 0.7510668563300142,
"grad_norm": 0.6421619908578323,
"learning_rate": 9.327824664044418e-06,
"loss": 0.0601,
"step": 792
},
{
"epoch": 0.7520151730678046,
"grad_norm": 0.7064042205046658,
"learning_rate": 9.32505700091531e-06,
"loss": 0.0656,
"step": 793
},
{
"epoch": 0.752963489805595,
"grad_norm": 0.6789456621715841,
"learning_rate": 9.322284063945651e-06,
"loss": 0.0754,
"step": 794
},
{
"epoch": 0.7539118065433855,
"grad_norm": 0.6349001762224292,
"learning_rate": 9.319505856516674e-06,
"loss": 0.055,
"step": 795
},
{
"epoch": 0.7548601232811759,
"grad_norm": 0.7970733715082516,
"learning_rate": 9.316722382016037e-06,
"loss": 0.0606,
"step": 796
},
{
"epoch": 0.7558084400189663,
"grad_norm": 0.6989262918440643,
"learning_rate": 9.313933643837825e-06,
"loss": 0.0419,
"step": 797
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.45444386596780545,
"learning_rate": 9.311139645382539e-06,
"loss": 0.0499,
"step": 798
},
{
"epoch": 0.7577050734945472,
"grad_norm": 0.7340919059070612,
"learning_rate": 9.308340390057091e-06,
"loss": 0.0605,
"step": 799
},
{
"epoch": 0.7586533902323376,
"grad_norm": 0.49624254277855845,
"learning_rate": 9.305535881274812e-06,
"loss": 0.038,
"step": 800
},
{
"epoch": 0.7596017069701281,
"grad_norm": 0.5558026345234958,
"learning_rate": 9.302726122455425e-06,
"loss": 0.0477,
"step": 801
},
{
"epoch": 0.7605500237079185,
"grad_norm": 0.7377034794768454,
"learning_rate": 9.299911117025071e-06,
"loss": 0.0798,
"step": 802
},
{
"epoch": 0.7614983404457089,
"grad_norm": 0.6642309640857783,
"learning_rate": 9.297090868416276e-06,
"loss": 0.0578,
"step": 803
},
{
"epoch": 0.7624466571834992,
"grad_norm": 0.4901567357915514,
"learning_rate": 9.294265380067965e-06,
"loss": 0.0546,
"step": 804
},
{
"epoch": 0.7633949739212897,
"grad_norm": 0.5504015183910195,
"learning_rate": 9.291434655425452e-06,
"loss": 0.0476,
"step": 805
},
{
"epoch": 0.7643432906590801,
"grad_norm": 0.7880325712467479,
"learning_rate": 9.288598697940433e-06,
"loss": 0.0967,
"step": 806
},
{
"epoch": 0.7652916073968705,
"grad_norm": 1.0094413699993006,
"learning_rate": 9.285757511070987e-06,
"loss": 0.0547,
"step": 807
},
{
"epoch": 0.766239924134661,
"grad_norm": 0.5462824953438216,
"learning_rate": 9.28291109828157e-06,
"loss": 0.0622,
"step": 808
},
{
"epoch": 0.7671882408724514,
"grad_norm": 0.6095693174069973,
"learning_rate": 9.28005946304301e-06,
"loss": 0.054,
"step": 809
},
{
"epoch": 0.7681365576102418,
"grad_norm": 0.5522598480936777,
"learning_rate": 9.277202608832502e-06,
"loss": 0.0608,
"step": 810
},
{
"epoch": 0.7690848743480322,
"grad_norm": 0.8887551561479244,
"learning_rate": 9.274340539133604e-06,
"loss": 0.0733,
"step": 811
},
{
"epoch": 0.7700331910858227,
"grad_norm": 0.6536519593388536,
"learning_rate": 9.271473257436239e-06,
"loss": 0.0704,
"step": 812
},
{
"epoch": 0.7709815078236131,
"grad_norm": 0.6903014054311826,
"learning_rate": 9.268600767236677e-06,
"loss": 0.0839,
"step": 813
},
{
"epoch": 0.7719298245614035,
"grad_norm": 0.5929159416904847,
"learning_rate": 9.265723072037546e-06,
"loss": 0.0592,
"step": 814
},
{
"epoch": 0.772878141299194,
"grad_norm": 0.7439638317959937,
"learning_rate": 9.26284017534782e-06,
"loss": 0.0568,
"step": 815
},
{
"epoch": 0.7738264580369844,
"grad_norm": 0.5860050856048022,
"learning_rate": 9.259952080682812e-06,
"loss": 0.0667,
"step": 816
},
{
"epoch": 0.7747747747747747,
"grad_norm": 0.4842910654706692,
"learning_rate": 9.257058791564175e-06,
"loss": 0.0513,
"step": 817
},
{
"epoch": 0.7757230915125652,
"grad_norm": 0.789038697553299,
"learning_rate": 9.254160311519896e-06,
"loss": 0.0557,
"step": 818
},
{
"epoch": 0.7766714082503556,
"grad_norm": 0.5387139258318481,
"learning_rate": 9.251256644084292e-06,
"loss": 0.0558,
"step": 819
},
{
"epoch": 0.777619724988146,
"grad_norm": 0.8887946106511906,
"learning_rate": 9.248347792798006e-06,
"loss": 0.0776,
"step": 820
},
{
"epoch": 0.7785680417259364,
"grad_norm": 0.7477907494684204,
"learning_rate": 9.245433761208e-06,
"loss": 0.0706,
"step": 821
},
{
"epoch": 0.7795163584637269,
"grad_norm": 0.8176178183928178,
"learning_rate": 9.242514552867556e-06,
"loss": 0.0806,
"step": 822
},
{
"epoch": 0.7804646752015173,
"grad_norm": 0.5104409829727489,
"learning_rate": 9.239590171336262e-06,
"loss": 0.0427,
"step": 823
},
{
"epoch": 0.7814129919393077,
"grad_norm": 0.5922185838285359,
"learning_rate": 9.236660620180024e-06,
"loss": 0.0553,
"step": 824
},
{
"epoch": 0.7823613086770982,
"grad_norm": 0.9414341871189567,
"learning_rate": 9.23372590297104e-06,
"loss": 0.0678,
"step": 825
},
{
"epoch": 0.7833096254148886,
"grad_norm": 0.49939628701466243,
"learning_rate": 9.230786023287819e-06,
"loss": 0.0437,
"step": 826
},
{
"epoch": 0.784257942152679,
"grad_norm": 0.519425273825053,
"learning_rate": 9.227840984715154e-06,
"loss": 0.0497,
"step": 827
},
{
"epoch": 0.7852062588904695,
"grad_norm": 0.5443123255099412,
"learning_rate": 9.224890790844137e-06,
"loss": 0.0612,
"step": 828
},
{
"epoch": 0.7861545756282599,
"grad_norm": 0.511905527310258,
"learning_rate": 9.221935445272144e-06,
"loss": 0.0449,
"step": 829
},
{
"epoch": 0.7871028923660502,
"grad_norm": 0.6705781452415145,
"learning_rate": 9.218974951602829e-06,
"loss": 0.063,
"step": 830
},
{
"epoch": 0.7880512091038406,
"grad_norm": 0.47754646141190604,
"learning_rate": 9.216009313446125e-06,
"loss": 0.0688,
"step": 831
},
{
"epoch": 0.7889995258416311,
"grad_norm": 0.5705276893342319,
"learning_rate": 9.213038534418244e-06,
"loss": 0.0686,
"step": 832
},
{
"epoch": 0.7899478425794215,
"grad_norm": 0.4253509537520698,
"learning_rate": 9.21006261814166e-06,
"loss": 0.0427,
"step": 833
},
{
"epoch": 0.7908961593172119,
"grad_norm": 0.533220697742502,
"learning_rate": 9.207081568245112e-06,
"loss": 0.0394,
"step": 834
},
{
"epoch": 0.7918444760550024,
"grad_norm": 0.5786737951816707,
"learning_rate": 9.2040953883636e-06,
"loss": 0.0556,
"step": 835
},
{
"epoch": 0.7927927927927928,
"grad_norm": 1.05765776588404,
"learning_rate": 9.20110408213838e-06,
"loss": 0.0388,
"step": 836
},
{
"epoch": 0.7937411095305832,
"grad_norm": 0.809530041430475,
"learning_rate": 9.19810765321696e-06,
"loss": 0.1042,
"step": 837
},
{
"epoch": 0.7946894262683737,
"grad_norm": 0.4767483114016521,
"learning_rate": 9.19510610525309e-06,
"loss": 0.0586,
"step": 838
},
{
"epoch": 0.7956377430061641,
"grad_norm": 0.6212000890855088,
"learning_rate": 9.192099441906765e-06,
"loss": 0.063,
"step": 839
},
{
"epoch": 0.7965860597439545,
"grad_norm": 0.5793471462839893,
"learning_rate": 9.189087666844219e-06,
"loss": 0.0599,
"step": 840
},
{
"epoch": 0.7975343764817449,
"grad_norm": 0.6109133021965912,
"learning_rate": 9.186070783737915e-06,
"loss": 0.0655,
"step": 841
},
{
"epoch": 0.7984826932195354,
"grad_norm": 1.7579309929430755,
"learning_rate": 9.183048796266547e-06,
"loss": 0.0531,
"step": 842
},
{
"epoch": 0.7994310099573257,
"grad_norm": 0.6305893305402994,
"learning_rate": 9.180021708115034e-06,
"loss": 0.069,
"step": 843
},
{
"epoch": 0.8003793266951161,
"grad_norm": 0.5799218206040034,
"learning_rate": 9.176989522974512e-06,
"loss": 0.0548,
"step": 844
},
{
"epoch": 0.8013276434329066,
"grad_norm": 0.5205329821796497,
"learning_rate": 9.173952244542335e-06,
"loss": 0.0551,
"step": 845
},
{
"epoch": 0.802275960170697,
"grad_norm": 0.6401356176971456,
"learning_rate": 9.170909876522067e-06,
"loss": 0.0613,
"step": 846
},
{
"epoch": 0.8032242769084874,
"grad_norm": 0.6283553782308525,
"learning_rate": 9.167862422623474e-06,
"loss": 0.0681,
"step": 847
},
{
"epoch": 0.8041725936462779,
"grad_norm": 0.5291087716357314,
"learning_rate": 9.164809886562532e-06,
"loss": 0.0428,
"step": 848
},
{
"epoch": 0.8051209103840683,
"grad_norm": 0.6176212098121372,
"learning_rate": 9.161752272061405e-06,
"loss": 0.0607,
"step": 849
},
{
"epoch": 0.8060692271218587,
"grad_norm": 0.5258734780929885,
"learning_rate": 9.158689582848454e-06,
"loss": 0.0555,
"step": 850
},
{
"epoch": 0.8070175438596491,
"grad_norm": 0.5473102285657928,
"learning_rate": 9.155621822658229e-06,
"loss": 0.0461,
"step": 851
},
{
"epoch": 0.8079658605974396,
"grad_norm": 0.7147069989389465,
"learning_rate": 9.15254899523146e-06,
"loss": 0.0699,
"step": 852
},
{
"epoch": 0.80891417733523,
"grad_norm": 0.5116476113725856,
"learning_rate": 9.14947110431506e-06,
"loss": 0.0593,
"step": 853
},
{
"epoch": 0.8098624940730204,
"grad_norm": 0.599625799358922,
"learning_rate": 9.146388153662109e-06,
"loss": 0.0719,
"step": 854
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.5657265833927722,
"learning_rate": 9.143300147031866e-06,
"loss": 0.0539,
"step": 855
},
{
"epoch": 0.8117591275486012,
"grad_norm": 0.490285928003467,
"learning_rate": 9.14020708818975e-06,
"loss": 0.0551,
"step": 856
},
{
"epoch": 0.8127074442863916,
"grad_norm": 0.5667257690255696,
"learning_rate": 9.137108980907341e-06,
"loss": 0.0485,
"step": 857
},
{
"epoch": 0.813655761024182,
"grad_norm": 0.7288808283591064,
"learning_rate": 9.134005828962373e-06,
"loss": 0.0464,
"step": 858
},
{
"epoch": 0.8146040777619725,
"grad_norm": 0.6578159612053353,
"learning_rate": 9.130897636138736e-06,
"loss": 0.0458,
"step": 859
},
{
"epoch": 0.8155523944997629,
"grad_norm": 0.6699312438910595,
"learning_rate": 9.127784406226462e-06,
"loss": 0.0484,
"step": 860
},
{
"epoch": 0.8165007112375533,
"grad_norm": 0.7577555099867996,
"learning_rate": 9.124666143021728e-06,
"loss": 0.0552,
"step": 861
},
{
"epoch": 0.8174490279753438,
"grad_norm": 0.6571718231580975,
"learning_rate": 9.121542850326849e-06,
"loss": 0.0418,
"step": 862
},
{
"epoch": 0.8183973447131342,
"grad_norm": 0.7375729716381728,
"learning_rate": 9.118414531950268e-06,
"loss": 0.0586,
"step": 863
},
{
"epoch": 0.8193456614509246,
"grad_norm": 0.7778186396499422,
"learning_rate": 9.115281191706563e-06,
"loss": 0.0638,
"step": 864
},
{
"epoch": 0.8202939781887151,
"grad_norm": 1.1753642296648885,
"learning_rate": 9.11214283341643e-06,
"loss": 0.0914,
"step": 865
},
{
"epoch": 0.8212422949265055,
"grad_norm": 0.6221136965708439,
"learning_rate": 9.108999460906687e-06,
"loss": 0.0513,
"step": 866
},
{
"epoch": 0.8221906116642959,
"grad_norm": 1.0126156537474953,
"learning_rate": 9.105851078010265e-06,
"loss": 0.0511,
"step": 867
},
{
"epoch": 0.8231389284020862,
"grad_norm": 0.609505398312846,
"learning_rate": 9.102697688566204e-06,
"loss": 0.0607,
"step": 868
},
{
"epoch": 0.8240872451398767,
"grad_norm": 0.6781545775462046,
"learning_rate": 9.09953929641965e-06,
"loss": 0.0537,
"step": 869
},
{
"epoch": 0.8250355618776671,
"grad_norm": 0.6162914997785193,
"learning_rate": 9.096375905421849e-06,
"loss": 0.0514,
"step": 870
},
{
"epoch": 0.8259838786154575,
"grad_norm": 0.9380195573648793,
"learning_rate": 9.093207519430138e-06,
"loss": 0.0592,
"step": 871
},
{
"epoch": 0.826932195353248,
"grad_norm": 0.6891518456384623,
"learning_rate": 9.090034142307955e-06,
"loss": 0.0611,
"step": 872
},
{
"epoch": 0.8278805120910384,
"grad_norm": 0.6860355795137043,
"learning_rate": 9.086855777924813e-06,
"loss": 0.0651,
"step": 873
},
{
"epoch": 0.8288288288288288,
"grad_norm": 0.5941193542193252,
"learning_rate": 9.083672430156313e-06,
"loss": 0.0561,
"step": 874
},
{
"epoch": 0.8297771455666193,
"grad_norm": 0.9859763647912905,
"learning_rate": 9.080484102884132e-06,
"loss": 0.0558,
"step": 875
},
{
"epoch": 0.8307254623044097,
"grad_norm": 0.6607364577205248,
"learning_rate": 9.077290799996015e-06,
"loss": 0.0445,
"step": 876
},
{
"epoch": 0.8316737790422001,
"grad_norm": 0.4579344621348973,
"learning_rate": 9.074092525385777e-06,
"loss": 0.0532,
"step": 877
},
{
"epoch": 0.8326220957799905,
"grad_norm": 0.44020290978074095,
"learning_rate": 9.070889282953297e-06,
"loss": 0.0432,
"step": 878
},
{
"epoch": 0.833570412517781,
"grad_norm": 0.7817453278171299,
"learning_rate": 9.067681076604507e-06,
"loss": 0.0622,
"step": 879
},
{
"epoch": 0.8345187292555714,
"grad_norm": 0.6106825636941368,
"learning_rate": 9.064467910251396e-06,
"loss": 0.0499,
"step": 880
},
{
"epoch": 0.8354670459933617,
"grad_norm": 0.5733918003298187,
"learning_rate": 9.061249787812e-06,
"loss": 0.058,
"step": 881
},
{
"epoch": 0.8364153627311522,
"grad_norm": 0.734104839469145,
"learning_rate": 9.058026713210396e-06,
"loss": 0.0603,
"step": 882
},
{
"epoch": 0.8373636794689426,
"grad_norm": 0.5863205921902287,
"learning_rate": 9.054798690376702e-06,
"loss": 0.0542,
"step": 883
},
{
"epoch": 0.838311996206733,
"grad_norm": 0.6529541400114963,
"learning_rate": 9.051565723247072e-06,
"loss": 0.0546,
"step": 884
},
{
"epoch": 0.8392603129445235,
"grad_norm": 0.8496840763418192,
"learning_rate": 9.048327815763682e-06,
"loss": 0.0499,
"step": 885
},
{
"epoch": 0.8402086296823139,
"grad_norm": 0.4879463969986272,
"learning_rate": 9.045084971874738e-06,
"loss": 0.0404,
"step": 886
},
{
"epoch": 0.8411569464201043,
"grad_norm": 0.48366631890428774,
"learning_rate": 9.041837195534462e-06,
"loss": 0.0438,
"step": 887
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.5668428745474414,
"learning_rate": 9.038584490703095e-06,
"loss": 0.0577,
"step": 888
},
{
"epoch": 0.8430535798956852,
"grad_norm": 0.6139669391301276,
"learning_rate": 9.03532686134688e-06,
"loss": 0.0699,
"step": 889
},
{
"epoch": 0.8440018966334756,
"grad_norm": 0.6191388281587789,
"learning_rate": 9.032064311438073e-06,
"loss": 0.0588,
"step": 890
},
{
"epoch": 0.844950213371266,
"grad_norm": 0.6152385003685913,
"learning_rate": 9.028796844954924e-06,
"loss": 0.0537,
"step": 891
},
{
"epoch": 0.8458985301090565,
"grad_norm": 0.755005932732524,
"learning_rate": 9.025524465881683e-06,
"loss": 0.0649,
"step": 892
},
{
"epoch": 0.8468468468468469,
"grad_norm": 0.7676513218085563,
"learning_rate": 9.022247178208585e-06,
"loss": 0.0635,
"step": 893
},
{
"epoch": 0.8477951635846372,
"grad_norm": 0.5920047067355723,
"learning_rate": 9.018964985931856e-06,
"loss": 0.06,
"step": 894
},
{
"epoch": 0.8487434803224277,
"grad_norm": 0.5807083572715754,
"learning_rate": 9.015677893053695e-06,
"loss": 0.0505,
"step": 895
},
{
"epoch": 0.8496917970602181,
"grad_norm": 0.7897487160161104,
"learning_rate": 9.012385903582286e-06,
"loss": 0.0714,
"step": 896
},
{
"epoch": 0.8506401137980085,
"grad_norm": 0.5382652341176712,
"learning_rate": 9.009089021531777e-06,
"loss": 0.0512,
"step": 897
},
{
"epoch": 0.8515884305357989,
"grad_norm": 0.8441756486986386,
"learning_rate": 9.005787250922285e-06,
"loss": 0.0766,
"step": 898
},
{
"epoch": 0.8525367472735894,
"grad_norm": 0.42966299233294036,
"learning_rate": 9.002480595779883e-06,
"loss": 0.0469,
"step": 899
},
{
"epoch": 0.8534850640113798,
"grad_norm": 0.5779848432711783,
"learning_rate": 8.999169060136609e-06,
"loss": 0.0549,
"step": 900
},
{
"epoch": 0.8544333807491702,
"grad_norm": 0.49828391414464324,
"learning_rate": 8.995852648030444e-06,
"loss": 0.0513,
"step": 901
},
{
"epoch": 0.8553816974869607,
"grad_norm": 0.5712972033755797,
"learning_rate": 8.99253136350532e-06,
"loss": 0.0642,
"step": 902
},
{
"epoch": 0.8563300142247511,
"grad_norm": 0.7463859566833713,
"learning_rate": 8.989205210611106e-06,
"loss": 0.0669,
"step": 903
},
{
"epoch": 0.8572783309625415,
"grad_norm": 0.6015754760898006,
"learning_rate": 8.98587419340361e-06,
"loss": 0.0518,
"step": 904
},
{
"epoch": 0.8582266477003319,
"grad_norm": 0.7279488477743896,
"learning_rate": 8.982538315944573e-06,
"loss": 0.0603,
"step": 905
},
{
"epoch": 0.8591749644381224,
"grad_norm": 0.41210687518386613,
"learning_rate": 8.979197582301662e-06,
"loss": 0.0508,
"step": 906
},
{
"epoch": 0.8601232811759127,
"grad_norm": 1.4900739335277513,
"learning_rate": 8.97585199654846e-06,
"loss": 0.072,
"step": 907
},
{
"epoch": 0.8610715979137031,
"grad_norm": 0.5450963951689192,
"learning_rate": 8.972501562764476e-06,
"loss": 0.0566,
"step": 908
},
{
"epoch": 0.8620199146514936,
"grad_norm": 0.5356916948533633,
"learning_rate": 8.969146285035119e-06,
"loss": 0.0471,
"step": 909
},
{
"epoch": 0.862968231389284,
"grad_norm": 0.6064958608566305,
"learning_rate": 8.965786167451713e-06,
"loss": 0.0586,
"step": 910
},
{
"epoch": 0.8639165481270744,
"grad_norm": 0.6550030676781202,
"learning_rate": 8.962421214111486e-06,
"loss": 0.0622,
"step": 911
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.5789487697080219,
"learning_rate": 8.959051429117551e-06,
"loss": 0.0587,
"step": 912
},
{
"epoch": 0.8658131816026553,
"grad_norm": 0.6480466907010984,
"learning_rate": 8.955676816578922e-06,
"loss": 0.0596,
"step": 913
},
{
"epoch": 0.8667614983404457,
"grad_norm": 0.703037972481164,
"learning_rate": 8.9522973806105e-06,
"loss": 0.0836,
"step": 914
},
{
"epoch": 0.8677098150782361,
"grad_norm": 0.49499510899266297,
"learning_rate": 8.94891312533306e-06,
"loss": 0.0493,
"step": 915
},
{
"epoch": 0.8686581318160266,
"grad_norm": 0.4679737716122778,
"learning_rate": 8.945524054873261e-06,
"loss": 0.0473,
"step": 916
},
{
"epoch": 0.869606448553817,
"grad_norm": 0.4868047238192127,
"learning_rate": 8.942130173363628e-06,
"loss": 0.0617,
"step": 917
},
{
"epoch": 0.8705547652916074,
"grad_norm": 0.48143223119722567,
"learning_rate": 8.938731484942557e-06,
"loss": 0.0459,
"step": 918
},
{
"epoch": 0.8715030820293979,
"grad_norm": 0.5109365563225756,
"learning_rate": 8.935327993754307e-06,
"loss": 0.0603,
"step": 919
},
{
"epoch": 0.8724513987671882,
"grad_norm": 0.5946328530954544,
"learning_rate": 8.931919703948981e-06,
"loss": 0.0663,
"step": 920
},
{
"epoch": 0.8733997155049786,
"grad_norm": 0.6675396299202498,
"learning_rate": 8.928506619682549e-06,
"loss": 0.0522,
"step": 921
},
{
"epoch": 0.8743480322427691,
"grad_norm": 0.5242785281728278,
"learning_rate": 8.925088745116817e-06,
"loss": 0.0477,
"step": 922
},
{
"epoch": 0.8752963489805595,
"grad_norm": 0.4607255100157249,
"learning_rate": 8.921666084419435e-06,
"loss": 0.0444,
"step": 923
},
{
"epoch": 0.8762446657183499,
"grad_norm": 0.6127086410246447,
"learning_rate": 8.918238641763894e-06,
"loss": 0.0505,
"step": 924
},
{
"epoch": 0.8771929824561403,
"grad_norm": 0.7108664485212953,
"learning_rate": 8.914806421329505e-06,
"loss": 0.0372,
"step": 925
},
{
"epoch": 0.8781412991939308,
"grad_norm": 0.48171514690034495,
"learning_rate": 8.911369427301418e-06,
"loss": 0.0467,
"step": 926
},
{
"epoch": 0.8790896159317212,
"grad_norm": 0.5032020795283936,
"learning_rate": 8.907927663870592e-06,
"loss": 0.0383,
"step": 927
},
{
"epoch": 0.8800379326695116,
"grad_norm": 0.6490864569323296,
"learning_rate": 8.90448113523381e-06,
"loss": 0.0703,
"step": 928
},
{
"epoch": 0.8809862494073021,
"grad_norm": 0.5274849878368799,
"learning_rate": 8.901029845593658e-06,
"loss": 0.0497,
"step": 929
},
{
"epoch": 0.8819345661450925,
"grad_norm": 0.7209898569229573,
"learning_rate": 8.897573799158534e-06,
"loss": 0.0845,
"step": 930
},
{
"epoch": 0.8828828828828829,
"grad_norm": 0.653701403062353,
"learning_rate": 8.894113000142636e-06,
"loss": 0.0528,
"step": 931
},
{
"epoch": 0.8838311996206734,
"grad_norm": 0.5252034559155617,
"learning_rate": 8.890647452765954e-06,
"loss": 0.054,
"step": 932
},
{
"epoch": 0.8847795163584637,
"grad_norm": 0.6597062824750437,
"learning_rate": 8.887177161254267e-06,
"loss": 0.0508,
"step": 933
},
{
"epoch": 0.8857278330962541,
"grad_norm": 0.9841434864966624,
"learning_rate": 8.883702129839144e-06,
"loss": 0.06,
"step": 934
},
{
"epoch": 0.8866761498340445,
"grad_norm": 0.4716559195813748,
"learning_rate": 8.880222362757928e-06,
"loss": 0.0484,
"step": 935
},
{
"epoch": 0.887624466571835,
"grad_norm": 0.6275887169553205,
"learning_rate": 8.87673786425374e-06,
"loss": 0.055,
"step": 936
},
{
"epoch": 0.8885727833096254,
"grad_norm": 0.5480616561224483,
"learning_rate": 8.87324863857547e-06,
"loss": 0.0512,
"step": 937
},
{
"epoch": 0.8895211000474158,
"grad_norm": 0.5716073816122306,
"learning_rate": 8.869754689977774e-06,
"loss": 0.0575,
"step": 938
},
{
"epoch": 0.8904694167852063,
"grad_norm": 0.8761043849726794,
"learning_rate": 8.866256022721062e-06,
"loss": 0.0508,
"step": 939
},
{
"epoch": 0.8914177335229967,
"grad_norm": 0.7017157731117182,
"learning_rate": 8.862752641071499e-06,
"loss": 0.0546,
"step": 940
},
{
"epoch": 0.8923660502607871,
"grad_norm": 1.5138916151321196,
"learning_rate": 8.859244549301005e-06,
"loss": 0.0658,
"step": 941
},
{
"epoch": 0.8933143669985776,
"grad_norm": 0.8433261605133346,
"learning_rate": 8.855731751687233e-06,
"loss": 0.0553,
"step": 942
},
{
"epoch": 0.894262683736368,
"grad_norm": 0.5494966721887847,
"learning_rate": 8.852214252513582e-06,
"loss": 0.0494,
"step": 943
},
{
"epoch": 0.8952110004741584,
"grad_norm": 0.6006177701179363,
"learning_rate": 8.848692056069184e-06,
"loss": 0.0612,
"step": 944
},
{
"epoch": 0.8961593172119487,
"grad_norm": 0.6876171031491582,
"learning_rate": 8.84516516664889e-06,
"loss": 0.0609,
"step": 945
},
{
"epoch": 0.8971076339497392,
"grad_norm": 0.846588378426009,
"learning_rate": 8.841633588553287e-06,
"loss": 0.0593,
"step": 946
},
{
"epoch": 0.8980559506875296,
"grad_norm": 1.175631640532978,
"learning_rate": 8.838097326088667e-06,
"loss": 0.0767,
"step": 947
},
{
"epoch": 0.89900426742532,
"grad_norm": 0.7010270158444133,
"learning_rate": 8.834556383567042e-06,
"loss": 0.0637,
"step": 948
},
{
"epoch": 0.8999525841631105,
"grad_norm": 0.7103962193756044,
"learning_rate": 8.831010765306124e-06,
"loss": 0.047,
"step": 949
},
{
"epoch": 0.9009009009009009,
"grad_norm": 0.9919713077792982,
"learning_rate": 8.827460475629334e-06,
"loss": 0.0699,
"step": 950
},
{
"epoch": 0.9018492176386913,
"grad_norm": 0.9438936607800321,
"learning_rate": 8.823905518865782e-06,
"loss": 0.0962,
"step": 951
},
{
"epoch": 0.9027975343764817,
"grad_norm": 0.41357107371942303,
"learning_rate": 8.820345899350275e-06,
"loss": 0.0393,
"step": 952
},
{
"epoch": 0.9037458511142722,
"grad_norm": 0.6094306471098007,
"learning_rate": 8.8167816214233e-06,
"loss": 0.0547,
"step": 953
},
{
"epoch": 0.9046941678520626,
"grad_norm": 0.45434395748515616,
"learning_rate": 8.81321268943103e-06,
"loss": 0.0458,
"step": 954
},
{
"epoch": 0.905642484589853,
"grad_norm": 0.584662000585842,
"learning_rate": 8.809639107725308e-06,
"loss": 0.0684,
"step": 955
},
{
"epoch": 0.9065908013276435,
"grad_norm": 0.6281479664499341,
"learning_rate": 8.80606088066365e-06,
"loss": 0.0485,
"step": 956
},
{
"epoch": 0.9075391180654339,
"grad_norm": 0.5220137398785665,
"learning_rate": 8.802478012609235e-06,
"loss": 0.0478,
"step": 957
},
{
"epoch": 0.9084874348032242,
"grad_norm": 0.7613507347001472,
"learning_rate": 8.798890507930899e-06,
"loss": 0.0534,
"step": 958
},
{
"epoch": 0.9094357515410147,
"grad_norm": 0.5338153539509801,
"learning_rate": 8.795298371003138e-06,
"loss": 0.0467,
"step": 959
},
{
"epoch": 0.9103840682788051,
"grad_norm": 0.508435320780577,
"learning_rate": 8.791701606206092e-06,
"loss": 0.05,
"step": 960
},
{
"epoch": 0.9113323850165955,
"grad_norm": 0.6801979027503147,
"learning_rate": 8.788100217925541e-06,
"loss": 0.0654,
"step": 961
},
{
"epoch": 0.9122807017543859,
"grad_norm": 0.5472159955708181,
"learning_rate": 8.78449421055291e-06,
"loss": 0.0566,
"step": 962
},
{
"epoch": 0.9132290184921764,
"grad_norm": 0.5546852372370231,
"learning_rate": 8.78088358848525e-06,
"loss": 0.0544,
"step": 963
},
{
"epoch": 0.9141773352299668,
"grad_norm": 0.7376086419870055,
"learning_rate": 8.777268356125244e-06,
"loss": 0.0618,
"step": 964
},
{
"epoch": 0.9151256519677572,
"grad_norm": 0.461174714622349,
"learning_rate": 8.773648517881194e-06,
"loss": 0.0527,
"step": 965
},
{
"epoch": 0.9160739687055477,
"grad_norm": 1.100649311314461,
"learning_rate": 8.770024078167017e-06,
"loss": 0.075,
"step": 966
},
{
"epoch": 0.9170222854433381,
"grad_norm": 0.5385193734337945,
"learning_rate": 8.766395041402245e-06,
"loss": 0.056,
"step": 967
},
{
"epoch": 0.9179706021811285,
"grad_norm": 0.4215583451342763,
"learning_rate": 8.762761412012011e-06,
"loss": 0.045,
"step": 968
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.5690890175604749,
"learning_rate": 8.75912319442705e-06,
"loss": 0.0568,
"step": 969
},
{
"epoch": 0.9198672356567094,
"grad_norm": 0.5598668678593514,
"learning_rate": 8.755480393083694e-06,
"loss": 0.0629,
"step": 970
},
{
"epoch": 0.9208155523944997,
"grad_norm": 0.4230299561301444,
"learning_rate": 8.751833012423861e-06,
"loss": 0.0402,
"step": 971
},
{
"epoch": 0.9217638691322901,
"grad_norm": 0.8504416588391118,
"learning_rate": 8.74818105689505e-06,
"loss": 0.0521,
"step": 972
},
{
"epoch": 0.9227121858700806,
"grad_norm": 0.461086821346764,
"learning_rate": 8.744524530950351e-06,
"loss": 0.0426,
"step": 973
},
{
"epoch": 0.923660502607871,
"grad_norm": 0.5086789755859074,
"learning_rate": 8.740863439048412e-06,
"loss": 0.0487,
"step": 974
},
{
"epoch": 0.9246088193456614,
"grad_norm": 0.45915883182777006,
"learning_rate": 8.737197785653457e-06,
"loss": 0.0444,
"step": 975
},
{
"epoch": 0.9255571360834519,
"grad_norm": 0.6701095989032753,
"learning_rate": 8.73352757523527e-06,
"loss": 0.0707,
"step": 976
},
{
"epoch": 0.9265054528212423,
"grad_norm": 0.460793794881083,
"learning_rate": 8.729852812269192e-06,
"loss": 0.0462,
"step": 977
},
{
"epoch": 0.9274537695590327,
"grad_norm": 0.42146552351647865,
"learning_rate": 8.726173501236115e-06,
"loss": 0.0413,
"step": 978
},
{
"epoch": 0.9284020862968232,
"grad_norm": 0.4515670497285217,
"learning_rate": 8.722489646622477e-06,
"loss": 0.0486,
"step": 979
},
{
"epoch": 0.9293504030346136,
"grad_norm": 0.7298661971153528,
"learning_rate": 8.718801252920257e-06,
"loss": 0.0728,
"step": 980
},
{
"epoch": 0.930298719772404,
"grad_norm": 0.6123325398467794,
"learning_rate": 8.715108324626967e-06,
"loss": 0.0528,
"step": 981
},
{
"epoch": 0.9312470365101944,
"grad_norm": 0.5334963078534037,
"learning_rate": 8.711410866245648e-06,
"loss": 0.0409,
"step": 982
},
{
"epoch": 0.9321953532479849,
"grad_norm": 0.44851971952458897,
"learning_rate": 8.70770888228487e-06,
"loss": 0.0509,
"step": 983
},
{
"epoch": 0.9331436699857752,
"grad_norm": 0.9770313333004932,
"learning_rate": 8.704002377258714e-06,
"loss": 0.0463,
"step": 984
},
{
"epoch": 0.9340919867235656,
"grad_norm": 0.7370636377202378,
"learning_rate": 8.700291355686779e-06,
"loss": 0.0637,
"step": 985
},
{
"epoch": 0.9350403034613561,
"grad_norm": 0.6070776528057518,
"learning_rate": 8.69657582209417e-06,
"loss": 0.0488,
"step": 986
},
{
"epoch": 0.9359886201991465,
"grad_norm": 0.7278417266877663,
"learning_rate": 8.692855781011494e-06,
"loss": 0.0501,
"step": 987
},
{
"epoch": 0.9369369369369369,
"grad_norm": 0.4731052806759658,
"learning_rate": 8.689131236974853e-06,
"loss": 0.0417,
"step": 988
},
{
"epoch": 0.9378852536747273,
"grad_norm": 0.45598792555472306,
"learning_rate": 8.68540219452584e-06,
"loss": 0.0396,
"step": 989
},
{
"epoch": 0.9388335704125178,
"grad_norm": 0.5661429908370399,
"learning_rate": 8.681668658211535e-06,
"loss": 0.0577,
"step": 990
},
{
"epoch": 0.9397818871503082,
"grad_norm": 0.41955875165931145,
"learning_rate": 8.677930632584496e-06,
"loss": 0.0432,
"step": 991
},
{
"epoch": 0.9407302038880986,
"grad_norm": 0.4107826749470781,
"learning_rate": 8.674188122202756e-06,
"loss": 0.0535,
"step": 992
},
{
"epoch": 0.9416785206258891,
"grad_norm": 0.47653411892607034,
"learning_rate": 8.670441131629816e-06,
"loss": 0.0586,
"step": 993
},
{
"epoch": 0.9426268373636795,
"grad_norm": 0.53171021829938,
"learning_rate": 8.66668966543464e-06,
"loss": 0.0518,
"step": 994
},
{
"epoch": 0.9435751541014699,
"grad_norm": 0.43148473645836083,
"learning_rate": 8.662933728191651e-06,
"loss": 0.0431,
"step": 995
},
{
"epoch": 0.9445234708392604,
"grad_norm": 0.4471351558402442,
"learning_rate": 8.659173324480722e-06,
"loss": 0.0438,
"step": 996
},
{
"epoch": 0.9454717875770507,
"grad_norm": 0.5782265716940447,
"learning_rate": 8.65540845888717e-06,
"loss": 0.0719,
"step": 997
},
{
"epoch": 0.9464201043148411,
"grad_norm": 0.4141433604011682,
"learning_rate": 8.651639136001762e-06,
"loss": 0.0469,
"step": 998
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.6061219180547935,
"learning_rate": 8.647865360420686e-06,
"loss": 0.0489,
"step": 999
},
{
"epoch": 0.948316737790422,
"grad_norm": 0.48916698447496854,
"learning_rate": 8.644087136745572e-06,
"loss": 0.0513,
"step": 1000
},
{
"epoch": 0.9492650545282124,
"grad_norm": 0.37441669864478105,
"learning_rate": 8.640304469583469e-06,
"loss": 0.0412,
"step": 1001
},
{
"epoch": 0.9502133712660028,
"grad_norm": 1.0623152293680482,
"learning_rate": 8.636517363546838e-06,
"loss": 0.0655,
"step": 1002
},
{
"epoch": 0.9511616880037933,
"grad_norm": 0.7061581986197312,
"learning_rate": 8.63272582325357e-06,
"loss": 0.0499,
"step": 1003
},
{
"epoch": 0.9521100047415837,
"grad_norm": 0.5399127227606683,
"learning_rate": 8.62892985332694e-06,
"loss": 0.0518,
"step": 1004
},
{
"epoch": 0.9530583214793741,
"grad_norm": 0.4559892605058489,
"learning_rate": 8.625129458395643e-06,
"loss": 0.0459,
"step": 1005
},
{
"epoch": 0.9540066382171646,
"grad_norm": 0.485355373272851,
"learning_rate": 8.621324643093762e-06,
"loss": 0.0454,
"step": 1006
},
{
"epoch": 0.954954954954955,
"grad_norm": 0.7459047370537332,
"learning_rate": 8.617515412060771e-06,
"loss": 0.06,
"step": 1007
},
{
"epoch": 0.9559032716927454,
"grad_norm": 0.7374476556281685,
"learning_rate": 8.613701769941526e-06,
"loss": 0.0677,
"step": 1008
},
{
"epoch": 0.9568515884305357,
"grad_norm": 0.5640575902917073,
"learning_rate": 8.609883721386266e-06,
"loss": 0.0464,
"step": 1009
},
{
"epoch": 0.9577999051683262,
"grad_norm": 0.5329518829334081,
"learning_rate": 8.606061271050601e-06,
"loss": 0.0422,
"step": 1010
},
{
"epoch": 0.9587482219061166,
"grad_norm": 0.5672285885118362,
"learning_rate": 8.602234423595509e-06,
"loss": 0.0432,
"step": 1011
},
{
"epoch": 0.959696538643907,
"grad_norm": 0.49279890911522445,
"learning_rate": 8.598403183687328e-06,
"loss": 0.0411,
"step": 1012
},
{
"epoch": 0.9606448553816975,
"grad_norm": 0.5195118583178678,
"learning_rate": 8.594567555997755e-06,
"loss": 0.0575,
"step": 1013
},
{
"epoch": 0.9615931721194879,
"grad_norm": 1.372925234445775,
"learning_rate": 8.590727545203833e-06,
"loss": 0.0615,
"step": 1014
},
{
"epoch": 0.9625414888572783,
"grad_norm": 0.7147315054833345,
"learning_rate": 8.586883155987955e-06,
"loss": 0.0712,
"step": 1015
},
{
"epoch": 0.9634898055950688,
"grad_norm": 0.5802509696174448,
"learning_rate": 8.583034393037848e-06,
"loss": 0.0552,
"step": 1016
},
{
"epoch": 0.9644381223328592,
"grad_norm": 0.49007583048635933,
"learning_rate": 8.579181261046576e-06,
"loss": 0.0449,
"step": 1017
},
{
"epoch": 0.9653864390706496,
"grad_norm": 0.48751614831454176,
"learning_rate": 8.57532376471253e-06,
"loss": 0.0475,
"step": 1018
},
{
"epoch": 0.96633475580844,
"grad_norm": 0.6496160692100631,
"learning_rate": 8.571461908739415e-06,
"loss": 0.0523,
"step": 1019
},
{
"epoch": 0.9672830725462305,
"grad_norm": 0.481345745516473,
"learning_rate": 8.567595697836266e-06,
"loss": 0.0515,
"step": 1020
},
{
"epoch": 0.9682313892840209,
"grad_norm": 0.5247818144993567,
"learning_rate": 8.563725136717419e-06,
"loss": 0.0494,
"step": 1021
},
{
"epoch": 0.9691797060218112,
"grad_norm": 0.8474516614825078,
"learning_rate": 8.559850230102513e-06,
"loss": 0.0578,
"step": 1022
},
{
"epoch": 0.9701280227596017,
"grad_norm": 0.7494686751693889,
"learning_rate": 8.555970982716492e-06,
"loss": 0.0613,
"step": 1023
},
{
"epoch": 0.9710763394973921,
"grad_norm": 0.528161959351856,
"learning_rate": 8.55208739928959e-06,
"loss": 0.0446,
"step": 1024
},
{
"epoch": 0.9720246562351825,
"grad_norm": 0.7556057248494816,
"learning_rate": 8.54819948455733e-06,
"loss": 0.0611,
"step": 1025
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.4857201457975449,
"learning_rate": 8.54430724326051e-06,
"loss": 0.0396,
"step": 1026
},
{
"epoch": 0.9739212897107634,
"grad_norm": 0.4633933638270801,
"learning_rate": 8.540410680145213e-06,
"loss": 0.045,
"step": 1027
},
{
"epoch": 0.9748696064485538,
"grad_norm": 0.5215732727679809,
"learning_rate": 8.536509799962784e-06,
"loss": 0.047,
"step": 1028
},
{
"epoch": 0.9758179231863442,
"grad_norm": 1.5449712519877792,
"learning_rate": 8.532604607469839e-06,
"loss": 0.0717,
"step": 1029
},
{
"epoch": 0.9767662399241347,
"grad_norm": 0.46693259860172376,
"learning_rate": 8.528695107428247e-06,
"loss": 0.0458,
"step": 1030
},
{
"epoch": 0.9777145566619251,
"grad_norm": 0.5388054089062692,
"learning_rate": 8.52478130460513e-06,
"loss": 0.047,
"step": 1031
},
{
"epoch": 0.9786628733997155,
"grad_norm": 0.5283181708144433,
"learning_rate": 8.520863203772858e-06,
"loss": 0.0496,
"step": 1032
},
{
"epoch": 0.979611190137506,
"grad_norm": 0.5890035811704775,
"learning_rate": 8.516940809709044e-06,
"loss": 0.0437,
"step": 1033
},
{
"epoch": 0.9805595068752964,
"grad_norm": 0.446739345865473,
"learning_rate": 8.513014127196533e-06,
"loss": 0.042,
"step": 1034
},
{
"epoch": 0.9815078236130867,
"grad_norm": 0.49851759898580866,
"learning_rate": 8.509083161023399e-06,
"loss": 0.0553,
"step": 1035
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.8986990099986447,
"learning_rate": 8.505147915982943e-06,
"loss": 0.0491,
"step": 1036
},
{
"epoch": 0.9834044570886676,
"grad_norm": 0.4813313700157437,
"learning_rate": 8.501208396873677e-06,
"loss": 0.0524,
"step": 1037
},
{
"epoch": 0.984352773826458,
"grad_norm": 0.7823009578163489,
"learning_rate": 8.497264608499332e-06,
"loss": 0.0542,
"step": 1038
},
{
"epoch": 0.9853010905642484,
"grad_norm": 0.5256393060960738,
"learning_rate": 8.49331655566884e-06,
"loss": 0.0545,
"step": 1039
},
{
"epoch": 0.9862494073020389,
"grad_norm": 0.5400471979930811,
"learning_rate": 8.489364243196334e-06,
"loss": 0.0495,
"step": 1040
},
{
"epoch": 0.9871977240398293,
"grad_norm": 0.5862041954662611,
"learning_rate": 8.485407675901142e-06,
"loss": 0.0442,
"step": 1041
},
{
"epoch": 0.9881460407776197,
"grad_norm": 0.6834922008296388,
"learning_rate": 8.48144685860778e-06,
"loss": 0.064,
"step": 1042
},
{
"epoch": 0.9890943575154102,
"grad_norm": 0.8002369541010694,
"learning_rate": 8.477481796145945e-06,
"loss": 0.0464,
"step": 1043
},
{
"epoch": 0.9900426742532006,
"grad_norm": 0.47393154077930216,
"learning_rate": 8.47351249335051e-06,
"loss": 0.0485,
"step": 1044
},
{
"epoch": 0.990990990990991,
"grad_norm": 0.4987272807246751,
"learning_rate": 8.469538955061525e-06,
"loss": 0.0478,
"step": 1045
},
{
"epoch": 0.9919393077287814,
"grad_norm": 0.6406968710094035,
"learning_rate": 8.465561186124193e-06,
"loss": 0.0494,
"step": 1046
},
{
"epoch": 0.9928876244665719,
"grad_norm": 0.5319476049591959,
"learning_rate": 8.46157919138889e-06,
"loss": 0.038,
"step": 1047
},
{
"epoch": 0.9938359412043622,
"grad_norm": 0.5377926003236448,
"learning_rate": 8.457592975711128e-06,
"loss": 0.0415,
"step": 1048
},
{
"epoch": 0.9947842579421526,
"grad_norm": 0.5054973123174826,
"learning_rate": 8.45360254395158e-06,
"loss": 0.0509,
"step": 1049
},
{
"epoch": 0.9957325746799431,
"grad_norm": 0.6511826899131821,
"learning_rate": 8.449607900976056e-06,
"loss": 0.0496,
"step": 1050
},
{
"epoch": 0.9966808914177335,
"grad_norm": 0.34335574918053036,
"learning_rate": 8.445609051655497e-06,
"loss": 0.0322,
"step": 1051
},
{
"epoch": 0.9976292081555239,
"grad_norm": 0.5324023086103392,
"learning_rate": 8.441606000865978e-06,
"loss": 0.0465,
"step": 1052
},
{
"epoch": 0.9985775248933144,
"grad_norm": 0.3971741987281817,
"learning_rate": 8.437598753488693e-06,
"loss": 0.0316,
"step": 1053
},
{
"epoch": 0.9995258416311048,
"grad_norm": 0.4702644191912913,
"learning_rate": 8.43358731440996e-06,
"loss": 0.0424,
"step": 1054
},
{
"epoch": 0.9995258416311048,
"eval_loss": 0.05579984560608864,
"eval_runtime": 205.6016,
"eval_samples_per_second": 34.547,
"eval_steps_per_second": 1.08,
"step": 1054
}
],
"logging_steps": 1,
"max_steps": 3162,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.8007437509892506e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}